diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp index 8f9495d83cde2..5160851f8c442 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -75,6 +75,13 @@ static cl::opt InlineMaxBB( cl::desc("Maximum number of BBs allowed in a function after inlining" " (compile time constraint)")); +// This default unroll factor is based on microbenchmarks on gfx1030. +static cl::opt MemcpyLoopUnroll( + "amdgpu-memcpy-loop-unroll", + cl::desc("Unroll factor (affecting 4x32-bit operations) to use for memory " + "operations when lowering memcpy as a loop"), + cl::init(16), cl::Hidden); + static bool dependsOnLocalPhi(const Loop *L, const Value *Cond, unsigned Depth = 0) { const Instruction *I = dyn_cast(Cond); @@ -409,13 +416,8 @@ int64_t GCNTTIImpl::getMaxMemIntrinsicInlineSizeThreshold() const { return 1024; } -// FIXME: Really we would like to issue multiple 128-bit loads and stores per -// iteration. Should we report a larger size and let it legalize? -// // FIXME: Should we use narrower types for local/region, or account for when // unaligned access is legal? -// -// FIXME: This could use fine tuning and microbenchmarks. Type *GCNTTIImpl::getMemcpyLoopLoweringType( LLVMContext &Context, Value *Length, unsigned SrcAddrSpace, unsigned DestAddrSpace, Align SrcAlign, Align DestAlign, @@ -442,9 +444,22 @@ Type *GCNTTIImpl::getMemcpyLoopLoweringType( return FixedVectorType::get(Type::getInt32Ty(Context), 2); } - // Global memory works best with 16-byte accesses. Private memory will also - // hit this, although they'll be decomposed. - return FixedVectorType::get(Type::getInt32Ty(Context), 4); + // Global memory works best with 16-byte accesses. + // If the operation has a fixed known length that is large enough, it is + // worthwhile to return an even wider type and let legalization lower it into + // multiple accesses, effectively unrolling the memcpy loop. Private memory + // also hits this, although accesses may be decomposed. + // + // Don't unroll if Length is not a constant, since unrolling leads to worse + // performance for length values that are smaller or slightly larger than the + // total size of the type returned here. Mitigating that would require a more + // complex lowering for variable-length memcpy and memmove. + unsigned I32EltsInVector = 4; + if (MemcpyLoopUnroll > 0 && isa(Length)) + return FixedVectorType::get(Type::getInt32Ty(Context), + MemcpyLoopUnroll * I32EltsInVector); + + return FixedVectorType::get(Type::getInt32Ty(Context), I32EltsInVector); } void GCNTTIImpl::getMemcpyLoopResidualLoweringType( @@ -452,7 +467,6 @@ void GCNTTIImpl::getMemcpyLoopResidualLoweringType( unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace, Align SrcAlign, Align DestAlign, std::optional AtomicCpySize) const { - assert(RemainingBytes < 16); if (AtomicCpySize) BaseT::getMemcpyLoopResidualLoweringType( @@ -462,6 +476,12 @@ void GCNTTIImpl::getMemcpyLoopResidualLoweringType( Align MinAlign = std::min(SrcAlign, DestAlign); if (MinAlign != Align(2)) { + Type *I32x4Ty = FixedVectorType::get(Type::getInt32Ty(Context), 4); + while (RemainingBytes >= 16) { + OpsOut.push_back(I32x4Ty); + RemainingBytes -= 16; + } + Type *I64Ty = Type::getInt64Ty(Context); while (RemainingBytes >= 8) { OpsOut.push_back(I64Ty); diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memcpy.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memcpy.ll index 7f23434c9dfdd..75d4d8816fb30 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memcpy.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memcpy.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -global-isel -mtriple=amdgcn-- -verify-machineinstrs -mem-intrinsic-expand-size=19 %s -o - | FileCheck -check-prefix=LOOP %s -; RUN: llc -global-isel -mtriple=amdgcn-- -verify-machineinstrs -mem-intrinsic-expand-size=21 %s -o - | FileCheck -check-prefix=UNROLL %s +; RUN: llc -global-isel -mtriple=amdgcn-- -verify-machineinstrs -amdgpu-memcpy-loop-unroll=2 -mem-intrinsic-expand-size=35 %s -o - | FileCheck -check-prefix=LOOP %s +; RUN: llc -global-isel -mtriple=amdgcn-- -verify-machineinstrs -amdgpu-memcpy-loop-unroll=2 -mem-intrinsic-expand-size=37 %s -o - | FileCheck -check-prefix=UNROLL %s declare void @llvm.memcpy.p1.p1.i32(ptr addrspace(1), ptr addrspace(1), i32, i1 immarg) @@ -14,104 +14,176 @@ define amdgpu_cs void @memcpy_p1i8(ptr addrspace(1) %dst, ptr addrspace(1) %src) ; LOOP-NEXT: v_mov_b32_e32 v4, s0 ; LOOP-NEXT: .LBB0_1: ; %load-store-loop ; LOOP-NEXT: ; =>This Inner Loop Header: Depth=1 +; LOOP-NEXT: s_waitcnt expcnt(2) +; LOOP-NEXT: v_add_i32_e32 v29, vcc, v2, v4 +; LOOP-NEXT: v_addc_u32_e32 v30, vcc, v3, v5, vcc +; LOOP-NEXT: buffer_load_ubyte v24, v[29:30], s[0:3], 0 addr64 +; LOOP-NEXT: buffer_load_ubyte v27, v[29:30], s[0:3], 0 addr64 offset:1 +; LOOP-NEXT: buffer_load_ubyte v34, v[29:30], s[0:3], 0 addr64 offset:2 +; LOOP-NEXT: buffer_load_ubyte v35, v[29:30], s[0:3], 0 addr64 offset:3 +; LOOP-NEXT: buffer_load_ubyte v36, v[29:30], s[0:3], 0 addr64 offset:4 +; LOOP-NEXT: buffer_load_ubyte v37, v[29:30], s[0:3], 0 addr64 offset:5 +; LOOP-NEXT: buffer_load_ubyte v38, v[29:30], s[0:3], 0 addr64 offset:6 +; LOOP-NEXT: buffer_load_ubyte v39, v[29:30], s[0:3], 0 addr64 offset:7 +; LOOP-NEXT: buffer_load_ubyte v6, v[29:30], s[0:3], 0 addr64 offset:8 +; LOOP-NEXT: buffer_load_ubyte v9, v[29:30], s[0:3], 0 addr64 offset:9 +; LOOP-NEXT: buffer_load_ubyte v10, v[29:30], s[0:3], 0 addr64 offset:10 ; LOOP-NEXT: s_waitcnt expcnt(0) -; LOOP-NEXT: v_add_i32_e32 v6, vcc, v2, v4 -; LOOP-NEXT: v_addc_u32_e32 v7, vcc, v3, v5, vcc -; LOOP-NEXT: v_add_i32_e32 v8, vcc, v0, v4 -; LOOP-NEXT: v_addc_u32_e32 v9, vcc, v1, v5, vcc -; LOOP-NEXT: v_add_i32_e32 v4, vcc, 16, v4 -; LOOP-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; LOOP-NEXT: buffer_load_ubyte v10, v[6:7], s[0:3], 0 addr64 -; LOOP-NEXT: buffer_load_ubyte v11, v[6:7], s[0:3], 0 addr64 offset:1 -; LOOP-NEXT: buffer_load_ubyte v12, v[6:7], s[0:3], 0 addr64 offset:2 -; LOOP-NEXT: buffer_load_ubyte v13, v[6:7], s[0:3], 0 addr64 offset:3 -; LOOP-NEXT: buffer_load_ubyte v14, v[6:7], s[0:3], 0 addr64 offset:4 -; LOOP-NEXT: buffer_load_ubyte v15, v[6:7], s[0:3], 0 addr64 offset:5 -; LOOP-NEXT: buffer_load_ubyte v16, v[6:7], s[0:3], 0 addr64 offset:6 -; LOOP-NEXT: buffer_load_ubyte v17, v[6:7], s[0:3], 0 addr64 offset:7 -; LOOP-NEXT: buffer_load_ubyte v18, v[6:7], s[0:3], 0 addr64 offset:8 -; LOOP-NEXT: buffer_load_ubyte v19, v[6:7], s[0:3], 0 addr64 offset:9 -; LOOP-NEXT: buffer_load_ubyte v20, v[6:7], s[0:3], 0 addr64 offset:10 -; LOOP-NEXT: buffer_load_ubyte v21, v[6:7], s[0:3], 0 addr64 offset:11 -; LOOP-NEXT: buffer_load_ubyte v22, v[6:7], s[0:3], 0 addr64 offset:12 -; LOOP-NEXT: buffer_load_ubyte v23, v[6:7], s[0:3], 0 addr64 offset:13 -; LOOP-NEXT: buffer_load_ubyte v24, v[6:7], s[0:3], 0 addr64 offset:14 -; LOOP-NEXT: buffer_load_ubyte v6, v[6:7], s[0:3], 0 addr64 offset:15 -; LOOP-NEXT: v_cmp_gt_u32_e32 vcc, 16, v4 +; LOOP-NEXT: buffer_load_ubyte v11, v[29:30], s[0:3], 0 addr64 offset:11 +; LOOP-NEXT: buffer_load_ubyte v7, v[29:30], s[0:3], 0 addr64 offset:12 +; LOOP-NEXT: buffer_load_ubyte v13, v[29:30], s[0:3], 0 addr64 offset:13 +; LOOP-NEXT: buffer_load_ubyte v14, v[29:30], s[0:3], 0 addr64 offset:14 +; LOOP-NEXT: buffer_load_ubyte v15, v[29:30], s[0:3], 0 addr64 offset:15 +; LOOP-NEXT: buffer_load_ubyte v8, v[29:30], s[0:3], 0 addr64 offset:16 +; LOOP-NEXT: buffer_load_ubyte v17, v[29:30], s[0:3], 0 addr64 offset:17 +; LOOP-NEXT: buffer_load_ubyte v18, v[29:30], s[0:3], 0 addr64 offset:18 +; LOOP-NEXT: buffer_load_ubyte v19, v[29:30], s[0:3], 0 addr64 offset:19 +; LOOP-NEXT: buffer_load_ubyte v12, v[29:30], s[0:3], 0 addr64 offset:20 +; LOOP-NEXT: buffer_load_ubyte v21, v[29:30], s[0:3], 0 addr64 offset:21 +; LOOP-NEXT: buffer_load_ubyte v22, v[29:30], s[0:3], 0 addr64 offset:22 +; LOOP-NEXT: buffer_load_ubyte v23, v[29:30], s[0:3], 0 addr64 offset:23 +; LOOP-NEXT: buffer_load_ubyte v16, v[29:30], s[0:3], 0 addr64 offset:24 +; LOOP-NEXT: buffer_load_ubyte v25, v[29:30], s[0:3], 0 addr64 offset:25 +; LOOP-NEXT: buffer_load_ubyte v26, v[29:30], s[0:3], 0 addr64 offset:26 +; LOOP-NEXT: buffer_load_ubyte v28, v[29:30], s[0:3], 0 addr64 offset:27 +; LOOP-NEXT: buffer_load_ubyte v20, v[29:30], s[0:3], 0 addr64 offset:28 +; LOOP-NEXT: buffer_load_ubyte v31, v[29:30], s[0:3], 0 addr64 offset:29 +; LOOP-NEXT: buffer_load_ubyte v32, v[29:30], s[0:3], 0 addr64 offset:30 +; LOOP-NEXT: buffer_load_ubyte v33, v[29:30], s[0:3], 0 addr64 offset:31 ; LOOP-NEXT: s_waitcnt vmcnt(14) -; LOOP-NEXT: v_lshlrev_b32_e32 v7, 8, v11 +; LOOP-NEXT: v_lshlrev_b32_e32 v27, 8, v27 +; LOOP-NEXT: v_or_b32_e32 v24, v27, v24 +; LOOP-NEXT: v_lshlrev_b32_e32 v27, 24, v35 +; LOOP-NEXT: v_lshlrev_b32_e32 v29, 16, v34 +; LOOP-NEXT: v_or_b32_e32 v27, v27, v29 +; LOOP-NEXT: v_lshlrev_b32_e32 v29, 8, v37 +; LOOP-NEXT: v_lshlrev_b32_e32 v30, 24, v39 +; LOOP-NEXT: v_lshlrev_b32_e32 v34, 16, v38 +; LOOP-NEXT: v_or_b32_e32 v29, v29, v36 +; LOOP-NEXT: v_or_b32_e32 v30, v30, v34 +; LOOP-NEXT: v_add_i32_e32 v34, vcc, v0, v4 +; LOOP-NEXT: v_addc_u32_e32 v35, vcc, v1, v5, vcc +; LOOP-NEXT: v_add_i32_e32 v4, vcc, 32, v4 +; LOOP-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; LOOP-NEXT: v_cmp_gt_u32_e32 vcc, 32, v4 +; LOOP-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; LOOP-NEXT: v_lshlrev_b32_e32 v11, 24, v11 +; LOOP-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; LOOP-NEXT: v_lshlrev_b32_e32 v13, 8, v13 +; LOOP-NEXT: v_lshlrev_b32_e32 v15, 24, v15 +; LOOP-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; LOOP-NEXT: v_lshlrev_b32_e32 v17, 8, v17 ; LOOP-NEXT: s_waitcnt vmcnt(12) -; LOOP-NEXT: v_lshlrev_b32_e32 v11, 24, v13 -; LOOP-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; LOOP-NEXT: v_lshlrev_b32_e32 v19, 24, v19 +; LOOP-NEXT: v_lshlrev_b32_e32 v18, 16, v18 ; LOOP-NEXT: s_waitcnt vmcnt(10) -; LOOP-NEXT: v_lshlrev_b32_e32 v13, 8, v15 +; LOOP-NEXT: v_lshlrev_b32_e32 v21, 8, v21 ; LOOP-NEXT: s_waitcnt vmcnt(8) -; LOOP-NEXT: v_lshlrev_b32_e32 v15, 24, v17 -; LOOP-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; LOOP-NEXT: v_lshlrev_b32_e32 v23, 24, v23 +; LOOP-NEXT: v_lshlrev_b32_e32 v22, 16, v22 ; LOOP-NEXT: s_waitcnt vmcnt(6) -; LOOP-NEXT: v_lshlrev_b32_e32 v17, 8, v19 +; LOOP-NEXT: v_lshlrev_b32_e32 v25, 8, v25 ; LOOP-NEXT: s_waitcnt vmcnt(4) -; LOOP-NEXT: v_lshlrev_b32_e32 v19, 24, v21 -; LOOP-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; LOOP-NEXT: v_lshlrev_b32_e32 v28, 24, v28 +; LOOP-NEXT: v_lshlrev_b32_e32 v26, 16, v26 ; LOOP-NEXT: s_waitcnt vmcnt(2) -; LOOP-NEXT: v_lshlrev_b32_e32 v21, 8, v23 +; LOOP-NEXT: v_lshlrev_b32_e32 v31, 8, v31 ; LOOP-NEXT: s_waitcnt vmcnt(0) -; LOOP-NEXT: v_lshlrev_b32_e32 v6, 24, v6 -; LOOP-NEXT: v_lshlrev_b32_e32 v23, 16, v24 -; LOOP-NEXT: v_or_b32_e32 v7, v7, v10 -; LOOP-NEXT: v_or_b32_e32 v10, v11, v12 -; LOOP-NEXT: v_or_b32_e32 v11, v13, v14 -; LOOP-NEXT: v_or_b32_e32 v12, v15, v16 -; LOOP-NEXT: v_or_b32_e32 v13, v17, v18 -; LOOP-NEXT: v_or_b32_e32 v14, v19, v20 -; LOOP-NEXT: v_or_b32_e32 v15, v21, v22 -; LOOP-NEXT: v_or_b32_e32 v6, v6, v23 +; LOOP-NEXT: v_lshlrev_b32_e32 v33, 24, v33 +; LOOP-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; LOOP-NEXT: v_or_b32_e32 v6, v9, v6 +; LOOP-NEXT: v_or_b32_e32 v9, v11, v10 +; LOOP-NEXT: v_or_b32_e32 v7, v13, v7 +; LOOP-NEXT: v_or_b32_e32 v10, v15, v14 +; LOOP-NEXT: v_or_b32_e32 v8, v17, v8 +; LOOP-NEXT: v_or_b32_e32 v11, v19, v18 +; LOOP-NEXT: v_or_b32_e32 v12, v21, v12 +; LOOP-NEXT: v_or_b32_e32 v13, v23, v22 +; LOOP-NEXT: v_or_b32_e32 v14, v25, v16 +; LOOP-NEXT: v_or_b32_e32 v15, v28, v26 +; LOOP-NEXT: v_or_b32_e32 v16, v31, v20 +; LOOP-NEXT: v_or_b32_e32 v17, v33, v32 +; LOOP-NEXT: v_or_b32_e32 v18, v27, v24 +; LOOP-NEXT: v_or_b32_e32 v19, v30, v29 +; LOOP-NEXT: v_or_b32_e32 v6, v9, v6 ; LOOP-NEXT: v_or_b32_e32 v7, v10, v7 -; LOOP-NEXT: v_or_b32_e32 v10, v12, v11 -; LOOP-NEXT: v_or_b32_e32 v11, v14, v13 -; LOOP-NEXT: v_or_b32_e32 v6, v6, v15 -; LOOP-NEXT: v_lshrrev_b32_e32 v12, 16, v7 -; LOOP-NEXT: v_bfe_u32 v13, v7, 8, 8 -; LOOP-NEXT: buffer_store_byte v7, v[8:9], s[0:3], 0 addr64 +; LOOP-NEXT: v_or_b32_e32 v8, v11, v8 +; LOOP-NEXT: v_or_b32_e32 v9, v13, v12 +; LOOP-NEXT: v_or_b32_e32 v10, v15, v14 +; LOOP-NEXT: v_or_b32_e32 v11, v17, v16 +; LOOP-NEXT: v_lshrrev_b32_e32 v12, 16, v18 +; LOOP-NEXT: v_bfe_u32 v13, v18, 8, 8 +; LOOP-NEXT: buffer_store_byte v18, v[34:35], s[0:3], 0 addr64 +; LOOP-NEXT: v_lshrrev_b32_e32 v14, 24, v18 +; LOOP-NEXT: v_lshrrev_b32_e32 v15, 16, v19 +; LOOP-NEXT: v_bfe_u32 v16, v19, 8, 8 +; LOOP-NEXT: buffer_store_byte v19, v[34:35], s[0:3], 0 addr64 offset:4 +; LOOP-NEXT: v_lshrrev_b32_e32 v17, 24, v19 +; LOOP-NEXT: s_waitcnt expcnt(1) +; LOOP-NEXT: v_lshrrev_b32_e32 v18, 16, v6 +; LOOP-NEXT: s_waitcnt expcnt(0) +; LOOP-NEXT: v_bfe_u32 v19, v6, 8, 8 +; LOOP-NEXT: buffer_store_byte v6, v[34:35], s[0:3], 0 addr64 offset:8 +; LOOP-NEXT: s_waitcnt expcnt(0) +; LOOP-NEXT: v_lshrrev_b32_e32 v6, 24, v6 +; LOOP-NEXT: v_lshrrev_b32_e32 v20, 16, v7 +; LOOP-NEXT: v_bfe_u32 v21, v7, 8, 8 +; LOOP-NEXT: buffer_store_byte v7, v[34:35], s[0:3], 0 addr64 offset:12 ; LOOP-NEXT: s_waitcnt expcnt(0) ; LOOP-NEXT: v_lshrrev_b32_e32 v7, 24, v7 -; LOOP-NEXT: v_lshrrev_b32_e32 v14, 16, v10 -; LOOP-NEXT: v_bfe_u32 v15, v10, 8, 8 -; LOOP-NEXT: buffer_store_byte v10, v[8:9], s[0:3], 0 addr64 offset:4 +; LOOP-NEXT: v_lshrrev_b32_e32 v22, 16, v8 +; LOOP-NEXT: v_bfe_u32 v23, v8, 8, 8 +; LOOP-NEXT: buffer_store_byte v8, v[34:35], s[0:3], 0 addr64 offset:16 +; LOOP-NEXT: s_waitcnt expcnt(0) +; LOOP-NEXT: v_lshrrev_b32_e32 v8, 24, v8 +; LOOP-NEXT: v_lshrrev_b32_e32 v24, 16, v9 +; LOOP-NEXT: v_bfe_u32 v25, v9, 8, 8 +; LOOP-NEXT: buffer_store_byte v9, v[34:35], s[0:3], 0 addr64 offset:20 +; LOOP-NEXT: s_waitcnt expcnt(0) +; LOOP-NEXT: v_lshrrev_b32_e32 v9, 24, v9 +; LOOP-NEXT: v_lshrrev_b32_e32 v26, 16, v10 +; LOOP-NEXT: v_bfe_u32 v27, v10, 8, 8 +; LOOP-NEXT: buffer_store_byte v10, v[34:35], s[0:3], 0 addr64 offset:24 ; LOOP-NEXT: s_waitcnt expcnt(0) ; LOOP-NEXT: v_lshrrev_b32_e32 v10, 24, v10 -; LOOP-NEXT: v_lshrrev_b32_e32 v16, 16, v11 -; LOOP-NEXT: v_bfe_u32 v17, v11, 8, 8 -; LOOP-NEXT: buffer_store_byte v11, v[8:9], s[0:3], 0 addr64 offset:8 +; LOOP-NEXT: v_lshrrev_b32_e32 v28, 16, v11 +; LOOP-NEXT: v_bfe_u32 v29, v11, 8, 8 +; LOOP-NEXT: buffer_store_byte v11, v[34:35], s[0:3], 0 addr64 offset:28 ; LOOP-NEXT: s_waitcnt expcnt(0) ; LOOP-NEXT: v_lshrrev_b32_e32 v11, 24, v11 -; LOOP-NEXT: v_lshrrev_b32_e32 v18, 16, v6 -; LOOP-NEXT: v_bfe_u32 v19, v6, 8, 8 -; LOOP-NEXT: buffer_store_byte v6, v[8:9], s[0:3], 0 addr64 offset:12 -; LOOP-NEXT: s_waitcnt expcnt(0) -; LOOP-NEXT: v_lshrrev_b32_e32 v6, 24, v6 -; LOOP-NEXT: buffer_store_byte v13, v[8:9], s[0:3], 0 addr64 offset:1 -; LOOP-NEXT: buffer_store_byte v12, v[8:9], s[0:3], 0 addr64 offset:2 -; LOOP-NEXT: buffer_store_byte v7, v[8:9], s[0:3], 0 addr64 offset:3 -; LOOP-NEXT: buffer_store_byte v15, v[8:9], s[0:3], 0 addr64 offset:5 -; LOOP-NEXT: buffer_store_byte v14, v[8:9], s[0:3], 0 addr64 offset:6 -; LOOP-NEXT: buffer_store_byte v10, v[8:9], s[0:3], 0 addr64 offset:7 -; LOOP-NEXT: buffer_store_byte v17, v[8:9], s[0:3], 0 addr64 offset:9 -; LOOP-NEXT: buffer_store_byte v16, v[8:9], s[0:3], 0 addr64 offset:10 -; LOOP-NEXT: buffer_store_byte v11, v[8:9], s[0:3], 0 addr64 offset:11 -; LOOP-NEXT: buffer_store_byte v19, v[8:9], s[0:3], 0 addr64 offset:13 -; LOOP-NEXT: buffer_store_byte v18, v[8:9], s[0:3], 0 addr64 offset:14 -; LOOP-NEXT: buffer_store_byte v6, v[8:9], s[0:3], 0 addr64 offset:15 +; LOOP-NEXT: buffer_store_byte v13, v[34:35], s[0:3], 0 addr64 offset:1 +; LOOP-NEXT: buffer_store_byte v12, v[34:35], s[0:3], 0 addr64 offset:2 +; LOOP-NEXT: buffer_store_byte v14, v[34:35], s[0:3], 0 addr64 offset:3 +; LOOP-NEXT: buffer_store_byte v16, v[34:35], s[0:3], 0 addr64 offset:5 +; LOOP-NEXT: buffer_store_byte v15, v[34:35], s[0:3], 0 addr64 offset:6 +; LOOP-NEXT: buffer_store_byte v17, v[34:35], s[0:3], 0 addr64 offset:7 +; LOOP-NEXT: buffer_store_byte v19, v[34:35], s[0:3], 0 addr64 offset:9 +; LOOP-NEXT: buffer_store_byte v18, v[34:35], s[0:3], 0 addr64 offset:10 +; LOOP-NEXT: buffer_store_byte v6, v[34:35], s[0:3], 0 addr64 offset:11 +; LOOP-NEXT: buffer_store_byte v21, v[34:35], s[0:3], 0 addr64 offset:13 +; LOOP-NEXT: buffer_store_byte v20, v[34:35], s[0:3], 0 addr64 offset:14 +; LOOP-NEXT: buffer_store_byte v7, v[34:35], s[0:3], 0 addr64 offset:15 +; LOOP-NEXT: buffer_store_byte v23, v[34:35], s[0:3], 0 addr64 offset:17 +; LOOP-NEXT: buffer_store_byte v22, v[34:35], s[0:3], 0 addr64 offset:18 +; LOOP-NEXT: buffer_store_byte v8, v[34:35], s[0:3], 0 addr64 offset:19 +; LOOP-NEXT: buffer_store_byte v25, v[34:35], s[0:3], 0 addr64 offset:21 +; LOOP-NEXT: buffer_store_byte v24, v[34:35], s[0:3], 0 addr64 offset:22 +; LOOP-NEXT: buffer_store_byte v9, v[34:35], s[0:3], 0 addr64 offset:23 +; LOOP-NEXT: buffer_store_byte v27, v[34:35], s[0:3], 0 addr64 offset:25 +; LOOP-NEXT: buffer_store_byte v26, v[34:35], s[0:3], 0 addr64 offset:26 +; LOOP-NEXT: buffer_store_byte v10, v[34:35], s[0:3], 0 addr64 offset:27 +; LOOP-NEXT: buffer_store_byte v29, v[34:35], s[0:3], 0 addr64 offset:29 +; LOOP-NEXT: buffer_store_byte v28, v[34:35], s[0:3], 0 addr64 offset:30 +; LOOP-NEXT: buffer_store_byte v11, v[34:35], s[0:3], 0 addr64 offset:31 ; LOOP-NEXT: s_cbranch_vccnz .LBB0_1 ; LOOP-NEXT: ; %bb.2: ; %memcpy-split ; LOOP-NEXT: s_mov_b32 s2, 0 ; LOOP-NEXT: s_mov_b32 s3, 0xf000 ; LOOP-NEXT: s_mov_b64 s[0:1], 0 -; LOOP-NEXT: buffer_load_ubyte v4, v[2:3], s[0:3], 0 addr64 offset:17 -; LOOP-NEXT: buffer_load_ubyte v5, v[2:3], s[0:3], 0 addr64 offset:19 -; LOOP-NEXT: s_waitcnt expcnt(0) -; LOOP-NEXT: buffer_load_ubyte v6, v[2:3], s[0:3], 0 addr64 offset:18 -; LOOP-NEXT: buffer_load_ubyte v2, v[2:3], s[0:3], 0 addr64 offset:16 +; LOOP-NEXT: buffer_load_ubyte v4, v[2:3], s[0:3], 0 addr64 offset:33 +; LOOP-NEXT: buffer_load_ubyte v5, v[2:3], s[0:3], 0 addr64 offset:35 +; LOOP-NEXT: buffer_load_ubyte v6, v[2:3], s[0:3], 0 addr64 offset:34 +; LOOP-NEXT: buffer_load_ubyte v2, v[2:3], s[0:3], 0 addr64 offset:32 ; LOOP-NEXT: s_waitcnt vmcnt(3) ; LOOP-NEXT: v_lshlrev_b32_e32 v3, 8, v4 ; LOOP-NEXT: s_waitcnt vmcnt(2) @@ -124,12 +196,12 @@ define amdgpu_cs void @memcpy_p1i8(ptr addrspace(1) %dst, ptr addrspace(1) %src) ; LOOP-NEXT: v_or_b32_e32 v2, v3, v2 ; LOOP-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; LOOP-NEXT: v_bfe_u32 v4, v2, 8, 8 -; LOOP-NEXT: buffer_store_byte v2, v[0:1], s[0:3], 0 addr64 offset:16 +; LOOP-NEXT: buffer_store_byte v2, v[0:1], s[0:3], 0 addr64 offset:32 ; LOOP-NEXT: s_waitcnt expcnt(0) ; LOOP-NEXT: v_lshrrev_b32_e32 v2, 24, v2 -; LOOP-NEXT: buffer_store_byte v4, v[0:1], s[0:3], 0 addr64 offset:17 -; LOOP-NEXT: buffer_store_byte v3, v[0:1], s[0:3], 0 addr64 offset:18 -; LOOP-NEXT: buffer_store_byte v2, v[0:1], s[0:3], 0 addr64 offset:19 +; LOOP-NEXT: buffer_store_byte v4, v[0:1], s[0:3], 0 addr64 offset:33 +; LOOP-NEXT: buffer_store_byte v3, v[0:1], s[0:3], 0 addr64 offset:34 +; LOOP-NEXT: buffer_store_byte v2, v[0:1], s[0:3], 0 addr64 offset:35 ; LOOP-NEXT: s_endpgm ; ; UNROLL-LABEL: memcpy_p1i8: @@ -212,11 +284,75 @@ define amdgpu_cs void @memcpy_p1i8(ptr addrspace(1) %dst, ptr addrspace(1) %src) ; UNROLL-NEXT: buffer_load_ubyte v4, v[2:3], s[0:3], 0 addr64 offset:18 ; UNROLL-NEXT: s_waitcnt vmcnt(0) ; UNROLL-NEXT: buffer_store_byte v4, v[0:1], s[0:3], 0 addr64 offset:18 -; UNROLL-NEXT: buffer_load_ubyte v2, v[2:3], s[0:3], 0 addr64 offset:19 +; UNROLL-NEXT: s_waitcnt expcnt(0) +; UNROLL-NEXT: buffer_load_ubyte v4, v[2:3], s[0:3], 0 addr64 offset:19 +; UNROLL-NEXT: s_waitcnt vmcnt(0) +; UNROLL-NEXT: buffer_store_byte v4, v[0:1], s[0:3], 0 addr64 offset:19 +; UNROLL-NEXT: s_waitcnt expcnt(0) +; UNROLL-NEXT: buffer_load_ubyte v4, v[2:3], s[0:3], 0 addr64 offset:20 +; UNROLL-NEXT: s_waitcnt vmcnt(0) +; UNROLL-NEXT: buffer_store_byte v4, v[0:1], s[0:3], 0 addr64 offset:20 +; UNROLL-NEXT: s_waitcnt expcnt(0) +; UNROLL-NEXT: buffer_load_ubyte v4, v[2:3], s[0:3], 0 addr64 offset:21 +; UNROLL-NEXT: s_waitcnt vmcnt(0) +; UNROLL-NEXT: buffer_store_byte v4, v[0:1], s[0:3], 0 addr64 offset:21 +; UNROLL-NEXT: s_waitcnt expcnt(0) +; UNROLL-NEXT: buffer_load_ubyte v4, v[2:3], s[0:3], 0 addr64 offset:22 +; UNROLL-NEXT: s_waitcnt vmcnt(0) +; UNROLL-NEXT: buffer_store_byte v4, v[0:1], s[0:3], 0 addr64 offset:22 +; UNROLL-NEXT: s_waitcnt expcnt(0) +; UNROLL-NEXT: buffer_load_ubyte v4, v[2:3], s[0:3], 0 addr64 offset:23 +; UNROLL-NEXT: s_waitcnt vmcnt(0) +; UNROLL-NEXT: buffer_store_byte v4, v[0:1], s[0:3], 0 addr64 offset:23 +; UNROLL-NEXT: s_waitcnt expcnt(0) +; UNROLL-NEXT: buffer_load_ubyte v4, v[2:3], s[0:3], 0 addr64 offset:24 +; UNROLL-NEXT: s_waitcnt vmcnt(0) +; UNROLL-NEXT: buffer_store_byte v4, v[0:1], s[0:3], 0 addr64 offset:24 +; UNROLL-NEXT: s_waitcnt expcnt(0) +; UNROLL-NEXT: buffer_load_ubyte v4, v[2:3], s[0:3], 0 addr64 offset:25 +; UNROLL-NEXT: s_waitcnt vmcnt(0) +; UNROLL-NEXT: buffer_store_byte v4, v[0:1], s[0:3], 0 addr64 offset:25 +; UNROLL-NEXT: s_waitcnt expcnt(0) +; UNROLL-NEXT: buffer_load_ubyte v4, v[2:3], s[0:3], 0 addr64 offset:26 +; UNROLL-NEXT: s_waitcnt vmcnt(0) +; UNROLL-NEXT: buffer_store_byte v4, v[0:1], s[0:3], 0 addr64 offset:26 +; UNROLL-NEXT: s_waitcnt expcnt(0) +; UNROLL-NEXT: buffer_load_ubyte v4, v[2:3], s[0:3], 0 addr64 offset:27 +; UNROLL-NEXT: s_waitcnt vmcnt(0) +; UNROLL-NEXT: buffer_store_byte v4, v[0:1], s[0:3], 0 addr64 offset:27 +; UNROLL-NEXT: s_waitcnt expcnt(0) +; UNROLL-NEXT: buffer_load_ubyte v4, v[2:3], s[0:3], 0 addr64 offset:28 +; UNROLL-NEXT: s_waitcnt vmcnt(0) +; UNROLL-NEXT: buffer_store_byte v4, v[0:1], s[0:3], 0 addr64 offset:28 +; UNROLL-NEXT: s_waitcnt expcnt(0) +; UNROLL-NEXT: buffer_load_ubyte v4, v[2:3], s[0:3], 0 addr64 offset:29 +; UNROLL-NEXT: s_waitcnt vmcnt(0) +; UNROLL-NEXT: buffer_store_byte v4, v[0:1], s[0:3], 0 addr64 offset:29 +; UNROLL-NEXT: s_waitcnt expcnt(0) +; UNROLL-NEXT: buffer_load_ubyte v4, v[2:3], s[0:3], 0 addr64 offset:30 +; UNROLL-NEXT: s_waitcnt vmcnt(0) +; UNROLL-NEXT: buffer_store_byte v4, v[0:1], s[0:3], 0 addr64 offset:30 +; UNROLL-NEXT: s_waitcnt expcnt(0) +; UNROLL-NEXT: buffer_load_ubyte v4, v[2:3], s[0:3], 0 addr64 offset:31 +; UNROLL-NEXT: s_waitcnt vmcnt(0) +; UNROLL-NEXT: buffer_store_byte v4, v[0:1], s[0:3], 0 addr64 offset:31 +; UNROLL-NEXT: s_waitcnt expcnt(0) +; UNROLL-NEXT: buffer_load_ubyte v4, v[2:3], s[0:3], 0 addr64 offset:32 +; UNROLL-NEXT: s_waitcnt vmcnt(0) +; UNROLL-NEXT: buffer_store_byte v4, v[0:1], s[0:3], 0 addr64 offset:32 +; UNROLL-NEXT: s_waitcnt expcnt(0) +; UNROLL-NEXT: buffer_load_ubyte v4, v[2:3], s[0:3], 0 addr64 offset:33 +; UNROLL-NEXT: s_waitcnt vmcnt(0) +; UNROLL-NEXT: buffer_store_byte v4, v[0:1], s[0:3], 0 addr64 offset:33 +; UNROLL-NEXT: s_waitcnt expcnt(0) +; UNROLL-NEXT: buffer_load_ubyte v4, v[2:3], s[0:3], 0 addr64 offset:34 +; UNROLL-NEXT: s_waitcnt vmcnt(0) +; UNROLL-NEXT: buffer_store_byte v4, v[0:1], s[0:3], 0 addr64 offset:34 +; UNROLL-NEXT: buffer_load_ubyte v2, v[2:3], s[0:3], 0 addr64 offset:35 ; UNROLL-NEXT: s_waitcnt vmcnt(0) -; UNROLL-NEXT: buffer_store_byte v2, v[0:1], s[0:3], 0 addr64 offset:19 +; UNROLL-NEXT: buffer_store_byte v2, v[0:1], s[0:3], 0 addr64 offset:35 ; UNROLL-NEXT: s_endpgm - call void @llvm.memcpy.p1.p1.i32(ptr addrspace(1) %dst, ptr addrspace(1) %src, i32 20, i1 false) + call void @llvm.memcpy.p1.p1.i32(ptr addrspace(1) %dst, ptr addrspace(1) %src, i32 36, i1 false) ret void } diff --git a/llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll b/llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll index a95f22507eece..ffe9e06c04ae4 100644 --- a/llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll @@ -46,10 +46,10 @@ define amdgpu_kernel void @max_size_small_static_memcpy_caller0(ptr addrspace(1) ; ALL: load-store-loop: ; ALL-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ] ; ALL-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]] -; ALL-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP1]], align 1 +; ALL-NEXT: [[TMP2:%.*]] = load <64 x i32>, ptr addrspace(1) [[TMP1]], align 1 ; ALL-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 [[LOOP_INDEX]] -; ALL-NEXT: store <4 x i32> [[TMP2]], ptr addrspace(1) [[TMP3]], align 1 -; ALL-NEXT: [[TMP4]] = add i64 [[LOOP_INDEX]], 16 +; ALL-NEXT: store <64 x i32> [[TMP2]], ptr addrspace(1) [[TMP3]], align 1 +; ALL-NEXT: [[TMP4]] = add i64 [[LOOP_INDEX]], 256 ; ALL-NEXT: [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 1024 ; ALL-NEXT: br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]] ; ALL: memcpy-split: @@ -66,10 +66,10 @@ define amdgpu_kernel void @min_size_large_static_memcpy_caller0(ptr addrspace(1) ; OPT: load-store-loop: ; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ] ; OPT-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]] -; OPT-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP1]], align 1 +; OPT-NEXT: [[TMP2:%.*]] = load <64 x i32>, ptr addrspace(1) [[TMP1]], align 1 ; OPT-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 [[LOOP_INDEX]] -; OPT-NEXT: store <4 x i32> [[TMP2]], ptr addrspace(1) [[TMP3]], align 1 -; OPT-NEXT: [[TMP4]] = add i64 [[LOOP_INDEX]], 16 +; OPT-NEXT: store <64 x i32> [[TMP2]], ptr addrspace(1) [[TMP3]], align 1 +; OPT-NEXT: [[TMP4]] = add i64 [[LOOP_INDEX]], 256 ; OPT-NEXT: [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 1024 ; OPT-NEXT: br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]] ; OPT: memcpy-split: @@ -93,20 +93,20 @@ define amdgpu_kernel void @max_size_small_static_memmove_caller0(ptr addrspace(1 ; ALL-NEXT: br i1 [[COMPARE_SRC_DST]], label [[MEMMOVE_BWD_LOOP:%.*]], label [[MEMMOVE_FWD_LOOP:%.*]] ; ALL: memmove_bwd_loop: ; ALL-NEXT: [[TMP1:%.*]] = phi i64 [ [[BWD_INDEX:%.*]], [[MEMMOVE_BWD_LOOP]] ], [ 1024, [[TMP0:%.*]] ] -; ALL-NEXT: [[BWD_INDEX]] = sub i64 [[TMP1]], 16 +; ALL-NEXT: [[BWD_INDEX]] = sub i64 [[TMP1]], 256 ; ALL-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 [[BWD_INDEX]] -; ALL-NEXT: [[ELEMENT:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP2]], align 1 +; ALL-NEXT: [[ELEMENT:%.*]] = load <64 x i32>, ptr addrspace(1) [[TMP2]], align 1 ; ALL-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 [[BWD_INDEX]] -; ALL-NEXT: store <4 x i32> [[ELEMENT]], ptr addrspace(1) [[TMP3]], align 1 +; ALL-NEXT: store <64 x i32> [[ELEMENT]], ptr addrspace(1) [[TMP3]], align 1 ; ALL-NEXT: [[TMP4:%.*]] = icmp eq i64 [[BWD_INDEX]], 0 ; ALL-NEXT: br i1 [[TMP4]], label [[MEMMOVE_DONE:%.*]], label [[MEMMOVE_BWD_LOOP]] ; ALL: memmove_fwd_loop: ; ALL-NEXT: [[FWD_INDEX:%.*]] = phi i64 [ [[TMP7:%.*]], [[MEMMOVE_FWD_LOOP]] ], [ 0, [[TMP0]] ] ; ALL-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 [[FWD_INDEX]] -; ALL-NEXT: [[ELEMENT1:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP5]], align 1 +; ALL-NEXT: [[ELEMENT1:%.*]] = load <64 x i32>, ptr addrspace(1) [[TMP5]], align 1 ; ALL-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 [[FWD_INDEX]] -; ALL-NEXT: store <4 x i32> [[ELEMENT1]], ptr addrspace(1) [[TMP6]], align 1 -; ALL-NEXT: [[TMP7]] = add i64 [[FWD_INDEX]], 16 +; ALL-NEXT: store <64 x i32> [[ELEMENT1]], ptr addrspace(1) [[TMP6]], align 1 +; ALL-NEXT: [[TMP7]] = add i64 [[FWD_INDEX]], 256 ; ALL-NEXT: [[TMP8:%.*]] = icmp eq i64 [[TMP7]], 1024 ; ALL-NEXT: br i1 [[TMP8]], label [[MEMMOVE_DONE]], label [[MEMMOVE_FWD_LOOP]] ; ALL: memmove_done: @@ -128,20 +128,20 @@ define amdgpu_kernel void @min_size_large_static_memmove_caller0(ptr addrspace(1 ; OPT-NEXT: br label [[MEMMOVE_BWD_LOOP:%.*]] ; OPT: memmove_bwd_loop: ; OPT-NEXT: [[TMP4:%.*]] = phi i64 [ [[BWD_INDEX:%.*]], [[MEMMOVE_BWD_LOOP]] ], [ 1024, [[MEMMOVE_BWD_RESIDUAL]] ] -; OPT-NEXT: [[BWD_INDEX]] = sub i64 [[TMP4]], 16 +; OPT-NEXT: [[BWD_INDEX]] = sub i64 [[TMP4]], 256 ; OPT-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 [[BWD_INDEX]] -; OPT-NEXT: [[ELEMENT:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP5]], align 1 +; OPT-NEXT: [[ELEMENT:%.*]] = load <64 x i32>, ptr addrspace(1) [[TMP5]], align 1 ; OPT-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 [[BWD_INDEX]] -; OPT-NEXT: store <4 x i32> [[ELEMENT]], ptr addrspace(1) [[TMP6]], align 1 +; OPT-NEXT: store <64 x i32> [[ELEMENT]], ptr addrspace(1) [[TMP6]], align 1 ; OPT-NEXT: [[TMP7:%.*]] = icmp eq i64 [[BWD_INDEX]], 0 ; OPT-NEXT: br i1 [[TMP7]], label [[MEMMOVE_DONE:%.*]], label [[MEMMOVE_BWD_LOOP]] ; OPT: memmove_fwd_loop: ; OPT-NEXT: [[FWD_INDEX:%.*]] = phi i64 [ [[TMP10:%.*]], [[MEMMOVE_FWD_LOOP]] ], [ 0, [[TMP0:%.*]] ] ; OPT-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 [[FWD_INDEX]] -; OPT-NEXT: [[ELEMENT1:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP8]], align 1 +; OPT-NEXT: [[ELEMENT1:%.*]] = load <64 x i32>, ptr addrspace(1) [[TMP8]], align 1 ; OPT-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 [[FWD_INDEX]] -; OPT-NEXT: store <4 x i32> [[ELEMENT1]], ptr addrspace(1) [[TMP9]], align 1 -; OPT-NEXT: [[TMP10]] = add i64 [[FWD_INDEX]], 16 +; OPT-NEXT: store <64 x i32> [[ELEMENT1]], ptr addrspace(1) [[TMP9]], align 1 +; OPT-NEXT: [[TMP10]] = add i64 [[FWD_INDEX]], 256 ; OPT-NEXT: [[TMP11:%.*]] = icmp eq i64 [[TMP10]], 1024 ; OPT-NEXT: br i1 [[TMP11]], label [[MEMMOVE_FWD_RESIDUAL:%.*]], label [[MEMMOVE_FWD_LOOP]] ; OPT: memmove_fwd_residual: @@ -421,17 +421,30 @@ define amdgpu_kernel void @memcpy_multi_use_one_function_keep_small(ptr addrspac ; ALL-NEXT: [[TMP15:%.*]] = icmp ult i64 [[TMP14]], [[TMP2]] ; ALL-NEXT: br i1 [[TMP15]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]] ; ALL: post-loop-memcpy-expansion: -; ALL-NEXT: br label [[LOAD_STORE_LOOP:%.*]] -; ALL: load-store-loop: -; ALL-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[POST_LOOP_MEMCPY_EXPANSION]] ], [ [[TMP19:%.*]], [[LOAD_STORE_LOOP]] ] -; ALL-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 [[LOOP_INDEX]] +; ALL-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 0 ; ALL-NEXT: [[TMP17:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP16]], align 1 -; ALL-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST1:%.*]], i64 [[LOOP_INDEX]] +; ALL-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST1:%.*]], i64 0 ; ALL-NEXT: store <4 x i32> [[TMP17]], ptr addrspace(1) [[TMP18]], align 1 -; ALL-NEXT: [[TMP19]] = add i64 [[LOOP_INDEX]], 16 -; ALL-NEXT: [[TMP20:%.*]] = icmp ult i64 [[TMP19]], 96 -; ALL-NEXT: br i1 [[TMP20]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]] -; ALL: memcpy-split: +; ALL-NEXT: [[TMP33:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 16 +; ALL-NEXT: [[TMP19:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP33]], align 1 +; ALL-NEXT: [[TMP20:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST1]], i64 16 +; ALL-NEXT: store <4 x i32> [[TMP19]], ptr addrspace(1) [[TMP20]], align 1 +; ALL-NEXT: [[TMP34:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 32 +; ALL-NEXT: [[TMP35:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP34]], align 1 +; ALL-NEXT: [[TMP36:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST1]], i64 32 +; ALL-NEXT: store <4 x i32> [[TMP35]], ptr addrspace(1) [[TMP36]], align 1 +; ALL-NEXT: [[TMP37:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 48 +; ALL-NEXT: [[TMP38:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP37]], align 1 +; ALL-NEXT: [[TMP39:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST1]], i64 48 +; ALL-NEXT: store <4 x i32> [[TMP38]], ptr addrspace(1) [[TMP39]], align 1 +; ALL-NEXT: [[TMP40:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 64 +; ALL-NEXT: [[TMP28:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP40]], align 1 +; ALL-NEXT: [[TMP29:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST1]], i64 64 +; ALL-NEXT: store <4 x i32> [[TMP28]], ptr addrspace(1) [[TMP29]], align 1 +; ALL-NEXT: [[TMP30:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 80 +; ALL-NEXT: [[TMP31:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP30]], align 1 +; ALL-NEXT: [[TMP32:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST1]], i64 80 +; ALL-NEXT: store <4 x i32> [[TMP31]], ptr addrspace(1) [[TMP32]], align 1 ; ALL-NEXT: [[TMP21:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 96 ; ALL-NEXT: [[TMP22:%.*]] = load i32, ptr addrspace(1) [[TMP21]], align 1 ; ALL-NEXT: [[TMP23:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST1]], i64 96 @@ -456,10 +469,10 @@ define amdgpu_kernel void @memcpy_global_align4_global_align4_1028(ptr addrspace ; OPT: load-store-loop: ; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ] ; OPT-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]] -; OPT-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP1]], align 4 +; OPT-NEXT: [[TMP2:%.*]] = load <64 x i32>, ptr addrspace(1) [[TMP1]], align 4 ; OPT-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 [[LOOP_INDEX]] -; OPT-NEXT: store <4 x i32> [[TMP2]], ptr addrspace(1) [[TMP3]], align 4 -; OPT-NEXT: [[TMP4]] = add i64 [[LOOP_INDEX]], 16 +; OPT-NEXT: store <64 x i32> [[TMP2]], ptr addrspace(1) [[TMP3]], align 4 +; OPT-NEXT: [[TMP4]] = add i64 [[LOOP_INDEX]], 256 ; OPT-NEXT: [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 1024 ; OPT-NEXT: br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]] ; OPT: memcpy-split: @@ -479,10 +492,10 @@ define amdgpu_kernel void @memcpy_global_align4_global_align4_1025(ptr addrspace ; OPT: load-store-loop: ; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ] ; OPT-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]] -; OPT-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP1]], align 4 +; OPT-NEXT: [[TMP2:%.*]] = load <64 x i32>, ptr addrspace(1) [[TMP1]], align 4 ; OPT-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 [[LOOP_INDEX]] -; OPT-NEXT: store <4 x i32> [[TMP2]], ptr addrspace(1) [[TMP3]], align 4 -; OPT-NEXT: [[TMP4]] = add i64 [[LOOP_INDEX]], 16 +; OPT-NEXT: store <64 x i32> [[TMP2]], ptr addrspace(1) [[TMP3]], align 4 +; OPT-NEXT: [[TMP4]] = add i64 [[LOOP_INDEX]], 256 ; OPT-NEXT: [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 1024 ; OPT-NEXT: br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]] ; OPT: memcpy-split: @@ -502,10 +515,10 @@ define amdgpu_kernel void @memcpy_global_align4_global_align4_1026(ptr addrspace ; OPT: load-store-loop: ; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ] ; OPT-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]] -; OPT-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP1]], align 4 +; OPT-NEXT: [[TMP2:%.*]] = load <64 x i32>, ptr addrspace(1) [[TMP1]], align 4 ; OPT-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 [[LOOP_INDEX]] -; OPT-NEXT: store <4 x i32> [[TMP2]], ptr addrspace(1) [[TMP3]], align 4 -; OPT-NEXT: [[TMP4]] = add i64 [[LOOP_INDEX]], 16 +; OPT-NEXT: store <64 x i32> [[TMP2]], ptr addrspace(1) [[TMP3]], align 4 +; OPT-NEXT: [[TMP4]] = add i64 [[LOOP_INDEX]], 256 ; OPT-NEXT: [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 1024 ; OPT-NEXT: br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]] ; OPT: memcpy-split: @@ -525,10 +538,10 @@ define amdgpu_kernel void @memcpy_global_align4_global_align4_1032(ptr addrspace ; OPT: load-store-loop: ; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ] ; OPT-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]] -; OPT-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP1]], align 4 +; OPT-NEXT: [[TMP2:%.*]] = load <64 x i32>, ptr addrspace(1) [[TMP1]], align 4 ; OPT-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 [[LOOP_INDEX]] -; OPT-NEXT: store <4 x i32> [[TMP2]], ptr addrspace(1) [[TMP3]], align 4 -; OPT-NEXT: [[TMP4]] = add i64 [[LOOP_INDEX]], 16 +; OPT-NEXT: store <64 x i32> [[TMP2]], ptr addrspace(1) [[TMP3]], align 4 +; OPT-NEXT: [[TMP4]] = add i64 [[LOOP_INDEX]], 256 ; OPT-NEXT: [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 1024 ; OPT-NEXT: br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]] ; OPT: memcpy-split: @@ -548,10 +561,10 @@ define amdgpu_kernel void @memcpy_global_align4_global_align4_1034(ptr addrspace ; OPT: load-store-loop: ; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ] ; OPT-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]] -; OPT-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP1]], align 4 +; OPT-NEXT: [[TMP2:%.*]] = load <64 x i32>, ptr addrspace(1) [[TMP1]], align 4 ; OPT-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 [[LOOP_INDEX]] -; OPT-NEXT: store <4 x i32> [[TMP2]], ptr addrspace(1) [[TMP3]], align 4 -; OPT-NEXT: [[TMP4]] = add i64 [[LOOP_INDEX]], 16 +; OPT-NEXT: store <64 x i32> [[TMP2]], ptr addrspace(1) [[TMP3]], align 4 +; OPT-NEXT: [[TMP4]] = add i64 [[LOOP_INDEX]], 256 ; OPT-NEXT: [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 1024 ; OPT-NEXT: br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]] ; OPT: memcpy-split: @@ -575,10 +588,10 @@ define amdgpu_kernel void @memcpy_global_align4_global_align4_1035(ptr addrspace ; OPT: load-store-loop: ; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ] ; OPT-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]] -; OPT-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP1]], align 4 +; OPT-NEXT: [[TMP2:%.*]] = load <64 x i32>, ptr addrspace(1) [[TMP1]], align 4 ; OPT-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 [[LOOP_INDEX]] -; OPT-NEXT: store <4 x i32> [[TMP2]], ptr addrspace(1) [[TMP3]], align 4 -; OPT-NEXT: [[TMP4]] = add i64 [[LOOP_INDEX]], 16 +; OPT-NEXT: store <64 x i32> [[TMP2]], ptr addrspace(1) [[TMP3]], align 4 +; OPT-NEXT: [[TMP4]] = add i64 [[LOOP_INDEX]], 256 ; OPT-NEXT: [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 1024 ; OPT-NEXT: br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]] ; OPT: memcpy-split: @@ -606,10 +619,10 @@ define amdgpu_kernel void @memcpy_global_align4_global_align4_1036(ptr addrspace ; OPT: load-store-loop: ; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ] ; OPT-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]] -; OPT-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP1]], align 4 +; OPT-NEXT: [[TMP2:%.*]] = load <64 x i32>, ptr addrspace(1) [[TMP1]], align 4 ; OPT-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 [[LOOP_INDEX]] -; OPT-NEXT: store <4 x i32> [[TMP2]], ptr addrspace(1) [[TMP3]], align 4 -; OPT-NEXT: [[TMP4]] = add i64 [[LOOP_INDEX]], 16 +; OPT-NEXT: store <64 x i32> [[TMP2]], ptr addrspace(1) [[TMP3]], align 4 +; OPT-NEXT: [[TMP4]] = add i64 [[LOOP_INDEX]], 256 ; OPT-NEXT: [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 1024 ; OPT-NEXT: br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]] ; OPT: memcpy-split: @@ -633,10 +646,10 @@ define amdgpu_kernel void @memcpy_global_align4_global_align4_1039(ptr addrspace ; OPT: load-store-loop: ; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ] ; OPT-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]] -; OPT-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP1]], align 4 +; OPT-NEXT: [[TMP2:%.*]] = load <64 x i32>, ptr addrspace(1) [[TMP1]], align 4 ; OPT-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 [[LOOP_INDEX]] -; OPT-NEXT: store <4 x i32> [[TMP2]], ptr addrspace(1) [[TMP3]], align 4 -; OPT-NEXT: [[TMP4]] = add i64 [[LOOP_INDEX]], 16 +; OPT-NEXT: store <64 x i32> [[TMP2]], ptr addrspace(1) [[TMP3]], align 4 +; OPT-NEXT: [[TMP4]] = add i64 [[LOOP_INDEX]], 256 ; OPT-NEXT: [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 1024 ; OPT-NEXT: br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]] ; OPT: memcpy-split: @@ -691,10 +704,10 @@ define amdgpu_kernel void @memcpy_global_align4_global_align4_1027(ptr addrspace ; OPT: load-store-loop: ; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ] ; OPT-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]] -; OPT-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP1]], align 4 +; OPT-NEXT: [[TMP2:%.*]] = load <64 x i32>, ptr addrspace(1) [[TMP1]], align 4 ; OPT-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 [[LOOP_INDEX]] -; OPT-NEXT: store <4 x i32> [[TMP2]], ptr addrspace(1) [[TMP3]], align 4 -; OPT-NEXT: [[TMP4]] = add i64 [[LOOP_INDEX]], 16 +; OPT-NEXT: store <64 x i32> [[TMP2]], ptr addrspace(1) [[TMP3]], align 4 +; OPT-NEXT: [[TMP4]] = add i64 [[LOOP_INDEX]], 256 ; OPT-NEXT: [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 1024 ; OPT-NEXT: br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]] ; OPT: memcpy-split: @@ -764,10 +777,10 @@ define amdgpu_kernel void @memcpy_private_align4_private_align4_1027(ptr addrspa ; OPT: load-store-loop: ; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ] ; OPT-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC:%.*]], i32 [[LOOP_INDEX]] -; OPT-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr addrspace(5) [[TMP1]], align 4 +; OPT-NEXT: [[TMP2:%.*]] = load <64 x i32>, ptr addrspace(5) [[TMP1]], align 4 ; OPT-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[DST:%.*]], i32 [[LOOP_INDEX]] -; OPT-NEXT: store <4 x i32> [[TMP2]], ptr addrspace(5) [[TMP3]], align 4 -; OPT-NEXT: [[TMP4]] = add i32 [[LOOP_INDEX]], 16 +; OPT-NEXT: store <64 x i32> [[TMP2]], ptr addrspace(5) [[TMP3]], align 4 +; OPT-NEXT: [[TMP4]] = add i32 [[LOOP_INDEX]], 256 ; OPT-NEXT: [[TMP5:%.*]] = icmp ult i32 [[TMP4]], 1024 ; OPT-NEXT: br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]] ; OPT: memcpy-split: @@ -814,10 +827,10 @@ define amdgpu_kernel void @memcpy_private_align1_private_align4_1027(ptr addrspa ; OPT: load-store-loop: ; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ] ; OPT-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC:%.*]], i32 [[LOOP_INDEX]] -; OPT-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr addrspace(5) [[TMP1]], align 4 +; OPT-NEXT: [[TMP2:%.*]] = load <64 x i32>, ptr addrspace(5) [[TMP1]], align 4 ; OPT-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[DST:%.*]], i32 [[LOOP_INDEX]] -; OPT-NEXT: store <4 x i32> [[TMP2]], ptr addrspace(5) [[TMP3]], align 1 -; OPT-NEXT: [[TMP4]] = add i32 [[LOOP_INDEX]], 16 +; OPT-NEXT: store <64 x i32> [[TMP2]], ptr addrspace(5) [[TMP3]], align 1 +; OPT-NEXT: [[TMP4]] = add i32 [[LOOP_INDEX]], 256 ; OPT-NEXT: [[TMP5:%.*]] = icmp ult i32 [[TMP4]], 1024 ; OPT-NEXT: br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]] ; OPT: memcpy-split: @@ -864,10 +877,10 @@ define amdgpu_kernel void @memcpy_private_align4_private_align1_1027(ptr addrspa ; OPT: load-store-loop: ; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ] ; OPT-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC:%.*]], i32 [[LOOP_INDEX]] -; OPT-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr addrspace(5) [[TMP1]], align 1 +; OPT-NEXT: [[TMP2:%.*]] = load <64 x i32>, ptr addrspace(5) [[TMP1]], align 1 ; OPT-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[DST:%.*]], i32 [[LOOP_INDEX]] -; OPT-NEXT: store <4 x i32> [[TMP2]], ptr addrspace(5) [[TMP3]], align 4 -; OPT-NEXT: [[TMP4]] = add i32 [[LOOP_INDEX]], 16 +; OPT-NEXT: store <64 x i32> [[TMP2]], ptr addrspace(5) [[TMP3]], align 4 +; OPT-NEXT: [[TMP4]] = add i32 [[LOOP_INDEX]], 256 ; OPT-NEXT: [[TMP5:%.*]] = icmp ult i32 [[TMP4]], 1024 ; OPT-NEXT: br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]] ; OPT: memcpy-split: @@ -1194,17 +1207,10 @@ define amdgpu_kernel void @memcpy_global_align4_global_align4_16(ptr addrspace(1 ; MAX1024-NEXT: ret void ; ; ALL-LABEL: @memcpy_global_align4_global_align4_16( -; ALL-NEXT: br label [[LOAD_STORE_LOOP:%.*]] -; ALL: load-store-loop: -; ALL-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ] -; ALL-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]] +; ALL-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 0 ; ALL-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP1]], align 4 -; ALL-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 [[LOOP_INDEX]] +; ALL-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 0 ; ALL-NEXT: store <4 x i32> [[TMP2]], ptr addrspace(1) [[TMP3]], align 4 -; ALL-NEXT: [[TMP4]] = add i64 [[LOOP_INDEX]], 16 -; ALL-NEXT: [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 16 -; ALL-NEXT: br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]] -; ALL: memcpy-split: ; ALL-NEXT: ret void ; call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) align 4 %dst, ptr addrspace(1) align 4 %src, i64 16, i1 false) @@ -1326,20 +1332,20 @@ define amdgpu_kernel void @memmove_flat_align1_global_align1(ptr %dst, ptr addrs ; ALL-NEXT: br i1 [[COMPARE_SRC_DST]], label [[MEMMOVE_BWD_LOOP:%.*]], label [[MEMMOVE_FWD_LOOP:%.*]] ; ALL: memmove_bwd_loop: ; ALL-NEXT: [[TMP2:%.*]] = phi i64 [ [[BWD_INDEX:%.*]], [[MEMMOVE_BWD_LOOP]] ], [ 256, [[TMP0:%.*]] ] -; ALL-NEXT: [[BWD_INDEX]] = sub i64 [[TMP2]], 16 +; ALL-NEXT: [[BWD_INDEX]] = sub i64 [[TMP2]], 256 ; ALL-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 [[BWD_INDEX]] -; ALL-NEXT: [[ELEMENT:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP3]], align 1 +; ALL-NEXT: [[ELEMENT:%.*]] = load <64 x i32>, ptr addrspace(1) [[TMP3]], align 1 ; ALL-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[BWD_INDEX]] -; ALL-NEXT: store <4 x i32> [[ELEMENT]], ptr [[TMP4]], align 1 +; ALL-NEXT: store <64 x i32> [[ELEMENT]], ptr [[TMP4]], align 1 ; ALL-NEXT: [[TMP5:%.*]] = icmp eq i64 [[BWD_INDEX]], 0 ; ALL-NEXT: br i1 [[TMP5]], label [[MEMMOVE_DONE:%.*]], label [[MEMMOVE_BWD_LOOP]] ; ALL: memmove_fwd_loop: ; ALL-NEXT: [[FWD_INDEX:%.*]] = phi i64 [ [[TMP8:%.*]], [[MEMMOVE_FWD_LOOP]] ], [ 0, [[TMP0]] ] ; ALL-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 [[FWD_INDEX]] -; ALL-NEXT: [[ELEMENT1:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP6]], align 1 +; ALL-NEXT: [[ELEMENT1:%.*]] = load <64 x i32>, ptr addrspace(1) [[TMP6]], align 1 ; ALL-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[FWD_INDEX]] -; ALL-NEXT: store <4 x i32> [[ELEMENT1]], ptr [[TMP7]], align 1 -; ALL-NEXT: [[TMP8]] = add i64 [[FWD_INDEX]], 16 +; ALL-NEXT: store <64 x i32> [[ELEMENT1]], ptr [[TMP7]], align 1 +; ALL-NEXT: [[TMP8]] = add i64 [[FWD_INDEX]], 256 ; ALL-NEXT: [[TMP9:%.*]] = icmp eq i64 [[TMP8]], 256 ; ALL-NEXT: br i1 [[TMP9]], label [[MEMMOVE_DONE]], label [[MEMMOVE_FWD_LOOP]] ; ALL: memmove_done: @@ -1360,20 +1366,20 @@ define amdgpu_kernel void @memmove_global_align1_flat_align1(ptr addrspace(1) %d ; ALL-NEXT: br i1 [[COMPARE_SRC_DST]], label [[MEMMOVE_BWD_LOOP:%.*]], label [[MEMMOVE_FWD_LOOP:%.*]] ; ALL: memmove_bwd_loop: ; ALL-NEXT: [[TMP2:%.*]] = phi i64 [ [[BWD_INDEX:%.*]], [[MEMMOVE_BWD_LOOP]] ], [ 256, [[TMP0:%.*]] ] -; ALL-NEXT: [[BWD_INDEX]] = sub i64 [[TMP2]], 16 +; ALL-NEXT: [[BWD_INDEX]] = sub i64 [[TMP2]], 256 ; ALL-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[BWD_INDEX]] -; ALL-NEXT: [[ELEMENT:%.*]] = load <4 x i32>, ptr [[TMP3]], align 1 +; ALL-NEXT: [[ELEMENT:%.*]] = load <64 x i32>, ptr [[TMP3]], align 1 ; ALL-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 [[BWD_INDEX]] -; ALL-NEXT: store <4 x i32> [[ELEMENT]], ptr addrspace(1) [[TMP4]], align 1 +; ALL-NEXT: store <64 x i32> [[ELEMENT]], ptr addrspace(1) [[TMP4]], align 1 ; ALL-NEXT: [[TMP5:%.*]] = icmp eq i64 [[BWD_INDEX]], 0 ; ALL-NEXT: br i1 [[TMP5]], label [[MEMMOVE_DONE:%.*]], label [[MEMMOVE_BWD_LOOP]] ; ALL: memmove_fwd_loop: ; ALL-NEXT: [[FWD_INDEX:%.*]] = phi i64 [ [[TMP8:%.*]], [[MEMMOVE_FWD_LOOP]] ], [ 0, [[TMP0]] ] ; ALL-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[FWD_INDEX]] -; ALL-NEXT: [[ELEMENT1:%.*]] = load <4 x i32>, ptr [[TMP6]], align 1 +; ALL-NEXT: [[ELEMENT1:%.*]] = load <64 x i32>, ptr [[TMP6]], align 1 ; ALL-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 [[FWD_INDEX]] -; ALL-NEXT: store <4 x i32> [[ELEMENT1]], ptr addrspace(1) [[TMP7]], align 1 -; ALL-NEXT: [[TMP8]] = add i64 [[FWD_INDEX]], 16 +; ALL-NEXT: store <64 x i32> [[ELEMENT1]], ptr addrspace(1) [[TMP7]], align 1 +; ALL-NEXT: [[TMP8]] = add i64 [[FWD_INDEX]], 256 ; ALL-NEXT: [[TMP9:%.*]] = icmp eq i64 [[TMP8]], 256 ; ALL-NEXT: br i1 [[TMP9]], label [[MEMMOVE_DONE]], label [[MEMMOVE_FWD_LOOP]] ; ALL: memmove_done: @@ -1394,20 +1400,20 @@ define amdgpu_kernel void @memmove_flat_align1_private_align1(ptr %dst, ptr addr ; ALL-NEXT: br i1 [[COMPARE_SRC_DST]], label [[MEMMOVE_BWD_LOOP:%.*]], label [[MEMMOVE_FWD_LOOP:%.*]] ; ALL: memmove_bwd_loop: ; ALL-NEXT: [[TMP2:%.*]] = phi i64 [ [[BWD_INDEX:%.*]], [[MEMMOVE_BWD_LOOP]] ], [ 256, [[TMP0:%.*]] ] -; ALL-NEXT: [[BWD_INDEX]] = sub i64 [[TMP2]], 16 +; ALL-NEXT: [[BWD_INDEX]] = sub i64 [[TMP2]], 256 ; ALL-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC]], i64 [[BWD_INDEX]] -; ALL-NEXT: [[ELEMENT:%.*]] = load <4 x i32>, ptr addrspace(5) [[TMP3]], align 1 +; ALL-NEXT: [[ELEMENT:%.*]] = load <64 x i32>, ptr addrspace(5) [[TMP3]], align 1 ; ALL-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[BWD_INDEX]] -; ALL-NEXT: store <4 x i32> [[ELEMENT]], ptr [[TMP4]], align 1 +; ALL-NEXT: store <64 x i32> [[ELEMENT]], ptr [[TMP4]], align 1 ; ALL-NEXT: [[TMP5:%.*]] = icmp eq i64 [[BWD_INDEX]], 0 ; ALL-NEXT: br i1 [[TMP5]], label [[MEMMOVE_DONE:%.*]], label [[MEMMOVE_BWD_LOOP]] ; ALL: memmove_fwd_loop: ; ALL-NEXT: [[FWD_INDEX:%.*]] = phi i64 [ [[TMP8:%.*]], [[MEMMOVE_FWD_LOOP]] ], [ 0, [[TMP0]] ] ; ALL-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC]], i64 [[FWD_INDEX]] -; ALL-NEXT: [[ELEMENT1:%.*]] = load <4 x i32>, ptr addrspace(5) [[TMP6]], align 1 +; ALL-NEXT: [[ELEMENT1:%.*]] = load <64 x i32>, ptr addrspace(5) [[TMP6]], align 1 ; ALL-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[FWD_INDEX]] -; ALL-NEXT: store <4 x i32> [[ELEMENT1]], ptr [[TMP7]], align 1 -; ALL-NEXT: [[TMP8]] = add i64 [[FWD_INDEX]], 16 +; ALL-NEXT: store <64 x i32> [[ELEMENT1]], ptr [[TMP7]], align 1 +; ALL-NEXT: [[TMP8]] = add i64 [[FWD_INDEX]], 256 ; ALL-NEXT: [[TMP9:%.*]] = icmp eq i64 [[TMP8]], 256 ; ALL-NEXT: br i1 [[TMP9]], label [[MEMMOVE_DONE]], label [[MEMMOVE_FWD_LOOP]] ; ALL: memmove_done: @@ -1428,20 +1434,20 @@ define amdgpu_kernel void @memmove_private_align1_flat_align1(ptr addrspace(5) % ; ALL-NEXT: br i1 [[COMPARE_SRC_DST]], label [[MEMMOVE_BWD_LOOP:%.*]], label [[MEMMOVE_FWD_LOOP:%.*]] ; ALL: memmove_bwd_loop: ; ALL-NEXT: [[TMP2:%.*]] = phi i64 [ [[BWD_INDEX:%.*]], [[MEMMOVE_BWD_LOOP]] ], [ 256, [[TMP0:%.*]] ] -; ALL-NEXT: [[BWD_INDEX]] = sub i64 [[TMP2]], 16 +; ALL-NEXT: [[BWD_INDEX]] = sub i64 [[TMP2]], 256 ; ALL-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[BWD_INDEX]] -; ALL-NEXT: [[ELEMENT:%.*]] = load <4 x i32>, ptr [[TMP3]], align 1 +; ALL-NEXT: [[ELEMENT:%.*]] = load <64 x i32>, ptr [[TMP3]], align 1 ; ALL-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[DST]], i64 [[BWD_INDEX]] -; ALL-NEXT: store <4 x i32> [[ELEMENT]], ptr addrspace(5) [[TMP4]], align 1 +; ALL-NEXT: store <64 x i32> [[ELEMENT]], ptr addrspace(5) [[TMP4]], align 1 ; ALL-NEXT: [[TMP5:%.*]] = icmp eq i64 [[BWD_INDEX]], 0 ; ALL-NEXT: br i1 [[TMP5]], label [[MEMMOVE_DONE:%.*]], label [[MEMMOVE_BWD_LOOP]] ; ALL: memmove_fwd_loop: ; ALL-NEXT: [[FWD_INDEX:%.*]] = phi i64 [ [[TMP8:%.*]], [[MEMMOVE_FWD_LOOP]] ], [ 0, [[TMP0]] ] ; ALL-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[FWD_INDEX]] -; ALL-NEXT: [[ELEMENT1:%.*]] = load <4 x i32>, ptr [[TMP6]], align 1 +; ALL-NEXT: [[ELEMENT1:%.*]] = load <64 x i32>, ptr [[TMP6]], align 1 ; ALL-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[DST]], i64 [[FWD_INDEX]] -; ALL-NEXT: store <4 x i32> [[ELEMENT1]], ptr addrspace(5) [[TMP7]], align 1 -; ALL-NEXT: [[TMP8]] = add i64 [[FWD_INDEX]], 16 +; ALL-NEXT: store <64 x i32> [[ELEMENT1]], ptr addrspace(5) [[TMP7]], align 1 +; ALL-NEXT: [[TMP8]] = add i64 [[FWD_INDEX]], 256 ; ALL-NEXT: [[TMP9:%.*]] = icmp eq i64 [[TMP8]], 256 ; ALL-NEXT: br i1 [[TMP9]], label [[MEMMOVE_DONE]], label [[MEMMOVE_FWD_LOOP]] ; ALL: memmove_done: @@ -1461,10 +1467,10 @@ define amdgpu_kernel void @memmove_private_align1_global_align1(ptr addrspace(5) ; ALL: load-store-loop: ; ALL-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ] ; ALL-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]] -; ALL-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP1]], align 1, !alias.scope [[META0:![0-9]+]] +; ALL-NEXT: [[TMP2:%.*]] = load <64 x i32>, ptr addrspace(1) [[TMP1]], align 1, !alias.scope [[META0:![0-9]+]] ; ALL-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[DST:%.*]], i64 [[LOOP_INDEX]] -; ALL-NEXT: store <4 x i32> [[TMP2]], ptr addrspace(5) [[TMP3]], align 1, !noalias [[META0]] -; ALL-NEXT: [[TMP4]] = add i64 [[LOOP_INDEX]], 16 +; ALL-NEXT: store <64 x i32> [[TMP2]], ptr addrspace(5) [[TMP3]], align 1, !noalias [[META0]] +; ALL-NEXT: [[TMP4]] = add i64 [[LOOP_INDEX]], 256 ; ALL-NEXT: [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 256 ; ALL-NEXT: br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]] ; ALL: memcpy-split: @@ -1484,10 +1490,10 @@ define amdgpu_kernel void @memmove_global_align1_private_align1(ptr addrspace(1) ; ALL: load-store-loop: ; ALL-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ] ; ALL-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC:%.*]], i64 [[LOOP_INDEX]] -; ALL-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr addrspace(5) [[TMP1]], align 1, !alias.scope [[META3:![0-9]+]] +; ALL-NEXT: [[TMP2:%.*]] = load <64 x i32>, ptr addrspace(5) [[TMP1]], align 1, !alias.scope [[META3:![0-9]+]] ; ALL-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 [[LOOP_INDEX]] -; ALL-NEXT: store <4 x i32> [[TMP2]], ptr addrspace(1) [[TMP3]], align 1, !noalias [[META3]] -; ALL-NEXT: [[TMP4]] = add i64 [[LOOP_INDEX]], 16 +; ALL-NEXT: store <64 x i32> [[TMP2]], ptr addrspace(1) [[TMP3]], align 1, !noalias [[META3]] +; ALL-NEXT: [[TMP4]] = add i64 [[LOOP_INDEX]], 256 ; ALL-NEXT: [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 256 ; ALL-NEXT: br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]] ; ALL: memcpy-split: @@ -2144,20 +2150,20 @@ define amdgpu_kernel void @memmove_private_align1_private_align1(ptr addrspace(5 ; ALL-NEXT: br i1 [[COMPARE_SRC_DST]], label [[MEMMOVE_BWD_LOOP:%.*]], label [[MEMMOVE_FWD_LOOP:%.*]] ; ALL: memmove_bwd_loop: ; ALL-NEXT: [[TMP1:%.*]] = phi i32 [ [[BWD_INDEX:%.*]], [[MEMMOVE_BWD_LOOP]] ], [ 256, [[TMP0:%.*]] ] -; ALL-NEXT: [[BWD_INDEX]] = sub i32 [[TMP1]], 16 +; ALL-NEXT: [[BWD_INDEX]] = sub i32 [[TMP1]], 256 ; ALL-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC]], i32 [[BWD_INDEX]] -; ALL-NEXT: [[ELEMENT:%.*]] = load <4 x i32>, ptr addrspace(5) [[TMP2]], align 1 +; ALL-NEXT: [[ELEMENT:%.*]] = load <64 x i32>, ptr addrspace(5) [[TMP2]], align 1 ; ALL-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[DST]], i32 [[BWD_INDEX]] -; ALL-NEXT: store <4 x i32> [[ELEMENT]], ptr addrspace(5) [[TMP3]], align 1 +; ALL-NEXT: store <64 x i32> [[ELEMENT]], ptr addrspace(5) [[TMP3]], align 1 ; ALL-NEXT: [[TMP4:%.*]] = icmp eq i32 [[BWD_INDEX]], 0 ; ALL-NEXT: br i1 [[TMP4]], label [[MEMMOVE_DONE:%.*]], label [[MEMMOVE_BWD_LOOP]] ; ALL: memmove_fwd_loop: ; ALL-NEXT: [[FWD_INDEX:%.*]] = phi i32 [ [[TMP7:%.*]], [[MEMMOVE_FWD_LOOP]] ], [ 0, [[TMP0]] ] ; ALL-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC]], i32 [[FWD_INDEX]] -; ALL-NEXT: [[ELEMENT1:%.*]] = load <4 x i32>, ptr addrspace(5) [[TMP5]], align 1 +; ALL-NEXT: [[ELEMENT1:%.*]] = load <64 x i32>, ptr addrspace(5) [[TMP5]], align 1 ; ALL-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[DST]], i32 [[FWD_INDEX]] -; ALL-NEXT: store <4 x i32> [[ELEMENT1]], ptr addrspace(5) [[TMP6]], align 1 -; ALL-NEXT: [[TMP7]] = add i32 [[FWD_INDEX]], 16 +; ALL-NEXT: store <64 x i32> [[ELEMENT1]], ptr addrspace(5) [[TMP6]], align 1 +; ALL-NEXT: [[TMP7]] = add i32 [[FWD_INDEX]], 256 ; ALL-NEXT: [[TMP8:%.*]] = icmp eq i32 [[TMP7]], 256 ; ALL-NEXT: br i1 [[TMP8]], label [[MEMMOVE_DONE]], label [[MEMMOVE_FWD_LOOP]] ; ALL: memmove_done: @@ -2231,27 +2237,27 @@ define amdgpu_kernel void @memmove_global_align4_static_residual_empty(ptr addrs ; OPT-NEXT: [[COMPARE_SRC_DST:%.*]] = icmp ult ptr addrspace(1) [[SRC:%.*]], [[DST:%.*]] ; OPT-NEXT: br i1 [[COMPARE_SRC_DST]], label [[MEMMOVE_BWD_LOOP:%.*]], label [[MEMMOVE_FWD_LOOP:%.*]] ; OPT: memmove_bwd_loop: -; OPT-NEXT: [[TMP1:%.*]] = phi i64 [ [[BWD_INDEX:%.*]], [[MEMMOVE_BWD_LOOP]] ], [ 1040, [[TMP0:%.*]] ] -; OPT-NEXT: [[BWD_INDEX]] = sub i64 [[TMP1]], 16 +; OPT-NEXT: [[TMP11:%.*]] = phi i64 [ [[BWD_INDEX:%.*]], [[MEMMOVE_BWD_LOOP]] ], [ 1280, [[TMP0:%.*]] ] +; OPT-NEXT: [[BWD_INDEX]] = sub i64 [[TMP11]], 256 ; OPT-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 [[BWD_INDEX]] -; OPT-NEXT: [[ELEMENT:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP2]], align 1 +; OPT-NEXT: [[ELEMENT:%.*]] = load <64 x i32>, ptr addrspace(1) [[TMP2]], align 1 ; OPT-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 [[BWD_INDEX]] -; OPT-NEXT: store <4 x i32> [[ELEMENT]], ptr addrspace(1) [[TMP3]], align 1 +; OPT-NEXT: store <64 x i32> [[ELEMENT]], ptr addrspace(1) [[TMP3]], align 1 ; OPT-NEXT: [[TMP4:%.*]] = icmp eq i64 [[BWD_INDEX]], 0 ; OPT-NEXT: br i1 [[TMP4]], label [[MEMMOVE_DONE:%.*]], label [[MEMMOVE_BWD_LOOP]] ; OPT: memmove_fwd_loop: ; OPT-NEXT: [[FWD_INDEX:%.*]] = phi i64 [ [[TMP7:%.*]], [[MEMMOVE_FWD_LOOP]] ], [ 0, [[TMP0]] ] ; OPT-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 [[FWD_INDEX]] -; OPT-NEXT: [[ELEMENT1:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP5]], align 1 +; OPT-NEXT: [[ELEMENT1:%.*]] = load <64 x i32>, ptr addrspace(1) [[TMP5]], align 1 ; OPT-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 [[FWD_INDEX]] -; OPT-NEXT: store <4 x i32> [[ELEMENT1]], ptr addrspace(1) [[TMP6]], align 1 -; OPT-NEXT: [[TMP7]] = add i64 [[FWD_INDEX]], 16 -; OPT-NEXT: [[TMP8:%.*]] = icmp eq i64 [[TMP7]], 1040 +; OPT-NEXT: store <64 x i32> [[ELEMENT1]], ptr addrspace(1) [[TMP6]], align 1 +; OPT-NEXT: [[TMP7]] = add i64 [[FWD_INDEX]], 256 +; OPT-NEXT: [[TMP8:%.*]] = icmp eq i64 [[TMP7]], 1280 ; OPT-NEXT: br i1 [[TMP8]], label [[MEMMOVE_DONE]], label [[MEMMOVE_FWD_LOOP]] ; OPT: memmove_done: ; OPT-NEXT: ret void ; - call void @llvm.memmove.p1.p1.i64(ptr addrspace(1) %dst, ptr addrspace(1) %src, i64 1040, i1 false) + call void @llvm.memmove.p1.p1.i64(ptr addrspace(1) %dst, ptr addrspace(1) %src, i64 1280, i1 false) ret void } @@ -2279,20 +2285,20 @@ define amdgpu_kernel void @memmove_global_align4_static_residual_full(ptr addrsp ; OPT-NEXT: br label [[MEMMOVE_BWD_LOOP:%.*]] ; OPT: memmove_bwd_loop: ; OPT-NEXT: [[TMP13:%.*]] = phi i64 [ [[BWD_INDEX:%.*]], [[MEMMOVE_BWD_LOOP]] ], [ 1024, [[MEMMOVE_BWD_RESIDUAL]] ] -; OPT-NEXT: [[BWD_INDEX]] = sub i64 [[TMP13]], 16 +; OPT-NEXT: [[BWD_INDEX]] = sub i64 [[TMP13]], 256 ; OPT-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 [[BWD_INDEX]] -; OPT-NEXT: [[ELEMENT:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP14]], align 1 +; OPT-NEXT: [[ELEMENT:%.*]] = load <64 x i32>, ptr addrspace(1) [[TMP14]], align 1 ; OPT-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 [[BWD_INDEX]] -; OPT-NEXT: store <4 x i32> [[ELEMENT]], ptr addrspace(1) [[TMP15]], align 1 +; OPT-NEXT: store <64 x i32> [[ELEMENT]], ptr addrspace(1) [[TMP15]], align 1 ; OPT-NEXT: [[TMP16:%.*]] = icmp eq i64 [[BWD_INDEX]], 0 ; OPT-NEXT: br i1 [[TMP16]], label [[MEMMOVE_DONE:%.*]], label [[MEMMOVE_BWD_LOOP]] ; OPT: memmove_fwd_loop: ; OPT-NEXT: [[FWD_INDEX:%.*]] = phi i64 [ [[TMP19:%.*]], [[MEMMOVE_FWD_LOOP]] ], [ 0, [[TMP0:%.*]] ] ; OPT-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 [[FWD_INDEX]] -; OPT-NEXT: [[ELEMENT1:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP17]], align 1 +; OPT-NEXT: [[ELEMENT1:%.*]] = load <64 x i32>, ptr addrspace(1) [[TMP17]], align 1 ; OPT-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 [[FWD_INDEX]] -; OPT-NEXT: store <4 x i32> [[ELEMENT1]], ptr addrspace(1) [[TMP18]], align 1 -; OPT-NEXT: [[TMP19]] = add i64 [[FWD_INDEX]], 16 +; OPT-NEXT: store <64 x i32> [[ELEMENT1]], ptr addrspace(1) [[TMP18]], align 1 +; OPT-NEXT: [[TMP19]] = add i64 [[FWD_INDEX]], 256 ; OPT-NEXT: [[TMP20:%.*]] = icmp eq i64 [[TMP19]], 1024 ; OPT-NEXT: br i1 [[TMP20]], label [[MEMMOVE_FWD_RESIDUAL:%.*]], label [[MEMMOVE_FWD_LOOP]] ; OPT: memmove_fwd_residual: @@ -2363,40 +2369,40 @@ entry: define amdgpu_kernel void @memmove_volatile(ptr addrspace(1) %dst, ptr addrspace(1) %src) #0 { ; MAX1024-LABEL: @memmove_volatile( -; MAX1024-NEXT: call void @llvm.memmove.p1.p1.i64(ptr addrspace(1) [[DST:%.*]], ptr addrspace(1) [[SRC:%.*]], i64 64, i1 true) +; MAX1024-NEXT: call void @llvm.memmove.p1.p1.i64(ptr addrspace(1) [[DST:%.*]], ptr addrspace(1) [[SRC:%.*]], i64 512, i1 true) ; MAX1024-NEXT: ret void ; ; ALL-LABEL: @memmove_volatile( ; ALL-NEXT: [[COMPARE_SRC_DST:%.*]] = icmp ult ptr addrspace(1) [[SRC:%.*]], [[DST:%.*]] ; ALL-NEXT: br i1 [[COMPARE_SRC_DST]], label [[MEMMOVE_BWD_LOOP:%.*]], label [[MEMMOVE_FWD_LOOP:%.*]] ; ALL: memmove_bwd_loop: -; ALL-NEXT: [[TMP1:%.*]] = phi i64 [ [[BWD_INDEX:%.*]], [[MEMMOVE_BWD_LOOP]] ], [ 64, [[TMP0:%.*]] ] -; ALL-NEXT: [[BWD_INDEX]] = sub i64 [[TMP1]], 16 +; ALL-NEXT: [[TMP1:%.*]] = phi i64 [ [[BWD_INDEX:%.*]], [[MEMMOVE_BWD_LOOP]] ], [ 512, [[TMP0:%.*]] ] +; ALL-NEXT: [[BWD_INDEX]] = sub i64 [[TMP1]], 256 ; ALL-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 [[BWD_INDEX]] -; ALL-NEXT: [[ELEMENT:%.*]] = load volatile <4 x i32>, ptr addrspace(1) [[TMP2]], align 1 +; ALL-NEXT: [[ELEMENT:%.*]] = load volatile <64 x i32>, ptr addrspace(1) [[TMP2]], align 1 ; ALL-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 [[BWD_INDEX]] -; ALL-NEXT: store volatile <4 x i32> [[ELEMENT]], ptr addrspace(1) [[TMP3]], align 1 +; ALL-NEXT: store volatile <64 x i32> [[ELEMENT]], ptr addrspace(1) [[TMP3]], align 1 ; ALL-NEXT: [[TMP4:%.*]] = icmp eq i64 [[BWD_INDEX]], 0 ; ALL-NEXT: br i1 [[TMP4]], label [[MEMMOVE_DONE:%.*]], label [[MEMMOVE_BWD_LOOP]] ; ALL: memmove_fwd_loop: ; ALL-NEXT: [[FWD_INDEX:%.*]] = phi i64 [ [[TMP7:%.*]], [[MEMMOVE_FWD_LOOP]] ], [ 0, [[TMP0]] ] ; ALL-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 [[FWD_INDEX]] -; ALL-NEXT: [[ELEMENT1:%.*]] = load volatile <4 x i32>, ptr addrspace(1) [[TMP5]], align 1 +; ALL-NEXT: [[ELEMENT1:%.*]] = load volatile <64 x i32>, ptr addrspace(1) [[TMP5]], align 1 ; ALL-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 [[FWD_INDEX]] -; ALL-NEXT: store volatile <4 x i32> [[ELEMENT1]], ptr addrspace(1) [[TMP6]], align 1 -; ALL-NEXT: [[TMP7]] = add i64 [[FWD_INDEX]], 16 -; ALL-NEXT: [[TMP8:%.*]] = icmp eq i64 [[TMP7]], 64 +; ALL-NEXT: store volatile <64 x i32> [[ELEMENT1]], ptr addrspace(1) [[TMP6]], align 1 +; ALL-NEXT: [[TMP7]] = add i64 [[FWD_INDEX]], 256 +; ALL-NEXT: [[TMP8:%.*]] = icmp eq i64 [[TMP7]], 512 ; ALL-NEXT: br i1 [[TMP8]], label [[MEMMOVE_DONE]], label [[MEMMOVE_FWD_LOOP]] ; ALL: memmove_done: ; ALL-NEXT: ret void ; - call void @llvm.memmove.p1.p1.i64(ptr addrspace(1) %dst, ptr addrspace(1) %src, i64 64, i1 true) + call void @llvm.memmove.p1.p1.i64(ptr addrspace(1) %dst, ptr addrspace(1) %src, i64 512, i1 true) ret void } define amdgpu_kernel void @memcpy_volatile(ptr addrspace(1) %dst, ptr addrspace(1) %src) #0 { ; MAX1024-LABEL: @memcpy_volatile( -; MAX1024-NEXT: call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) [[DST:%.*]], ptr addrspace(1) [[SRC:%.*]], i64 64, i1 true) +; MAX1024-NEXT: call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) [[DST:%.*]], ptr addrspace(1) [[SRC:%.*]], i64 512, i1 true) ; MAX1024-NEXT: ret void ; ; ALL-LABEL: @memcpy_volatile( @@ -2404,16 +2410,16 @@ define amdgpu_kernel void @memcpy_volatile(ptr addrspace(1) %dst, ptr addrspace( ; ALL: load-store-loop: ; ALL-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ] ; ALL-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]] -; ALL-NEXT: [[TMP2:%.*]] = load volatile <4 x i32>, ptr addrspace(1) [[TMP1]], align 1 +; ALL-NEXT: [[TMP2:%.*]] = load volatile <64 x i32>, ptr addrspace(1) [[TMP1]], align 1 ; ALL-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 [[LOOP_INDEX]] -; ALL-NEXT: store volatile <4 x i32> [[TMP2]], ptr addrspace(1) [[TMP3]], align 1 -; ALL-NEXT: [[TMP4]] = add i64 [[LOOP_INDEX]], 16 -; ALL-NEXT: [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 64 +; ALL-NEXT: store volatile <64 x i32> [[TMP2]], ptr addrspace(1) [[TMP3]], align 1 +; ALL-NEXT: [[TMP4]] = add i64 [[LOOP_INDEX]], 256 +; ALL-NEXT: [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 512 ; ALL-NEXT: br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]] ; ALL: memcpy-split: ; ALL-NEXT: ret void ; - call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) %dst, ptr addrspace(1) %src, i64 64, i1 true) + call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) %dst, ptr addrspace(1) %src, i64 512, i1 true) ret void } diff --git a/llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll b/llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll new file mode 100644 index 0000000000000..565fce0e7abde --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll @@ -0,0 +1,16049 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 %s -o - | FileCheck %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 -mattr=-unaligned-access-mode %s -o - | FileCheck -check-prefix=ALIGNED %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 -amdgpu-memcpy-loop-unroll=3 %s -o - | FileCheck -check-prefix=UNROLL3 %s + +; For checking that LowerMemIntrinsics lowers memcpy and memmove with large +; constant copy-sizes into loops with multiple load/store pairs. + + +; memcpy for address spaces 0, 1, 4, 5 + +define void @memcpy_p0_p0_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(0) align 1 readonly %src) { +; CHECK-LABEL: memcpy_p0_p0_sz2048: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_mov_b64 s[4:5], 0 +; CHECK-NEXT: .LBB0_1: ; %load-store-loop +; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: v_add_co_u32 v96, vcc_lo, v2, s4 +; CHECK-NEXT: v_add_co_ci_u32_e32 v97, vcc_lo, s5, v3, vcc_lo +; CHECK-NEXT: v_add_co_u32 v100, vcc_lo, v0, s4 +; CHECK-NEXT: s_add_u32 s4, s4, 0x100 +; CHECK-NEXT: s_clause 0xf +; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[96:97] offset:224 +; CHECK-NEXT: flat_load_dwordx4 v[8:11], v[96:97] offset:240 +; CHECK-NEXT: flat_load_dwordx4 v[12:15], v[96:97] offset:192 +; CHECK-NEXT: flat_load_dwordx4 v[16:19], v[96:97] offset:208 +; CHECK-NEXT: flat_load_dwordx4 v[20:23], v[96:97] offset:160 +; CHECK-NEXT: flat_load_dwordx4 v[24:27], v[96:97] offset:176 +; CHECK-NEXT: flat_load_dwordx4 v[28:31], v[96:97] offset:128 +; CHECK-NEXT: flat_load_dwordx4 v[32:35], v[96:97] offset:144 +; CHECK-NEXT: flat_load_dwordx4 v[36:39], v[96:97] offset:96 +; CHECK-NEXT: flat_load_dwordx4 v[48:51], v[96:97] offset:112 +; CHECK-NEXT: flat_load_dwordx4 v[52:55], v[96:97] offset:64 +; CHECK-NEXT: flat_load_dwordx4 v[64:67], v[96:97] offset:80 +; CHECK-NEXT: flat_load_dwordx4 v[68:71], v[96:97] offset:32 +; CHECK-NEXT: flat_load_dwordx4 v[80:83], v[96:97] offset:48 +; CHECK-NEXT: flat_load_dwordx4 v[84:87], v[96:97] +; CHECK-NEXT: flat_load_dwordx4 v[96:99], v[96:97] offset:16 +; CHECK-NEXT: v_add_co_ci_u32_e32 v101, vcc_lo, s5, v1, vcc_lo +; CHECK-NEXT: s_addc_u32 s5, s5, 0 +; CHECK-NEXT: s_waitcnt vmcnt(15) lgkmcnt(15) +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[4:7] offset:224 +; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(15) +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[8:11] offset:240 +; CHECK-NEXT: s_waitcnt vmcnt(13) lgkmcnt(15) +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[12:15] offset:192 +; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(15) +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[16:19] offset:208 +; CHECK-NEXT: s_waitcnt vmcnt(11) lgkmcnt(15) +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[20:23] offset:160 +; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(15) +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[24:27] offset:176 +; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(15) +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[28:31] offset:128 +; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(15) +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[32:35] offset:144 +; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(15) +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[36:39] offset:96 +; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(15) +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[48:51] offset:112 +; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(15) +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[52:55] offset:64 +; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(15) +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[64:67] offset:80 +; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(15) +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[68:71] offset:32 +; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(15) +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[80:83] offset:48 +; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(15) +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[84:87] +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(15) +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[96:99] offset:16 +; CHECK-NEXT: v_cmp_gt_u64_e64 s6, 0x800, s[4:5] +; CHECK-NEXT: s_and_b32 vcc_lo, exec_lo, s6 +; CHECK-NEXT: s_cbranch_vccnz .LBB0_1 +; CHECK-NEXT: ; %bb.2: ; %memcpy-split +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: s_setpc_b64 s[30:31] +; +; ALIGNED-LABEL: memcpy_p0_p0_sz2048: +; ALIGNED: ; %bb.0: ; %entry +; ALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; ALIGNED-NEXT: s_mov_b64 s[4:5], 0 +; ALIGNED-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill +; ALIGNED-NEXT: .LBB0_1: ; %load-store-loop +; ALIGNED-NEXT: ; =>This Inner Loop Header: Depth=1 +; ALIGNED-NEXT: v_add_co_u32 v24, vcc_lo, v2, s4 +; ALIGNED-NEXT: v_add_co_ci_u32_e32 v25, vcc_lo, s5, v3, vcc_lo +; ALIGNED-NEXT: s_clause 0xf +; ALIGNED-NEXT: flat_load_dwordx4 v[16:19], v[24:25] offset:240 +; ALIGNED-NEXT: flat_load_dwordx4 v[20:23], v[24:25] offset:224 +; ALIGNED-NEXT: flat_load_dwordx4 v[4:7], v[24:25] +; ALIGNED-NEXT: flat_load_dwordx4 v[8:11], v[24:25] offset:16 +; ALIGNED-NEXT: flat_load_dwordx4 v[12:15], v[24:25] offset:32 +; ALIGNED-NEXT: flat_load_dwordx4 v[112:115], v[24:25] offset:48 +; ALIGNED-NEXT: flat_load_dwordx4 v[116:119], v[24:25] offset:64 +; ALIGNED-NEXT: flat_load_dwordx4 v[40:43], v[24:25] offset:80 +; ALIGNED-NEXT: flat_load_dwordx4 v[26:29], v[24:25] offset:96 +; ALIGNED-NEXT: flat_load_dwordx4 v[32:35], v[24:25] offset:112 +; ALIGNED-NEXT: flat_load_dwordx4 v[44:47], v[24:25] offset:128 +; ALIGNED-NEXT: flat_load_dwordx4 v[52:55], v[24:25] offset:144 +; ALIGNED-NEXT: flat_load_dwordx4 v[66:69], v[24:25] offset:160 +; ALIGNED-NEXT: flat_load_dwordx4 v[81:84], v[24:25] offset:176 +; ALIGNED-NEXT: flat_load_dwordx4 v[96:99], v[24:25] offset:192 +; ALIGNED-NEXT: flat_load_dwordx4 v[100:103], v[24:25] offset:208 +; ALIGNED-NEXT: s_waitcnt vmcnt(15) lgkmcnt(15) +; ALIGNED-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:64 +; ALIGNED-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:68 +; ALIGNED-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:72 +; ALIGNED-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:76 +; ALIGNED-NEXT: s_clause 0x3 +; ALIGNED-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:76 +; ALIGNED-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:72 +; ALIGNED-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:68 +; ALIGNED-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:64 +; ALIGNED-NEXT: v_add_co_u32 v16, vcc_lo, v0, s4 +; ALIGNED-NEXT: v_add_co_ci_u32_e32 v17, vcc_lo, s5, v1, vcc_lo +; ALIGNED-NEXT: s_add_u32 s4, s4, 0x100 +; ALIGNED-NEXT: s_addc_u32 s5, s5, 0 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v31 offset:254 +; ALIGNED-NEXT: flat_store_byte v[16:17], v31 offset:252 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v30 offset:250 +; ALIGNED-NEXT: flat_store_byte v[16:17], v30 offset:248 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v25 offset:246 +; ALIGNED-NEXT: flat_store_byte v[16:17], v25 offset:244 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v24 offset:242 +; ALIGNED-NEXT: flat_store_byte v[16:17], v24 offset:240 +; ALIGNED-NEXT: s_waitcnt lgkmcnt(22) +; ALIGNED-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:80 +; ALIGNED-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:84 +; ALIGNED-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:88 +; ALIGNED-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:92 +; ALIGNED-NEXT: s_clause 0x3 +; ALIGNED-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:92 +; ALIGNED-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:88 +; ALIGNED-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:84 +; ALIGNED-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:80 +; ALIGNED-NEXT: v_cmp_gt_u64_e64 s6, 0x800, s[4:5] +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v51 offset:238 +; ALIGNED-NEXT: flat_store_byte v[16:17], v51 offset:236 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v50 offset:234 +; ALIGNED-NEXT: flat_store_byte v[16:17], v50 offset:232 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v49 offset:230 +; ALIGNED-NEXT: flat_store_byte v[16:17], v49 offset:228 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v36 offset:226 +; ALIGNED-NEXT: flat_store_byte v[16:17], v36 offset:224 +; ALIGNED-NEXT: s_waitcnt lgkmcnt(16) +; ALIGNED-NEXT: buffer_store_dword v100, off, s[0:3], s32 offset:32 +; ALIGNED-NEXT: buffer_store_dword v101, off, s[0:3], s32 offset:36 +; ALIGNED-NEXT: buffer_store_dword v102, off, s[0:3], s32 offset:40 +; ALIGNED-NEXT: buffer_store_dword v103, off, s[0:3], s32 offset:44 +; ALIGNED-NEXT: s_clause 0x3 +; ALIGNED-NEXT: buffer_load_dword v71, off, s[0:3], s32 offset:44 +; ALIGNED-NEXT: buffer_load_dword v70, off, s[0:3], s32 offset:40 +; ALIGNED-NEXT: buffer_load_dword v65, off, s[0:3], s32 offset:36 +; ALIGNED-NEXT: buffer_load_dword v64, off, s[0:3], s32 offset:32 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v31 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v31, 8, v31 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v30 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v30, 8, v30 +; ALIGNED-NEXT: s_and_b32 vcc_lo, exec_lo, s6 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v71 offset:222 +; ALIGNED-NEXT: flat_store_byte v[16:17], v71 offset:220 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v70 offset:218 +; ALIGNED-NEXT: flat_store_byte v[16:17], v70 offset:216 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v65 offset:214 +; ALIGNED-NEXT: flat_store_byte v[16:17], v65 offset:212 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v64 offset:210 +; ALIGNED-NEXT: flat_store_byte v[16:17], v64 offset:208 +; ALIGNED-NEXT: buffer_store_dword v96, off, s[0:3], s32 offset:48 +; ALIGNED-NEXT: buffer_store_dword v97, off, s[0:3], s32 offset:52 +; ALIGNED-NEXT: buffer_store_dword v98, off, s[0:3], s32 offset:56 +; ALIGNED-NEXT: buffer_store_dword v99, off, s[0:3], s32 offset:60 +; ALIGNED-NEXT: s_clause 0x3 +; ALIGNED-NEXT: buffer_load_dword v87, off, s[0:3], s32 offset:60 +; ALIGNED-NEXT: buffer_load_dword v86, off, s[0:3], s32 offset:56 +; ALIGNED-NEXT: buffer_load_dword v85, off, s[0:3], s32 offset:52 +; ALIGNED-NEXT: buffer_load_dword v80, off, s[0:3], s32 offset:48 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v87 offset:206 +; ALIGNED-NEXT: flat_store_byte v[16:17], v87 offset:204 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v86 offset:202 +; ALIGNED-NEXT: flat_store_byte v[16:17], v86 offset:200 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v85 offset:198 +; ALIGNED-NEXT: flat_store_byte v[16:17], v85 offset:196 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v80 offset:194 +; ALIGNED-NEXT: flat_store_byte v[16:17], v80 offset:192 +; ALIGNED-NEXT: buffer_store_dword v81, off, s[0:3], s32 offset:128 +; ALIGNED-NEXT: buffer_store_dword v82, off, s[0:3], s32 offset:132 +; ALIGNED-NEXT: buffer_store_dword v83, off, s[0:3], s32 offset:136 +; ALIGNED-NEXT: buffer_store_dword v84, off, s[0:3], s32 offset:140 +; ALIGNED-NEXT: s_clause 0x3 +; ALIGNED-NEXT: buffer_load_dword v101, off, s[0:3], s32 offset:140 +; ALIGNED-NEXT: buffer_load_dword v99, off, s[0:3], s32 offset:136 +; ALIGNED-NEXT: buffer_load_dword v96, off, s[0:3], s32 offset:132 +; ALIGNED-NEXT: buffer_load_dword v81, off, s[0:3], s32 offset:128 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v101 offset:190 +; ALIGNED-NEXT: flat_store_byte v[16:17], v101 offset:188 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v99 offset:186 +; ALIGNED-NEXT: flat_store_byte v[16:17], v99 offset:184 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v96 offset:182 +; ALIGNED-NEXT: flat_store_byte v[16:17], v96 offset:180 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v81 offset:178 +; ALIGNED-NEXT: flat_store_byte v[16:17], v81 offset:176 +; ALIGNED-NEXT: buffer_store_dword v66, off, s[0:3], s32 offset:144 +; ALIGNED-NEXT: buffer_store_dword v67, off, s[0:3], s32 offset:148 +; ALIGNED-NEXT: buffer_store_dword v68, off, s[0:3], s32 offset:152 +; ALIGNED-NEXT: buffer_store_dword v69, off, s[0:3], s32 offset:156 +; ALIGNED-NEXT: s_clause 0x3 +; ALIGNED-NEXT: buffer_load_dword v100, off, s[0:3], s32 offset:156 +; ALIGNED-NEXT: buffer_load_dword v97, off, s[0:3], s32 offset:152 +; ALIGNED-NEXT: buffer_load_dword v82, off, s[0:3], s32 offset:148 +; ALIGNED-NEXT: buffer_load_dword v66, off, s[0:3], s32 offset:144 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v100 offset:174 +; ALIGNED-NEXT: flat_store_byte v[16:17], v100 offset:172 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v97 offset:170 +; ALIGNED-NEXT: flat_store_byte v[16:17], v97 offset:168 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v82 offset:166 +; ALIGNED-NEXT: flat_store_byte v[16:17], v82 offset:164 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v66 offset:162 +; ALIGNED-NEXT: flat_store_byte v[16:17], v66 offset:160 +; ALIGNED-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:96 +; ALIGNED-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:100 +; ALIGNED-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:104 +; ALIGNED-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:108 +; ALIGNED-NEXT: s_clause 0x3 +; ALIGNED-NEXT: buffer_load_dword v98, off, s[0:3], s32 offset:108 +; ALIGNED-NEXT: buffer_load_dword v83, off, s[0:3], s32 offset:104 +; ALIGNED-NEXT: buffer_load_dword v67, off, s[0:3], s32 offset:100 +; ALIGNED-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:96 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v98 offset:158 +; ALIGNED-NEXT: flat_store_byte v[16:17], v98 offset:156 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v83 offset:154 +; ALIGNED-NEXT: flat_store_byte v[16:17], v83 offset:152 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v67 offset:150 +; ALIGNED-NEXT: flat_store_byte v[16:17], v67 offset:148 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v52 offset:146 +; ALIGNED-NEXT: flat_store_byte v[16:17], v52 offset:144 +; ALIGNED-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:112 +; ALIGNED-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:116 +; ALIGNED-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:120 +; ALIGNED-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:124 +; ALIGNED-NEXT: s_clause 0x3 +; ALIGNED-NEXT: buffer_load_dword v84, off, s[0:3], s32 offset:124 +; ALIGNED-NEXT: buffer_load_dword v68, off, s[0:3], s32 offset:120 +; ALIGNED-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:116 +; ALIGNED-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:112 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v84 offset:142 +; ALIGNED-NEXT: flat_store_byte v[16:17], v84 offset:140 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v68 offset:138 +; ALIGNED-NEXT: flat_store_byte v[16:17], v68 offset:136 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v53 offset:134 +; ALIGNED-NEXT: flat_store_byte v[16:17], v53 offset:132 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v37 offset:130 +; ALIGNED-NEXT: flat_store_byte v[16:17], v37 offset:128 +; ALIGNED-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:192 +; ALIGNED-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:196 +; ALIGNED-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:200 +; ALIGNED-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:204 +; ALIGNED-NEXT: s_clause 0x3 +; ALIGNED-NEXT: buffer_load_dword v69, off, s[0:3], s32 offset:204 +; ALIGNED-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:200 +; ALIGNED-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:196 +; ALIGNED-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:192 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v69 offset:126 +; ALIGNED-NEXT: flat_store_byte v[16:17], v69 offset:124 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v54 offset:122 +; ALIGNED-NEXT: flat_store_byte v[16:17], v54 offset:120 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v38 offset:118 +; ALIGNED-NEXT: flat_store_byte v[16:17], v38 offset:116 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v32 offset:114 +; ALIGNED-NEXT: flat_store_byte v[16:17], v32 offset:112 +; ALIGNED-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:208 +; ALIGNED-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:212 +; ALIGNED-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:216 +; ALIGNED-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:220 +; ALIGNED-NEXT: s_clause 0x3 +; ALIGNED-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:220 +; ALIGNED-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:216 +; ALIGNED-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:212 +; ALIGNED-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:208 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v55 offset:110 +; ALIGNED-NEXT: flat_store_byte v[16:17], v55 offset:108 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v39 offset:106 +; ALIGNED-NEXT: flat_store_byte v[16:17], v39 offset:104 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v33 offset:102 +; ALIGNED-NEXT: flat_store_byte v[16:17], v33 offset:100 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v26 offset:98 +; ALIGNED-NEXT: flat_store_byte v[16:17], v26 offset:96 +; ALIGNED-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:160 +; ALIGNED-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:164 +; ALIGNED-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:168 +; ALIGNED-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:172 +; ALIGNED-NEXT: s_clause 0x3 +; ALIGNED-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:172 +; ALIGNED-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:168 +; ALIGNED-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:164 +; ALIGNED-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:160 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v48 offset:94 +; ALIGNED-NEXT: flat_store_byte v[16:17], v48 offset:92 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v34 offset:90 +; ALIGNED-NEXT: flat_store_byte v[16:17], v34 offset:88 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v27 offset:86 +; ALIGNED-NEXT: flat_store_byte v[16:17], v27 offset:84 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v21 offset:82 +; ALIGNED-NEXT: flat_store_byte v[16:17], v21 offset:80 +; ALIGNED-NEXT: buffer_store_dword v116, off, s[0:3], s32 offset:176 +; ALIGNED-NEXT: buffer_store_dword v117, off, s[0:3], s32 offset:180 +; ALIGNED-NEXT: buffer_store_dword v118, off, s[0:3], s32 offset:184 +; ALIGNED-NEXT: buffer_store_dword v119, off, s[0:3], s32 offset:188 +; ALIGNED-NEXT: s_clause 0x3 +; ALIGNED-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:188 +; ALIGNED-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:184 +; ALIGNED-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:180 +; ALIGNED-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:176 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v35 offset:78 +; ALIGNED-NEXT: flat_store_byte v[16:17], v35 offset:76 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v28 offset:74 +; ALIGNED-NEXT: flat_store_byte v[16:17], v28 offset:72 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v22 offset:70 +; ALIGNED-NEXT: flat_store_byte v[16:17], v22 offset:68 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v19 offset:66 +; ALIGNED-NEXT: flat_store_byte v[16:17], v19 offset:64 +; ALIGNED-NEXT: buffer_store_dword v112, off, s[0:3], s32 offset:256 +; ALIGNED-NEXT: buffer_store_dword v113, off, s[0:3], s32 offset:260 +; ALIGNED-NEXT: buffer_store_dword v114, off, s[0:3], s32 offset:264 +; ALIGNED-NEXT: buffer_store_dword v115, off, s[0:3], s32 offset:268 +; ALIGNED-NEXT: s_clause 0x3 +; ALIGNED-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:268 +; ALIGNED-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:264 +; ALIGNED-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:260 +; ALIGNED-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:256 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v25 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v25, 8, v25 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v24 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v24, 8, v24 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v51 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 8, v51 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 24, v50 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v50, 8, v50 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v29 offset:62 +; ALIGNED-NEXT: flat_store_byte v[16:17], v29 offset:60 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v23 offset:58 +; ALIGNED-NEXT: flat_store_byte v[16:17], v23 offset:56 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v20 offset:54 +; ALIGNED-NEXT: flat_store_byte v[16:17], v20 offset:52 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v18 offset:50 +; ALIGNED-NEXT: flat_store_byte v[16:17], v18 offset:48 +; ALIGNED-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:272 +; ALIGNED-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:276 +; ALIGNED-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:280 +; ALIGNED-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:284 +; ALIGNED-NEXT: s_clause 0x3 +; ALIGNED-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:280 +; ALIGNED-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:284 +; ALIGNED-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:272 +; ALIGNED-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:276 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v15 offset:42 +; ALIGNED-NEXT: flat_store_byte v[16:17], v15 offset:40 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v14 offset:46 +; ALIGNED-NEXT: flat_store_byte v[16:17], v14 offset:44 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v13 offset:34 +; ALIGNED-NEXT: flat_store_byte v[16:17], v13 offset:32 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v12 offset:38 +; ALIGNED-NEXT: flat_store_byte v[16:17], v12 offset:36 +; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:224 +; ALIGNED-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:228 +; ALIGNED-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:232 +; ALIGNED-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:236 +; ALIGNED-NEXT: s_clause 0x3 +; ALIGNED-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:236 +; ALIGNED-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:232 +; ALIGNED-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:228 +; ALIGNED-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:224 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v11 offset:30 +; ALIGNED-NEXT: flat_store_byte v[16:17], v11 offset:28 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v10 offset:26 +; ALIGNED-NEXT: flat_store_byte v[16:17], v10 offset:24 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v9 offset:22 +; ALIGNED-NEXT: flat_store_byte v[16:17], v9 offset:20 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v8 offset:18 +; ALIGNED-NEXT: flat_store_byte v[16:17], v8 offset:16 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:240 +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:244 +; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:248 +; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:252 +; ALIGNED-NEXT: s_clause 0x3 +; ALIGNED-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:252 +; ALIGNED-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:248 +; ALIGNED-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:244 +; ALIGNED-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:240 +; ALIGNED-NEXT: flat_store_byte v[16:17], v112 offset:247 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v65 +; ALIGNED-NEXT: flat_store_byte v[16:17], v102 offset:255 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v49 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v49, 8, v49 +; ALIGNED-NEXT: flat_store_byte v[16:17], v31 offset:253 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v31, 24, v36 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v36, 8, v36 +; ALIGNED-NEXT: flat_store_byte v[16:17], v103 offset:251 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v71 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 8, v71 +; ALIGNED-NEXT: flat_store_byte v[16:17], v30 offset:249 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v30, 24, v70 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 8, v70 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v65, 8, v65 +; ALIGNED-NEXT: flat_store_byte v[16:17], v25 offset:245 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v25, 24, v64 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v64, 8, v64 +; ALIGNED-NEXT: flat_store_byte v[16:17], v112 offset:215 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v67 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v67, 8, v67 +; ALIGNED-NEXT: flat_store_byte v[16:17], v113 offset:243 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v87 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 8, v87 +; ALIGNED-NEXT: flat_store_byte v[16:17], v24 offset:241 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v24, 24, v86 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 8, v86 +; ALIGNED-NEXT: flat_store_byte v[16:17], v114 offset:239 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v85 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 8, v85 +; ALIGNED-NEXT: flat_store_byte v[16:17], v51 offset:237 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 24, v80 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 8, v80 +; ALIGNED-NEXT: flat_store_byte v[16:17], v115 offset:235 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 24, v101 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 8, v101 +; ALIGNED-NEXT: flat_store_byte v[16:17], v50 offset:233 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v50, 24, v99 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v99 +; ALIGNED-NEXT: flat_store_byte v[16:17], v102 offset:231 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v96 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v96, 8, v96 +; ALIGNED-NEXT: flat_store_byte v[16:17], v49 offset:229 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v49, 24, v81 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 8, v81 +; ALIGNED-NEXT: flat_store_byte v[16:17], v31 offset:227 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v31, 24, v100 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 8, v100 +; ALIGNED-NEXT: flat_store_byte v[16:17], v36 offset:225 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v36, 24, v97 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v97, 8, v97 +; ALIGNED-NEXT: flat_store_byte v[16:17], v103 offset:223 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v82 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 8, v82 +; ALIGNED-NEXT: flat_store_byte v[16:17], v71 offset:221 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 24, v66 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v66, 8, v66 +; ALIGNED-NEXT: flat_store_byte v[16:17], v30 offset:219 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v30, 24, v98 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 8, v98 +; ALIGNED-NEXT: flat_store_byte v[16:17], v70 offset:217 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 24, v83 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 8, v83 +; ALIGNED-NEXT: flat_store_byte v[16:17], v65 offset:213 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v65, 24, v52 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v52, 8, v52 +; ALIGNED-NEXT: flat_store_byte v[16:17], v25 offset:211 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v25, 24, v84 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v84, 8, v84 +; ALIGNED-NEXT: flat_store_byte v[16:17], v64 offset:209 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v64, 24, v68 +; ALIGNED-NEXT: flat_store_byte v[16:17], v67 offset:149 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v67, 24, v8 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v8, 8, v8 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v68, 8, v68 +; ALIGNED-NEXT: flat_store_byte v[16:17], v113 offset:207 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v53 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v53, 8, v53 +; ALIGNED-NEXT: flat_store_byte v[16:17], v87 offset:205 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 24, v37 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v37, 8, v37 +; ALIGNED-NEXT: flat_store_byte v[16:17], v24 offset:203 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v24, 24, v69 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 8, v69 +; ALIGNED-NEXT: flat_store_byte v[16:17], v86 offset:201 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 24, v54 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v54, 8, v54 +; ALIGNED-NEXT: flat_store_byte v[16:17], v114 offset:199 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v38 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v38, 8, v38 +; ALIGNED-NEXT: flat_store_byte v[16:17], v85 offset:197 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 24, v32 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v32, 8, v32 +; ALIGNED-NEXT: flat_store_byte v[16:17], v51 offset:195 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 24, v55 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v55, 8, v55 +; ALIGNED-NEXT: flat_store_byte v[16:17], v80 offset:193 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 24, v39 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v39, 8, v39 +; ALIGNED-NEXT: flat_store_byte v[16:17], v115 offset:191 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 24, v33 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v33, 8, v33 +; ALIGNED-NEXT: flat_store_byte v[16:17], v101 offset:189 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 24, v26 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v26, 8, v26 +; ALIGNED-NEXT: flat_store_byte v[16:17], v50 offset:187 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v50, 24, v48 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v48, 8, v48 +; ALIGNED-NEXT: flat_store_byte v[16:17], v99 offset:185 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 24, v34 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v34, 8, v34 +; ALIGNED-NEXT: flat_store_byte v[16:17], v102 offset:183 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v27 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v27, 8, v27 +; ALIGNED-NEXT: flat_store_byte v[16:17], v96 offset:181 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v96, 24, v21 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v21, 8, v21 +; ALIGNED-NEXT: flat_store_byte v[16:17], v49 offset:179 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v49, 24, v35 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v35, 8, v35 +; ALIGNED-NEXT: flat_store_byte v[16:17], v81 offset:177 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 24, v28 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v28, 8, v28 +; ALIGNED-NEXT: flat_store_byte v[16:17], v31 offset:175 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v31, 24, v22 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v22, 8, v22 +; ALIGNED-NEXT: flat_store_byte v[16:17], v100 offset:173 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v19 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v19, 8, v19 +; ALIGNED-NEXT: flat_store_byte v[16:17], v36 offset:171 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v36, 24, v29 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v29, 8, v29 +; ALIGNED-NEXT: flat_store_byte v[16:17], v97 offset:169 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v97, 24, v23 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v23, 8, v23 +; ALIGNED-NEXT: flat_store_byte v[16:17], v103 offset:167 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v20 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v20, 8, v20 +; ALIGNED-NEXT: flat_store_byte v[16:17], v82 offset:165 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v18 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v18, 8, v18 +; ALIGNED-NEXT: flat_store_byte v[16:17], v71 offset:163 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 24, v15 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v15, 8, v15 +; ALIGNED-NEXT: flat_store_byte v[16:17], v66 offset:161 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v66, 24, v14 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v14, 8, v14 +; ALIGNED-NEXT: flat_store_byte v[16:17], v30 offset:159 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v30, 24, v13 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v13, 8, v13 +; ALIGNED-NEXT: flat_store_byte v[16:17], v98 offset:157 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v12 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v12, 8, v12 +; ALIGNED-NEXT: flat_store_byte v[16:17], v70 offset:155 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 24, v11 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v11, 8, v11 +; ALIGNED-NEXT: flat_store_byte v[16:17], v83 offset:153 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 24, v10 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v10, 8, v10 +; ALIGNED-NEXT: flat_store_byte v[16:17], v112 offset:151 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v9 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v9, 8, v9 +; ALIGNED-NEXT: flat_store_byte v[16:17], v65 offset:147 +; ALIGNED-NEXT: flat_store_byte v[16:17], v52 offset:145 +; ALIGNED-NEXT: flat_store_byte v[16:17], v25 offset:143 +; ALIGNED-NEXT: flat_store_byte v[16:17], v84 offset:141 +; ALIGNED-NEXT: flat_store_byte v[16:17], v64 offset:139 +; ALIGNED-NEXT: flat_store_byte v[16:17], v68 offset:137 +; ALIGNED-NEXT: flat_store_byte v[16:17], v113 offset:135 +; ALIGNED-NEXT: flat_store_byte v[16:17], v53 offset:133 +; ALIGNED-NEXT: flat_store_byte v[16:17], v87 offset:131 +; ALIGNED-NEXT: flat_store_byte v[16:17], v37 offset:129 +; ALIGNED-NEXT: flat_store_byte v[16:17], v24 offset:127 +; ALIGNED-NEXT: flat_store_byte v[16:17], v69 offset:125 +; ALIGNED-NEXT: flat_store_byte v[16:17], v86 offset:123 +; ALIGNED-NEXT: flat_store_byte v[16:17], v54 offset:121 +; ALIGNED-NEXT: flat_store_byte v[16:17], v114 offset:119 +; ALIGNED-NEXT: flat_store_byte v[16:17], v38 offset:117 +; ALIGNED-NEXT: flat_store_byte v[16:17], v85 offset:115 +; ALIGNED-NEXT: flat_store_byte v[16:17], v32 offset:113 +; ALIGNED-NEXT: flat_store_byte v[16:17], v51 offset:111 +; ALIGNED-NEXT: flat_store_byte v[16:17], v55 offset:109 +; ALIGNED-NEXT: flat_store_byte v[16:17], v80 offset:107 +; ALIGNED-NEXT: flat_store_byte v[16:17], v39 offset:105 +; ALIGNED-NEXT: flat_store_byte v[16:17], v115 offset:103 +; ALIGNED-NEXT: flat_store_byte v[16:17], v33 offset:101 +; ALIGNED-NEXT: flat_store_byte v[16:17], v101 offset:99 +; ALIGNED-NEXT: flat_store_byte v[16:17], v26 offset:97 +; ALIGNED-NEXT: flat_store_byte v[16:17], v50 offset:95 +; ALIGNED-NEXT: flat_store_byte v[16:17], v48 offset:93 +; ALIGNED-NEXT: flat_store_byte v[16:17], v99 offset:91 +; ALIGNED-NEXT: flat_store_byte v[16:17], v34 offset:89 +; ALIGNED-NEXT: flat_store_byte v[16:17], v102 offset:87 +; ALIGNED-NEXT: flat_store_byte v[16:17], v27 offset:85 +; ALIGNED-NEXT: flat_store_byte v[16:17], v96 offset:83 +; ALIGNED-NEXT: flat_store_byte v[16:17], v21 offset:81 +; ALIGNED-NEXT: flat_store_byte v[16:17], v49 offset:79 +; ALIGNED-NEXT: flat_store_byte v[16:17], v35 offset:77 +; ALIGNED-NEXT: flat_store_byte v[16:17], v81 offset:75 +; ALIGNED-NEXT: flat_store_byte v[16:17], v28 offset:73 +; ALIGNED-NEXT: flat_store_byte v[16:17], v31 offset:71 +; ALIGNED-NEXT: flat_store_byte v[16:17], v22 offset:69 +; ALIGNED-NEXT: flat_store_byte v[16:17], v100 offset:67 +; ALIGNED-NEXT: flat_store_byte v[16:17], v19 offset:65 +; ALIGNED-NEXT: flat_store_byte v[16:17], v36 offset:63 +; ALIGNED-NEXT: flat_store_byte v[16:17], v29 offset:61 +; ALIGNED-NEXT: flat_store_byte v[16:17], v97 offset:59 +; ALIGNED-NEXT: flat_store_byte v[16:17], v23 offset:57 +; ALIGNED-NEXT: flat_store_byte v[16:17], v103 offset:55 +; ALIGNED-NEXT: flat_store_byte v[16:17], v20 offset:53 +; ALIGNED-NEXT: flat_store_byte v[16:17], v82 offset:51 +; ALIGNED-NEXT: flat_store_byte v[16:17], v18 offset:49 +; ALIGNED-NEXT: flat_store_byte v[16:17], v71 offset:43 +; ALIGNED-NEXT: flat_store_byte v[16:17], v15 offset:41 +; ALIGNED-NEXT: flat_store_byte v[16:17], v66 offset:47 +; ALIGNED-NEXT: flat_store_byte v[16:17], v14 offset:45 +; ALIGNED-NEXT: flat_store_byte v[16:17], v30 offset:35 +; ALIGNED-NEXT: flat_store_byte v[16:17], v13 offset:33 +; ALIGNED-NEXT: flat_store_byte v[16:17], v98 offset:39 +; ALIGNED-NEXT: flat_store_byte v[16:17], v12 offset:37 +; ALIGNED-NEXT: flat_store_byte v[16:17], v70 offset:31 +; ALIGNED-NEXT: flat_store_byte v[16:17], v11 offset:29 +; ALIGNED-NEXT: flat_store_byte v[16:17], v83 offset:27 +; ALIGNED-NEXT: flat_store_byte v[16:17], v10 offset:25 +; ALIGNED-NEXT: flat_store_byte v[16:17], v112 offset:23 +; ALIGNED-NEXT: flat_store_byte v[16:17], v9 offset:21 +; ALIGNED-NEXT: flat_store_byte v[16:17], v67 offset:19 +; ALIGNED-NEXT: flat_store_byte v[16:17], v8 offset:17 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v7 offset:14 +; ALIGNED-NEXT: flat_store_byte v[16:17], v7 offset:12 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v6 offset:10 +; ALIGNED-NEXT: flat_store_byte v[16:17], v6 offset:8 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v5 offset:6 +; ALIGNED-NEXT: flat_store_byte v[16:17], v5 offset:4 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v4 offset:2 +; ALIGNED-NEXT: flat_store_byte v[16:17], v4 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v8, 24, v7 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v7, 8, v7 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v9, 24, v6 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v6, 8, v6 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v10, 24, v5 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v5, 8, v5 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v11, 24, v4 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v4, 8, v4 +; ALIGNED-NEXT: flat_store_byte v[16:17], v8 offset:15 +; ALIGNED-NEXT: flat_store_byte v[16:17], v7 offset:13 +; ALIGNED-NEXT: flat_store_byte v[16:17], v9 offset:11 +; ALIGNED-NEXT: flat_store_byte v[16:17], v6 offset:9 +; ALIGNED-NEXT: flat_store_byte v[16:17], v10 offset:7 +; ALIGNED-NEXT: flat_store_byte v[16:17], v5 offset:5 +; ALIGNED-NEXT: flat_store_byte v[16:17], v11 offset:3 +; ALIGNED-NEXT: flat_store_byte v[16:17], v4 offset:1 +; ALIGNED-NEXT: s_cbranch_vccnz .LBB0_1 +; ALIGNED-NEXT: ; %bb.2: ; %memcpy-split +; ALIGNED-NEXT: s_clause 0x7 +; ALIGNED-NEXT: buffer_load_dword v47, off, s[0:3], s32 +; ALIGNED-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 +; ALIGNED-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:8 +; ALIGNED-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:12 +; ALIGNED-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:16 +; ALIGNED-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 +; ALIGNED-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:24 +; ALIGNED-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; ALIGNED-NEXT: s_setpc_b64 s[30:31] +; +; UNROLL3-LABEL: memcpy_p0_p0_sz2048: +; UNROLL3: ; %bb.0: ; %entry +; UNROLL3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; UNROLL3-NEXT: s_mov_b64 s[4:5], 0 +; UNROLL3-NEXT: .p2align 6 +; UNROLL3-NEXT: .LBB0_1: ; %load-store-loop +; UNROLL3-NEXT: ; =>This Inner Loop Header: Depth=1 +; UNROLL3-NEXT: v_add_co_u32 v12, vcc_lo, v2, s4 +; UNROLL3-NEXT: v_add_co_ci_u32_e32 v13, vcc_lo, s5, v3, vcc_lo +; UNROLL3-NEXT: v_add_co_u32 v16, vcc_lo, v0, s4 +; UNROLL3-NEXT: s_add_u32 s4, s4, 48 +; UNROLL3-NEXT: s_clause 0x2 +; UNROLL3-NEXT: flat_load_dwordx4 v[4:7], v[12:13] +; UNROLL3-NEXT: flat_load_dwordx4 v[8:11], v[12:13] offset:16 +; UNROLL3-NEXT: flat_load_dwordx4 v[12:15], v[12:13] offset:32 +; UNROLL3-NEXT: v_add_co_ci_u32_e32 v17, vcc_lo, s5, v1, vcc_lo +; UNROLL3-NEXT: s_addc_u32 s5, s5, 0 +; UNROLL3-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) +; UNROLL3-NEXT: flat_store_dwordx4 v[16:17], v[4:7] +; UNROLL3-NEXT: s_waitcnt vmcnt(1) lgkmcnt(2) +; UNROLL3-NEXT: flat_store_dwordx4 v[16:17], v[8:11] offset:16 +; UNROLL3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(2) +; UNROLL3-NEXT: flat_store_dwordx4 v[16:17], v[12:15] offset:32 +; UNROLL3-NEXT: v_cmp_gt_u64_e64 s6, 0x7e0, s[4:5] +; UNROLL3-NEXT: s_and_b32 vcc_lo, exec_lo, s6 +; UNROLL3-NEXT: s_cbranch_vccnz .LBB0_1 +; UNROLL3-NEXT: ; %bb.2: ; %memcpy-split +; UNROLL3-NEXT: flat_load_dwordx4 v[4:7], v[2:3] offset:2016 +; UNROLL3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; UNROLL3-NEXT: flat_store_dwordx4 v[0:1], v[4:7] offset:2016 +; UNROLL3-NEXT: flat_load_dwordx4 v[2:5], v[2:3] offset:2032 +; UNROLL3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; UNROLL3-NEXT: flat_store_dwordx4 v[0:1], v[2:5] offset:2032 +; UNROLL3-NEXT: s_waitcnt lgkmcnt(0) +; UNROLL3-NEXT: s_setpc_b64 s[30:31] +entry: + tail call void @llvm.memcpy.p0.p0.i64(ptr addrspace(0) noundef nonnull align 1 %dst, ptr addrspace(0) noundef nonnull align 1 %src, i64 2048, i1 false) + ret void +} + +define void @memcpy_p1_p1_sz2048(ptr addrspace(1) align 1 %dst, ptr addrspace(1) align 1 readonly %src) { +; CHECK-LABEL: memcpy_p1_p1_sz2048: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_mov_b64 s[4:5], 0 +; CHECK-NEXT: .LBB1_1: ; %load-store-loop +; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: v_add_co_u32 v96, vcc_lo, v2, s4 +; CHECK-NEXT: v_add_co_ci_u32_e32 v97, vcc_lo, s5, v3, vcc_lo +; CHECK-NEXT: v_add_co_u32 v100, vcc_lo, v0, s4 +; CHECK-NEXT: s_add_u32 s4, s4, 0x100 +; CHECK-NEXT: s_clause 0xf +; CHECK-NEXT: global_load_dwordx4 v[4:7], v[96:97], off offset:224 +; CHECK-NEXT: global_load_dwordx4 v[8:11], v[96:97], off offset:240 +; CHECK-NEXT: global_load_dwordx4 v[12:15], v[96:97], off offset:192 +; CHECK-NEXT: global_load_dwordx4 v[16:19], v[96:97], off offset:208 +; CHECK-NEXT: global_load_dwordx4 v[20:23], v[96:97], off offset:160 +; CHECK-NEXT: global_load_dwordx4 v[24:27], v[96:97], off offset:176 +; CHECK-NEXT: global_load_dwordx4 v[28:31], v[96:97], off offset:128 +; CHECK-NEXT: global_load_dwordx4 v[32:35], v[96:97], off offset:144 +; CHECK-NEXT: global_load_dwordx4 v[36:39], v[96:97], off offset:96 +; CHECK-NEXT: global_load_dwordx4 v[48:51], v[96:97], off offset:112 +; CHECK-NEXT: global_load_dwordx4 v[52:55], v[96:97], off offset:64 +; CHECK-NEXT: global_load_dwordx4 v[64:67], v[96:97], off offset:80 +; CHECK-NEXT: global_load_dwordx4 v[68:71], v[96:97], off offset:32 +; CHECK-NEXT: global_load_dwordx4 v[80:83], v[96:97], off offset:48 +; CHECK-NEXT: global_load_dwordx4 v[84:87], v[96:97], off +; CHECK-NEXT: global_load_dwordx4 v[96:99], v[96:97], off offset:16 +; CHECK-NEXT: v_add_co_ci_u32_e32 v101, vcc_lo, s5, v1, vcc_lo +; CHECK-NEXT: s_addc_u32 s5, s5, 0 +; CHECK-NEXT: s_waitcnt vmcnt(15) +; CHECK-NEXT: global_store_dwordx4 v[100:101], v[4:7], off offset:224 +; CHECK-NEXT: s_waitcnt vmcnt(14) +; CHECK-NEXT: global_store_dwordx4 v[100:101], v[8:11], off offset:240 +; CHECK-NEXT: s_waitcnt vmcnt(13) +; CHECK-NEXT: global_store_dwordx4 v[100:101], v[12:15], off offset:192 +; CHECK-NEXT: s_waitcnt vmcnt(12) +; CHECK-NEXT: global_store_dwordx4 v[100:101], v[16:19], off offset:208 +; CHECK-NEXT: s_waitcnt vmcnt(11) +; CHECK-NEXT: global_store_dwordx4 v[100:101], v[20:23], off offset:160 +; CHECK-NEXT: s_waitcnt vmcnt(10) +; CHECK-NEXT: global_store_dwordx4 v[100:101], v[24:27], off offset:176 +; CHECK-NEXT: s_waitcnt vmcnt(9) +; CHECK-NEXT: global_store_dwordx4 v[100:101], v[28:31], off offset:128 +; CHECK-NEXT: s_waitcnt vmcnt(8) +; CHECK-NEXT: global_store_dwordx4 v[100:101], v[32:35], off offset:144 +; CHECK-NEXT: s_waitcnt vmcnt(7) +; CHECK-NEXT: global_store_dwordx4 v[100:101], v[36:39], off offset:96 +; CHECK-NEXT: s_waitcnt vmcnt(6) +; CHECK-NEXT: global_store_dwordx4 v[100:101], v[48:51], off offset:112 +; CHECK-NEXT: s_waitcnt vmcnt(5) +; CHECK-NEXT: global_store_dwordx4 v[100:101], v[52:55], off offset:64 +; CHECK-NEXT: s_waitcnt vmcnt(4) +; CHECK-NEXT: global_store_dwordx4 v[100:101], v[64:67], off offset:80 +; CHECK-NEXT: s_waitcnt vmcnt(3) +; CHECK-NEXT: global_store_dwordx4 v[100:101], v[68:71], off offset:32 +; CHECK-NEXT: s_waitcnt vmcnt(2) +; CHECK-NEXT: global_store_dwordx4 v[100:101], v[80:83], off offset:48 +; CHECK-NEXT: s_waitcnt vmcnt(1) +; CHECK-NEXT: global_store_dwordx4 v[100:101], v[84:87], off +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: global_store_dwordx4 v[100:101], v[96:99], off offset:16 +; CHECK-NEXT: v_cmp_gt_u64_e64 s6, 0x800, s[4:5] +; CHECK-NEXT: s_and_b32 vcc_lo, exec_lo, s6 +; CHECK-NEXT: s_cbranch_vccnz .LBB1_1 +; CHECK-NEXT: ; %bb.2: ; %memcpy-split +; CHECK-NEXT: s_setpc_b64 s[30:31] +; +; ALIGNED-LABEL: memcpy_p1_p1_sz2048: +; ALIGNED: ; %bb.0: ; %entry +; ALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; ALIGNED-NEXT: s_mov_b64 s[4:5], 0 +; ALIGNED-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill +; ALIGNED-NEXT: .LBB1_1: ; %load-store-loop +; ALIGNED-NEXT: ; =>This Inner Loop Header: Depth=1 +; ALIGNED-NEXT: v_add_co_u32 v24, vcc_lo, v2, s4 +; ALIGNED-NEXT: v_add_co_ci_u32_e32 v25, vcc_lo, s5, v3, vcc_lo +; ALIGNED-NEXT: s_clause 0xf +; ALIGNED-NEXT: global_load_dwordx4 v[16:19], v[24:25], off offset:240 +; ALIGNED-NEXT: global_load_dwordx4 v[20:23], v[24:25], off offset:224 +; ALIGNED-NEXT: global_load_dwordx4 v[4:7], v[24:25], off +; ALIGNED-NEXT: global_load_dwordx4 v[8:11], v[24:25], off offset:16 +; ALIGNED-NEXT: global_load_dwordx4 v[12:15], v[24:25], off offset:32 +; ALIGNED-NEXT: global_load_dwordx4 v[112:115], v[24:25], off offset:48 +; ALIGNED-NEXT: global_load_dwordx4 v[116:119], v[24:25], off offset:64 +; ALIGNED-NEXT: global_load_dwordx4 v[40:43], v[24:25], off offset:80 +; ALIGNED-NEXT: global_load_dwordx4 v[26:29], v[24:25], off offset:96 +; ALIGNED-NEXT: global_load_dwordx4 v[32:35], v[24:25], off offset:112 +; ALIGNED-NEXT: global_load_dwordx4 v[44:47], v[24:25], off offset:128 +; ALIGNED-NEXT: global_load_dwordx4 v[52:55], v[24:25], off offset:144 +; ALIGNED-NEXT: global_load_dwordx4 v[66:69], v[24:25], off offset:160 +; ALIGNED-NEXT: global_load_dwordx4 v[81:84], v[24:25], off offset:176 +; ALIGNED-NEXT: global_load_dwordx4 v[96:99], v[24:25], off offset:192 +; ALIGNED-NEXT: global_load_dwordx4 v[100:103], v[24:25], off offset:208 +; ALIGNED-NEXT: s_waitcnt vmcnt(15) +; ALIGNED-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:64 +; ALIGNED-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:68 +; ALIGNED-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:72 +; ALIGNED-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:76 +; ALIGNED-NEXT: s_clause 0x3 +; ALIGNED-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:76 +; ALIGNED-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:72 +; ALIGNED-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:68 +; ALIGNED-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:64 +; ALIGNED-NEXT: v_add_co_u32 v16, vcc_lo, v0, s4 +; ALIGNED-NEXT: v_add_co_ci_u32_e32 v17, vcc_lo, s5, v1, vcc_lo +; ALIGNED-NEXT: s_add_u32 s4, s4, 0x100 +; ALIGNED-NEXT: s_addc_u32 s5, s5, 0 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v31, off offset:254 +; ALIGNED-NEXT: global_store_byte v[16:17], v31, off offset:252 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v30, off offset:250 +; ALIGNED-NEXT: global_store_byte v[16:17], v30, off offset:248 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v25, off offset:246 +; ALIGNED-NEXT: global_store_byte v[16:17], v25, off offset:244 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v24, off offset:242 +; ALIGNED-NEXT: global_store_byte v[16:17], v24, off offset:240 +; ALIGNED-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:80 +; ALIGNED-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:84 +; ALIGNED-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:88 +; ALIGNED-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:92 +; ALIGNED-NEXT: s_clause 0x3 +; ALIGNED-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:92 +; ALIGNED-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:88 +; ALIGNED-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:84 +; ALIGNED-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:80 +; ALIGNED-NEXT: v_cmp_gt_u64_e64 s6, 0x800, s[4:5] +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v51, off offset:238 +; ALIGNED-NEXT: global_store_byte v[16:17], v51, off offset:236 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v50, off offset:234 +; ALIGNED-NEXT: global_store_byte v[16:17], v50, off offset:232 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v49, off offset:230 +; ALIGNED-NEXT: global_store_byte v[16:17], v49, off offset:228 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v36, off offset:226 +; ALIGNED-NEXT: global_store_byte v[16:17], v36, off offset:224 +; ALIGNED-NEXT: buffer_store_dword v100, off, s[0:3], s32 offset:32 +; ALIGNED-NEXT: buffer_store_dword v101, off, s[0:3], s32 offset:36 +; ALIGNED-NEXT: buffer_store_dword v102, off, s[0:3], s32 offset:40 +; ALIGNED-NEXT: buffer_store_dword v103, off, s[0:3], s32 offset:44 +; ALIGNED-NEXT: s_clause 0x3 +; ALIGNED-NEXT: buffer_load_dword v71, off, s[0:3], s32 offset:44 +; ALIGNED-NEXT: buffer_load_dword v70, off, s[0:3], s32 offset:40 +; ALIGNED-NEXT: buffer_load_dword v65, off, s[0:3], s32 offset:36 +; ALIGNED-NEXT: buffer_load_dword v64, off, s[0:3], s32 offset:32 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v31 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v31, 8, v31 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v30 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v30, 8, v30 +; ALIGNED-NEXT: s_and_b32 vcc_lo, exec_lo, s6 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v71, off offset:222 +; ALIGNED-NEXT: global_store_byte v[16:17], v71, off offset:220 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v70, off offset:218 +; ALIGNED-NEXT: global_store_byte v[16:17], v70, off offset:216 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v65, off offset:214 +; ALIGNED-NEXT: global_store_byte v[16:17], v65, off offset:212 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v64, off offset:210 +; ALIGNED-NEXT: global_store_byte v[16:17], v64, off offset:208 +; ALIGNED-NEXT: buffer_store_dword v96, off, s[0:3], s32 offset:48 +; ALIGNED-NEXT: buffer_store_dword v97, off, s[0:3], s32 offset:52 +; ALIGNED-NEXT: buffer_store_dword v98, off, s[0:3], s32 offset:56 +; ALIGNED-NEXT: buffer_store_dword v99, off, s[0:3], s32 offset:60 +; ALIGNED-NEXT: s_clause 0x3 +; ALIGNED-NEXT: buffer_load_dword v87, off, s[0:3], s32 offset:60 +; ALIGNED-NEXT: buffer_load_dword v86, off, s[0:3], s32 offset:56 +; ALIGNED-NEXT: buffer_load_dword v85, off, s[0:3], s32 offset:52 +; ALIGNED-NEXT: buffer_load_dword v80, off, s[0:3], s32 offset:48 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v87, off offset:206 +; ALIGNED-NEXT: global_store_byte v[16:17], v87, off offset:204 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v86, off offset:202 +; ALIGNED-NEXT: global_store_byte v[16:17], v86, off offset:200 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v85, off offset:198 +; ALIGNED-NEXT: global_store_byte v[16:17], v85, off offset:196 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v80, off offset:194 +; ALIGNED-NEXT: global_store_byte v[16:17], v80, off offset:192 +; ALIGNED-NEXT: buffer_store_dword v81, off, s[0:3], s32 offset:128 +; ALIGNED-NEXT: buffer_store_dword v82, off, s[0:3], s32 offset:132 +; ALIGNED-NEXT: buffer_store_dword v83, off, s[0:3], s32 offset:136 +; ALIGNED-NEXT: buffer_store_dword v84, off, s[0:3], s32 offset:140 +; ALIGNED-NEXT: s_clause 0x3 +; ALIGNED-NEXT: buffer_load_dword v101, off, s[0:3], s32 offset:140 +; ALIGNED-NEXT: buffer_load_dword v99, off, s[0:3], s32 offset:136 +; ALIGNED-NEXT: buffer_load_dword v96, off, s[0:3], s32 offset:132 +; ALIGNED-NEXT: buffer_load_dword v81, off, s[0:3], s32 offset:128 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v101, off offset:190 +; ALIGNED-NEXT: global_store_byte v[16:17], v101, off offset:188 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v99, off offset:186 +; ALIGNED-NEXT: global_store_byte v[16:17], v99, off offset:184 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v96, off offset:182 +; ALIGNED-NEXT: global_store_byte v[16:17], v96, off offset:180 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v81, off offset:178 +; ALIGNED-NEXT: global_store_byte v[16:17], v81, off offset:176 +; ALIGNED-NEXT: buffer_store_dword v66, off, s[0:3], s32 offset:144 +; ALIGNED-NEXT: buffer_store_dword v67, off, s[0:3], s32 offset:148 +; ALIGNED-NEXT: buffer_store_dword v68, off, s[0:3], s32 offset:152 +; ALIGNED-NEXT: buffer_store_dword v69, off, s[0:3], s32 offset:156 +; ALIGNED-NEXT: s_clause 0x3 +; ALIGNED-NEXT: buffer_load_dword v100, off, s[0:3], s32 offset:156 +; ALIGNED-NEXT: buffer_load_dword v97, off, s[0:3], s32 offset:152 +; ALIGNED-NEXT: buffer_load_dword v82, off, s[0:3], s32 offset:148 +; ALIGNED-NEXT: buffer_load_dword v66, off, s[0:3], s32 offset:144 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v100, off offset:174 +; ALIGNED-NEXT: global_store_byte v[16:17], v100, off offset:172 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v97, off offset:170 +; ALIGNED-NEXT: global_store_byte v[16:17], v97, off offset:168 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v82, off offset:166 +; ALIGNED-NEXT: global_store_byte v[16:17], v82, off offset:164 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v66, off offset:162 +; ALIGNED-NEXT: global_store_byte v[16:17], v66, off offset:160 +; ALIGNED-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:96 +; ALIGNED-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:100 +; ALIGNED-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:104 +; ALIGNED-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:108 +; ALIGNED-NEXT: s_clause 0x3 +; ALIGNED-NEXT: buffer_load_dword v98, off, s[0:3], s32 offset:108 +; ALIGNED-NEXT: buffer_load_dword v83, off, s[0:3], s32 offset:104 +; ALIGNED-NEXT: buffer_load_dword v67, off, s[0:3], s32 offset:100 +; ALIGNED-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:96 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v98, off offset:158 +; ALIGNED-NEXT: global_store_byte v[16:17], v98, off offset:156 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v83, off offset:154 +; ALIGNED-NEXT: global_store_byte v[16:17], v83, off offset:152 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v67, off offset:150 +; ALIGNED-NEXT: global_store_byte v[16:17], v67, off offset:148 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v52, off offset:146 +; ALIGNED-NEXT: global_store_byte v[16:17], v52, off offset:144 +; ALIGNED-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:112 +; ALIGNED-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:116 +; ALIGNED-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:120 +; ALIGNED-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:124 +; ALIGNED-NEXT: s_clause 0x3 +; ALIGNED-NEXT: buffer_load_dword v84, off, s[0:3], s32 offset:124 +; ALIGNED-NEXT: buffer_load_dword v68, off, s[0:3], s32 offset:120 +; ALIGNED-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:116 +; ALIGNED-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:112 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v84, off offset:142 +; ALIGNED-NEXT: global_store_byte v[16:17], v84, off offset:140 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v68, off offset:138 +; ALIGNED-NEXT: global_store_byte v[16:17], v68, off offset:136 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v53, off offset:134 +; ALIGNED-NEXT: global_store_byte v[16:17], v53, off offset:132 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v37, off offset:130 +; ALIGNED-NEXT: global_store_byte v[16:17], v37, off offset:128 +; ALIGNED-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:192 +; ALIGNED-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:196 +; ALIGNED-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:200 +; ALIGNED-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:204 +; ALIGNED-NEXT: s_clause 0x3 +; ALIGNED-NEXT: buffer_load_dword v69, off, s[0:3], s32 offset:204 +; ALIGNED-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:200 +; ALIGNED-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:196 +; ALIGNED-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:192 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v69, off offset:126 +; ALIGNED-NEXT: global_store_byte v[16:17], v69, off offset:124 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v54, off offset:122 +; ALIGNED-NEXT: global_store_byte v[16:17], v54, off offset:120 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v38, off offset:118 +; ALIGNED-NEXT: global_store_byte v[16:17], v38, off offset:116 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v32, off offset:114 +; ALIGNED-NEXT: global_store_byte v[16:17], v32, off offset:112 +; ALIGNED-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:208 +; ALIGNED-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:212 +; ALIGNED-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:216 +; ALIGNED-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:220 +; ALIGNED-NEXT: s_clause 0x3 +; ALIGNED-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:220 +; ALIGNED-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:216 +; ALIGNED-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:212 +; ALIGNED-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:208 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v55, off offset:110 +; ALIGNED-NEXT: global_store_byte v[16:17], v55, off offset:108 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v39, off offset:106 +; ALIGNED-NEXT: global_store_byte v[16:17], v39, off offset:104 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v33, off offset:102 +; ALIGNED-NEXT: global_store_byte v[16:17], v33, off offset:100 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v26, off offset:98 +; ALIGNED-NEXT: global_store_byte v[16:17], v26, off offset:96 +; ALIGNED-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:160 +; ALIGNED-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:164 +; ALIGNED-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:168 +; ALIGNED-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:172 +; ALIGNED-NEXT: s_clause 0x3 +; ALIGNED-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:172 +; ALIGNED-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:168 +; ALIGNED-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:164 +; ALIGNED-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:160 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v48, off offset:94 +; ALIGNED-NEXT: global_store_byte v[16:17], v48, off offset:92 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v34, off offset:90 +; ALIGNED-NEXT: global_store_byte v[16:17], v34, off offset:88 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v27, off offset:86 +; ALIGNED-NEXT: global_store_byte v[16:17], v27, off offset:84 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v21, off offset:82 +; ALIGNED-NEXT: global_store_byte v[16:17], v21, off offset:80 +; ALIGNED-NEXT: buffer_store_dword v116, off, s[0:3], s32 offset:176 +; ALIGNED-NEXT: buffer_store_dword v117, off, s[0:3], s32 offset:180 +; ALIGNED-NEXT: buffer_store_dword v118, off, s[0:3], s32 offset:184 +; ALIGNED-NEXT: buffer_store_dword v119, off, s[0:3], s32 offset:188 +; ALIGNED-NEXT: s_clause 0x3 +; ALIGNED-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:188 +; ALIGNED-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:184 +; ALIGNED-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:180 +; ALIGNED-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:176 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v35, off offset:78 +; ALIGNED-NEXT: global_store_byte v[16:17], v35, off offset:76 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v28, off offset:74 +; ALIGNED-NEXT: global_store_byte v[16:17], v28, off offset:72 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v22, off offset:70 +; ALIGNED-NEXT: global_store_byte v[16:17], v22, off offset:68 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v19, off offset:66 +; ALIGNED-NEXT: global_store_byte v[16:17], v19, off offset:64 +; ALIGNED-NEXT: buffer_store_dword v112, off, s[0:3], s32 offset:256 +; ALIGNED-NEXT: buffer_store_dword v113, off, s[0:3], s32 offset:260 +; ALIGNED-NEXT: buffer_store_dword v114, off, s[0:3], s32 offset:264 +; ALIGNED-NEXT: buffer_store_dword v115, off, s[0:3], s32 offset:268 +; ALIGNED-NEXT: s_clause 0x3 +; ALIGNED-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:268 +; ALIGNED-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:264 +; ALIGNED-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:260 +; ALIGNED-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:256 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v25 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v25, 8, v25 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v24 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v24, 8, v24 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v51 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 8, v51 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 24, v50 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v50, 8, v50 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v29, off offset:62 +; ALIGNED-NEXT: global_store_byte v[16:17], v29, off offset:60 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v23, off offset:58 +; ALIGNED-NEXT: global_store_byte v[16:17], v23, off offset:56 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v20, off offset:54 +; ALIGNED-NEXT: global_store_byte v[16:17], v20, off offset:52 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v18, off offset:50 +; ALIGNED-NEXT: global_store_byte v[16:17], v18, off offset:48 +; ALIGNED-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:272 +; ALIGNED-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:276 +; ALIGNED-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:280 +; ALIGNED-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:284 +; ALIGNED-NEXT: s_clause 0x3 +; ALIGNED-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:280 +; ALIGNED-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:284 +; ALIGNED-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:272 +; ALIGNED-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:276 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v15, off offset:42 +; ALIGNED-NEXT: global_store_byte v[16:17], v15, off offset:40 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v14, off offset:46 +; ALIGNED-NEXT: global_store_byte v[16:17], v14, off offset:44 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v13, off offset:34 +; ALIGNED-NEXT: global_store_byte v[16:17], v13, off offset:32 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v12, off offset:38 +; ALIGNED-NEXT: global_store_byte v[16:17], v12, off offset:36 +; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:224 +; ALIGNED-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:228 +; ALIGNED-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:232 +; ALIGNED-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:236 +; ALIGNED-NEXT: s_clause 0x3 +; ALIGNED-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:236 +; ALIGNED-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:232 +; ALIGNED-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:228 +; ALIGNED-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:224 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v11, off offset:30 +; ALIGNED-NEXT: global_store_byte v[16:17], v11, off offset:28 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v10, off offset:26 +; ALIGNED-NEXT: global_store_byte v[16:17], v10, off offset:24 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v9, off offset:22 +; ALIGNED-NEXT: global_store_byte v[16:17], v9, off offset:20 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v8, off offset:18 +; ALIGNED-NEXT: global_store_byte v[16:17], v8, off offset:16 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:240 +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:244 +; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:248 +; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:252 +; ALIGNED-NEXT: s_clause 0x3 +; ALIGNED-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:252 +; ALIGNED-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:248 +; ALIGNED-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:244 +; ALIGNED-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:240 +; ALIGNED-NEXT: global_store_byte v[16:17], v112, off offset:247 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v65 +; ALIGNED-NEXT: global_store_byte v[16:17], v102, off offset:255 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v49 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v49, 8, v49 +; ALIGNED-NEXT: global_store_byte v[16:17], v31, off offset:253 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v31, 24, v36 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v36, 8, v36 +; ALIGNED-NEXT: global_store_byte v[16:17], v103, off offset:251 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v71 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 8, v71 +; ALIGNED-NEXT: global_store_byte v[16:17], v30, off offset:249 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v30, 24, v70 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 8, v70 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v65, 8, v65 +; ALIGNED-NEXT: global_store_byte v[16:17], v25, off offset:245 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v25, 24, v64 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v64, 8, v64 +; ALIGNED-NEXT: global_store_byte v[16:17], v112, off offset:215 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v67 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v67, 8, v67 +; ALIGNED-NEXT: global_store_byte v[16:17], v113, off offset:243 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v87 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 8, v87 +; ALIGNED-NEXT: global_store_byte v[16:17], v24, off offset:241 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v24, 24, v86 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 8, v86 +; ALIGNED-NEXT: global_store_byte v[16:17], v114, off offset:239 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v85 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 8, v85 +; ALIGNED-NEXT: global_store_byte v[16:17], v51, off offset:237 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 24, v80 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 8, v80 +; ALIGNED-NEXT: global_store_byte v[16:17], v115, off offset:235 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 24, v101 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 8, v101 +; ALIGNED-NEXT: global_store_byte v[16:17], v50, off offset:233 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v50, 24, v99 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v99 +; ALIGNED-NEXT: global_store_byte v[16:17], v102, off offset:231 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v96 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v96, 8, v96 +; ALIGNED-NEXT: global_store_byte v[16:17], v49, off offset:229 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v49, 24, v81 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 8, v81 +; ALIGNED-NEXT: global_store_byte v[16:17], v31, off offset:227 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v31, 24, v100 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 8, v100 +; ALIGNED-NEXT: global_store_byte v[16:17], v36, off offset:225 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v36, 24, v97 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v97, 8, v97 +; ALIGNED-NEXT: global_store_byte v[16:17], v103, off offset:223 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v82 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 8, v82 +; ALIGNED-NEXT: global_store_byte v[16:17], v71, off offset:221 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 24, v66 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v66, 8, v66 +; ALIGNED-NEXT: global_store_byte v[16:17], v30, off offset:219 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v30, 24, v98 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 8, v98 +; ALIGNED-NEXT: global_store_byte v[16:17], v70, off offset:217 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 24, v83 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 8, v83 +; ALIGNED-NEXT: global_store_byte v[16:17], v65, off offset:213 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v65, 24, v52 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v52, 8, v52 +; ALIGNED-NEXT: global_store_byte v[16:17], v25, off offset:211 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v25, 24, v84 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v84, 8, v84 +; ALIGNED-NEXT: global_store_byte v[16:17], v64, off offset:209 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v64, 24, v68 +; ALIGNED-NEXT: global_store_byte v[16:17], v67, off offset:149 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v67, 24, v8 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v8, 8, v8 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v68, 8, v68 +; ALIGNED-NEXT: global_store_byte v[16:17], v113, off offset:207 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v53 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v53, 8, v53 +; ALIGNED-NEXT: global_store_byte v[16:17], v87, off offset:205 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 24, v37 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v37, 8, v37 +; ALIGNED-NEXT: global_store_byte v[16:17], v24, off offset:203 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v24, 24, v69 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 8, v69 +; ALIGNED-NEXT: global_store_byte v[16:17], v86, off offset:201 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 24, v54 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v54, 8, v54 +; ALIGNED-NEXT: global_store_byte v[16:17], v114, off offset:199 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v38 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v38, 8, v38 +; ALIGNED-NEXT: global_store_byte v[16:17], v85, off offset:197 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 24, v32 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v32, 8, v32 +; ALIGNED-NEXT: global_store_byte v[16:17], v51, off offset:195 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 24, v55 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v55, 8, v55 +; ALIGNED-NEXT: global_store_byte v[16:17], v80, off offset:193 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 24, v39 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v39, 8, v39 +; ALIGNED-NEXT: global_store_byte v[16:17], v115, off offset:191 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 24, v33 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v33, 8, v33 +; ALIGNED-NEXT: global_store_byte v[16:17], v101, off offset:189 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 24, v26 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v26, 8, v26 +; ALIGNED-NEXT: global_store_byte v[16:17], v50, off offset:187 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v50, 24, v48 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v48, 8, v48 +; ALIGNED-NEXT: global_store_byte v[16:17], v99, off offset:185 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 24, v34 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v34, 8, v34 +; ALIGNED-NEXT: global_store_byte v[16:17], v102, off offset:183 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v27 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v27, 8, v27 +; ALIGNED-NEXT: global_store_byte v[16:17], v96, off offset:181 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v96, 24, v21 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v21, 8, v21 +; ALIGNED-NEXT: global_store_byte v[16:17], v49, off offset:179 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v49, 24, v35 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v35, 8, v35 +; ALIGNED-NEXT: global_store_byte v[16:17], v81, off offset:177 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 24, v28 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v28, 8, v28 +; ALIGNED-NEXT: global_store_byte v[16:17], v31, off offset:175 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v31, 24, v22 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v22, 8, v22 +; ALIGNED-NEXT: global_store_byte v[16:17], v100, off offset:173 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v19 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v19, 8, v19 +; ALIGNED-NEXT: global_store_byte v[16:17], v36, off offset:171 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v36, 24, v29 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v29, 8, v29 +; ALIGNED-NEXT: global_store_byte v[16:17], v97, off offset:169 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v97, 24, v23 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v23, 8, v23 +; ALIGNED-NEXT: global_store_byte v[16:17], v103, off offset:167 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v20 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v20, 8, v20 +; ALIGNED-NEXT: global_store_byte v[16:17], v82, off offset:165 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v18 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v18, 8, v18 +; ALIGNED-NEXT: global_store_byte v[16:17], v71, off offset:163 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 24, v15 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v15, 8, v15 +; ALIGNED-NEXT: global_store_byte v[16:17], v66, off offset:161 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v66, 24, v14 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v14, 8, v14 +; ALIGNED-NEXT: global_store_byte v[16:17], v30, off offset:159 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v30, 24, v13 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v13, 8, v13 +; ALIGNED-NEXT: global_store_byte v[16:17], v98, off offset:157 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v12 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v12, 8, v12 +; ALIGNED-NEXT: global_store_byte v[16:17], v70, off offset:155 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 24, v11 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v11, 8, v11 +; ALIGNED-NEXT: global_store_byte v[16:17], v83, off offset:153 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 24, v10 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v10, 8, v10 +; ALIGNED-NEXT: global_store_byte v[16:17], v112, off offset:151 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v9 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v9, 8, v9 +; ALIGNED-NEXT: global_store_byte v[16:17], v65, off offset:147 +; ALIGNED-NEXT: global_store_byte v[16:17], v52, off offset:145 +; ALIGNED-NEXT: global_store_byte v[16:17], v25, off offset:143 +; ALIGNED-NEXT: global_store_byte v[16:17], v84, off offset:141 +; ALIGNED-NEXT: global_store_byte v[16:17], v64, off offset:139 +; ALIGNED-NEXT: global_store_byte v[16:17], v68, off offset:137 +; ALIGNED-NEXT: global_store_byte v[16:17], v113, off offset:135 +; ALIGNED-NEXT: global_store_byte v[16:17], v53, off offset:133 +; ALIGNED-NEXT: global_store_byte v[16:17], v87, off offset:131 +; ALIGNED-NEXT: global_store_byte v[16:17], v37, off offset:129 +; ALIGNED-NEXT: global_store_byte v[16:17], v24, off offset:127 +; ALIGNED-NEXT: global_store_byte v[16:17], v69, off offset:125 +; ALIGNED-NEXT: global_store_byte v[16:17], v86, off offset:123 +; ALIGNED-NEXT: global_store_byte v[16:17], v54, off offset:121 +; ALIGNED-NEXT: global_store_byte v[16:17], v114, off offset:119 +; ALIGNED-NEXT: global_store_byte v[16:17], v38, off offset:117 +; ALIGNED-NEXT: global_store_byte v[16:17], v85, off offset:115 +; ALIGNED-NEXT: global_store_byte v[16:17], v32, off offset:113 +; ALIGNED-NEXT: global_store_byte v[16:17], v51, off offset:111 +; ALIGNED-NEXT: global_store_byte v[16:17], v55, off offset:109 +; ALIGNED-NEXT: global_store_byte v[16:17], v80, off offset:107 +; ALIGNED-NEXT: global_store_byte v[16:17], v39, off offset:105 +; ALIGNED-NEXT: global_store_byte v[16:17], v115, off offset:103 +; ALIGNED-NEXT: global_store_byte v[16:17], v33, off offset:101 +; ALIGNED-NEXT: global_store_byte v[16:17], v101, off offset:99 +; ALIGNED-NEXT: global_store_byte v[16:17], v26, off offset:97 +; ALIGNED-NEXT: global_store_byte v[16:17], v50, off offset:95 +; ALIGNED-NEXT: global_store_byte v[16:17], v48, off offset:93 +; ALIGNED-NEXT: global_store_byte v[16:17], v99, off offset:91 +; ALIGNED-NEXT: global_store_byte v[16:17], v34, off offset:89 +; ALIGNED-NEXT: global_store_byte v[16:17], v102, off offset:87 +; ALIGNED-NEXT: global_store_byte v[16:17], v27, off offset:85 +; ALIGNED-NEXT: global_store_byte v[16:17], v96, off offset:83 +; ALIGNED-NEXT: global_store_byte v[16:17], v21, off offset:81 +; ALIGNED-NEXT: global_store_byte v[16:17], v49, off offset:79 +; ALIGNED-NEXT: global_store_byte v[16:17], v35, off offset:77 +; ALIGNED-NEXT: global_store_byte v[16:17], v81, off offset:75 +; ALIGNED-NEXT: global_store_byte v[16:17], v28, off offset:73 +; ALIGNED-NEXT: global_store_byte v[16:17], v31, off offset:71 +; ALIGNED-NEXT: global_store_byte v[16:17], v22, off offset:69 +; ALIGNED-NEXT: global_store_byte v[16:17], v100, off offset:67 +; ALIGNED-NEXT: global_store_byte v[16:17], v19, off offset:65 +; ALIGNED-NEXT: global_store_byte v[16:17], v36, off offset:63 +; ALIGNED-NEXT: global_store_byte v[16:17], v29, off offset:61 +; ALIGNED-NEXT: global_store_byte v[16:17], v97, off offset:59 +; ALIGNED-NEXT: global_store_byte v[16:17], v23, off offset:57 +; ALIGNED-NEXT: global_store_byte v[16:17], v103, off offset:55 +; ALIGNED-NEXT: global_store_byte v[16:17], v20, off offset:53 +; ALIGNED-NEXT: global_store_byte v[16:17], v82, off offset:51 +; ALIGNED-NEXT: global_store_byte v[16:17], v18, off offset:49 +; ALIGNED-NEXT: global_store_byte v[16:17], v71, off offset:43 +; ALIGNED-NEXT: global_store_byte v[16:17], v15, off offset:41 +; ALIGNED-NEXT: global_store_byte v[16:17], v66, off offset:47 +; ALIGNED-NEXT: global_store_byte v[16:17], v14, off offset:45 +; ALIGNED-NEXT: global_store_byte v[16:17], v30, off offset:35 +; ALIGNED-NEXT: global_store_byte v[16:17], v13, off offset:33 +; ALIGNED-NEXT: global_store_byte v[16:17], v98, off offset:39 +; ALIGNED-NEXT: global_store_byte v[16:17], v12, off offset:37 +; ALIGNED-NEXT: global_store_byte v[16:17], v70, off offset:31 +; ALIGNED-NEXT: global_store_byte v[16:17], v11, off offset:29 +; ALIGNED-NEXT: global_store_byte v[16:17], v83, off offset:27 +; ALIGNED-NEXT: global_store_byte v[16:17], v10, off offset:25 +; ALIGNED-NEXT: global_store_byte v[16:17], v112, off offset:23 +; ALIGNED-NEXT: global_store_byte v[16:17], v9, off offset:21 +; ALIGNED-NEXT: global_store_byte v[16:17], v67, off offset:19 +; ALIGNED-NEXT: global_store_byte v[16:17], v8, off offset:17 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v7, off offset:14 +; ALIGNED-NEXT: global_store_byte v[16:17], v7, off offset:12 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v6, off offset:10 +; ALIGNED-NEXT: global_store_byte v[16:17], v6, off offset:8 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v5, off offset:6 +; ALIGNED-NEXT: global_store_byte v[16:17], v5, off offset:4 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v4, off offset:2 +; ALIGNED-NEXT: global_store_byte v[16:17], v4, off +; ALIGNED-NEXT: v_lshrrev_b32_e32 v8, 24, v7 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v7, 8, v7 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v9, 24, v6 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v6, 8, v6 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v10, 24, v5 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v5, 8, v5 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v11, 24, v4 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v4, 8, v4 +; ALIGNED-NEXT: global_store_byte v[16:17], v8, off offset:15 +; ALIGNED-NEXT: global_store_byte v[16:17], v7, off offset:13 +; ALIGNED-NEXT: global_store_byte v[16:17], v9, off offset:11 +; ALIGNED-NEXT: global_store_byte v[16:17], v6, off offset:9 +; ALIGNED-NEXT: global_store_byte v[16:17], v10, off offset:7 +; ALIGNED-NEXT: global_store_byte v[16:17], v5, off offset:5 +; ALIGNED-NEXT: global_store_byte v[16:17], v11, off offset:3 +; ALIGNED-NEXT: global_store_byte v[16:17], v4, off offset:1 +; ALIGNED-NEXT: s_cbranch_vccnz .LBB1_1 +; ALIGNED-NEXT: ; %bb.2: ; %memcpy-split +; ALIGNED-NEXT: s_clause 0x7 +; ALIGNED-NEXT: buffer_load_dword v47, off, s[0:3], s32 +; ALIGNED-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 +; ALIGNED-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:8 +; ALIGNED-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:12 +; ALIGNED-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:16 +; ALIGNED-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 +; ALIGNED-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:24 +; ALIGNED-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: s_setpc_b64 s[30:31] +; +; UNROLL3-LABEL: memcpy_p1_p1_sz2048: +; UNROLL3: ; %bb.0: ; %entry +; UNROLL3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; UNROLL3-NEXT: s_mov_b64 s[4:5], 0 +; UNROLL3-NEXT: .p2align 6 +; UNROLL3-NEXT: .LBB1_1: ; %load-store-loop +; UNROLL3-NEXT: ; =>This Inner Loop Header: Depth=1 +; UNROLL3-NEXT: v_add_co_u32 v12, vcc_lo, v2, s4 +; UNROLL3-NEXT: v_add_co_ci_u32_e32 v13, vcc_lo, s5, v3, vcc_lo +; UNROLL3-NEXT: v_add_co_u32 v16, vcc_lo, v0, s4 +; UNROLL3-NEXT: s_add_u32 s4, s4, 48 +; UNROLL3-NEXT: s_clause 0x2 +; UNROLL3-NEXT: global_load_dwordx4 v[4:7], v[12:13], off +; UNROLL3-NEXT: global_load_dwordx4 v[8:11], v[12:13], off offset:16 +; UNROLL3-NEXT: global_load_dwordx4 v[12:15], v[12:13], off offset:32 +; UNROLL3-NEXT: v_add_co_ci_u32_e32 v17, vcc_lo, s5, v1, vcc_lo +; UNROLL3-NEXT: s_addc_u32 s5, s5, 0 +; UNROLL3-NEXT: s_waitcnt vmcnt(2) +; UNROLL3-NEXT: global_store_dwordx4 v[16:17], v[4:7], off +; UNROLL3-NEXT: s_waitcnt vmcnt(1) +; UNROLL3-NEXT: global_store_dwordx4 v[16:17], v[8:11], off offset:16 +; UNROLL3-NEXT: s_waitcnt vmcnt(0) +; UNROLL3-NEXT: global_store_dwordx4 v[16:17], v[12:15], off offset:32 +; UNROLL3-NEXT: v_cmp_gt_u64_e64 s6, 0x7e0, s[4:5] +; UNROLL3-NEXT: s_and_b32 vcc_lo, exec_lo, s6 +; UNROLL3-NEXT: s_cbranch_vccnz .LBB1_1 +; UNROLL3-NEXT: ; %bb.2: ; %memcpy-split +; UNROLL3-NEXT: global_load_dwordx4 v[4:7], v[2:3], off offset:2016 +; UNROLL3-NEXT: s_waitcnt vmcnt(0) +; UNROLL3-NEXT: global_store_dwordx4 v[0:1], v[4:7], off offset:2016 +; UNROLL3-NEXT: global_load_dwordx4 v[2:5], v[2:3], off offset:2032 +; UNROLL3-NEXT: s_waitcnt vmcnt(0) +; UNROLL3-NEXT: global_store_dwordx4 v[0:1], v[2:5], off offset:2032 +; UNROLL3-NEXT: s_setpc_b64 s[30:31] +entry: + tail call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) noundef nonnull align 1 %dst, ptr addrspace(1) noundef nonnull align 1 %src, i64 2048, i1 false) + ret void +} + +define void @memcpy_p0_p4_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(4) align 1 readonly %src) { +; CHECK-LABEL: memcpy_p0_p4_sz2048: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_mov_b64 s[4:5], 0 +; CHECK-NEXT: .LBB2_1: ; %load-store-loop +; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: v_add_co_u32 v96, vcc_lo, v2, s4 +; CHECK-NEXT: v_add_co_ci_u32_e32 v97, vcc_lo, s5, v3, vcc_lo +; CHECK-NEXT: v_add_co_u32 v100, vcc_lo, v0, s4 +; CHECK-NEXT: s_add_u32 s4, s4, 0x100 +; CHECK-NEXT: s_clause 0xf +; CHECK-NEXT: global_load_dwordx4 v[4:7], v[96:97], off offset:240 +; CHECK-NEXT: global_load_dwordx4 v[8:11], v[96:97], off offset:224 +; CHECK-NEXT: global_load_dwordx4 v[12:15], v[96:97], off offset:208 +; CHECK-NEXT: global_load_dwordx4 v[16:19], v[96:97], off offset:192 +; CHECK-NEXT: global_load_dwordx4 v[20:23], v[96:97], off offset:176 +; CHECK-NEXT: global_load_dwordx4 v[24:27], v[96:97], off offset:160 +; CHECK-NEXT: global_load_dwordx4 v[28:31], v[96:97], off offset:144 +; CHECK-NEXT: global_load_dwordx4 v[32:35], v[96:97], off offset:128 +; CHECK-NEXT: global_load_dwordx4 v[36:39], v[96:97], off offset:112 +; CHECK-NEXT: global_load_dwordx4 v[48:51], v[96:97], off offset:96 +; CHECK-NEXT: global_load_dwordx4 v[52:55], v[96:97], off offset:80 +; CHECK-NEXT: global_load_dwordx4 v[64:67], v[96:97], off offset:64 +; CHECK-NEXT: global_load_dwordx4 v[68:71], v[96:97], off offset:48 +; CHECK-NEXT: global_load_dwordx4 v[80:83], v[96:97], off offset:32 +; CHECK-NEXT: global_load_dwordx4 v[84:87], v[96:97], off offset:16 +; CHECK-NEXT: global_load_dwordx4 v[96:99], v[96:97], off +; CHECK-NEXT: v_add_co_ci_u32_e32 v101, vcc_lo, s5, v1, vcc_lo +; CHECK-NEXT: s_addc_u32 s5, s5, 0 +; CHECK-NEXT: s_waitcnt vmcnt(15) +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[4:7] offset:240 +; CHECK-NEXT: s_waitcnt vmcnt(14) +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[8:11] offset:224 +; CHECK-NEXT: s_waitcnt vmcnt(13) +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[12:15] offset:208 +; CHECK-NEXT: s_waitcnt vmcnt(12) +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[16:19] offset:192 +; CHECK-NEXT: s_waitcnt vmcnt(11) +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[20:23] offset:176 +; CHECK-NEXT: s_waitcnt vmcnt(10) +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[24:27] offset:160 +; CHECK-NEXT: s_waitcnt vmcnt(9) +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[28:31] offset:144 +; CHECK-NEXT: s_waitcnt vmcnt(8) +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[32:35] offset:128 +; CHECK-NEXT: s_waitcnt vmcnt(7) +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[36:39] offset:112 +; CHECK-NEXT: s_waitcnt vmcnt(6) +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[48:51] offset:96 +; CHECK-NEXT: s_waitcnt vmcnt(5) +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[52:55] offset:80 +; CHECK-NEXT: s_waitcnt vmcnt(4) +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[64:67] offset:64 +; CHECK-NEXT: s_waitcnt vmcnt(3) +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[68:71] offset:48 +; CHECK-NEXT: s_waitcnt vmcnt(2) +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[80:83] offset:32 +; CHECK-NEXT: s_waitcnt vmcnt(1) +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[84:87] offset:16 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[96:99] +; CHECK-NEXT: v_cmp_gt_u64_e64 s6, 0x800, s[4:5] +; CHECK-NEXT: s_and_b32 vcc_lo, exec_lo, s6 +; CHECK-NEXT: s_cbranch_vccnz .LBB2_1 +; CHECK-NEXT: ; %bb.2: ; %memcpy-split +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: s_setpc_b64 s[30:31] +; +; ALIGNED-LABEL: memcpy_p0_p4_sz2048: +; ALIGNED: ; %bb.0: ; %entry +; ALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; ALIGNED-NEXT: s_mov_b64 s[4:5], 0 +; ALIGNED-NEXT: .LBB2_1: ; %load-store-loop +; ALIGNED-NEXT: ; =>This Inner Loop Header: Depth=1 +; ALIGNED-NEXT: v_add_co_u32 v4, vcc_lo, v2, s4 +; ALIGNED-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, s5, v3, vcc_lo +; ALIGNED-NEXT: v_add_co_u32 v96, vcc_lo, v0, s4 +; ALIGNED-NEXT: v_add_co_ci_u32_e32 v97, vcc_lo, s5, v1, vcc_lo +; ALIGNED-NEXT: s_clause 0xf +; ALIGNED-NEXT: global_load_dwordx4 v[98:101], v[4:5], off offset:240 +; ALIGNED-NEXT: global_load_dwordx4 v[84:87], v[4:5], off offset:224 +; ALIGNED-NEXT: global_load_dwordx4 v[80:83], v[4:5], off offset:208 +; ALIGNED-NEXT: global_load_dwordx4 v[68:71], v[4:5], off offset:192 +; ALIGNED-NEXT: global_load_dwordx4 v[64:67], v[4:5], off offset:176 +; ALIGNED-NEXT: global_load_dwordx4 v[52:55], v[4:5], off offset:160 +; ALIGNED-NEXT: global_load_dwordx4 v[48:51], v[4:5], off offset:144 +; ALIGNED-NEXT: global_load_dwordx4 v[36:39], v[4:5], off offset:128 +; ALIGNED-NEXT: global_load_dwordx4 v[32:35], v[4:5], off offset:112 +; ALIGNED-NEXT: global_load_dwordx4 v[28:31], v[4:5], off offset:96 +; ALIGNED-NEXT: global_load_dwordx4 v[24:27], v[4:5], off offset:80 +; ALIGNED-NEXT: global_load_dwordx4 v[20:23], v[4:5], off offset:64 +; ALIGNED-NEXT: global_load_dwordx4 v[16:19], v[4:5], off offset:48 +; ALIGNED-NEXT: global_load_dwordx4 v[12:15], v[4:5], off offset:32 +; ALIGNED-NEXT: global_load_dwordx4 v[8:11], v[4:5], off offset:16 +; ALIGNED-NEXT: global_load_dwordx4 v[4:7], v[4:5], off +; ALIGNED-NEXT: s_add_u32 s4, s4, 0x100 +; ALIGNED-NEXT: s_addc_u32 s5, s5, 0 +; ALIGNED-NEXT: s_waitcnt vmcnt(15) +; ALIGNED-NEXT: buffer_store_dword v100, off, s[0:3], s32 offset:168 +; ALIGNED-NEXT: buffer_store_dword v101, off, s[0:3], s32 offset:172 +; ALIGNED-NEXT: buffer_store_dword v99, off, s[0:3], s32 offset:164 +; ALIGNED-NEXT: buffer_store_dword v98, off, s[0:3], s32 offset:160 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v100 offset:250 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v101 offset:254 +; ALIGNED-NEXT: flat_store_byte v[96:97], v101 offset:252 +; ALIGNED-NEXT: flat_store_byte v[96:97], v100 offset:248 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v99 offset:246 +; ALIGNED-NEXT: flat_store_byte v[96:97], v99 offset:244 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v98 offset:242 +; ALIGNED-NEXT: flat_store_byte v[96:97], v98 offset:240 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v100 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v100 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v101 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 8, v101 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 24, v99 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 8, v99 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 24, v98 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 8, v98 +; ALIGNED-NEXT: s_waitcnt vmcnt(14) +; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v86 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v86 +; ALIGNED-NEXT: flat_store_byte v[96:97], v113 offset:251 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v87 +; ALIGNED-NEXT: flat_store_byte v[96:97], v112 offset:249 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v87 +; ALIGNED-NEXT: flat_store_byte v[96:97], v103 offset:255 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v85 +; ALIGNED-NEXT: flat_store_byte v[96:97], v102 offset:253 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 8, v85 +; ALIGNED-NEXT: flat_store_byte v[96:97], v101 offset:247 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 24, v84 +; ALIGNED-NEXT: flat_store_byte v[96:97], v100 offset:245 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 8, v84 +; ALIGNED-NEXT: flat_store_byte v[96:97], v99 offset:243 +; ALIGNED-NEXT: s_waitcnt vmcnt(13) +; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 24, v82 +; ALIGNED-NEXT: flat_store_byte v[96:97], v98 offset:241 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 8, v82 +; ALIGNED-NEXT: buffer_store_dword v86, off, s[0:3], s32 offset:184 +; ALIGNED-NEXT: buffer_store_dword v87, off, s[0:3], s32 offset:188 +; ALIGNED-NEXT: buffer_store_dword v85, off, s[0:3], s32 offset:180 +; ALIGNED-NEXT: buffer_store_dword v84, off, s[0:3], s32 offset:176 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v86 offset:234 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v87 offset:238 +; ALIGNED-NEXT: flat_store_byte v[96:97], v87 offset:236 +; ALIGNED-NEXT: flat_store_byte v[96:97], v86 offset:232 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v85 offset:230 +; ALIGNED-NEXT: flat_store_byte v[96:97], v85 offset:228 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v84 offset:226 +; ALIGNED-NEXT: flat_store_byte v[96:97], v84 offset:224 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v84, 24, v83 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 8, v83 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 24, v81 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 8, v81 +; ALIGNED-NEXT: flat_store_byte v[96:97], v114 offset:235 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v80 +; ALIGNED-NEXT: flat_store_byte v[96:97], v115 offset:233 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v80 +; ALIGNED-NEXT: flat_store_byte v[96:97], v113 offset:239 +; ALIGNED-NEXT: s_waitcnt vmcnt(12) +; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v70 +; ALIGNED-NEXT: flat_store_byte v[96:97], v112 offset:237 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v70 +; ALIGNED-NEXT: flat_store_byte v[96:97], v103 offset:231 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v71 +; ALIGNED-NEXT: flat_store_byte v[96:97], v102 offset:229 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 8, v71 +; ALIGNED-NEXT: flat_store_byte v[96:97], v101 offset:227 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 24, v69 +; ALIGNED-NEXT: flat_store_byte v[96:97], v100 offset:225 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 8, v69 +; ALIGNED-NEXT: buffer_store_dword v82, off, s[0:3], s32 offset:136 +; ALIGNED-NEXT: buffer_store_dword v83, off, s[0:3], s32 offset:140 +; ALIGNED-NEXT: buffer_store_dword v81, off, s[0:3], s32 offset:132 +; ALIGNED-NEXT: buffer_store_dword v80, off, s[0:3], s32 offset:128 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v82 offset:218 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v83 offset:222 +; ALIGNED-NEXT: flat_store_byte v[96:97], v83 offset:220 +; ALIGNED-NEXT: flat_store_byte v[96:97], v82 offset:216 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v81 offset:214 +; ALIGNED-NEXT: flat_store_byte v[96:97], v81 offset:212 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v80 offset:210 +; ALIGNED-NEXT: flat_store_byte v[96:97], v80 offset:208 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 24, v68 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 8, v68 +; ALIGNED-NEXT: s_waitcnt vmcnt(11) +; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v66 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 8, v66 +; ALIGNED-NEXT: flat_store_byte v[96:97], v99 offset:219 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 24, v67 +; ALIGNED-NEXT: flat_store_byte v[96:97], v98 offset:217 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 8, v67 +; ALIGNED-NEXT: flat_store_byte v[96:97], v84 offset:223 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v84, 24, v65 +; ALIGNED-NEXT: flat_store_byte v[96:97], v85 offset:221 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 8, v65 +; ALIGNED-NEXT: flat_store_byte v[96:97], v86 offset:215 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 24, v64 +; ALIGNED-NEXT: flat_store_byte v[96:97], v87 offset:213 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 8, v64 +; ALIGNED-NEXT: flat_store_byte v[96:97], v114 offset:211 +; ALIGNED-NEXT: s_waitcnt vmcnt(10) +; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v54 +; ALIGNED-NEXT: flat_store_byte v[96:97], v115 offset:209 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v54 +; ALIGNED-NEXT: buffer_store_dword v70, off, s[0:3], s32 offset:152 +; ALIGNED-NEXT: buffer_store_dword v71, off, s[0:3], s32 offset:156 +; ALIGNED-NEXT: buffer_store_dword v69, off, s[0:3], s32 offset:148 +; ALIGNED-NEXT: buffer_store_dword v68, off, s[0:3], s32 offset:144 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v70 offset:202 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v71 offset:206 +; ALIGNED-NEXT: flat_store_byte v[96:97], v71 offset:204 +; ALIGNED-NEXT: flat_store_byte v[96:97], v70 offset:200 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v69 offset:198 +; ALIGNED-NEXT: flat_store_byte v[96:97], v69 offset:196 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v68 offset:194 +; ALIGNED-NEXT: flat_store_byte v[96:97], v68 offset:192 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 8, v55 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 24, v52 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 8, v52 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v68, 24, v55 +; ALIGNED-NEXT: flat_store_byte v[96:97], v113 offset:203 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v53 +; ALIGNED-NEXT: flat_store_byte v[96:97], v112 offset:201 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v53 +; ALIGNED-NEXT: flat_store_byte v[96:97], v103 offset:207 +; ALIGNED-NEXT: s_waitcnt vmcnt(9) +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v50 +; ALIGNED-NEXT: flat_store_byte v[96:97], v102 offset:205 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 8, v50 +; ALIGNED-NEXT: flat_store_byte v[96:97], v101 offset:199 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 24, v51 +; ALIGNED-NEXT: flat_store_byte v[96:97], v100 offset:197 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 8, v51 +; ALIGNED-NEXT: flat_store_byte v[96:97], v80 offset:195 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 24, v49 +; ALIGNED-NEXT: flat_store_byte v[96:97], v81 offset:193 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 8, v49 +; ALIGNED-NEXT: buffer_store_dword v66, off, s[0:3], s32 offset:232 +; ALIGNED-NEXT: buffer_store_dword v67, off, s[0:3], s32 offset:236 +; ALIGNED-NEXT: buffer_store_dword v65, off, s[0:3], s32 offset:228 +; ALIGNED-NEXT: buffer_store_dword v64, off, s[0:3], s32 offset:224 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v66 offset:186 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v67 offset:190 +; ALIGNED-NEXT: flat_store_byte v[96:97], v67 offset:188 +; ALIGNED-NEXT: flat_store_byte v[96:97], v66 offset:184 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v65 offset:182 +; ALIGNED-NEXT: flat_store_byte v[96:97], v65 offset:180 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v64 offset:178 +; ALIGNED-NEXT: flat_store_byte v[96:97], v64 offset:176 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v64, 24, v48 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v65, 8, v48 +; ALIGNED-NEXT: flat_store_byte v[96:97], v82 offset:187 +; ALIGNED-NEXT: s_waitcnt vmcnt(8) +; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v39 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v66, 24, v38 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v67, 8, v38 +; ALIGNED-NEXT: flat_store_byte v[96:97], v83 offset:185 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 8, v39 +; ALIGNED-NEXT: flat_store_byte v[96:97], v99 offset:191 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 24, v37 +; ALIGNED-NEXT: flat_store_byte v[96:97], v98 offset:189 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 8, v37 +; ALIGNED-NEXT: flat_store_byte v[96:97], v84 offset:183 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v84, 24, v36 +; ALIGNED-NEXT: flat_store_byte v[96:97], v85 offset:181 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 8, v36 +; ALIGNED-NEXT: flat_store_byte v[96:97], v86 offset:179 +; ALIGNED-NEXT: s_waitcnt vmcnt(7) +; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 24, v34 +; ALIGNED-NEXT: flat_store_byte v[96:97], v87 offset:177 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 8, v34 +; ALIGNED-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:252 +; ALIGNED-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:248 +; ALIGNED-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:244 +; ALIGNED-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:240 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v54 offset:170 +; ALIGNED-NEXT: flat_store_byte v[96:97], v54 offset:168 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v55 offset:174 +; ALIGNED-NEXT: flat_store_byte v[96:97], v55 offset:172 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v52 offset:162 +; ALIGNED-NEXT: flat_store_byte v[96:97], v52 offset:160 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v53 offset:166 +; ALIGNED-NEXT: flat_store_byte v[96:97], v53 offset:164 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v53, 24, v35 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v54, 8, v35 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v55, 24, v33 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v116, 8, v33 +; ALIGNED-NEXT: flat_store_byte v[96:97], v114 offset:171 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v32 +; ALIGNED-NEXT: flat_store_byte v[96:97], v115 offset:169 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v32 +; ALIGNED-NEXT: flat_store_byte v[96:97], v69 offset:173 +; ALIGNED-NEXT: s_waitcnt vmcnt(6) +; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 24, v31 +; ALIGNED-NEXT: v_cmp_gt_u64_e64 s6, 0x800, s[4:5] +; ALIGNED-NEXT: flat_store_byte v[96:97], v70 offset:163 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 8, v31 +; ALIGNED-NEXT: flat_store_byte v[96:97], v71 offset:161 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 24, v29 +; ALIGNED-NEXT: flat_store_byte v[96:97], v68 offset:175 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v52, 24, v30 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v68, 8, v30 +; ALIGNED-NEXT: flat_store_byte v[96:97], v113 offset:167 +; ALIGNED-NEXT: flat_store_byte v[96:97], v112 offset:165 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v29 +; ALIGNED-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:200 +; ALIGNED-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:204 +; ALIGNED-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:196 +; ALIGNED-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:192 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v50 offset:154 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v51 offset:158 +; ALIGNED-NEXT: flat_store_byte v[96:97], v51 offset:156 +; ALIGNED-NEXT: flat_store_byte v[96:97], v50 offset:152 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v49 offset:150 +; ALIGNED-NEXT: flat_store_byte v[96:97], v49 offset:148 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v48 offset:146 +; ALIGNED-NEXT: flat_store_byte v[96:97], v48 offset:144 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v48, 24, v28 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v49, 8, v28 +; ALIGNED-NEXT: s_waitcnt vmcnt(5) +; ALIGNED-NEXT: v_lshrrev_b32_e32 v50, 24, v26 +; ALIGNED-NEXT: flat_store_byte v[96:97], v103 offset:155 +; ALIGNED-NEXT: flat_store_byte v[96:97], v102 offset:153 +; ALIGNED-NEXT: flat_store_byte v[96:97], v101 offset:159 +; ALIGNED-NEXT: flat_store_byte v[96:97], v100 offset:157 +; ALIGNED-NEXT: flat_store_byte v[96:97], v80 offset:151 +; ALIGNED-NEXT: flat_store_byte v[96:97], v81 offset:149 +; ALIGNED-NEXT: flat_store_byte v[96:97], v64 offset:147 +; ALIGNED-NEXT: flat_store_byte v[96:97], v65 offset:145 +; ALIGNED-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:216 +; ALIGNED-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:220 +; ALIGNED-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:212 +; ALIGNED-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:208 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v38 offset:138 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v39 offset:142 +; ALIGNED-NEXT: flat_store_byte v[96:97], v39 offset:140 +; ALIGNED-NEXT: flat_store_byte v[96:97], v38 offset:136 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v37 offset:134 +; ALIGNED-NEXT: flat_store_byte v[96:97], v37 offset:132 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v36 offset:130 +; ALIGNED-NEXT: flat_store_byte v[96:97], v36 offset:128 +; ALIGNED-NEXT: flat_store_byte v[96:97], v82 offset:143 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v18 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 8, v26 +; ALIGNED-NEXT: flat_store_byte v[96:97], v66 offset:139 +; ALIGNED-NEXT: flat_store_byte v[96:97], v67 offset:137 +; ALIGNED-NEXT: flat_store_byte v[96:97], v83 offset:141 +; ALIGNED-NEXT: flat_store_byte v[96:97], v99 offset:135 +; ALIGNED-NEXT: flat_store_byte v[96:97], v98 offset:133 +; ALIGNED-NEXT: flat_store_byte v[96:97], v84 offset:131 +; ALIGNED-NEXT: flat_store_byte v[96:97], v85 offset:129 +; ALIGNED-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:40 +; ALIGNED-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:44 +; ALIGNED-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:36 +; ALIGNED-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:32 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v34 offset:122 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v35 offset:126 +; ALIGNED-NEXT: flat_store_byte v[96:97], v35 offset:124 +; ALIGNED-NEXT: flat_store_byte v[96:97], v34 offset:120 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v33 offset:118 +; ALIGNED-NEXT: flat_store_byte v[96:97], v33 offset:116 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v32 offset:114 +; ALIGNED-NEXT: flat_store_byte v[96:97], v32 offset:112 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: v_lshrrev_b32_e32 v34, 24, v14 +; ALIGNED-NEXT: flat_store_byte v[96:97], v86 offset:123 +; ALIGNED-NEXT: flat_store_byte v[96:97], v87 offset:121 +; ALIGNED-NEXT: flat_store_byte v[96:97], v53 offset:127 +; ALIGNED-NEXT: flat_store_byte v[96:97], v54 offset:125 +; ALIGNED-NEXT: flat_store_byte v[96:97], v55 offset:119 +; ALIGNED-NEXT: flat_store_byte v[96:97], v116 offset:117 +; ALIGNED-NEXT: flat_store_byte v[96:97], v114 offset:115 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v10 +; ALIGNED-NEXT: flat_store_byte v[96:97], v115 offset:113 +; ALIGNED-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:56 +; ALIGNED-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:60 +; ALIGNED-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:52 +; ALIGNED-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:48 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v30 offset:106 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v31 offset:110 +; ALIGNED-NEXT: flat_store_byte v[96:97], v31 offset:108 +; ALIGNED-NEXT: flat_store_byte v[96:97], v30 offset:104 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v29 offset:102 +; ALIGNED-NEXT: flat_store_byte v[96:97], v29 offset:100 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v28 offset:98 +; ALIGNED-NEXT: flat_store_byte v[96:97], v28 offset:96 +; ALIGNED-NEXT: flat_store_byte v[96:97], v69 offset:111 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 24, v6 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v27 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 8, v18 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v35, 8, v14 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v10 +; ALIGNED-NEXT: flat_store_byte v[96:97], v70 offset:109 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 8, v6 +; ALIGNED-NEXT: s_and_b32 vcc_lo, exec_lo, s6 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 8, v27 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 24, v25 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 24, v19 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 24, v15 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v28, 24, v11 +; ALIGNED-NEXT: flat_store_byte v[96:97], v71 offset:103 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 24, v7 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 8, v25 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 24, v24 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 8, v24 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v64, 24, v22 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v65, 8, v22 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v36, 24, v23 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v37, 8, v23 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v38, 24, v21 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v39, 8, v21 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v66, 24, v20 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v67, 8, v20 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 8, v19 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v84, 24, v17 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 8, v17 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v32, 24, v16 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v33, 8, v16 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 8, v15 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v53, 24, v13 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v54, 8, v13 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v55, 24, v12 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 8, v12 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v29, 8, v11 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v30, 24, v9 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v31, 8, v9 +; ALIGNED-NEXT: flat_store_byte v[96:97], v52 offset:107 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v52, 24, v8 +; ALIGNED-NEXT: flat_store_byte v[96:97], v68 offset:105 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v68, 8, v8 +; ALIGNED-NEXT: flat_store_byte v[96:97], v112 offset:101 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v7 +; ALIGNED-NEXT: flat_store_byte v[96:97], v48 offset:99 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v48, 24, v5 +; ALIGNED-NEXT: flat_store_byte v[96:97], v49 offset:97 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v49, 8, v5 +; ALIGNED-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:8 +; ALIGNED-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:12 +; ALIGNED-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:4 +; ALIGNED-NEXT: buffer_store_dword v24, off, s[0:3], s32 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v26 offset:90 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v27 offset:94 +; ALIGNED-NEXT: flat_store_byte v[96:97], v27 offset:92 +; ALIGNED-NEXT: flat_store_byte v[96:97], v26 offset:88 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v25 offset:86 +; ALIGNED-NEXT: flat_store_byte v[96:97], v25 offset:84 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v24 offset:82 +; ALIGNED-NEXT: flat_store_byte v[96:97], v24 offset:80 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v24, 24, v4 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v25, 8, v4 +; ALIGNED-NEXT: flat_store_byte v[96:97], v50 offset:91 +; ALIGNED-NEXT: flat_store_byte v[96:97], v51 offset:89 +; ALIGNED-NEXT: flat_store_byte v[96:97], v103 offset:95 +; ALIGNED-NEXT: flat_store_byte v[96:97], v102 offset:93 +; ALIGNED-NEXT: flat_store_byte v[96:97], v101 offset:87 +; ALIGNED-NEXT: flat_store_byte v[96:97], v100 offset:85 +; ALIGNED-NEXT: flat_store_byte v[96:97], v80 offset:83 +; ALIGNED-NEXT: flat_store_byte v[96:97], v81 offset:81 +; ALIGNED-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:24 +; ALIGNED-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:28 +; ALIGNED-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:20 +; ALIGNED-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:16 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v22 offset:74 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v23 offset:78 +; ALIGNED-NEXT: flat_store_byte v[96:97], v23 offset:76 +; ALIGNED-NEXT: flat_store_byte v[96:97], v22 offset:72 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v21 offset:70 +; ALIGNED-NEXT: flat_store_byte v[96:97], v21 offset:68 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v20 offset:66 +; ALIGNED-NEXT: flat_store_byte v[96:97], v20 offset:64 +; ALIGNED-NEXT: flat_store_byte v[96:97], v64 offset:75 +; ALIGNED-NEXT: flat_store_byte v[96:97], v65 offset:73 +; ALIGNED-NEXT: flat_store_byte v[96:97], v36 offset:79 +; ALIGNED-NEXT: flat_store_byte v[96:97], v37 offset:77 +; ALIGNED-NEXT: flat_store_byte v[96:97], v38 offset:71 +; ALIGNED-NEXT: flat_store_byte v[96:97], v39 offset:69 +; ALIGNED-NEXT: flat_store_byte v[96:97], v66 offset:67 +; ALIGNED-NEXT: flat_store_byte v[96:97], v67 offset:65 +; ALIGNED-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:104 +; ALIGNED-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:108 +; ALIGNED-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:100 +; ALIGNED-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:96 +; ALIGNED-NEXT: flat_store_byte v[96:97], v82 offset:59 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v18 offset:58 +; ALIGNED-NEXT: flat_store_byte v[96:97], v83 offset:57 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v19 offset:62 +; ALIGNED-NEXT: flat_store_byte v[96:97], v99 offset:63 +; ALIGNED-NEXT: flat_store_byte v[96:97], v19 offset:60 +; ALIGNED-NEXT: flat_store_byte v[96:97], v98 offset:61 +; ALIGNED-NEXT: flat_store_byte v[96:97], v18 offset:56 +; ALIGNED-NEXT: flat_store_byte v[96:97], v84 offset:55 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v17 offset:54 +; ALIGNED-NEXT: flat_store_byte v[96:97], v85 offset:53 +; ALIGNED-NEXT: flat_store_byte v[96:97], v17 offset:52 +; ALIGNED-NEXT: flat_store_byte v[96:97], v32 offset:51 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v16 offset:50 +; ALIGNED-NEXT: flat_store_byte v[96:97], v33 offset:49 +; ALIGNED-NEXT: flat_store_byte v[96:97], v16 offset:48 +; ALIGNED-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:120 +; ALIGNED-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:124 +; ALIGNED-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:116 +; ALIGNED-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:112 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v14 offset:42 +; ALIGNED-NEXT: flat_store_byte v[96:97], v34 offset:43 +; ALIGNED-NEXT: flat_store_byte v[96:97], v35 offset:41 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v15 offset:46 +; ALIGNED-NEXT: flat_store_byte v[96:97], v86 offset:47 +; ALIGNED-NEXT: flat_store_byte v[96:97], v15 offset:44 +; ALIGNED-NEXT: flat_store_byte v[96:97], v87 offset:45 +; ALIGNED-NEXT: flat_store_byte v[96:97], v14 offset:40 +; ALIGNED-NEXT: flat_store_byte v[96:97], v53 offset:39 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v13 offset:38 +; ALIGNED-NEXT: flat_store_byte v[96:97], v54 offset:37 +; ALIGNED-NEXT: flat_store_byte v[96:97], v13 offset:36 +; ALIGNED-NEXT: flat_store_byte v[96:97], v55 offset:35 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v12 offset:34 +; ALIGNED-NEXT: flat_store_byte v[96:97], v113 offset:33 +; ALIGNED-NEXT: flat_store_byte v[96:97], v12 offset:32 +; ALIGNED-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:72 +; ALIGNED-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:76 +; ALIGNED-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:68 +; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:64 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v10 offset:26 +; ALIGNED-NEXT: flat_store_byte v[96:97], v114 offset:27 +; ALIGNED-NEXT: flat_store_byte v[96:97], v115 offset:25 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v11 offset:30 +; ALIGNED-NEXT: flat_store_byte v[96:97], v28 offset:31 +; ALIGNED-NEXT: flat_store_byte v[96:97], v11 offset:28 +; ALIGNED-NEXT: flat_store_byte v[96:97], v29 offset:29 +; ALIGNED-NEXT: flat_store_byte v[96:97], v10 offset:24 +; ALIGNED-NEXT: flat_store_byte v[96:97], v30 offset:23 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v9 offset:22 +; ALIGNED-NEXT: flat_store_byte v[96:97], v31 offset:21 +; ALIGNED-NEXT: flat_store_byte v[96:97], v9 offset:20 +; ALIGNED-NEXT: flat_store_byte v[96:97], v52 offset:19 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v8 offset:18 +; ALIGNED-NEXT: flat_store_byte v[96:97], v68 offset:17 +; ALIGNED-NEXT: flat_store_byte v[96:97], v8 offset:16 +; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:88 +; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:92 +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:84 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:80 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v6 offset:10 +; ALIGNED-NEXT: flat_store_byte v[96:97], v69 offset:11 +; ALIGNED-NEXT: flat_store_byte v[96:97], v70 offset:9 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v7 offset:14 +; ALIGNED-NEXT: flat_store_byte v[96:97], v71 offset:15 +; ALIGNED-NEXT: flat_store_byte v[96:97], v7 offset:12 +; ALIGNED-NEXT: flat_store_byte v[96:97], v112 offset:13 +; ALIGNED-NEXT: flat_store_byte v[96:97], v6 offset:8 +; ALIGNED-NEXT: flat_store_byte v[96:97], v48 offset:7 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v5 offset:6 +; ALIGNED-NEXT: flat_store_byte v[96:97], v49 offset:5 +; ALIGNED-NEXT: flat_store_byte v[96:97], v5 offset:4 +; ALIGNED-NEXT: flat_store_byte v[96:97], v24 offset:3 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v4 offset:2 +; ALIGNED-NEXT: flat_store_byte v[96:97], v25 offset:1 +; ALIGNED-NEXT: flat_store_byte v[96:97], v4 +; ALIGNED-NEXT: s_cbranch_vccnz .LBB2_1 +; ALIGNED-NEXT: ; %bb.2: ; %memcpy-split +; ALIGNED-NEXT: s_waitcnt lgkmcnt(0) +; ALIGNED-NEXT: s_setpc_b64 s[30:31] +; +; UNROLL3-LABEL: memcpy_p0_p4_sz2048: +; UNROLL3: ; %bb.0: ; %entry +; UNROLL3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; UNROLL3-NEXT: s_mov_b64 s[4:5], 0 +; UNROLL3-NEXT: .p2align 6 +; UNROLL3-NEXT: .LBB2_1: ; %load-store-loop +; UNROLL3-NEXT: ; =>This Inner Loop Header: Depth=1 +; UNROLL3-NEXT: v_add_co_u32 v12, vcc_lo, v2, s4 +; UNROLL3-NEXT: v_add_co_ci_u32_e32 v13, vcc_lo, s5, v3, vcc_lo +; UNROLL3-NEXT: v_add_co_u32 v16, vcc_lo, v0, s4 +; UNROLL3-NEXT: s_add_u32 s4, s4, 48 +; UNROLL3-NEXT: s_clause 0x2 +; UNROLL3-NEXT: global_load_dwordx4 v[4:7], v[12:13], off offset:16 +; UNROLL3-NEXT: global_load_dwordx4 v[8:11], v[12:13], off +; UNROLL3-NEXT: global_load_dwordx4 v[12:15], v[12:13], off offset:32 +; UNROLL3-NEXT: v_add_co_ci_u32_e32 v17, vcc_lo, s5, v1, vcc_lo +; UNROLL3-NEXT: s_addc_u32 s5, s5, 0 +; UNROLL3-NEXT: s_waitcnt vmcnt(2) +; UNROLL3-NEXT: flat_store_dwordx4 v[16:17], v[4:7] offset:16 +; UNROLL3-NEXT: s_waitcnt vmcnt(1) +; UNROLL3-NEXT: flat_store_dwordx4 v[16:17], v[8:11] +; UNROLL3-NEXT: s_waitcnt vmcnt(0) +; UNROLL3-NEXT: flat_store_dwordx4 v[16:17], v[12:15] offset:32 +; UNROLL3-NEXT: v_cmp_gt_u64_e64 s6, 0x7e0, s[4:5] +; UNROLL3-NEXT: s_and_b32 vcc_lo, exec_lo, s6 +; UNROLL3-NEXT: s_cbranch_vccnz .LBB2_1 +; UNROLL3-NEXT: ; %bb.2: ; %memcpy-split +; UNROLL3-NEXT: s_clause 0x1 +; UNROLL3-NEXT: global_load_dwordx4 v[4:7], v[2:3], off offset:2016 +; UNROLL3-NEXT: global_load_dwordx4 v[8:11], v[2:3], off offset:2032 +; UNROLL3-NEXT: s_waitcnt vmcnt(1) +; UNROLL3-NEXT: flat_store_dwordx4 v[0:1], v[4:7] offset:2016 +; UNROLL3-NEXT: s_waitcnt vmcnt(0) +; UNROLL3-NEXT: flat_store_dwordx4 v[0:1], v[8:11] offset:2032 +; UNROLL3-NEXT: s_waitcnt lgkmcnt(0) +; UNROLL3-NEXT: s_setpc_b64 s[30:31] +entry: + tail call void @llvm.memcpy.p0.p4.i64(ptr addrspace(0) noundef nonnull align 1 %dst, ptr addrspace(4) noundef nonnull align 1 %src, i64 2048, i1 false) + ret void +} + +define void @memcpy_p5_p5_sz2048(ptr addrspace(5) align 1 %dst, ptr addrspace(5) align 1 readonly %src) { +; CHECK-LABEL: memcpy_p5_p5_sz2048: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_mov_b64 s[4:5], 0 +; CHECK-NEXT: .LBB3_1: ; %load-store-loop +; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: s_clause 0x3e +; CHECK-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen offset:252 +; CHECK-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:248 +; CHECK-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen offset:244 +; CHECK-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen offset:240 +; CHECK-NEXT: buffer_load_dword v6, v1, s[0:3], 0 offen offset:236 +; CHECK-NEXT: buffer_load_dword v7, v1, s[0:3], 0 offen offset:232 +; CHECK-NEXT: buffer_load_dword v8, v1, s[0:3], 0 offen offset:228 +; CHECK-NEXT: buffer_load_dword v9, v1, s[0:3], 0 offen offset:224 +; CHECK-NEXT: buffer_load_dword v10, v1, s[0:3], 0 offen offset:220 +; CHECK-NEXT: buffer_load_dword v11, v1, s[0:3], 0 offen offset:216 +; CHECK-NEXT: buffer_load_dword v12, v1, s[0:3], 0 offen offset:212 +; CHECK-NEXT: buffer_load_dword v13, v1, s[0:3], 0 offen offset:208 +; CHECK-NEXT: buffer_load_dword v14, v1, s[0:3], 0 offen offset:204 +; CHECK-NEXT: buffer_load_dword v15, v1, s[0:3], 0 offen offset:200 +; CHECK-NEXT: buffer_load_dword v16, v1, s[0:3], 0 offen offset:196 +; CHECK-NEXT: buffer_load_dword v17, v1, s[0:3], 0 offen offset:192 +; CHECK-NEXT: buffer_load_dword v18, v1, s[0:3], 0 offen offset:188 +; CHECK-NEXT: buffer_load_dword v19, v1, s[0:3], 0 offen offset:184 +; CHECK-NEXT: buffer_load_dword v20, v1, s[0:3], 0 offen offset:180 +; CHECK-NEXT: buffer_load_dword v21, v1, s[0:3], 0 offen offset:176 +; CHECK-NEXT: buffer_load_dword v22, v1, s[0:3], 0 offen offset:172 +; CHECK-NEXT: buffer_load_dword v23, v1, s[0:3], 0 offen offset:168 +; CHECK-NEXT: buffer_load_dword v24, v1, s[0:3], 0 offen offset:164 +; CHECK-NEXT: buffer_load_dword v25, v1, s[0:3], 0 offen offset:160 +; CHECK-NEXT: buffer_load_dword v26, v1, s[0:3], 0 offen offset:156 +; CHECK-NEXT: buffer_load_dword v27, v1, s[0:3], 0 offen offset:152 +; CHECK-NEXT: buffer_load_dword v28, v1, s[0:3], 0 offen offset:148 +; CHECK-NEXT: buffer_load_dword v29, v1, s[0:3], 0 offen offset:144 +; CHECK-NEXT: buffer_load_dword v30, v1, s[0:3], 0 offen offset:140 +; CHECK-NEXT: buffer_load_dword v31, v1, s[0:3], 0 offen offset:136 +; CHECK-NEXT: buffer_load_dword v32, v1, s[0:3], 0 offen offset:132 +; CHECK-NEXT: buffer_load_dword v33, v1, s[0:3], 0 offen offset:128 +; CHECK-NEXT: buffer_load_dword v34, v1, s[0:3], 0 offen offset:124 +; CHECK-NEXT: buffer_load_dword v35, v1, s[0:3], 0 offen offset:120 +; CHECK-NEXT: buffer_load_dword v36, v1, s[0:3], 0 offen offset:116 +; CHECK-NEXT: buffer_load_dword v37, v1, s[0:3], 0 offen offset:112 +; CHECK-NEXT: buffer_load_dword v38, v1, s[0:3], 0 offen offset:108 +; CHECK-NEXT: buffer_load_dword v39, v1, s[0:3], 0 offen offset:104 +; CHECK-NEXT: buffer_load_dword v48, v1, s[0:3], 0 offen offset:100 +; CHECK-NEXT: buffer_load_dword v49, v1, s[0:3], 0 offen offset:96 +; CHECK-NEXT: buffer_load_dword v50, v1, s[0:3], 0 offen offset:92 +; CHECK-NEXT: buffer_load_dword v51, v1, s[0:3], 0 offen offset:88 +; CHECK-NEXT: buffer_load_dword v52, v1, s[0:3], 0 offen offset:84 +; CHECK-NEXT: buffer_load_dword v53, v1, s[0:3], 0 offen offset:80 +; CHECK-NEXT: buffer_load_dword v54, v1, s[0:3], 0 offen offset:76 +; CHECK-NEXT: buffer_load_dword v55, v1, s[0:3], 0 offen offset:72 +; CHECK-NEXT: buffer_load_dword v64, v1, s[0:3], 0 offen offset:68 +; CHECK-NEXT: buffer_load_dword v65, v1, s[0:3], 0 offen offset:64 +; CHECK-NEXT: buffer_load_dword v66, v1, s[0:3], 0 offen offset:60 +; CHECK-NEXT: buffer_load_dword v67, v1, s[0:3], 0 offen offset:56 +; CHECK-NEXT: buffer_load_dword v68, v1, s[0:3], 0 offen offset:52 +; CHECK-NEXT: buffer_load_dword v69, v1, s[0:3], 0 offen offset:48 +; CHECK-NEXT: buffer_load_dword v70, v1, s[0:3], 0 offen offset:44 +; CHECK-NEXT: buffer_load_dword v71, v1, s[0:3], 0 offen offset:40 +; CHECK-NEXT: buffer_load_dword v80, v1, s[0:3], 0 offen offset:36 +; CHECK-NEXT: buffer_load_dword v81, v1, s[0:3], 0 offen offset:32 +; CHECK-NEXT: buffer_load_dword v82, v1, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_load_dword v83, v1, s[0:3], 0 offen offset:24 +; CHECK-NEXT: buffer_load_dword v84, v1, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_load_dword v85, v1, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_load_dword v86, v1, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_load_dword v87, v1, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v96, v1, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v97, v1, s[0:3], 0 offen +; CHECK-NEXT: s_add_u32 s4, s4, 0x100 +; CHECK-NEXT: s_addc_u32 s5, s5, 0 +; CHECK-NEXT: v_add_nc_u32_e32 v1, 0x100, v1 +; CHECK-NEXT: v_cmp_gt_u64_e64 s6, 0x800, s[4:5] +; CHECK-NEXT: s_waitcnt vmcnt(62) +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:252 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:248 +; CHECK-NEXT: s_waitcnt vmcnt(61) +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:244 +; CHECK-NEXT: s_waitcnt vmcnt(60) +; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:240 +; CHECK-NEXT: s_waitcnt vmcnt(59) +; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:236 +; CHECK-NEXT: s_waitcnt vmcnt(58) +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:232 +; CHECK-NEXT: s_waitcnt vmcnt(57) +; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:228 +; CHECK-NEXT: s_waitcnt vmcnt(56) +; CHECK-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:224 +; CHECK-NEXT: s_waitcnt vmcnt(55) +; CHECK-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:220 +; CHECK-NEXT: s_waitcnt vmcnt(54) +; CHECK-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:216 +; CHECK-NEXT: s_waitcnt vmcnt(53) +; CHECK-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen offset:212 +; CHECK-NEXT: s_waitcnt vmcnt(52) +; CHECK-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:208 +; CHECK-NEXT: s_waitcnt vmcnt(51) +; CHECK-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen offset:204 +; CHECK-NEXT: s_waitcnt vmcnt(50) +; CHECK-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:200 +; CHECK-NEXT: s_waitcnt vmcnt(49) +; CHECK-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen offset:196 +; CHECK-NEXT: s_waitcnt vmcnt(48) +; CHECK-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen offset:192 +; CHECK-NEXT: s_waitcnt vmcnt(47) +; CHECK-NEXT: buffer_store_dword v18, v0, s[0:3], 0 offen offset:188 +; CHECK-NEXT: s_waitcnt vmcnt(46) +; CHECK-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:184 +; CHECK-NEXT: s_waitcnt vmcnt(45) +; CHECK-NEXT: buffer_store_dword v20, v0, s[0:3], 0 offen offset:180 +; CHECK-NEXT: s_waitcnt vmcnt(44) +; CHECK-NEXT: buffer_store_dword v21, v0, s[0:3], 0 offen offset:176 +; CHECK-NEXT: s_waitcnt vmcnt(43) +; CHECK-NEXT: buffer_store_dword v22, v0, s[0:3], 0 offen offset:172 +; CHECK-NEXT: s_waitcnt vmcnt(42) +; CHECK-NEXT: buffer_store_dword v23, v0, s[0:3], 0 offen offset:168 +; CHECK-NEXT: s_waitcnt vmcnt(41) +; CHECK-NEXT: buffer_store_dword v24, v0, s[0:3], 0 offen offset:164 +; CHECK-NEXT: s_waitcnt vmcnt(40) +; CHECK-NEXT: buffer_store_dword v25, v0, s[0:3], 0 offen offset:160 +; CHECK-NEXT: s_waitcnt vmcnt(39) +; CHECK-NEXT: buffer_store_dword v26, v0, s[0:3], 0 offen offset:156 +; CHECK-NEXT: s_waitcnt vmcnt(38) +; CHECK-NEXT: buffer_store_dword v27, v0, s[0:3], 0 offen offset:152 +; CHECK-NEXT: s_waitcnt vmcnt(37) +; CHECK-NEXT: buffer_store_dword v28, v0, s[0:3], 0 offen offset:148 +; CHECK-NEXT: s_waitcnt vmcnt(36) +; CHECK-NEXT: buffer_store_dword v29, v0, s[0:3], 0 offen offset:144 +; CHECK-NEXT: s_waitcnt vmcnt(35) +; CHECK-NEXT: buffer_store_dword v30, v0, s[0:3], 0 offen offset:140 +; CHECK-NEXT: s_waitcnt vmcnt(34) +; CHECK-NEXT: buffer_store_dword v31, v0, s[0:3], 0 offen offset:136 +; CHECK-NEXT: s_waitcnt vmcnt(33) +; CHECK-NEXT: buffer_store_dword v32, v0, s[0:3], 0 offen offset:132 +; CHECK-NEXT: s_waitcnt vmcnt(32) +; CHECK-NEXT: buffer_store_dword v33, v0, s[0:3], 0 offen offset:128 +; CHECK-NEXT: s_waitcnt vmcnt(31) +; CHECK-NEXT: buffer_store_dword v34, v0, s[0:3], 0 offen offset:124 +; CHECK-NEXT: s_waitcnt vmcnt(30) +; CHECK-NEXT: buffer_store_dword v35, v0, s[0:3], 0 offen offset:120 +; CHECK-NEXT: s_waitcnt vmcnt(29) +; CHECK-NEXT: buffer_store_dword v36, v0, s[0:3], 0 offen offset:116 +; CHECK-NEXT: s_waitcnt vmcnt(28) +; CHECK-NEXT: buffer_store_dword v37, v0, s[0:3], 0 offen offset:112 +; CHECK-NEXT: s_waitcnt vmcnt(27) +; CHECK-NEXT: buffer_store_dword v38, v0, s[0:3], 0 offen offset:108 +; CHECK-NEXT: s_waitcnt vmcnt(26) +; CHECK-NEXT: buffer_store_dword v39, v0, s[0:3], 0 offen offset:104 +; CHECK-NEXT: s_waitcnt vmcnt(25) +; CHECK-NEXT: buffer_store_dword v48, v0, s[0:3], 0 offen offset:100 +; CHECK-NEXT: s_waitcnt vmcnt(24) +; CHECK-NEXT: buffer_store_dword v49, v0, s[0:3], 0 offen offset:96 +; CHECK-NEXT: s_waitcnt vmcnt(23) +; CHECK-NEXT: buffer_store_dword v50, v0, s[0:3], 0 offen offset:92 +; CHECK-NEXT: s_waitcnt vmcnt(22) +; CHECK-NEXT: buffer_store_dword v51, v0, s[0:3], 0 offen offset:88 +; CHECK-NEXT: s_waitcnt vmcnt(21) +; CHECK-NEXT: buffer_store_dword v52, v0, s[0:3], 0 offen offset:84 +; CHECK-NEXT: s_waitcnt vmcnt(20) +; CHECK-NEXT: buffer_store_dword v53, v0, s[0:3], 0 offen offset:80 +; CHECK-NEXT: s_waitcnt vmcnt(19) +; CHECK-NEXT: buffer_store_dword v54, v0, s[0:3], 0 offen offset:76 +; CHECK-NEXT: s_waitcnt vmcnt(18) +; CHECK-NEXT: buffer_store_dword v55, v0, s[0:3], 0 offen offset:72 +; CHECK-NEXT: s_waitcnt vmcnt(17) +; CHECK-NEXT: buffer_store_dword v64, v0, s[0:3], 0 offen offset:68 +; CHECK-NEXT: s_waitcnt vmcnt(16) +; CHECK-NEXT: buffer_store_dword v65, v0, s[0:3], 0 offen offset:64 +; CHECK-NEXT: s_waitcnt vmcnt(15) +; CHECK-NEXT: buffer_store_dword v66, v0, s[0:3], 0 offen offset:60 +; CHECK-NEXT: s_waitcnt vmcnt(14) +; CHECK-NEXT: buffer_store_dword v67, v0, s[0:3], 0 offen offset:56 +; CHECK-NEXT: s_waitcnt vmcnt(13) +; CHECK-NEXT: buffer_store_dword v68, v0, s[0:3], 0 offen offset:52 +; CHECK-NEXT: s_waitcnt vmcnt(12) +; CHECK-NEXT: buffer_store_dword v69, v0, s[0:3], 0 offen offset:48 +; CHECK-NEXT: s_waitcnt vmcnt(11) +; CHECK-NEXT: buffer_store_dword v70, v0, s[0:3], 0 offen offset:44 +; CHECK-NEXT: s_waitcnt vmcnt(10) +; CHECK-NEXT: buffer_store_dword v71, v0, s[0:3], 0 offen offset:40 +; CHECK-NEXT: s_waitcnt vmcnt(9) +; CHECK-NEXT: buffer_store_dword v80, v0, s[0:3], 0 offen offset:36 +; CHECK-NEXT: s_waitcnt vmcnt(8) +; CHECK-NEXT: buffer_store_dword v81, v0, s[0:3], 0 offen offset:32 +; CHECK-NEXT: s_waitcnt vmcnt(7) +; CHECK-NEXT: buffer_store_dword v82, v0, s[0:3], 0 offen offset:28 +; CHECK-NEXT: s_waitcnt vmcnt(6) +; CHECK-NEXT: buffer_store_dword v83, v0, s[0:3], 0 offen offset:24 +; CHECK-NEXT: s_waitcnt vmcnt(5) +; CHECK-NEXT: buffer_store_dword v84, v0, s[0:3], 0 offen offset:20 +; CHECK-NEXT: s_waitcnt vmcnt(4) +; CHECK-NEXT: buffer_store_dword v85, v0, s[0:3], 0 offen offset:16 +; CHECK-NEXT: s_waitcnt vmcnt(3) +; CHECK-NEXT: buffer_store_dword v86, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: s_waitcnt vmcnt(2) +; CHECK-NEXT: buffer_store_dword v87, v0, s[0:3], 0 offen offset:8 +; CHECK-NEXT: s_waitcnt vmcnt(1) +; CHECK-NEXT: buffer_store_dword v96, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: buffer_store_dword v97, v0, s[0:3], 0 offen +; CHECK-NEXT: v_add_nc_u32_e32 v0, 0x100, v0 +; CHECK-NEXT: s_and_b32 vcc_lo, exec_lo, s6 +; CHECK-NEXT: s_cbranch_vccnz .LBB3_1 +; CHECK-NEXT: ; %bb.2: ; %memcpy-split +; CHECK-NEXT: s_setpc_b64 s[30:31] +; +; ALIGNED-LABEL: memcpy_p5_p5_sz2048: +; ALIGNED: ; %bb.0: ; %entry +; ALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; ALIGNED-NEXT: s_mov_b64 s[4:5], 0 +; ALIGNED-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v72, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v73, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v74, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v75, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v76, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v77, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v78, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v79, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v88, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v89, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v90, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v91, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v92, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v93, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v94, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v95, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v104, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v105, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v106, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v107, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v108, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v109, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v110, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v111, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v120, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v121, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v122, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v123, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v124, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v125, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v126, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v127, off, s[0:3], s32 ; 4-byte Folded Spill +; ALIGNED-NEXT: .LBB3_1: ; %load-store-loop +; ALIGNED-NEXT: ; =>This Inner Loop Header: Depth=1 +; ALIGNED-NEXT: s_clause 0x34 +; ALIGNED-NEXT: buffer_load_ubyte v116, v1, s[0:3], 0 offen offset:255 +; ALIGNED-NEXT: buffer_load_ubyte v117, v1, s[0:3], 0 offen offset:254 +; ALIGNED-NEXT: buffer_load_ubyte v118, v1, s[0:3], 0 offen offset:253 +; ALIGNED-NEXT: buffer_load_ubyte v119, v1, s[0:3], 0 offen offset:252 +; ALIGNED-NEXT: buffer_load_ubyte v40, v1, s[0:3], 0 offen offset:251 +; ALIGNED-NEXT: buffer_load_ubyte v41, v1, s[0:3], 0 offen offset:250 +; ALIGNED-NEXT: buffer_load_ubyte v42, v1, s[0:3], 0 offen offset:249 +; ALIGNED-NEXT: buffer_load_ubyte v43, v1, s[0:3], 0 offen offset:248 +; ALIGNED-NEXT: buffer_load_ubyte v44, v1, s[0:3], 0 offen offset:247 +; ALIGNED-NEXT: buffer_load_ubyte v45, v1, s[0:3], 0 offen offset:246 +; ALIGNED-NEXT: buffer_load_ubyte v46, v1, s[0:3], 0 offen offset:245 +; ALIGNED-NEXT: buffer_load_ubyte v47, v1, s[0:3], 0 offen offset:244 +; ALIGNED-NEXT: buffer_load_ubyte v56, v1, s[0:3], 0 offen offset:243 +; ALIGNED-NEXT: buffer_load_ubyte v57, v1, s[0:3], 0 offen offset:242 +; ALIGNED-NEXT: buffer_load_ubyte v58, v1, s[0:3], 0 offen offset:241 +; ALIGNED-NEXT: buffer_load_ubyte v59, v1, s[0:3], 0 offen offset:240 +; ALIGNED-NEXT: buffer_load_ubyte v60, v1, s[0:3], 0 offen offset:239 +; ALIGNED-NEXT: buffer_load_ubyte v61, v1, s[0:3], 0 offen offset:238 +; ALIGNED-NEXT: buffer_load_ubyte v62, v1, s[0:3], 0 offen offset:237 +; ALIGNED-NEXT: buffer_load_ubyte v63, v1, s[0:3], 0 offen offset:236 +; ALIGNED-NEXT: buffer_load_ubyte v72, v1, s[0:3], 0 offen offset:235 +; ALIGNED-NEXT: buffer_load_ubyte v73, v1, s[0:3], 0 offen offset:234 +; ALIGNED-NEXT: buffer_load_ubyte v74, v1, s[0:3], 0 offen offset:233 +; ALIGNED-NEXT: buffer_load_ubyte v75, v1, s[0:3], 0 offen offset:232 +; ALIGNED-NEXT: buffer_load_ubyte v76, v1, s[0:3], 0 offen offset:231 +; ALIGNED-NEXT: buffer_load_ubyte v77, v1, s[0:3], 0 offen offset:230 +; ALIGNED-NEXT: buffer_load_ubyte v78, v1, s[0:3], 0 offen offset:229 +; ALIGNED-NEXT: buffer_load_ubyte v79, v1, s[0:3], 0 offen offset:228 +; ALIGNED-NEXT: buffer_load_ubyte v88, v1, s[0:3], 0 offen offset:227 +; ALIGNED-NEXT: buffer_load_ubyte v89, v1, s[0:3], 0 offen offset:226 +; ALIGNED-NEXT: buffer_load_ubyte v90, v1, s[0:3], 0 offen offset:225 +; ALIGNED-NEXT: buffer_load_ubyte v91, v1, s[0:3], 0 offen offset:224 +; ALIGNED-NEXT: buffer_load_ubyte v92, v1, s[0:3], 0 offen offset:223 +; ALIGNED-NEXT: buffer_load_ubyte v93, v1, s[0:3], 0 offen offset:222 +; ALIGNED-NEXT: buffer_load_ubyte v94, v1, s[0:3], 0 offen offset:221 +; ALIGNED-NEXT: buffer_load_ubyte v95, v1, s[0:3], 0 offen offset:220 +; ALIGNED-NEXT: buffer_load_ubyte v104, v1, s[0:3], 0 offen offset:219 +; ALIGNED-NEXT: buffer_load_ubyte v105, v1, s[0:3], 0 offen offset:218 +; ALIGNED-NEXT: buffer_load_ubyte v106, v1, s[0:3], 0 offen offset:217 +; ALIGNED-NEXT: buffer_load_ubyte v107, v1, s[0:3], 0 offen offset:216 +; ALIGNED-NEXT: buffer_load_ubyte v108, v1, s[0:3], 0 offen offset:215 +; ALIGNED-NEXT: buffer_load_ubyte v109, v1, s[0:3], 0 offen offset:214 +; ALIGNED-NEXT: buffer_load_ubyte v110, v1, s[0:3], 0 offen offset:213 +; ALIGNED-NEXT: buffer_load_ubyte v111, v1, s[0:3], 0 offen offset:212 +; ALIGNED-NEXT: buffer_load_ubyte v120, v1, s[0:3], 0 offen offset:211 +; ALIGNED-NEXT: buffer_load_ubyte v121, v1, s[0:3], 0 offen offset:210 +; ALIGNED-NEXT: buffer_load_ubyte v122, v1, s[0:3], 0 offen offset:209 +; ALIGNED-NEXT: buffer_load_ubyte v123, v1, s[0:3], 0 offen offset:208 +; ALIGNED-NEXT: buffer_load_ubyte v124, v1, s[0:3], 0 offen offset:207 +; ALIGNED-NEXT: buffer_load_ubyte v125, v1, s[0:3], 0 offen offset:206 +; ALIGNED-NEXT: buffer_load_ubyte v126, v1, s[0:3], 0 offen offset:205 +; ALIGNED-NEXT: buffer_load_ubyte v127, v1, s[0:3], 0 offen offset:204 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:203 +; ALIGNED-NEXT: s_add_u32 s4, s4, 0x100 +; ALIGNED-NEXT: s_addc_u32 s5, s5, 0 +; ALIGNED-NEXT: v_cmp_gt_u64_e64 s6, 0x800, s[4:5] +; ALIGNED-NEXT: s_and_b32 vcc_lo, exec_lo, s6 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_clause 0x3e +; ALIGNED-NEXT: buffer_load_ubyte v3, v1, s[0:3], 0 offen offset:202 +; ALIGNED-NEXT: buffer_load_ubyte v4, v1, s[0:3], 0 offen offset:201 +; ALIGNED-NEXT: buffer_load_ubyte v5, v1, s[0:3], 0 offen offset:200 +; ALIGNED-NEXT: buffer_load_ubyte v6, v1, s[0:3], 0 offen offset:199 +; ALIGNED-NEXT: buffer_load_ubyte v7, v1, s[0:3], 0 offen offset:198 +; ALIGNED-NEXT: buffer_load_ubyte v8, v1, s[0:3], 0 offen offset:197 +; ALIGNED-NEXT: buffer_load_ubyte v9, v1, s[0:3], 0 offen offset:196 +; ALIGNED-NEXT: buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:195 +; ALIGNED-NEXT: buffer_load_ubyte v11, v1, s[0:3], 0 offen offset:194 +; ALIGNED-NEXT: buffer_load_ubyte v12, v1, s[0:3], 0 offen offset:193 +; ALIGNED-NEXT: buffer_load_ubyte v13, v1, s[0:3], 0 offen offset:192 +; ALIGNED-NEXT: buffer_load_ubyte v14, v1, s[0:3], 0 offen offset:191 +; ALIGNED-NEXT: buffer_load_ubyte v15, v1, s[0:3], 0 offen offset:190 +; ALIGNED-NEXT: buffer_load_ubyte v16, v1, s[0:3], 0 offen offset:189 +; ALIGNED-NEXT: buffer_load_ubyte v17, v1, s[0:3], 0 offen offset:188 +; ALIGNED-NEXT: buffer_load_ubyte v18, v1, s[0:3], 0 offen offset:187 +; ALIGNED-NEXT: buffer_load_ubyte v19, v1, s[0:3], 0 offen offset:186 +; ALIGNED-NEXT: buffer_load_ubyte v20, v1, s[0:3], 0 offen offset:185 +; ALIGNED-NEXT: buffer_load_ubyte v21, v1, s[0:3], 0 offen offset:184 +; ALIGNED-NEXT: buffer_load_ubyte v22, v1, s[0:3], 0 offen offset:183 +; ALIGNED-NEXT: buffer_load_ubyte v23, v1, s[0:3], 0 offen offset:182 +; ALIGNED-NEXT: buffer_load_ubyte v24, v1, s[0:3], 0 offen offset:181 +; ALIGNED-NEXT: buffer_load_ubyte v25, v1, s[0:3], 0 offen offset:180 +; ALIGNED-NEXT: buffer_load_ubyte v26, v1, s[0:3], 0 offen offset:179 +; ALIGNED-NEXT: buffer_load_ubyte v27, v1, s[0:3], 0 offen offset:178 +; ALIGNED-NEXT: buffer_load_ubyte v28, v1, s[0:3], 0 offen offset:177 +; ALIGNED-NEXT: buffer_load_ubyte v29, v1, s[0:3], 0 offen offset:176 +; ALIGNED-NEXT: buffer_load_ubyte v30, v1, s[0:3], 0 offen offset:175 +; ALIGNED-NEXT: buffer_load_ubyte v31, v1, s[0:3], 0 offen offset:174 +; ALIGNED-NEXT: buffer_load_ubyte v32, v1, s[0:3], 0 offen offset:173 +; ALIGNED-NEXT: buffer_load_ubyte v33, v1, s[0:3], 0 offen offset:172 +; ALIGNED-NEXT: buffer_load_ubyte v34, v1, s[0:3], 0 offen offset:171 +; ALIGNED-NEXT: buffer_load_ubyte v35, v1, s[0:3], 0 offen offset:170 +; ALIGNED-NEXT: buffer_load_ubyte v36, v1, s[0:3], 0 offen offset:169 +; ALIGNED-NEXT: buffer_load_ubyte v37, v1, s[0:3], 0 offen offset:168 +; ALIGNED-NEXT: buffer_load_ubyte v38, v1, s[0:3], 0 offen offset:167 +; ALIGNED-NEXT: buffer_load_ubyte v39, v1, s[0:3], 0 offen offset:166 +; ALIGNED-NEXT: buffer_load_ubyte v48, v1, s[0:3], 0 offen offset:165 +; ALIGNED-NEXT: buffer_load_ubyte v49, v1, s[0:3], 0 offen offset:164 +; ALIGNED-NEXT: buffer_load_ubyte v50, v1, s[0:3], 0 offen offset:163 +; ALIGNED-NEXT: buffer_load_ubyte v51, v1, s[0:3], 0 offen offset:162 +; ALIGNED-NEXT: buffer_load_ubyte v52, v1, s[0:3], 0 offen offset:161 +; ALIGNED-NEXT: buffer_load_ubyte v53, v1, s[0:3], 0 offen offset:160 +; ALIGNED-NEXT: buffer_load_ubyte v54, v1, s[0:3], 0 offen offset:159 +; ALIGNED-NEXT: buffer_load_ubyte v55, v1, s[0:3], 0 offen offset:158 +; ALIGNED-NEXT: buffer_load_ubyte v64, v1, s[0:3], 0 offen offset:157 +; ALIGNED-NEXT: buffer_load_ubyte v65, v1, s[0:3], 0 offen offset:156 +; ALIGNED-NEXT: buffer_load_ubyte v66, v1, s[0:3], 0 offen offset:155 +; ALIGNED-NEXT: buffer_load_ubyte v67, v1, s[0:3], 0 offen offset:154 +; ALIGNED-NEXT: buffer_load_ubyte v68, v1, s[0:3], 0 offen offset:153 +; ALIGNED-NEXT: buffer_load_ubyte v69, v1, s[0:3], 0 offen offset:152 +; ALIGNED-NEXT: buffer_load_ubyte v70, v1, s[0:3], 0 offen offset:151 +; ALIGNED-NEXT: buffer_load_ubyte v71, v1, s[0:3], 0 offen offset:150 +; ALIGNED-NEXT: buffer_load_ubyte v80, v1, s[0:3], 0 offen offset:149 +; ALIGNED-NEXT: buffer_load_ubyte v81, v1, s[0:3], 0 offen offset:148 +; ALIGNED-NEXT: buffer_load_ubyte v82, v1, s[0:3], 0 offen offset:147 +; ALIGNED-NEXT: buffer_load_ubyte v83, v1, s[0:3], 0 offen offset:146 +; ALIGNED-NEXT: buffer_load_ubyte v84, v1, s[0:3], 0 offen offset:145 +; ALIGNED-NEXT: buffer_load_ubyte v85, v1, s[0:3], 0 offen offset:144 +; ALIGNED-NEXT: buffer_load_ubyte v86, v1, s[0:3], 0 offen offset:143 +; ALIGNED-NEXT: buffer_load_ubyte v87, v1, s[0:3], 0 offen offset:142 +; ALIGNED-NEXT: buffer_load_ubyte v96, v1, s[0:3], 0 offen offset:141 +; ALIGNED-NEXT: buffer_load_ubyte v97, v1, s[0:3], 0 offen offset:140 +; ALIGNED-NEXT: s_clause 0xa +; ALIGNED-NEXT: buffer_load_ubyte v98, v1, s[0:3], 0 offen offset:139 +; ALIGNED-NEXT: buffer_load_ubyte v99, v1, s[0:3], 0 offen offset:138 +; ALIGNED-NEXT: buffer_load_ubyte v100, v1, s[0:3], 0 offen offset:137 +; ALIGNED-NEXT: buffer_load_ubyte v101, v1, s[0:3], 0 offen offset:136 +; ALIGNED-NEXT: buffer_load_ubyte v102, v1, s[0:3], 0 offen offset:135 +; ALIGNED-NEXT: buffer_load_ubyte v103, v1, s[0:3], 0 offen offset:134 +; ALIGNED-NEXT: buffer_load_ubyte v112, v1, s[0:3], 0 offen offset:133 +; ALIGNED-NEXT: buffer_load_ubyte v113, v1, s[0:3], 0 offen offset:132 +; ALIGNED-NEXT: buffer_load_ubyte v114, v1, s[0:3], 0 offen offset:131 +; ALIGNED-NEXT: buffer_load_ubyte v115, v1, s[0:3], 0 offen offset:130 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:129 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:128 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:127 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:126 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:125 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:124 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:123 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:122 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:121 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:120 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:119 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:118 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:117 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:116 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:115 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:114 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:113 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:112 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:111 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:110 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:109 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:108 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:107 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:106 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:105 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:104 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:103 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:102 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:101 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:100 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:99 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:98 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:97 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:96 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:95 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:94 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:93 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:92 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:91 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:90 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:89 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:88 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:87 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:86 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:85 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:84 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:83 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:82 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:81 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:80 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:79 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:78 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:77 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:76 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:75 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:74 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:73 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:72 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:71 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:70 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:69 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:68 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:67 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:66 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:65 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:64 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:63 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:62 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:61 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:60 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:59 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:58 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:57 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:56 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:55 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:54 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:53 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:52 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:51 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:50 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:49 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:48 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:47 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:46 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:45 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:44 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:43 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:42 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:41 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:40 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:39 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:38 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:37 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:36 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:35 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:34 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:33 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:32 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:31 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:30 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:29 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:28 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:27 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:26 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:25 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:24 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:23 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:22 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:21 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:20 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:19 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:18 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:17 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:16 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:15 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:14 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:13 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:12 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:11 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:10 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:9 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:8 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:7 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:6 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:5 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:4 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:3 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:2 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen +; ALIGNED-NEXT: v_add_nc_u32_e32 v1, 0x100, v1 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_byte v116, v0, s[0:3], 0 offen offset:255 +; ALIGNED-NEXT: buffer_store_byte v117, v0, s[0:3], 0 offen offset:254 +; ALIGNED-NEXT: buffer_store_byte v118, v0, s[0:3], 0 offen offset:253 +; ALIGNED-NEXT: buffer_store_byte v119, v0, s[0:3], 0 offen offset:252 +; ALIGNED-NEXT: buffer_store_byte v40, v0, s[0:3], 0 offen offset:251 +; ALIGNED-NEXT: buffer_store_byte v41, v0, s[0:3], 0 offen offset:250 +; ALIGNED-NEXT: buffer_store_byte v42, v0, s[0:3], 0 offen offset:249 +; ALIGNED-NEXT: buffer_store_byte v43, v0, s[0:3], 0 offen offset:248 +; ALIGNED-NEXT: buffer_store_byte v44, v0, s[0:3], 0 offen offset:247 +; ALIGNED-NEXT: buffer_store_byte v45, v0, s[0:3], 0 offen offset:246 +; ALIGNED-NEXT: buffer_store_byte v46, v0, s[0:3], 0 offen offset:245 +; ALIGNED-NEXT: buffer_store_byte v47, v0, s[0:3], 0 offen offset:244 +; ALIGNED-NEXT: buffer_store_byte v56, v0, s[0:3], 0 offen offset:243 +; ALIGNED-NEXT: buffer_store_byte v57, v0, s[0:3], 0 offen offset:242 +; ALIGNED-NEXT: buffer_store_byte v58, v0, s[0:3], 0 offen offset:241 +; ALIGNED-NEXT: buffer_store_byte v59, v0, s[0:3], 0 offen offset:240 +; ALIGNED-NEXT: buffer_store_byte v60, v0, s[0:3], 0 offen offset:239 +; ALIGNED-NEXT: buffer_store_byte v61, v0, s[0:3], 0 offen offset:238 +; ALIGNED-NEXT: buffer_store_byte v62, v0, s[0:3], 0 offen offset:237 +; ALIGNED-NEXT: buffer_store_byte v63, v0, s[0:3], 0 offen offset:236 +; ALIGNED-NEXT: buffer_store_byte v72, v0, s[0:3], 0 offen offset:235 +; ALIGNED-NEXT: buffer_store_byte v73, v0, s[0:3], 0 offen offset:234 +; ALIGNED-NEXT: buffer_store_byte v74, v0, s[0:3], 0 offen offset:233 +; ALIGNED-NEXT: buffer_store_byte v75, v0, s[0:3], 0 offen offset:232 +; ALIGNED-NEXT: buffer_store_byte v76, v0, s[0:3], 0 offen offset:231 +; ALIGNED-NEXT: buffer_store_byte v77, v0, s[0:3], 0 offen offset:230 +; ALIGNED-NEXT: buffer_store_byte v78, v0, s[0:3], 0 offen offset:229 +; ALIGNED-NEXT: buffer_store_byte v79, v0, s[0:3], 0 offen offset:228 +; ALIGNED-NEXT: buffer_store_byte v88, v0, s[0:3], 0 offen offset:227 +; ALIGNED-NEXT: buffer_store_byte v89, v0, s[0:3], 0 offen offset:226 +; ALIGNED-NEXT: buffer_store_byte v90, v0, s[0:3], 0 offen offset:225 +; ALIGNED-NEXT: buffer_store_byte v91, v0, s[0:3], 0 offen offset:224 +; ALIGNED-NEXT: buffer_store_byte v92, v0, s[0:3], 0 offen offset:223 +; ALIGNED-NEXT: buffer_store_byte v93, v0, s[0:3], 0 offen offset:222 +; ALIGNED-NEXT: buffer_store_byte v94, v0, s[0:3], 0 offen offset:221 +; ALIGNED-NEXT: buffer_store_byte v95, v0, s[0:3], 0 offen offset:220 +; ALIGNED-NEXT: buffer_store_byte v104, v0, s[0:3], 0 offen offset:219 +; ALIGNED-NEXT: buffer_store_byte v105, v0, s[0:3], 0 offen offset:218 +; ALIGNED-NEXT: buffer_store_byte v106, v0, s[0:3], 0 offen offset:217 +; ALIGNED-NEXT: buffer_store_byte v107, v0, s[0:3], 0 offen offset:216 +; ALIGNED-NEXT: buffer_store_byte v108, v0, s[0:3], 0 offen offset:215 +; ALIGNED-NEXT: buffer_store_byte v109, v0, s[0:3], 0 offen offset:214 +; ALIGNED-NEXT: buffer_store_byte v110, v0, s[0:3], 0 offen offset:213 +; ALIGNED-NEXT: buffer_store_byte v111, v0, s[0:3], 0 offen offset:212 +; ALIGNED-NEXT: buffer_store_byte v120, v0, s[0:3], 0 offen offset:211 +; ALIGNED-NEXT: buffer_store_byte v121, v0, s[0:3], 0 offen offset:210 +; ALIGNED-NEXT: buffer_store_byte v122, v0, s[0:3], 0 offen offset:209 +; ALIGNED-NEXT: buffer_store_byte v123, v0, s[0:3], 0 offen offset:208 +; ALIGNED-NEXT: buffer_store_byte v124, v0, s[0:3], 0 offen offset:207 +; ALIGNED-NEXT: buffer_store_byte v125, v0, s[0:3], 0 offen offset:206 +; ALIGNED-NEXT: buffer_store_byte v126, v0, s[0:3], 0 offen offset:205 +; ALIGNED-NEXT: buffer_store_byte v127, v0, s[0:3], 0 offen offset:204 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:203 +; ALIGNED-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:202 +; ALIGNED-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:201 +; ALIGNED-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:200 +; ALIGNED-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:199 +; ALIGNED-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:198 +; ALIGNED-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:197 +; ALIGNED-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:196 +; ALIGNED-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:195 +; ALIGNED-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:194 +; ALIGNED-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:193 +; ALIGNED-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:192 +; ALIGNED-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:191 +; ALIGNED-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:190 +; ALIGNED-NEXT: buffer_store_byte v16, v0, s[0:3], 0 offen offset:189 +; ALIGNED-NEXT: buffer_store_byte v17, v0, s[0:3], 0 offen offset:188 +; ALIGNED-NEXT: buffer_store_byte v18, v0, s[0:3], 0 offen offset:187 +; ALIGNED-NEXT: buffer_store_byte v19, v0, s[0:3], 0 offen offset:186 +; ALIGNED-NEXT: buffer_store_byte v20, v0, s[0:3], 0 offen offset:185 +; ALIGNED-NEXT: buffer_store_byte v21, v0, s[0:3], 0 offen offset:184 +; ALIGNED-NEXT: buffer_store_byte v22, v0, s[0:3], 0 offen offset:183 +; ALIGNED-NEXT: buffer_store_byte v23, v0, s[0:3], 0 offen offset:182 +; ALIGNED-NEXT: buffer_store_byte v24, v0, s[0:3], 0 offen offset:181 +; ALIGNED-NEXT: buffer_store_byte v25, v0, s[0:3], 0 offen offset:180 +; ALIGNED-NEXT: buffer_store_byte v26, v0, s[0:3], 0 offen offset:179 +; ALIGNED-NEXT: buffer_store_byte v27, v0, s[0:3], 0 offen offset:178 +; ALIGNED-NEXT: buffer_store_byte v28, v0, s[0:3], 0 offen offset:177 +; ALIGNED-NEXT: buffer_store_byte v29, v0, s[0:3], 0 offen offset:176 +; ALIGNED-NEXT: buffer_store_byte v30, v0, s[0:3], 0 offen offset:175 +; ALIGNED-NEXT: buffer_store_byte v31, v0, s[0:3], 0 offen offset:174 +; ALIGNED-NEXT: buffer_store_byte v32, v0, s[0:3], 0 offen offset:173 +; ALIGNED-NEXT: buffer_store_byte v33, v0, s[0:3], 0 offen offset:172 +; ALIGNED-NEXT: buffer_store_byte v34, v0, s[0:3], 0 offen offset:171 +; ALIGNED-NEXT: buffer_store_byte v35, v0, s[0:3], 0 offen offset:170 +; ALIGNED-NEXT: buffer_store_byte v36, v0, s[0:3], 0 offen offset:169 +; ALIGNED-NEXT: buffer_store_byte v37, v0, s[0:3], 0 offen offset:168 +; ALIGNED-NEXT: buffer_store_byte v38, v0, s[0:3], 0 offen offset:167 +; ALIGNED-NEXT: buffer_store_byte v39, v0, s[0:3], 0 offen offset:166 +; ALIGNED-NEXT: buffer_store_byte v48, v0, s[0:3], 0 offen offset:165 +; ALIGNED-NEXT: buffer_store_byte v49, v0, s[0:3], 0 offen offset:164 +; ALIGNED-NEXT: buffer_store_byte v50, v0, s[0:3], 0 offen offset:163 +; ALIGNED-NEXT: buffer_store_byte v51, v0, s[0:3], 0 offen offset:162 +; ALIGNED-NEXT: buffer_store_byte v52, v0, s[0:3], 0 offen offset:161 +; ALIGNED-NEXT: buffer_store_byte v53, v0, s[0:3], 0 offen offset:160 +; ALIGNED-NEXT: buffer_store_byte v54, v0, s[0:3], 0 offen offset:159 +; ALIGNED-NEXT: buffer_store_byte v55, v0, s[0:3], 0 offen offset:158 +; ALIGNED-NEXT: buffer_store_byte v64, v0, s[0:3], 0 offen offset:157 +; ALIGNED-NEXT: buffer_store_byte v65, v0, s[0:3], 0 offen offset:156 +; ALIGNED-NEXT: buffer_store_byte v66, v0, s[0:3], 0 offen offset:155 +; ALIGNED-NEXT: buffer_store_byte v67, v0, s[0:3], 0 offen offset:154 +; ALIGNED-NEXT: buffer_store_byte v68, v0, s[0:3], 0 offen offset:153 +; ALIGNED-NEXT: buffer_store_byte v69, v0, s[0:3], 0 offen offset:152 +; ALIGNED-NEXT: buffer_store_byte v70, v0, s[0:3], 0 offen offset:151 +; ALIGNED-NEXT: buffer_store_byte v71, v0, s[0:3], 0 offen offset:150 +; ALIGNED-NEXT: buffer_store_byte v80, v0, s[0:3], 0 offen offset:149 +; ALIGNED-NEXT: buffer_store_byte v81, v0, s[0:3], 0 offen offset:148 +; ALIGNED-NEXT: buffer_store_byte v82, v0, s[0:3], 0 offen offset:147 +; ALIGNED-NEXT: buffer_store_byte v83, v0, s[0:3], 0 offen offset:146 +; ALIGNED-NEXT: buffer_store_byte v84, v0, s[0:3], 0 offen offset:145 +; ALIGNED-NEXT: buffer_store_byte v85, v0, s[0:3], 0 offen offset:144 +; ALIGNED-NEXT: buffer_store_byte v86, v0, s[0:3], 0 offen offset:143 +; ALIGNED-NEXT: buffer_store_byte v87, v0, s[0:3], 0 offen offset:142 +; ALIGNED-NEXT: buffer_store_byte v96, v0, s[0:3], 0 offen offset:141 +; ALIGNED-NEXT: buffer_store_byte v97, v0, s[0:3], 0 offen offset:140 +; ALIGNED-NEXT: buffer_store_byte v98, v0, s[0:3], 0 offen offset:139 +; ALIGNED-NEXT: buffer_store_byte v99, v0, s[0:3], 0 offen offset:138 +; ALIGNED-NEXT: buffer_store_byte v100, v0, s[0:3], 0 offen offset:137 +; ALIGNED-NEXT: buffer_store_byte v101, v0, s[0:3], 0 offen offset:136 +; ALIGNED-NEXT: buffer_store_byte v102, v0, s[0:3], 0 offen offset:135 +; ALIGNED-NEXT: buffer_store_byte v103, v0, s[0:3], 0 offen offset:134 +; ALIGNED-NEXT: buffer_store_byte v112, v0, s[0:3], 0 offen offset:133 +; ALIGNED-NEXT: buffer_store_byte v113, v0, s[0:3], 0 offen offset:132 +; ALIGNED-NEXT: buffer_store_byte v114, v0, s[0:3], 0 offen offset:131 +; ALIGNED-NEXT: buffer_store_byte v115, v0, s[0:3], 0 offen offset:130 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:129 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:128 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:127 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:126 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:125 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:124 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:123 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:122 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:121 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:120 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:119 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:118 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:117 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:116 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:115 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:114 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:113 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:112 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:111 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:110 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:109 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:108 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:107 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:106 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:105 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:104 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:103 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:102 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:101 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:100 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:99 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:98 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:97 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:96 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:95 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:94 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:93 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:92 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:91 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:90 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:89 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:88 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:87 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:86 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:85 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:84 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:83 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:82 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:81 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:80 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:79 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:78 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:77 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:76 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:75 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:74 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:73 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:72 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:71 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:70 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:69 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:68 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:67 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:66 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:65 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:64 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:63 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:62 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:61 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:60 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:59 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:58 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:57 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:56 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:55 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:54 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:53 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:52 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:51 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:50 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:49 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:48 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:47 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:46 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:45 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:44 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:43 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:42 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:41 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:40 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:39 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:38 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:37 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:36 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:35 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:34 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:33 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:32 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:31 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:30 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:29 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:28 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:27 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:26 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:25 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:24 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:23 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:22 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:21 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:20 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:19 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:18 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:17 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:16 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:15 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:14 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:13 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:12 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:11 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:10 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:9 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:8 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:7 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:6 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:5 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:4 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:3 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:2 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen +; ALIGNED-NEXT: v_add_nc_u32_e32 v0, 0x100, v0 +; ALIGNED-NEXT: s_cbranch_vccnz .LBB3_1 +; ALIGNED-NEXT: ; %bb.2: ; %memcpy-split +; ALIGNED-NEXT: s_clause 0x2f +; ALIGNED-NEXT: buffer_load_dword v127, off, s[0:3], s32 +; ALIGNED-NEXT: buffer_load_dword v126, off, s[0:3], s32 offset:4 +; ALIGNED-NEXT: buffer_load_dword v125, off, s[0:3], s32 offset:8 +; ALIGNED-NEXT: buffer_load_dword v124, off, s[0:3], s32 offset:12 +; ALIGNED-NEXT: buffer_load_dword v123, off, s[0:3], s32 offset:16 +; ALIGNED-NEXT: buffer_load_dword v122, off, s[0:3], s32 offset:20 +; ALIGNED-NEXT: buffer_load_dword v121, off, s[0:3], s32 offset:24 +; ALIGNED-NEXT: buffer_load_dword v120, off, s[0:3], s32 offset:28 +; ALIGNED-NEXT: buffer_load_dword v111, off, s[0:3], s32 offset:32 +; ALIGNED-NEXT: buffer_load_dword v110, off, s[0:3], s32 offset:36 +; ALIGNED-NEXT: buffer_load_dword v109, off, s[0:3], s32 offset:40 +; ALIGNED-NEXT: buffer_load_dword v108, off, s[0:3], s32 offset:44 +; ALIGNED-NEXT: buffer_load_dword v107, off, s[0:3], s32 offset:48 +; ALIGNED-NEXT: buffer_load_dword v106, off, s[0:3], s32 offset:52 +; ALIGNED-NEXT: buffer_load_dword v105, off, s[0:3], s32 offset:56 +; ALIGNED-NEXT: buffer_load_dword v104, off, s[0:3], s32 offset:60 +; ALIGNED-NEXT: buffer_load_dword v95, off, s[0:3], s32 offset:64 +; ALIGNED-NEXT: buffer_load_dword v94, off, s[0:3], s32 offset:68 +; ALIGNED-NEXT: buffer_load_dword v93, off, s[0:3], s32 offset:72 +; ALIGNED-NEXT: buffer_load_dword v92, off, s[0:3], s32 offset:76 +; ALIGNED-NEXT: buffer_load_dword v91, off, s[0:3], s32 offset:80 +; ALIGNED-NEXT: buffer_load_dword v90, off, s[0:3], s32 offset:84 +; ALIGNED-NEXT: buffer_load_dword v89, off, s[0:3], s32 offset:88 +; ALIGNED-NEXT: buffer_load_dword v88, off, s[0:3], s32 offset:92 +; ALIGNED-NEXT: buffer_load_dword v79, off, s[0:3], s32 offset:96 +; ALIGNED-NEXT: buffer_load_dword v78, off, s[0:3], s32 offset:100 +; ALIGNED-NEXT: buffer_load_dword v77, off, s[0:3], s32 offset:104 +; ALIGNED-NEXT: buffer_load_dword v76, off, s[0:3], s32 offset:108 +; ALIGNED-NEXT: buffer_load_dword v75, off, s[0:3], s32 offset:112 +; ALIGNED-NEXT: buffer_load_dword v74, off, s[0:3], s32 offset:116 +; ALIGNED-NEXT: buffer_load_dword v73, off, s[0:3], s32 offset:120 +; ALIGNED-NEXT: buffer_load_dword v72, off, s[0:3], s32 offset:124 +; ALIGNED-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:128 +; ALIGNED-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:132 +; ALIGNED-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:136 +; ALIGNED-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:140 +; ALIGNED-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:144 +; ALIGNED-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:148 +; ALIGNED-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:152 +; ALIGNED-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:156 +; ALIGNED-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:160 +; ALIGNED-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:164 +; ALIGNED-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:168 +; ALIGNED-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:172 +; ALIGNED-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:176 +; ALIGNED-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:180 +; ALIGNED-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:184 +; ALIGNED-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:188 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: s_setpc_b64 s[30:31] +; +; UNROLL3-LABEL: memcpy_p5_p5_sz2048: +; UNROLL3: ; %bb.0: ; %entry +; UNROLL3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; UNROLL3-NEXT: v_mov_b32_e32 v2, v1 +; UNROLL3-NEXT: v_mov_b32_e32 v3, v0 +; UNROLL3-NEXT: s_mov_b64 s[4:5], 0 +; UNROLL3-NEXT: .LBB3_1: ; %load-store-loop +; UNROLL3-NEXT: ; =>This Inner Loop Header: Depth=1 +; UNROLL3-NEXT: s_clause 0xb +; UNROLL3-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:44 +; UNROLL3-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:40 +; UNROLL3-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:36 +; UNROLL3-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen offset:32 +; UNROLL3-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:28 +; UNROLL3-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:24 +; UNROLL3-NEXT: buffer_load_dword v10, v2, s[0:3], 0 offen offset:20 +; UNROLL3-NEXT: buffer_load_dword v11, v2, s[0:3], 0 offen offset:16 +; UNROLL3-NEXT: buffer_load_dword v12, v2, s[0:3], 0 offen offset:12 +; UNROLL3-NEXT: buffer_load_dword v13, v2, s[0:3], 0 offen offset:8 +; UNROLL3-NEXT: buffer_load_dword v14, v2, s[0:3], 0 offen offset:4 +; UNROLL3-NEXT: buffer_load_dword v15, v2, s[0:3], 0 offen +; UNROLL3-NEXT: s_add_u32 s4, s4, 48 +; UNROLL3-NEXT: s_addc_u32 s5, s5, 0 +; UNROLL3-NEXT: v_add_nc_u32_e32 v2, 48, v2 +; UNROLL3-NEXT: v_cmp_gt_u64_e64 s6, 0x7e0, s[4:5] +; UNROLL3-NEXT: s_waitcnt vmcnt(11) +; UNROLL3-NEXT: buffer_store_dword v4, v3, s[0:3], 0 offen offset:44 +; UNROLL3-NEXT: s_waitcnt vmcnt(10) +; UNROLL3-NEXT: buffer_store_dword v5, v3, s[0:3], 0 offen offset:40 +; UNROLL3-NEXT: s_waitcnt vmcnt(9) +; UNROLL3-NEXT: buffer_store_dword v6, v3, s[0:3], 0 offen offset:36 +; UNROLL3-NEXT: s_waitcnt vmcnt(8) +; UNROLL3-NEXT: buffer_store_dword v7, v3, s[0:3], 0 offen offset:32 +; UNROLL3-NEXT: s_waitcnt vmcnt(7) +; UNROLL3-NEXT: buffer_store_dword v8, v3, s[0:3], 0 offen offset:28 +; UNROLL3-NEXT: s_waitcnt vmcnt(6) +; UNROLL3-NEXT: buffer_store_dword v9, v3, s[0:3], 0 offen offset:24 +; UNROLL3-NEXT: s_waitcnt vmcnt(5) +; UNROLL3-NEXT: buffer_store_dword v10, v3, s[0:3], 0 offen offset:20 +; UNROLL3-NEXT: s_waitcnt vmcnt(4) +; UNROLL3-NEXT: buffer_store_dword v11, v3, s[0:3], 0 offen offset:16 +; UNROLL3-NEXT: s_waitcnt vmcnt(3) +; UNROLL3-NEXT: buffer_store_dword v12, v3, s[0:3], 0 offen offset:12 +; UNROLL3-NEXT: s_waitcnt vmcnt(2) +; UNROLL3-NEXT: buffer_store_dword v13, v3, s[0:3], 0 offen offset:8 +; UNROLL3-NEXT: s_waitcnt vmcnt(1) +; UNROLL3-NEXT: buffer_store_dword v14, v3, s[0:3], 0 offen offset:4 +; UNROLL3-NEXT: s_waitcnt vmcnt(0) +; UNROLL3-NEXT: buffer_store_dword v15, v3, s[0:3], 0 offen +; UNROLL3-NEXT: v_add_nc_u32_e32 v3, 48, v3 +; UNROLL3-NEXT: s_and_b32 vcc_lo, exec_lo, s6 +; UNROLL3-NEXT: s_cbranch_vccnz .LBB3_1 +; UNROLL3-NEXT: ; %bb.2: ; %memcpy-split +; UNROLL3-NEXT: s_clause 0x3 +; UNROLL3-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen offset:2028 +; UNROLL3-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:2024 +; UNROLL3-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen offset:2020 +; UNROLL3-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen offset:2016 +; UNROLL3-NEXT: s_waitcnt vmcnt(3) +; UNROLL3-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:2028 +; UNROLL3-NEXT: s_waitcnt vmcnt(2) +; UNROLL3-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:2024 +; UNROLL3-NEXT: s_waitcnt vmcnt(1) +; UNROLL3-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:2020 +; UNROLL3-NEXT: s_waitcnt vmcnt(0) +; UNROLL3-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:2016 +; UNROLL3-NEXT: s_clause 0x3 +; UNROLL3-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen offset:2044 +; UNROLL3-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:2040 +; UNROLL3-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen offset:2036 +; UNROLL3-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:2032 +; UNROLL3-NEXT: s_waitcnt vmcnt(3) +; UNROLL3-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:2044 +; UNROLL3-NEXT: s_waitcnt vmcnt(2) +; UNROLL3-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:2040 +; UNROLL3-NEXT: s_waitcnt vmcnt(1) +; UNROLL3-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:2036 +; UNROLL3-NEXT: s_waitcnt vmcnt(0) +; UNROLL3-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:2032 +; UNROLL3-NEXT: s_setpc_b64 s[30:31] +entry: + tail call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(5) noundef nonnull align 1 %src, i64 2048, i1 false) + ret void +} + +define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) align 1 readonly %src) { +; CHECK-LABEL: memcpy_p0_p5_sz2048: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_mov_b64 s[4:5], 0 +; CHECK-NEXT: .LBB4_1: ; %load-store-loop +; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: s_clause 0x3e +; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:24 +; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen offset:32 +; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:36 +; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:40 +; CHECK-NEXT: buffer_load_dword v10, v2, s[0:3], 0 offen offset:44 +; CHECK-NEXT: buffer_load_dword v11, v2, s[0:3], 0 offen offset:48 +; CHECK-NEXT: buffer_load_dword v12, v2, s[0:3], 0 offen offset:52 +; CHECK-NEXT: buffer_load_dword v13, v2, s[0:3], 0 offen offset:56 +; CHECK-NEXT: buffer_load_dword v14, v2, s[0:3], 0 offen offset:60 +; CHECK-NEXT: buffer_load_dword v18, v2, s[0:3], 0 offen offset:124 +; CHECK-NEXT: buffer_load_dword v17, v2, s[0:3], 0 offen offset:120 +; CHECK-NEXT: buffer_load_dword v16, v2, s[0:3], 0 offen offset:116 +; CHECK-NEXT: buffer_load_dword v15, v2, s[0:3], 0 offen offset:112 +; CHECK-NEXT: buffer_load_dword v22, v2, s[0:3], 0 offen offset:108 +; CHECK-NEXT: buffer_load_dword v21, v2, s[0:3], 0 offen offset:104 +; CHECK-NEXT: buffer_load_dword v20, v2, s[0:3], 0 offen offset:100 +; CHECK-NEXT: buffer_load_dword v19, v2, s[0:3], 0 offen offset:96 +; CHECK-NEXT: buffer_load_dword v26, v2, s[0:3], 0 offen offset:252 +; CHECK-NEXT: buffer_load_dword v25, v2, s[0:3], 0 offen offset:248 +; CHECK-NEXT: buffer_load_dword v24, v2, s[0:3], 0 offen offset:244 +; CHECK-NEXT: buffer_load_dword v23, v2, s[0:3], 0 offen offset:240 +; CHECK-NEXT: buffer_load_dword v30, v2, s[0:3], 0 offen offset:236 +; CHECK-NEXT: buffer_load_dword v29, v2, s[0:3], 0 offen offset:232 +; CHECK-NEXT: buffer_load_dword v28, v2, s[0:3], 0 offen offset:228 +; CHECK-NEXT: buffer_load_dword v27, v2, s[0:3], 0 offen offset:224 +; CHECK-NEXT: buffer_load_dword v34, v2, s[0:3], 0 offen offset:220 +; CHECK-NEXT: buffer_load_dword v33, v2, s[0:3], 0 offen offset:216 +; CHECK-NEXT: buffer_load_dword v32, v2, s[0:3], 0 offen offset:212 +; CHECK-NEXT: buffer_load_dword v31, v2, s[0:3], 0 offen offset:208 +; CHECK-NEXT: buffer_load_dword v38, v2, s[0:3], 0 offen offset:204 +; CHECK-NEXT: buffer_load_dword v37, v2, s[0:3], 0 offen offset:200 +; CHECK-NEXT: buffer_load_dword v36, v2, s[0:3], 0 offen offset:196 +; CHECK-NEXT: buffer_load_dword v35, v2, s[0:3], 0 offen offset:192 +; CHECK-NEXT: buffer_load_dword v51, v2, s[0:3], 0 offen offset:188 +; CHECK-NEXT: buffer_load_dword v50, v2, s[0:3], 0 offen offset:184 +; CHECK-NEXT: buffer_load_dword v49, v2, s[0:3], 0 offen offset:180 +; CHECK-NEXT: buffer_load_dword v48, v2, s[0:3], 0 offen offset:176 +; CHECK-NEXT: buffer_load_dword v55, v2, s[0:3], 0 offen offset:172 +; CHECK-NEXT: buffer_load_dword v54, v2, s[0:3], 0 offen offset:168 +; CHECK-NEXT: buffer_load_dword v53, v2, s[0:3], 0 offen offset:164 +; CHECK-NEXT: buffer_load_dword v52, v2, s[0:3], 0 offen offset:160 +; CHECK-NEXT: buffer_load_dword v67, v2, s[0:3], 0 offen offset:156 +; CHECK-NEXT: buffer_load_dword v66, v2, s[0:3], 0 offen offset:152 +; CHECK-NEXT: buffer_load_dword v65, v2, s[0:3], 0 offen offset:148 +; CHECK-NEXT: buffer_load_dword v64, v2, s[0:3], 0 offen offset:144 +; CHECK-NEXT: buffer_load_dword v71, v2, s[0:3], 0 offen offset:140 +; CHECK-NEXT: buffer_load_dword v70, v2, s[0:3], 0 offen offset:136 +; CHECK-NEXT: buffer_load_dword v69, v2, s[0:3], 0 offen offset:132 +; CHECK-NEXT: buffer_load_dword v68, v2, s[0:3], 0 offen offset:128 +; CHECK-NEXT: buffer_load_dword v83, v2, s[0:3], 0 offen offset:92 +; CHECK-NEXT: buffer_load_dword v82, v2, s[0:3], 0 offen offset:88 +; CHECK-NEXT: buffer_load_dword v81, v2, s[0:3], 0 offen offset:84 +; CHECK-NEXT: buffer_load_dword v80, v2, s[0:3], 0 offen offset:80 +; CHECK-NEXT: buffer_load_dword v87, v2, s[0:3], 0 offen offset:76 +; CHECK-NEXT: buffer_load_dword v86, v2, s[0:3], 0 offen offset:72 +; CHECK-NEXT: buffer_load_dword v85, v2, s[0:3], 0 offen offset:68 +; CHECK-NEXT: buffer_load_dword v84, v2, s[0:3], 0 offen offset:64 +; CHECK-NEXT: buffer_load_dword v96, v2, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v97, v2, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v98, v2, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_load_dword v99, v2, s[0:3], 0 offen offset:12 +; CHECK-NEXT: v_add_co_u32 v100, vcc_lo, v0, s4 +; CHECK-NEXT: s_add_u32 s4, s4, 0x100 +; CHECK-NEXT: v_add_co_ci_u32_e32 v101, vcc_lo, s5, v1, vcc_lo +; CHECK-NEXT: s_addc_u32 s5, s5, 0 +; CHECK-NEXT: v_add_nc_u32_e32 v2, 0x100, v2 +; CHECK-NEXT: v_cmp_gt_u64_e64 s6, 0x800, s[4:5] +; CHECK-NEXT: s_waitcnt vmcnt(41) +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[23:26] offset:240 +; CHECK-NEXT: s_waitcnt vmcnt(37) +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[27:30] offset:224 +; CHECK-NEXT: s_waitcnt vmcnt(33) +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[31:34] offset:208 +; CHECK-NEXT: s_waitcnt vmcnt(29) +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[35:38] offset:192 +; CHECK-NEXT: s_waitcnt vmcnt(25) +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[48:51] offset:176 +; CHECK-NEXT: s_waitcnt vmcnt(21) +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[52:55] offset:160 +; CHECK-NEXT: s_waitcnt vmcnt(17) +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[64:67] offset:144 +; CHECK-NEXT: s_waitcnt vmcnt(13) +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[68:71] offset:128 +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[15:18] offset:112 +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[19:22] offset:96 +; CHECK-NEXT: s_waitcnt vmcnt(9) +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[80:83] offset:80 +; CHECK-NEXT: s_waitcnt vmcnt(5) +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[84:87] offset:64 +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[11:14] offset:48 +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[7:10] offset:32 +; CHECK-NEXT: s_waitcnt vmcnt(1) +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[3:6] offset:16 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[96:99] +; CHECK-NEXT: s_and_b32 vcc_lo, exec_lo, s6 +; CHECK-NEXT: s_cbranch_vccnz .LBB4_1 +; CHECK-NEXT: ; %bb.2: ; %memcpy-split +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: s_setpc_b64 s[30:31] +; +; ALIGNED-LABEL: memcpy_p0_p5_sz2048: +; ALIGNED: ; %bb.0: ; %entry +; ALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; ALIGNED-NEXT: s_mov_b64 s[4:5], 0 +; ALIGNED-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v72, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v73, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v74, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v75, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v76, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v77, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v78, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v79, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v88, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v89, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v90, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v91, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v92, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v93, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v94, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v95, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v104, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v105, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v106, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v107, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v108, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v109, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v110, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v111, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v120, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v121, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v122, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v123, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v124, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v125, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v126, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v127, off, s[0:3], s32 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1224 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1228 ; 4-byte Folded Spill +; ALIGNED-NEXT: .LBB4_1: ; %load-store-loop +; ALIGNED-NEXT: ; =>This Inner Loop Header: Depth=1 +; ALIGNED-NEXT: s_clause 0x39 +; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:20 +; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:21 +; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:22 +; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:23 +; ALIGNED-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:24 +; ALIGNED-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:25 +; ALIGNED-NEXT: buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:26 +; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:30 +; ALIGNED-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:31 +; ALIGNED-NEXT: buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:32 +; ALIGNED-NEXT: buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:33 +; ALIGNED-NEXT: buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:34 +; ALIGNED-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:29 +; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:28 +; ALIGNED-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:27 +; ALIGNED-NEXT: buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:35 +; ALIGNED-NEXT: buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:36 +; ALIGNED-NEXT: buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:37 +; ALIGNED-NEXT: buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:38 +; ALIGNED-NEXT: buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:39 +; ALIGNED-NEXT: buffer_load_ubyte v22, v2, s[0:3], 0 offen offset:40 +; ALIGNED-NEXT: buffer_load_ubyte v23, v2, s[0:3], 0 offen offset:41 +; ALIGNED-NEXT: buffer_load_ubyte v25, v2, s[0:3], 0 offen offset:42 +; ALIGNED-NEXT: buffer_load_ubyte v28, v2, s[0:3], 0 offen offset:43 +; ALIGNED-NEXT: buffer_load_ubyte v21, v2, s[0:3], 0 offen offset:44 +; ALIGNED-NEXT: buffer_load_ubyte v24, v2, s[0:3], 0 offen offset:45 +; ALIGNED-NEXT: buffer_load_ubyte v26, v2, s[0:3], 0 offen offset:46 +; ALIGNED-NEXT: buffer_load_ubyte v27, v2, s[0:3], 0 offen offset:47 +; ALIGNED-NEXT: buffer_load_ubyte v30, v2, s[0:3], 0 offen offset:48 +; ALIGNED-NEXT: buffer_load_ubyte v31, v2, s[0:3], 0 offen offset:49 +; ALIGNED-NEXT: buffer_load_ubyte v33, v2, s[0:3], 0 offen offset:50 +; ALIGNED-NEXT: buffer_load_ubyte v34, v2, s[0:3], 0 offen offset:51 +; ALIGNED-NEXT: buffer_load_ubyte v32, v2, s[0:3], 0 offen offset:52 +; ALIGNED-NEXT: buffer_load_ubyte v37, v2, s[0:3], 0 offen offset:53 +; ALIGNED-NEXT: buffer_load_ubyte v35, v2, s[0:3], 0 offen offset:54 +; ALIGNED-NEXT: buffer_load_ubyte v36, v2, s[0:3], 0 offen offset:55 +; ALIGNED-NEXT: buffer_load_ubyte v48, v2, s[0:3], 0 offen offset:56 +; ALIGNED-NEXT: buffer_load_ubyte v51, v2, s[0:3], 0 offen offset:57 +; ALIGNED-NEXT: buffer_load_ubyte v52, v2, s[0:3], 0 offen offset:58 +; ALIGNED-NEXT: buffer_load_ubyte v38, v2, s[0:3], 0 offen offset:60 +; ALIGNED-NEXT: buffer_load_ubyte v50, v2, s[0:3], 0 offen offset:61 +; ALIGNED-NEXT: buffer_load_ubyte v39, v2, s[0:3], 0 offen offset:62 +; ALIGNED-NEXT: buffer_load_ubyte v49, v2, s[0:3], 0 offen offset:63 +; ALIGNED-NEXT: buffer_load_ubyte v29, v2, s[0:3], 0 offen offset:64 +; ALIGNED-NEXT: buffer_load_ubyte v55, v2, s[0:3], 0 offen offset:65 +; ALIGNED-NEXT: buffer_load_ubyte v66, v2, s[0:3], 0 offen offset:66 +; ALIGNED-NEXT: buffer_load_ubyte v53, v2, s[0:3], 0 offen offset:59 +; ALIGNED-NEXT: buffer_load_ubyte v67, v2, s[0:3], 0 offen offset:67 +; ALIGNED-NEXT: buffer_load_ubyte v54, v2, s[0:3], 0 offen offset:68 +; ALIGNED-NEXT: buffer_load_ubyte v64, v2, s[0:3], 0 offen offset:69 +; ALIGNED-NEXT: buffer_load_ubyte v65, v2, s[0:3], 0 offen offset:70 +; ALIGNED-NEXT: buffer_load_ubyte v68, v2, s[0:3], 0 offen offset:71 +; ALIGNED-NEXT: buffer_load_ubyte v69, v2, s[0:3], 0 offen offset:76 +; ALIGNED-NEXT: buffer_load_ubyte v70, v2, s[0:3], 0 offen offset:77 +; ALIGNED-NEXT: buffer_load_ubyte v71, v2, s[0:3], 0 offen offset:78 +; ALIGNED-NEXT: buffer_load_ubyte v80, v2, s[0:3], 0 offen offset:79 +; ALIGNED-NEXT: buffer_load_ubyte v127, v2, s[0:3], 0 offen offset:19 +; ALIGNED-NEXT: buffer_load_ubyte v81, v2, s[0:3], 0 offen offset:75 +; ALIGNED-NEXT: s_waitcnt vmcnt(57) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(56) +; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(55) +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(54) +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(53) +; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(52) +; ALIGNED-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(51) +; ALIGNED-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(50) +; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(49) +; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(48) +; ALIGNED-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 8, v3 +; ALIGNED-NEXT: s_waitcnt vmcnt(45) +; ALIGNED-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(44) +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(43) +; ALIGNED-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v3, v9, 8, v5 +; ALIGNED-NEXT: s_waitcnt vmcnt(41) +; ALIGNED-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v4, v8, 8, v6 +; ALIGNED-NEXT: v_lshl_or_b32 v5, v10, 8, v7 +; ALIGNED-NEXT: v_lshl_or_b32 v6, v11, 8, v12 +; ALIGNED-NEXT: v_lshl_or_b32 v7, v15, 8, v14 +; ALIGNED-NEXT: v_lshl_or_b32 v8, v19, 8, v17 +; ALIGNED-NEXT: s_waitcnt vmcnt(40) +; ALIGNED-NEXT: v_lshl_or_b32 v9, v16, 8, v13 +; ALIGNED-NEXT: s_waitcnt vmcnt(38) +; ALIGNED-NEXT: v_lshl_or_b32 v10, v20, 8, v18 +; ALIGNED-NEXT: s_waitcnt vmcnt(36) +; ALIGNED-NEXT: v_lshl_or_b32 v11, v23, 8, v22 +; ALIGNED-NEXT: s_waitcnt vmcnt(34) +; ALIGNED-NEXT: v_lshl_or_b32 v12, v28, 8, v25 +; ALIGNED-NEXT: s_waitcnt vmcnt(32) +; ALIGNED-NEXT: v_lshl_or_b32 v13, v24, 8, v21 +; ALIGNED-NEXT: s_waitcnt vmcnt(30) +; ALIGNED-NEXT: v_lshl_or_b32 v14, v27, 8, v26 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 16, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v6, 16, v5 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v8, 16, v7 +; ALIGNED-NEXT: v_lshl_or_b32 v5, v10, 16, v9 +; ALIGNED-NEXT: v_lshl_or_b32 v6, v12, 16, v11 +; ALIGNED-NEXT: v_lshl_or_b32 v7, v14, 16, v13 +; ALIGNED-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(28) +; ALIGNED-NEXT: v_lshl_or_b32 v15, v31, 8, v30 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(26) +; ALIGNED-NEXT: v_lshl_or_b32 v0, v34, 8, v33 +; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(24) +; ALIGNED-NEXT: v_lshl_or_b32 v1, v37, 8, v32 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(22) +; ALIGNED-NEXT: v_lshl_or_b32 v3, v36, 8, v35 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(17) +; ALIGNED-NEXT: v_lshl_or_b32 v4, v50, 8, v38 +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(15) +; ALIGNED-NEXT: v_lshl_or_b32 v5, v49, 8, v39 +; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v6, v51, 8, v48 +; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(11) +; ALIGNED-NEXT: v_lshl_or_b32 v7, v53, 8, v52 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v0, 16, v15 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 16, v1 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v5, 16, v4 +; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:85 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v7, 16, v6 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v55, 8, v29 +; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(11) +; ALIGNED-NEXT: v_lshl_or_b32 v1, v67, 8, v66 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(9) +; ALIGNED-NEXT: v_lshl_or_b32 v3, v64, 8, v54 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(7) +; ALIGNED-NEXT: v_lshl_or_b32 v4, v68, 8, v65 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: s_clause 0x1 +; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:86 +; ALIGNED-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:82 +; ALIGNED-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 16, v3 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(7) +; ALIGNED-NEXT: v_lshl_or_b32 v0, v70, 8, v69 +; ALIGNED-NEXT: s_clause 0x1 +; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:83 +; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:74 +; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(7) +; ALIGNED-NEXT: v_lshl_or_b32 v1, v80, 8, v71 +; ALIGNED-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:73 +; ALIGNED-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:72 +; ALIGNED-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v66, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v67, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v64, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v65, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v68, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v69, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v70, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v71, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v80, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(8) +; ALIGNED-NEXT: buffer_store_dword v127, off, s[0:3], s32 offset:1152 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(7) +; ALIGNED-NEXT: buffer_store_dword v81, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:87 +; ALIGNED-NEXT: s_waitcnt vmcnt(7) +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(6) +; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(5) +; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(4) +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v81, 8, v3 +; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:84 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:81 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:80 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 8, v7 +; ALIGNED-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:98 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v8, 8, v6 +; ALIGNED-NEXT: s_clause 0x1 +; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:102 +; ALIGNED-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:103 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v5, 8, v3 +; ALIGNED-NEXT: s_clause 0x1 +; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:94 +; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:95 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: s_clause 0x1 +; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:93 +; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:91 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:92 +; ALIGNED-NEXT: s_waitcnt vmcnt(6) +; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:852 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(5) +; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:856 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(4) +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v5, 8, v3 +; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:90 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:101 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:89 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:88 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:848 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 8, v3 +; ALIGNED-NEXT: s_clause 0x1 +; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:99 +; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:100 +; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:844 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:97 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:96 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:840 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 8, v7 +; ALIGNED-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:114 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v8, 8, v6 +; ALIGNED-NEXT: s_clause 0x1 +; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:118 +; ALIGNED-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:119 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:860 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v5, 8, v3 +; ALIGNED-NEXT: s_clause 0x1 +; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:110 +; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:111 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: s_clause 0x1 +; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:109 +; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:107 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:868 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:108 +; ALIGNED-NEXT: s_waitcnt vmcnt(6) +; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:932 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(5) +; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:936 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(4) +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:876 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:880 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:872 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:864 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v5, 8, v3 +; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:106 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:896 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:117 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:105 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:888 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:104 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:900 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:928 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:892 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:884 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 8, v3 +; ALIGNED-NEXT: s_clause 0x1 +; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:115 +; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:116 +; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:924 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:113 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:904 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:112 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:916 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:920 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:912 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:908 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 8, v7 +; ALIGNED-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:130 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v8, 8, v6 +; ALIGNED-NEXT: s_clause 0x1 +; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:134 +; ALIGNED-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:135 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:940 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v5, 8, v3 +; ALIGNED-NEXT: s_clause 0x1 +; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:126 +; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:127 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: s_clause 0x1 +; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:125 +; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:123 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:948 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:124 +; ALIGNED-NEXT: s_waitcnt vmcnt(6) +; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1012 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(5) +; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:1016 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(4) +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:956 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:960 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:952 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:944 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v5, 8, v3 +; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:122 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:976 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:133 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:121 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:968 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:120 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:980 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1008 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:972 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:964 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 8, v3 +; ALIGNED-NEXT: s_clause 0x1 +; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:131 +; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:132 +; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:1004 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:129 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:984 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:128 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:996 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1000 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:992 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:988 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 8, v7 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v8, 8, v6 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1020 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v5, 8, v3 +; ALIGNED-NEXT: s_clause 0x1 +; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:142 +; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:143 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: s_clause 0x1 +; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:141 +; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:139 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1028 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:140 +; ALIGNED-NEXT: s_waitcnt vmcnt(4) +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1036 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1040 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1032 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1024 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v5, 8, v3 +; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:138 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1056 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:137 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1048 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:136 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1060 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1052 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1044 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 8, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:145 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1064 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:144 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1084 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1072 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_clause 0x5 +; ALIGNED-NEXT: buffer_load_ubyte v125, v2, s[0:3], 0 offen offset:146 +; ALIGNED-NEXT: buffer_load_ubyte v126, v2, s[0:3], 0 offen offset:147 +; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:148 +; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:149 +; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:150 +; ALIGNED-NEXT: buffer_load_ubyte v123, v2, s[0:3], 0 offen offset:151 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 +; ALIGNED-NEXT: s_waitcnt vmcnt(4) +; ALIGNED-NEXT: v_lshl_or_b32 v1, v126, 8, v125 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1104 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1112 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1116 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: v_lshl_or_b32 v1, v123, 8, v5 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1124 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v4, 8, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1132 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_clause 0x4 +; ALIGNED-NEXT: buffer_load_ubyte v121, v2, s[0:3], 0 offen offset:156 +; ALIGNED-NEXT: buffer_load_ubyte v109, v2, s[0:3], 0 offen offset:157 +; ALIGNED-NEXT: buffer_load_ubyte v108, v2, s[0:3], 0 offen offset:158 +; ALIGNED-NEXT: buffer_load_ubyte v107, v2, s[0:3], 0 offen offset:159 +; ALIGNED-NEXT: buffer_load_ubyte v106, v2, s[0:3], 0 offen offset:155 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: v_lshl_or_b32 v0, v109, 8, v121 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: v_lshl_or_b32 v1, v107, 8, v108 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1136 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_clause 0x2 +; ALIGNED-NEXT: buffer_load_ubyte v105, v2, s[0:3], 0 offen offset:152 +; ALIGNED-NEXT: buffer_load_ubyte v93, v2, s[0:3], 0 offen offset:153 +; ALIGNED-NEXT: buffer_load_ubyte v91, v2, s[0:3], 0 offen offset:154 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: v_lshl_or_b32 v0, v93, 8, v105 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: v_lshl_or_b32 v1, v106, 8, v91 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1144 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_clause 0x7 +; ALIGNED-NEXT: buffer_load_ubyte v89, v2, s[0:3], 0 offen offset:160 +; ALIGNED-NEXT: buffer_load_ubyte v78, v2, s[0:3], 0 offen offset:161 +; ALIGNED-NEXT: buffer_load_ubyte v73, v2, s[0:3], 0 offen offset:162 +; ALIGNED-NEXT: buffer_load_ubyte v74, v2, s[0:3], 0 offen offset:163 +; ALIGNED-NEXT: buffer_load_ubyte v79, v2, s[0:3], 0 offen offset:164 +; ALIGNED-NEXT: buffer_load_ubyte v75, v2, s[0:3], 0 offen offset:165 +; ALIGNED-NEXT: buffer_load_ubyte v76, v2, s[0:3], 0 offen offset:166 +; ALIGNED-NEXT: buffer_load_ubyte v72, v2, s[0:3], 0 offen offset:167 +; ALIGNED-NEXT: s_waitcnt vmcnt(6) +; ALIGNED-NEXT: v_lshl_or_b32 v0, v78, 8, v89 +; ALIGNED-NEXT: s_waitcnt vmcnt(4) +; ALIGNED-NEXT: v_lshl_or_b32 v1, v74, 8, v73 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: v_lshl_or_b32 v1, v72, 8, v76 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1156 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v75, 8, v79 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1160 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_clause 0x4 +; ALIGNED-NEXT: buffer_load_ubyte v63, v2, s[0:3], 0 offen offset:172 +; ALIGNED-NEXT: buffer_load_ubyte v61, v2, s[0:3], 0 offen offset:173 +; ALIGNED-NEXT: buffer_load_ubyte v62, v2, s[0:3], 0 offen offset:174 +; ALIGNED-NEXT: buffer_load_ubyte v60, v2, s[0:3], 0 offen offset:175 +; ALIGNED-NEXT: buffer_load_ubyte v57, v2, s[0:3], 0 offen offset:171 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: v_lshl_or_b32 v0, v61, 8, v63 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: v_lshl_or_b32 v1, v60, 8, v62 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1164 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_clause 0x2 +; ALIGNED-NEXT: buffer_load_ubyte v59, v2, s[0:3], 0 offen offset:168 +; ALIGNED-NEXT: buffer_load_ubyte v56, v2, s[0:3], 0 offen offset:169 +; ALIGNED-NEXT: buffer_load_ubyte v47, v2, s[0:3], 0 offen offset:170 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: v_lshl_or_b32 v0, v56, 8, v59 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: v_lshl_or_b32 v1, v57, 8, v47 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1168 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_clause 0x7 +; ALIGNED-NEXT: buffer_load_ubyte v46, v2, s[0:3], 0 offen offset:176 +; ALIGNED-NEXT: buffer_load_ubyte v43, v2, s[0:3], 0 offen offset:177 +; ALIGNED-NEXT: buffer_load_ubyte v119, v2, s[0:3], 0 offen offset:178 +; ALIGNED-NEXT: buffer_load_ubyte v40, v2, s[0:3], 0 offen offset:179 +; ALIGNED-NEXT: buffer_load_ubyte v45, v2, s[0:3], 0 offen offset:180 +; ALIGNED-NEXT: buffer_load_ubyte v41, v2, s[0:3], 0 offen offset:181 +; ALIGNED-NEXT: buffer_load_ubyte v42, v2, s[0:3], 0 offen offset:182 +; ALIGNED-NEXT: buffer_load_ubyte v118, v2, s[0:3], 0 offen offset:183 +; ALIGNED-NEXT: s_waitcnt vmcnt(6) +; ALIGNED-NEXT: v_lshl_or_b32 v0, v43, 8, v46 +; ALIGNED-NEXT: s_waitcnt vmcnt(4) +; ALIGNED-NEXT: v_lshl_or_b32 v1, v40, 8, v119 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: v_lshl_or_b32 v1, v118, 8, v42 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1172 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v41, 8, v45 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1176 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_clause 0x4 +; ALIGNED-NEXT: buffer_load_ubyte v117, v2, s[0:3], 0 offen offset:188 +; ALIGNED-NEXT: buffer_load_ubyte v115, v2, s[0:3], 0 offen offset:189 +; ALIGNED-NEXT: buffer_load_ubyte v116, v2, s[0:3], 0 offen offset:190 +; ALIGNED-NEXT: buffer_load_ubyte v114, v2, s[0:3], 0 offen offset:191 +; ALIGNED-NEXT: buffer_load_ubyte v112, v2, s[0:3], 0 offen offset:187 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: v_lshl_or_b32 v0, v115, 8, v117 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: v_lshl_or_b32 v1, v114, 8, v116 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1180 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_clause 0x2 +; ALIGNED-NEXT: buffer_load_ubyte v113, v2, s[0:3], 0 offen offset:184 +; ALIGNED-NEXT: buffer_load_ubyte v103, v2, s[0:3], 0 offen offset:185 +; ALIGNED-NEXT: buffer_load_ubyte v102, v2, s[0:3], 0 offen offset:186 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: v_lshl_or_b32 v0, v103, 8, v113 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: v_lshl_or_b32 v1, v112, 8, v102 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1184 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_clause 0x7 +; ALIGNED-NEXT: buffer_load_ubyte v100, v2, s[0:3], 0 offen offset:192 +; ALIGNED-NEXT: buffer_load_ubyte v98, v2, s[0:3], 0 offen offset:193 +; ALIGNED-NEXT: buffer_load_ubyte v87, v2, s[0:3], 0 offen offset:194 +; ALIGNED-NEXT: buffer_load_ubyte v86, v2, s[0:3], 0 offen offset:195 +; ALIGNED-NEXT: buffer_load_ubyte v99, v2, s[0:3], 0 offen offset:196 +; ALIGNED-NEXT: buffer_load_ubyte v97, v2, s[0:3], 0 offen offset:197 +; ALIGNED-NEXT: buffer_load_ubyte v96, v2, s[0:3], 0 offen offset:198 +; ALIGNED-NEXT: buffer_load_ubyte v85, v2, s[0:3], 0 offen offset:199 +; ALIGNED-NEXT: s_waitcnt vmcnt(6) +; ALIGNED-NEXT: v_lshl_or_b32 v0, v98, 8, v100 +; ALIGNED-NEXT: s_waitcnt vmcnt(4) +; ALIGNED-NEXT: v_lshl_or_b32 v3, v86, 8, v87 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v3, 16, v0 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: v_lshl_or_b32 v3, v85, 8, v96 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1188 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v97, 8, v99 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v3, 16, v0 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1192 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_clause 0x4 +; ALIGNED-NEXT: buffer_load_ubyte v83, v2, s[0:3], 0 offen offset:204 +; ALIGNED-NEXT: buffer_load_ubyte v81, v2, s[0:3], 0 offen offset:205 +; ALIGNED-NEXT: buffer_load_ubyte v82, v2, s[0:3], 0 offen offset:206 +; ALIGNED-NEXT: buffer_load_ubyte v80, v2, s[0:3], 0 offen offset:207 +; ALIGNED-NEXT: buffer_load_ubyte v71, v2, s[0:3], 0 offen offset:203 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: v_lshl_or_b32 v3, v81, 8, v83 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: v_lshl_or_b32 v4, v80, 8, v82 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v4, 16, v3 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1196 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_clause 0x2 +; ALIGNED-NEXT: buffer_load_ubyte v70, v2, s[0:3], 0 offen offset:200 +; ALIGNED-NEXT: buffer_load_ubyte v69, v2, s[0:3], 0 offen offset:201 +; ALIGNED-NEXT: buffer_load_ubyte v68, v2, s[0:3], 0 offen offset:202 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: v_lshl_or_b32 v3, v69, 8, v70 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: v_lshl_or_b32 v4, v71, 8, v68 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v4, 16, v3 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1200 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_clause 0x4 +; ALIGNED-NEXT: buffer_load_ubyte v67, v2, s[0:3], 0 offen offset:212 +; ALIGNED-NEXT: buffer_load_ubyte v54, v2, s[0:3], 0 offen offset:213 +; ALIGNED-NEXT: buffer_load_ubyte v65, v2, s[0:3], 0 offen offset:214 +; ALIGNED-NEXT: buffer_load_ubyte v52, v2, s[0:3], 0 offen offset:215 +; ALIGNED-NEXT: buffer_load_ubyte v55, v2, s[0:3], 0 offen offset:211 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: v_lshl_or_b32 v3, v54, 8, v67 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: v_lshl_or_b32 v4, v52, 8, v65 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v4, 16, v3 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1204 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_clause 0x7 +; ALIGNED-NEXT: buffer_load_ubyte v66, v2, s[0:3], 0 offen offset:216 +; ALIGNED-NEXT: buffer_load_ubyte v53, v2, s[0:3], 0 offen offset:217 +; ALIGNED-NEXT: buffer_load_ubyte v49, v2, s[0:3], 0 offen offset:218 +; ALIGNED-NEXT: buffer_load_ubyte v48, v2, s[0:3], 0 offen offset:219 +; ALIGNED-NEXT: buffer_load_ubyte v64, v2, s[0:3], 0 offen offset:220 +; ALIGNED-NEXT: buffer_load_ubyte v51, v2, s[0:3], 0 offen offset:221 +; ALIGNED-NEXT: buffer_load_ubyte v50, v2, s[0:3], 0 offen offset:222 +; ALIGNED-NEXT: buffer_load_ubyte v39, v2, s[0:3], 0 offen offset:223 +; ALIGNED-NEXT: s_waitcnt vmcnt(6) +; ALIGNED-NEXT: v_lshl_or_b32 v3, v53, 8, v66 +; ALIGNED-NEXT: s_waitcnt vmcnt(4) +; ALIGNED-NEXT: v_lshl_or_b32 v4, v48, 8, v49 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v4, 16, v3 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: v_lshl_or_b32 v3, v51, 8, v64 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: v_lshl_or_b32 v4, v39, 8, v50 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1208 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v4, 16, v3 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1212 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_clause 0x2 +; ALIGNED-NEXT: buffer_load_ubyte v38, v2, s[0:3], 0 offen offset:208 +; ALIGNED-NEXT: buffer_load_ubyte v36, v2, s[0:3], 0 offen offset:209 +; ALIGNED-NEXT: buffer_load_ubyte v37, v2, s[0:3], 0 offen offset:210 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: v_lshl_or_b32 v3, v36, 8, v38 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: v_lshl_or_b32 v4, v55, 8, v37 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v4, 16, v3 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1216 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_clause 0x7 +; ALIGNED-NEXT: buffer_load_ubyte v35, v2, s[0:3], 0 offen offset:224 +; ALIGNED-NEXT: buffer_load_ubyte v33, v2, s[0:3], 0 offen offset:225 +; ALIGNED-NEXT: buffer_load_ubyte v29, v2, s[0:3], 0 offen offset:226 +; ALIGNED-NEXT: buffer_load_ubyte v30, v2, s[0:3], 0 offen offset:227 +; ALIGNED-NEXT: buffer_load_ubyte v34, v2, s[0:3], 0 offen offset:228 +; ALIGNED-NEXT: buffer_load_ubyte v31, v2, s[0:3], 0 offen offset:229 +; ALIGNED-NEXT: buffer_load_ubyte v32, v2, s[0:3], 0 offen offset:230 +; ALIGNED-NEXT: buffer_load_ubyte v28, v2, s[0:3], 0 offen offset:231 +; ALIGNED-NEXT: s_waitcnt vmcnt(6) +; ALIGNED-NEXT: v_lshl_or_b32 v3, v33, 8, v35 +; ALIGNED-NEXT: s_waitcnt vmcnt(4) +; ALIGNED-NEXT: v_lshl_or_b32 v4, v30, 8, v29 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v4, 16, v3 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: v_lshl_or_b32 v3, v31, 8, v34 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: v_lshl_or_b32 v4, v28, 8, v32 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1220 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_clause 0x17 +; ALIGNED-NEXT: buffer_load_ubyte v27, v2, s[0:3], 0 offen offset:236 +; ALIGNED-NEXT: buffer_load_ubyte v25, v2, s[0:3], 0 offen offset:237 +; ALIGNED-NEXT: buffer_load_ubyte v26, v2, s[0:3], 0 offen offset:238 +; ALIGNED-NEXT: buffer_load_ubyte v24, v2, s[0:3], 0 offen offset:239 +; ALIGNED-NEXT: buffer_load_ubyte v23, v2, s[0:3], 0 offen offset:235 +; ALIGNED-NEXT: buffer_load_ubyte v22, v2, s[0:3], 0 offen offset:232 +; ALIGNED-NEXT: buffer_load_ubyte v21, v2, s[0:3], 0 offen offset:233 +; ALIGNED-NEXT: buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:234 +; ALIGNED-NEXT: buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:240 +; ALIGNED-NEXT: buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:241 +; ALIGNED-NEXT: buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:242 +; ALIGNED-NEXT: buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:243 +; ALIGNED-NEXT: buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:244 +; ALIGNED-NEXT: buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:245 +; ALIGNED-NEXT: buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:246 +; ALIGNED-NEXT: buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:247 +; ALIGNED-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:252 +; ALIGNED-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:253 +; ALIGNED-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:254 +; ALIGNED-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:255 +; ALIGNED-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:251 +; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:248 +; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:249 +; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:250 +; ALIGNED-NEXT: v_lshl_or_b32 v124, v4, 16, v3 +; ALIGNED-NEXT: s_clause 0x5 +; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen +; ALIGNED-NEXT: buffer_load_ubyte v94, v2, s[0:3], 0 offen offset:2 +; ALIGNED-NEXT: buffer_load_ubyte v88, v2, s[0:3], 0 offen offset:4 +; ALIGNED-NEXT: buffer_load_ubyte v90, v2, s[0:3], 0 offen offset:5 +; ALIGNED-NEXT: buffer_load_ubyte v92, v2, s[0:3], 0 offen offset:6 +; ALIGNED-NEXT: buffer_load_ubyte v95, v2, s[0:3], 0 offen offset:7 +; ALIGNED-NEXT: s_waitcnt vmcnt(28) +; ALIGNED-NEXT: v_lshl_or_b32 v3, v25, 8, v27 +; ALIGNED-NEXT: s_waitcnt vmcnt(26) +; ALIGNED-NEXT: v_lshl_or_b32 v4, v24, 8, v26 +; ALIGNED-NEXT: s_waitcnt vmcnt(14) +; ALIGNED-NEXT: v_lshl_or_b32 v44, v12, 8, v16 +; ALIGNED-NEXT: s_waitcnt vmcnt(10) +; ALIGNED-NEXT: v_lshl_or_b32 v58, v8, 8, v10 +; ALIGNED-NEXT: v_lshl_or_b32 v104, v4, 16, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v21, 8, v22 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v23, 8, v20 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: buffer_store_dword v88, off, s[0:3], s32 offset:1088 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: buffer_store_dword v90, off, s[0:3], s32 offset:1096 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: buffer_store_dword v92, off, s[0:3], s32 offset:1100 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v77, v4, 16, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v17, 8, v19 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v14, 8, v13 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v95, off, s[0:3], s32 offset:1108 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v101, v4, 16, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v15, 8, v18 +; ALIGNED-NEXT: v_lshl_or_b32 v84, v44, 16, v4 +; ALIGNED-NEXT: v_lshl_or_b32 v44, v9, 8, v11 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v58, 16, v44 +; ALIGNED-NEXT: v_lshl_or_b32 v44, v5, 8, v6 +; ALIGNED-NEXT: v_lshl_or_b32 v58, v7, 8, v1 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v58, 16, v44 +; ALIGNED-NEXT: s_clause 0x1 +; ALIGNED-NEXT: buffer_load_ubyte v44, v2, s[0:3], 0 offen offset:1 +; ALIGNED-NEXT: buffer_load_ubyte v58, v2, s[0:3], 0 offen offset:3 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1068 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v94, off, s[0:3], s32 offset:1092 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:1076 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:1080 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v44, v44, 8, v0 +; ALIGNED-NEXT: v_lshl_or_b32 v58, v58, 8, v94 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v58, 16, v44 +; ALIGNED-NEXT: v_lshl_or_b32 v44, v90, 8, v88 +; ALIGNED-NEXT: v_lshl_or_b32 v58, v95, 8, v92 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1120 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v58, 16, v44 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1128 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_clause 0x4 +; ALIGNED-NEXT: buffer_load_ubyte v122, v2, s[0:3], 0 offen offset:12 +; ALIGNED-NEXT: buffer_load_ubyte v111, v2, s[0:3], 0 offen offset:13 +; ALIGNED-NEXT: buffer_load_ubyte v120, v2, s[0:3], 0 offen offset:14 +; ALIGNED-NEXT: buffer_load_ubyte v110, v2, s[0:3], 0 offen offset:15 +; ALIGNED-NEXT: buffer_load_ubyte v94, v2, s[0:3], 0 offen offset:11 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: v_lshl_or_b32 v44, v111, 8, v122 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: v_lshl_or_b32 v58, v110, 8, v120 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v58, 16, v44 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1140 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_clause 0x2 +; ALIGNED-NEXT: buffer_load_ubyte v95, v2, s[0:3], 0 offen offset:8 +; ALIGNED-NEXT: buffer_load_ubyte v92, v2, s[0:3], 0 offen offset:9 +; ALIGNED-NEXT: buffer_load_ubyte v90, v2, s[0:3], 0 offen offset:10 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: v_lshl_or_b32 v44, v92, 8, v95 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: v_lshl_or_b32 v58, v94, 8, v90 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v58, 16, v44 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1148 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_clause 0x2 +; ALIGNED-NEXT: buffer_load_ubyte v88, v2, s[0:3], 0 offen offset:16 +; ALIGNED-NEXT: buffer_load_ubyte v44, v2, s[0:3], 0 offen offset:18 +; ALIGNED-NEXT: buffer_load_ubyte v58, v2, s[0:3], 0 offen offset:17 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:232 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:236 +; ALIGNED-NEXT: buffer_store_dword v84, off, s[0:3], s32 offset:228 +; ALIGNED-NEXT: buffer_store_dword v101, off, s[0:3], s32 offset:224 +; ALIGNED-NEXT: v_add_nc_u32_e32 v2, 0x100, v2 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: v_lshl_or_b32 v0, v127, 8, v44 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: v_lshl_or_b32 v127, v58, 8, v88 +; ALIGNED-NEXT: v_lshl_or_b32 v127, v0, 16, v127 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1228 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: v_add_co_u32 v3, vcc_lo, v0, s4 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1224 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_add_u32 s4, s4, 0x100 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, s5, v0, vcc_lo +; ALIGNED-NEXT: flat_store_byte v[3:4], v1 offset:250 +; ALIGNED-NEXT: flat_store_byte v[3:4], v7 offset:251 +; ALIGNED-NEXT: flat_store_byte v[3:4], v5 offset:249 +; ALIGNED-NEXT: flat_store_byte v[3:4], v8 offset:255 +; ALIGNED-NEXT: flat_store_byte v[3:4], v9 offset:253 +; ALIGNED-NEXT: flat_store_byte v[3:4], v10 offset:254 +; ALIGNED-NEXT: flat_store_byte v[3:4], v11 offset:252 +; ALIGNED-NEXT: flat_store_byte v[3:4], v6 offset:248 +; ALIGNED-NEXT: flat_store_byte v[3:4], v13 offset:242 +; ALIGNED-NEXT: flat_store_byte v[3:4], v14 offset:243 +; ALIGNED-NEXT: flat_store_byte v[3:4], v17 offset:241 +; ALIGNED-NEXT: flat_store_byte v[3:4], v12 offset:247 +; ALIGNED-NEXT: flat_store_byte v[3:4], v15 offset:245 +; ALIGNED-NEXT: flat_store_byte v[3:4], v16 offset:246 +; ALIGNED-NEXT: flat_store_byte v[3:4], v18 offset:244 +; ALIGNED-NEXT: flat_store_byte v[3:4], v19 offset:240 +; ALIGNED-NEXT: buffer_store_dword v77, off, s[0:3], s32 offset:248 +; ALIGNED-NEXT: buffer_store_dword v104, off, s[0:3], s32 offset:252 +; ALIGNED-NEXT: buffer_store_dword v124, off, s[0:3], s32 offset:244 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1220 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_addc_u32 s5, s5, 0 +; ALIGNED-NEXT: v_cmp_gt_u64_e64 s6, 0x800, s[4:5] +; ALIGNED-NEXT: s_and_b32 vcc_lo, exec_lo, s6 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 +; ALIGNED-NEXT: flat_store_byte v[3:4], v20 offset:234 +; ALIGNED-NEXT: flat_store_byte v[3:4], v23 offset:235 +; ALIGNED-NEXT: flat_store_byte v[3:4], v21 offset:233 +; ALIGNED-NEXT: flat_store_byte v[3:4], v24 offset:239 +; ALIGNED-NEXT: flat_store_byte v[3:4], v25 offset:237 +; ALIGNED-NEXT: flat_store_byte v[3:4], v26 offset:238 +; ALIGNED-NEXT: flat_store_byte v[3:4], v27 offset:236 +; ALIGNED-NEXT: flat_store_byte v[3:4], v22 offset:232 +; ALIGNED-NEXT: flat_store_byte v[3:4], v29 offset:226 +; ALIGNED-NEXT: flat_store_byte v[3:4], v30 offset:227 +; ALIGNED-NEXT: flat_store_byte v[3:4], v33 offset:225 +; ALIGNED-NEXT: flat_store_byte v[3:4], v28 offset:231 +; ALIGNED-NEXT: flat_store_byte v[3:4], v31 offset:229 +; ALIGNED-NEXT: flat_store_byte v[3:4], v32 offset:230 +; ALIGNED-NEXT: flat_store_byte v[3:4], v34 offset:228 +; ALIGNED-NEXT: flat_store_byte v[3:4], v35 offset:224 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1216 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1212 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1208 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1204 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 +; ALIGNED-NEXT: flat_store_byte v[3:4], v54 offset:213 +; ALIGNED-NEXT: flat_store_byte v[3:4], v52 offset:215 +; ALIGNED-NEXT: flat_store_byte v[3:4], v36 offset:209 +; ALIGNED-NEXT: flat_store_byte v[3:4], v55 offset:211 +; ALIGNED-NEXT: flat_store_byte v[3:4], v37 offset:210 +; ALIGNED-NEXT: flat_store_byte v[3:4], v65 offset:214 +; ALIGNED-NEXT: flat_store_byte v[3:4], v67 offset:212 +; ALIGNED-NEXT: flat_store_byte v[3:4], v49 offset:218 +; ALIGNED-NEXT: flat_store_byte v[3:4], v48 offset:219 +; ALIGNED-NEXT: flat_store_byte v[3:4], v53 offset:217 +; ALIGNED-NEXT: flat_store_byte v[3:4], v39 offset:223 +; ALIGNED-NEXT: flat_store_byte v[3:4], v51 offset:221 +; ALIGNED-NEXT: flat_store_byte v[3:4], v50 offset:222 +; ALIGNED-NEXT: flat_store_byte v[3:4], v64 offset:220 +; ALIGNED-NEXT: flat_store_byte v[3:4], v66 offset:216 +; ALIGNED-NEXT: flat_store_byte v[3:4], v38 offset:208 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1200 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1196 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1192 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1188 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 +; ALIGNED-NEXT: flat_store_byte v[3:4], v68 offset:202 +; ALIGNED-NEXT: flat_store_byte v[3:4], v71 offset:203 +; ALIGNED-NEXT: flat_store_byte v[3:4], v69 offset:201 +; ALIGNED-NEXT: flat_store_byte v[3:4], v80 offset:207 +; ALIGNED-NEXT: flat_store_byte v[3:4], v81 offset:205 +; ALIGNED-NEXT: flat_store_byte v[3:4], v82 offset:206 +; ALIGNED-NEXT: flat_store_byte v[3:4], v83 offset:204 +; ALIGNED-NEXT: flat_store_byte v[3:4], v70 offset:200 +; ALIGNED-NEXT: flat_store_byte v[3:4], v87 offset:194 +; ALIGNED-NEXT: flat_store_byte v[3:4], v86 offset:195 +; ALIGNED-NEXT: flat_store_byte v[3:4], v98 offset:193 +; ALIGNED-NEXT: flat_store_byte v[3:4], v85 offset:199 +; ALIGNED-NEXT: flat_store_byte v[3:4], v97 offset:197 +; ALIGNED-NEXT: flat_store_byte v[3:4], v96 offset:198 +; ALIGNED-NEXT: flat_store_byte v[3:4], v99 offset:196 +; ALIGNED-NEXT: flat_store_byte v[3:4], v100 offset:192 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1184 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1180 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:300 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1176 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1172 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 +; ALIGNED-NEXT: flat_store_byte v[3:4], v102 offset:186 +; ALIGNED-NEXT: flat_store_byte v[3:4], v112 offset:187 +; ALIGNED-NEXT: flat_store_byte v[3:4], v103 offset:185 +; ALIGNED-NEXT: flat_store_byte v[3:4], v114 offset:191 +; ALIGNED-NEXT: flat_store_byte v[3:4], v115 offset:189 +; ALIGNED-NEXT: flat_store_byte v[3:4], v116 offset:190 +; ALIGNED-NEXT: flat_store_byte v[3:4], v117 offset:188 +; ALIGNED-NEXT: flat_store_byte v[3:4], v113 offset:184 +; ALIGNED-NEXT: flat_store_byte v[3:4], v119 offset:178 +; ALIGNED-NEXT: flat_store_byte v[3:4], v40 offset:179 +; ALIGNED-NEXT: flat_store_byte v[3:4], v43 offset:177 +; ALIGNED-NEXT: flat_store_byte v[3:4], v118 offset:183 +; ALIGNED-NEXT: flat_store_byte v[3:4], v41 offset:181 +; ALIGNED-NEXT: flat_store_byte v[3:4], v42 offset:182 +; ALIGNED-NEXT: flat_store_byte v[3:4], v45 offset:180 +; ALIGNED-NEXT: flat_store_byte v[3:4], v46 offset:176 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1168 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:312 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1164 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:316 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1160 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1156 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 +; ALIGNED-NEXT: flat_store_byte v[3:4], v47 offset:170 +; ALIGNED-NEXT: flat_store_byte v[3:4], v57 offset:171 +; ALIGNED-NEXT: flat_store_byte v[3:4], v56 offset:169 +; ALIGNED-NEXT: flat_store_byte v[3:4], v60 offset:175 +; ALIGNED-NEXT: flat_store_byte v[3:4], v61 offset:173 +; ALIGNED-NEXT: flat_store_byte v[3:4], v62 offset:174 +; ALIGNED-NEXT: flat_store_byte v[3:4], v63 offset:172 +; ALIGNED-NEXT: flat_store_byte v[3:4], v59 offset:168 +; ALIGNED-NEXT: flat_store_byte v[3:4], v73 offset:162 +; ALIGNED-NEXT: flat_store_byte v[3:4], v74 offset:163 +; ALIGNED-NEXT: flat_store_byte v[3:4], v78 offset:161 +; ALIGNED-NEXT: flat_store_byte v[3:4], v72 offset:167 +; ALIGNED-NEXT: flat_store_byte v[3:4], v75 offset:165 +; ALIGNED-NEXT: flat_store_byte v[3:4], v76 offset:166 +; ALIGNED-NEXT: flat_store_byte v[3:4], v79 offset:164 +; ALIGNED-NEXT: flat_store_byte v[3:4], v89 offset:160 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1144 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1136 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1132 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1124 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 +; ALIGNED-NEXT: flat_store_byte v[3:4], v91 offset:154 +; ALIGNED-NEXT: flat_store_byte v[3:4], v106 offset:155 +; ALIGNED-NEXT: flat_store_byte v[3:4], v93 offset:153 +; ALIGNED-NEXT: flat_store_byte v[3:4], v107 offset:159 +; ALIGNED-NEXT: flat_store_byte v[3:4], v109 offset:157 +; ALIGNED-NEXT: flat_store_byte v[3:4], v108 offset:158 +; ALIGNED-NEXT: flat_store_byte v[3:4], v121 offset:156 +; ALIGNED-NEXT: flat_store_byte v[3:4], v105 offset:152 +; ALIGNED-NEXT: flat_store_byte v[3:4], v125 offset:146 +; ALIGNED-NEXT: flat_store_byte v[3:4], v126 offset:147 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1084 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:145 +; ALIGNED-NEXT: flat_store_byte v[3:4], v123 offset:151 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1112 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:149 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1116 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:150 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1104 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:148 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1072 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:144 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1064 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1048 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:284 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1028 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1020 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1060 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:138 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1056 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:139 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1052 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:137 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1040 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:143 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1032 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:141 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1036 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:142 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1024 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:140 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1044 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:136 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1004 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:130 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:996 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:131 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:992 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:129 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1016 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:135 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1008 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:133 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1012 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:134 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1000 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:132 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:988 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:128 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:984 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:360 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:968 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:364 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:948 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:356 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:940 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:352 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:980 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:122 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:976 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:123 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:972 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:121 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:960 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:127 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:952 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:125 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:956 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:126 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:944 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:124 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:964 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:120 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:924 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:114 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:916 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:115 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:912 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:113 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:936 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:119 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:928 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:117 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:932 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:118 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:920 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:116 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:908 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:112 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:904 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:376 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:888 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:380 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:868 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:372 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:860 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:368 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:900 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:106 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:896 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:107 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:892 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:105 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:880 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:111 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:872 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:109 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:876 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:110 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:864 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:108 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:884 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:104 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:844 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:98 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:99 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:97 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:856 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:103 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:848 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:101 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:852 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:102 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:840 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:100 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:96 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:328 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:332 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:324 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:90 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:91 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:89 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:95 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:93 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:94 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:92 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:88 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:82 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:83 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:81 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:87 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:85 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:86 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:84 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:80 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:344 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:348 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:340 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:336 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:74 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:75 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:73 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:79 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:77 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:78 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:76 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:72 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:66 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:67 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:65 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:71 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:69 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:70 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:68 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:64 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:424 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:428 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:420 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:416 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:61 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:58 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:59 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:57 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:63 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:62 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:60 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:56 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:53 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:50 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:51 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:49 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:55 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:54 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:52 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:48 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:444 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:440 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:436 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:432 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:43 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:42 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:41 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:40 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:47 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:46 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:45 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:44 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:35 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:34 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:33 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:32 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:39 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:38 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:37 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:36 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:392 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:396 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:388 +; ALIGNED-NEXT: buffer_store_dword v127, off, s[0:3], s32 offset:384 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:26 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:27 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:25 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:31 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:29 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:30 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:28 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:24 +; ALIGNED-NEXT: flat_store_byte v[3:4], v44 offset:18 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1152 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:19 +; ALIGNED-NEXT: flat_store_byte v[3:4], v58 offset:17 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:23 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:21 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:22 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:20 +; ALIGNED-NEXT: flat_store_byte v[3:4], v88 offset:16 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1148 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:408 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1140 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:412 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1128 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:404 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1120 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:400 +; ALIGNED-NEXT: flat_store_byte v[3:4], v90 offset:10 +; ALIGNED-NEXT: flat_store_byte v[3:4], v94 offset:11 +; ALIGNED-NEXT: flat_store_byte v[3:4], v111 offset:13 +; ALIGNED-NEXT: flat_store_byte v[3:4], v92 offset:9 +; ALIGNED-NEXT: flat_store_byte v[3:4], v110 offset:15 +; ALIGNED-NEXT: flat_store_byte v[3:4], v120 offset:14 +; ALIGNED-NEXT: flat_store_byte v[3:4], v122 offset:12 +; ALIGNED-NEXT: flat_store_byte v[3:4], v95 offset:8 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1092 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:2 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1080 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:3 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1076 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:1 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1108 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:7 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1096 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:5 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1100 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:6 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1088 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:4 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1068 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 +; ALIGNED-NEXT: s_cbranch_vccnz .LBB4_1 +; ALIGNED-NEXT: ; %bb.2: ; %memcpy-split +; ALIGNED-NEXT: s_clause 0x2f +; ALIGNED-NEXT: buffer_load_dword v127, off, s[0:3], s32 +; ALIGNED-NEXT: buffer_load_dword v126, off, s[0:3], s32 offset:4 +; ALIGNED-NEXT: buffer_load_dword v125, off, s[0:3], s32 offset:8 +; ALIGNED-NEXT: buffer_load_dword v124, off, s[0:3], s32 offset:12 +; ALIGNED-NEXT: buffer_load_dword v123, off, s[0:3], s32 offset:16 +; ALIGNED-NEXT: buffer_load_dword v122, off, s[0:3], s32 offset:20 +; ALIGNED-NEXT: buffer_load_dword v121, off, s[0:3], s32 offset:24 +; ALIGNED-NEXT: buffer_load_dword v120, off, s[0:3], s32 offset:28 +; ALIGNED-NEXT: buffer_load_dword v111, off, s[0:3], s32 offset:32 +; ALIGNED-NEXT: buffer_load_dword v110, off, s[0:3], s32 offset:36 +; ALIGNED-NEXT: buffer_load_dword v109, off, s[0:3], s32 offset:40 +; ALIGNED-NEXT: buffer_load_dword v108, off, s[0:3], s32 offset:44 +; ALIGNED-NEXT: buffer_load_dword v107, off, s[0:3], s32 offset:48 +; ALIGNED-NEXT: buffer_load_dword v106, off, s[0:3], s32 offset:52 +; ALIGNED-NEXT: buffer_load_dword v105, off, s[0:3], s32 offset:56 +; ALIGNED-NEXT: buffer_load_dword v104, off, s[0:3], s32 offset:60 +; ALIGNED-NEXT: buffer_load_dword v95, off, s[0:3], s32 offset:64 +; ALIGNED-NEXT: buffer_load_dword v94, off, s[0:3], s32 offset:68 +; ALIGNED-NEXT: buffer_load_dword v93, off, s[0:3], s32 offset:72 +; ALIGNED-NEXT: buffer_load_dword v92, off, s[0:3], s32 offset:76 +; ALIGNED-NEXT: buffer_load_dword v91, off, s[0:3], s32 offset:80 +; ALIGNED-NEXT: buffer_load_dword v90, off, s[0:3], s32 offset:84 +; ALIGNED-NEXT: buffer_load_dword v89, off, s[0:3], s32 offset:88 +; ALIGNED-NEXT: buffer_load_dword v88, off, s[0:3], s32 offset:92 +; ALIGNED-NEXT: buffer_load_dword v79, off, s[0:3], s32 offset:96 +; ALIGNED-NEXT: buffer_load_dword v78, off, s[0:3], s32 offset:100 +; ALIGNED-NEXT: buffer_load_dword v77, off, s[0:3], s32 offset:104 +; ALIGNED-NEXT: buffer_load_dword v76, off, s[0:3], s32 offset:108 +; ALIGNED-NEXT: buffer_load_dword v75, off, s[0:3], s32 offset:112 +; ALIGNED-NEXT: buffer_load_dword v74, off, s[0:3], s32 offset:116 +; ALIGNED-NEXT: buffer_load_dword v73, off, s[0:3], s32 offset:120 +; ALIGNED-NEXT: buffer_load_dword v72, off, s[0:3], s32 offset:124 +; ALIGNED-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:128 +; ALIGNED-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:132 +; ALIGNED-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:136 +; ALIGNED-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:140 +; ALIGNED-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:144 +; ALIGNED-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:148 +; ALIGNED-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:152 +; ALIGNED-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:156 +; ALIGNED-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:160 +; ALIGNED-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:164 +; ALIGNED-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:168 +; ALIGNED-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:172 +; ALIGNED-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:176 +; ALIGNED-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:180 +; ALIGNED-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:184 +; ALIGNED-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:188 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; ALIGNED-NEXT: s_setpc_b64 s[30:31] +; +; UNROLL3-LABEL: memcpy_p0_p5_sz2048: +; UNROLL3: ; %bb.0: ; %entry +; UNROLL3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; UNROLL3-NEXT: v_mov_b32_e32 v3, v2 +; UNROLL3-NEXT: s_mov_b64 s[4:5], 0 +; UNROLL3-NEXT: s_inst_prefetch 0x1 +; UNROLL3-NEXT: .p2align 6 +; UNROLL3-NEXT: .LBB4_1: ; %load-store-loop +; UNROLL3-NEXT: ; =>This Inner Loop Header: Depth=1 +; UNROLL3-NEXT: s_clause 0xb +; UNROLL3-NEXT: buffer_load_dword v4, v3, s[0:3], 0 offen +; UNROLL3-NEXT: buffer_load_dword v5, v3, s[0:3], 0 offen offset:4 +; UNROLL3-NEXT: buffer_load_dword v6, v3, s[0:3], 0 offen offset:8 +; UNROLL3-NEXT: buffer_load_dword v7, v3, s[0:3], 0 offen offset:12 +; UNROLL3-NEXT: buffer_load_dword v8, v3, s[0:3], 0 offen offset:16 +; UNROLL3-NEXT: buffer_load_dword v9, v3, s[0:3], 0 offen offset:20 +; UNROLL3-NEXT: buffer_load_dword v10, v3, s[0:3], 0 offen offset:24 +; UNROLL3-NEXT: buffer_load_dword v11, v3, s[0:3], 0 offen offset:28 +; UNROLL3-NEXT: buffer_load_dword v12, v3, s[0:3], 0 offen offset:32 +; UNROLL3-NEXT: buffer_load_dword v13, v3, s[0:3], 0 offen offset:36 +; UNROLL3-NEXT: buffer_load_dword v14, v3, s[0:3], 0 offen offset:40 +; UNROLL3-NEXT: buffer_load_dword v15, v3, s[0:3], 0 offen offset:44 +; UNROLL3-NEXT: v_add_co_u32 v16, vcc_lo, v0, s4 +; UNROLL3-NEXT: s_add_u32 s4, s4, 48 +; UNROLL3-NEXT: v_add_co_ci_u32_e32 v17, vcc_lo, s5, v1, vcc_lo +; UNROLL3-NEXT: s_addc_u32 s5, s5, 0 +; UNROLL3-NEXT: v_add_nc_u32_e32 v3, 48, v3 +; UNROLL3-NEXT: v_cmp_gt_u64_e64 s6, 0x7e0, s[4:5] +; UNROLL3-NEXT: s_waitcnt vmcnt(4) +; UNROLL3-NEXT: flat_store_dwordx4 v[16:17], v[8:11] offset:16 +; UNROLL3-NEXT: flat_store_dwordx4 v[16:17], v[4:7] +; UNROLL3-NEXT: s_waitcnt vmcnt(0) +; UNROLL3-NEXT: flat_store_dwordx4 v[16:17], v[12:15] offset:32 +; UNROLL3-NEXT: s_and_b32 vcc_lo, exec_lo, s6 +; UNROLL3-NEXT: s_cbranch_vccnz .LBB4_1 +; UNROLL3-NEXT: ; %bb.2: ; %memcpy-split +; UNROLL3-NEXT: s_inst_prefetch 0x2 +; UNROLL3-NEXT: s_clause 0x3 +; UNROLL3-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen offset:2016 +; UNROLL3-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:2020 +; UNROLL3-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:2024 +; UNROLL3-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:2028 +; UNROLL3-NEXT: s_waitcnt vmcnt(0) +; UNROLL3-NEXT: flat_store_dwordx4 v[0:1], v[3:6] offset:2016 +; UNROLL3-NEXT: s_clause 0x3 +; UNROLL3-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen offset:2032 +; UNROLL3-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:2036 +; UNROLL3-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:2040 +; UNROLL3-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:2044 +; UNROLL3-NEXT: s_waitcnt vmcnt(0) +; UNROLL3-NEXT: flat_store_dwordx4 v[0:1], v[3:6] offset:2032 +; UNROLL3-NEXT: s_waitcnt lgkmcnt(0) +; UNROLL3-NEXT: s_setpc_b64 s[30:31] +entry: + tail call void @llvm.memcpy.p0.p5.i64(ptr addrspace(0) noundef nonnull align 1 %dst, ptr addrspace(5) noundef nonnull align 1 %src, i64 2048, i1 false) + ret void +} + + +; memmove for address spaces 0, 1, 4, 5 + +define void @memmove_p0_p0_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(0) align 1 readonly %src) { +; CHECK-LABEL: memmove_p0_p0_sz2048: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_mov_b32 s4, exec_lo +; CHECK-NEXT: v_cmpx_ge_u64_e64 v[2:3], v[0:1] +; CHECK-NEXT: s_xor_b32 s6, exec_lo, s4 +; CHECK-NEXT: s_cbranch_execz .LBB5_3 +; CHECK-NEXT: ; %bb.1: ; %memmove_fwd_loop.preheader +; CHECK-NEXT: s_mov_b64 s[4:5], 0 +; CHECK-NEXT: .LBB5_2: ; %memmove_fwd_loop +; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: v_add_co_u32 v96, vcc_lo, v2, s4 +; CHECK-NEXT: v_add_co_ci_u32_e32 v97, vcc_lo, s5, v3, vcc_lo +; CHECK-NEXT: v_add_co_u32 v100, vcc_lo, v0, s4 +; CHECK-NEXT: v_add_co_ci_u32_e32 v101, vcc_lo, s5, v1, vcc_lo +; CHECK-NEXT: s_clause 0xf +; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[96:97] offset:224 +; CHECK-NEXT: flat_load_dwordx4 v[8:11], v[96:97] offset:240 +; CHECK-NEXT: flat_load_dwordx4 v[12:15], v[96:97] offset:192 +; CHECK-NEXT: flat_load_dwordx4 v[16:19], v[96:97] offset:208 +; CHECK-NEXT: flat_load_dwordx4 v[20:23], v[96:97] offset:160 +; CHECK-NEXT: flat_load_dwordx4 v[24:27], v[96:97] offset:176 +; CHECK-NEXT: flat_load_dwordx4 v[28:31], v[96:97] offset:128 +; CHECK-NEXT: flat_load_dwordx4 v[32:35], v[96:97] offset:144 +; CHECK-NEXT: flat_load_dwordx4 v[36:39], v[96:97] offset:96 +; CHECK-NEXT: flat_load_dwordx4 v[48:51], v[96:97] offset:112 +; CHECK-NEXT: flat_load_dwordx4 v[52:55], v[96:97] offset:64 +; CHECK-NEXT: flat_load_dwordx4 v[64:67], v[96:97] offset:80 +; CHECK-NEXT: flat_load_dwordx4 v[68:71], v[96:97] offset:32 +; CHECK-NEXT: flat_load_dwordx4 v[80:83], v[96:97] offset:48 +; CHECK-NEXT: flat_load_dwordx4 v[84:87], v[96:97] +; CHECK-NEXT: flat_load_dwordx4 v[96:99], v[96:97] offset:16 +; CHECK-NEXT: s_add_u32 s4, s4, 0x100 +; CHECK-NEXT: s_addc_u32 s5, s5, 0 +; CHECK-NEXT: s_waitcnt vmcnt(15) lgkmcnt(15) +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[4:7] offset:224 +; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(15) +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[8:11] offset:240 +; CHECK-NEXT: s_waitcnt vmcnt(13) lgkmcnt(15) +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[12:15] offset:192 +; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(15) +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[16:19] offset:208 +; CHECK-NEXT: s_waitcnt vmcnt(11) lgkmcnt(15) +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[20:23] offset:160 +; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(15) +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[24:27] offset:176 +; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(15) +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[28:31] offset:128 +; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(15) +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[32:35] offset:144 +; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(15) +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[36:39] offset:96 +; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(15) +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[48:51] offset:112 +; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(15) +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[52:55] offset:64 +; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(15) +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[64:67] offset:80 +; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(15) +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[68:71] offset:32 +; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(15) +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[80:83] offset:48 +; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(15) +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[84:87] +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(15) +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[96:99] offset:16 +; CHECK-NEXT: s_cmp_lg_u64 s[4:5], 0x800 +; CHECK-NEXT: s_cbranch_scc1 .LBB5_2 +; CHECK-NEXT: .LBB5_3: ; %Flow5 +; CHECK-NEXT: s_andn2_saveexec_b32 s8, s6 +; CHECK-NEXT: s_cbranch_execz .LBB5_6 +; CHECK-NEXT: ; %bb.4: ; %memmove_bwd_loop.preheader +; CHECK-NEXT: s_movk_i32 s6, 0xff00 +; CHECK-NEXT: s_mov_b64 s[4:5], 0x700 +; CHECK-NEXT: s_mov_b32 s7, -1 +; CHECK-NEXT: .LBB5_5: ; %memmove_bwd_loop +; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: v_add_co_u32 v96, vcc_lo, v2, s4 +; CHECK-NEXT: v_add_co_ci_u32_e32 v97, vcc_lo, s5, v3, vcc_lo +; CHECK-NEXT: v_add_co_u32 v100, vcc_lo, v0, s4 +; CHECK-NEXT: v_add_co_ci_u32_e32 v101, vcc_lo, s5, v1, vcc_lo +; CHECK-NEXT: s_clause 0xf +; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[96:97] offset:224 +; CHECK-NEXT: flat_load_dwordx4 v[8:11], v[96:97] offset:240 +; CHECK-NEXT: flat_load_dwordx4 v[12:15], v[96:97] offset:192 +; CHECK-NEXT: flat_load_dwordx4 v[16:19], v[96:97] offset:208 +; CHECK-NEXT: flat_load_dwordx4 v[20:23], v[96:97] offset:160 +; CHECK-NEXT: flat_load_dwordx4 v[24:27], v[96:97] offset:176 +; CHECK-NEXT: flat_load_dwordx4 v[28:31], v[96:97] offset:128 +; CHECK-NEXT: flat_load_dwordx4 v[32:35], v[96:97] offset:144 +; CHECK-NEXT: flat_load_dwordx4 v[36:39], v[96:97] offset:96 +; CHECK-NEXT: flat_load_dwordx4 v[48:51], v[96:97] offset:112 +; CHECK-NEXT: flat_load_dwordx4 v[52:55], v[96:97] offset:64 +; CHECK-NEXT: flat_load_dwordx4 v[64:67], v[96:97] offset:80 +; CHECK-NEXT: flat_load_dwordx4 v[68:71], v[96:97] offset:32 +; CHECK-NEXT: flat_load_dwordx4 v[80:83], v[96:97] offset:48 +; CHECK-NEXT: flat_load_dwordx4 v[84:87], v[96:97] +; CHECK-NEXT: flat_load_dwordx4 v[96:99], v[96:97] offset:16 +; CHECK-NEXT: s_add_u32 s4, s4, 0xffffff00 +; CHECK-NEXT: s_addc_u32 s5, s5, -1 +; CHECK-NEXT: s_waitcnt vmcnt(15) lgkmcnt(15) +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[4:7] offset:224 +; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(15) +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[8:11] offset:240 +; CHECK-NEXT: s_waitcnt vmcnt(13) lgkmcnt(15) +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[12:15] offset:192 +; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(15) +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[16:19] offset:208 +; CHECK-NEXT: s_waitcnt vmcnt(11) lgkmcnt(15) +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[20:23] offset:160 +; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(15) +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[24:27] offset:176 +; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(15) +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[28:31] offset:128 +; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(15) +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[32:35] offset:144 +; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(15) +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[36:39] offset:96 +; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(15) +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[48:51] offset:112 +; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(15) +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[52:55] offset:64 +; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(15) +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[64:67] offset:80 +; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(15) +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[68:71] offset:32 +; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(15) +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[80:83] offset:48 +; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(15) +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[84:87] +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(15) +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[96:99] offset:16 +; CHECK-NEXT: s_cmp_eq_u64 s[4:5], s[6:7] +; CHECK-NEXT: s_cbranch_scc0 .LBB5_5 +; CHECK-NEXT: .LBB5_6: ; %Flow6 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s8 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: s_setpc_b64 s[30:31] +; +; ALIGNED-LABEL: memmove_p0_p0_sz2048: +; ALIGNED: ; %bb.0: ; %entry +; ALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_mov_b32 s4, exec_lo +; ALIGNED-NEXT: v_cmpx_ge_u64_e64 v[2:3], v[0:1] +; ALIGNED-NEXT: s_xor_b32 s6, exec_lo, s4 +; ALIGNED-NEXT: s_cbranch_execz .LBB5_3 +; ALIGNED-NEXT: ; %bb.1: ; %memmove_fwd_loop.preheader +; ALIGNED-NEXT: s_mov_b64 s[4:5], 0 +; ALIGNED-NEXT: .LBB5_2: ; %memmove_fwd_loop +; ALIGNED-NEXT: ; =>This Inner Loop Header: Depth=1 +; ALIGNED-NEXT: v_add_co_u32 v20, vcc_lo, v2, s4 +; ALIGNED-NEXT: v_add_co_ci_u32_e32 v21, vcc_lo, s5, v3, vcc_lo +; ALIGNED-NEXT: s_clause 0xf +; ALIGNED-NEXT: flat_load_dwordx4 v[16:19], v[20:21] offset:240 +; ALIGNED-NEXT: flat_load_dwordx4 v[22:25], v[20:21] offset:224 +; ALIGNED-NEXT: flat_load_dwordx4 v[4:7], v[20:21] +; ALIGNED-NEXT: flat_load_dwordx4 v[8:11], v[20:21] offset:16 +; ALIGNED-NEXT: flat_load_dwordx4 v[12:15], v[20:21] offset:32 +; ALIGNED-NEXT: flat_load_dwordx4 v[98:101], v[20:21] offset:48 +; ALIGNED-NEXT: flat_load_dwordx4 v[112:115], v[20:21] offset:64 +; ALIGNED-NEXT: flat_load_dwordx4 v[82:85], v[20:21] offset:80 +; ALIGNED-NEXT: flat_load_dwordx4 v[116:119], v[20:21] offset:96 +; ALIGNED-NEXT: flat_load_dwordx4 v[66:69], v[20:21] offset:112 +; ALIGNED-NEXT: flat_load_dwordx4 v[40:43], v[20:21] offset:128 +; ALIGNED-NEXT: flat_load_dwordx4 v[50:53], v[20:21] offset:144 +; ALIGNED-NEXT: flat_load_dwordx4 v[44:47], v[20:21] offset:160 +; ALIGNED-NEXT: flat_load_dwordx4 v[34:37], v[20:21] offset:176 +; ALIGNED-NEXT: flat_load_dwordx4 v[30:33], v[20:21] offset:192 +; ALIGNED-NEXT: flat_load_dwordx4 v[26:29], v[20:21] offset:208 +; ALIGNED-NEXT: s_waitcnt vmcnt(15) lgkmcnt(15) +; ALIGNED-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:64 +; ALIGNED-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:68 +; ALIGNED-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:72 +; ALIGNED-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:76 +; ALIGNED-NEXT: s_clause 0x3 +; ALIGNED-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:76 +; ALIGNED-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:72 +; ALIGNED-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:68 +; ALIGNED-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:64 +; ALIGNED-NEXT: v_add_co_u32 v16, vcc_lo, v0, s4 +; ALIGNED-NEXT: v_add_co_ci_u32_e32 v17, vcc_lo, s5, v1, vcc_lo +; ALIGNED-NEXT: s_add_u32 s4, s4, 0x100 +; ALIGNED-NEXT: s_addc_u32 s5, s5, 0 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v21 offset:254 +; ALIGNED-NEXT: flat_store_byte v[16:17], v21 offset:252 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v20 offset:250 +; ALIGNED-NEXT: flat_store_byte v[16:17], v20 offset:248 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v19 offset:246 +; ALIGNED-NEXT: flat_store_byte v[16:17], v19 offset:244 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v18 offset:242 +; ALIGNED-NEXT: flat_store_byte v[16:17], v18 offset:240 +; ALIGNED-NEXT: s_waitcnt lgkmcnt(22) +; ALIGNED-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:80 +; ALIGNED-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:84 +; ALIGNED-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:88 +; ALIGNED-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:92 +; ALIGNED-NEXT: s_clause 0x3 +; ALIGNED-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:92 +; ALIGNED-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:88 +; ALIGNED-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:84 +; ALIGNED-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:80 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v21 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v21, 8, v21 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v20 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v20, 8, v20 +; ALIGNED-NEXT: s_cmp_lg_u64 s[4:5], 0x800 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v25 offset:238 +; ALIGNED-NEXT: flat_store_byte v[16:17], v25 offset:236 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v24 offset:234 +; ALIGNED-NEXT: flat_store_byte v[16:17], v24 offset:232 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v23 offset:230 +; ALIGNED-NEXT: flat_store_byte v[16:17], v23 offset:228 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v22 offset:226 +; ALIGNED-NEXT: flat_store_byte v[16:17], v22 offset:224 +; ALIGNED-NEXT: s_waitcnt lgkmcnt(16) +; ALIGNED-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:32 +; ALIGNED-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:36 +; ALIGNED-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:40 +; ALIGNED-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:44 +; ALIGNED-NEXT: s_clause 0x3 +; ALIGNED-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:44 +; ALIGNED-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:40 +; ALIGNED-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:36 +; ALIGNED-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:32 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v29 offset:222 +; ALIGNED-NEXT: flat_store_byte v[16:17], v29 offset:220 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v28 offset:218 +; ALIGNED-NEXT: flat_store_byte v[16:17], v28 offset:216 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v27 offset:214 +; ALIGNED-NEXT: flat_store_byte v[16:17], v27 offset:212 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v26 offset:210 +; ALIGNED-NEXT: flat_store_byte v[16:17], v26 offset:208 +; ALIGNED-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:48 +; ALIGNED-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:52 +; ALIGNED-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:56 +; ALIGNED-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:60 +; ALIGNED-NEXT: s_clause 0x3 +; ALIGNED-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:60 +; ALIGNED-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:56 +; ALIGNED-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:52 +; ALIGNED-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:48 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v33 offset:206 +; ALIGNED-NEXT: flat_store_byte v[16:17], v33 offset:204 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v32 offset:202 +; ALIGNED-NEXT: flat_store_byte v[16:17], v32 offset:200 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v31 offset:198 +; ALIGNED-NEXT: flat_store_byte v[16:17], v31 offset:196 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v30 offset:194 +; ALIGNED-NEXT: flat_store_byte v[16:17], v30 offset:192 +; ALIGNED-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:128 +; ALIGNED-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:132 +; ALIGNED-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:136 +; ALIGNED-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:140 +; ALIGNED-NEXT: s_clause 0x3 +; ALIGNED-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:140 +; ALIGNED-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:136 +; ALIGNED-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:132 +; ALIGNED-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:128 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v37 offset:190 +; ALIGNED-NEXT: flat_store_byte v[16:17], v37 offset:188 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v36 offset:186 +; ALIGNED-NEXT: flat_store_byte v[16:17], v36 offset:184 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v35 offset:182 +; ALIGNED-NEXT: flat_store_byte v[16:17], v35 offset:180 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v34 offset:178 +; ALIGNED-NEXT: flat_store_byte v[16:17], v34 offset:176 +; ALIGNED-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:144 +; ALIGNED-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:148 +; ALIGNED-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:152 +; ALIGNED-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:156 +; ALIGNED-NEXT: s_clause 0x3 +; ALIGNED-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:156 +; ALIGNED-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:152 +; ALIGNED-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:148 +; ALIGNED-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:144 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v49 offset:174 +; ALIGNED-NEXT: flat_store_byte v[16:17], v49 offset:172 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v48 offset:170 +; ALIGNED-NEXT: flat_store_byte v[16:17], v48 offset:168 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v39 offset:166 +; ALIGNED-NEXT: flat_store_byte v[16:17], v39 offset:164 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v38 offset:162 +; ALIGNED-NEXT: flat_store_byte v[16:17], v38 offset:160 +; ALIGNED-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:96 +; ALIGNED-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:100 +; ALIGNED-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:104 +; ALIGNED-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:108 +; ALIGNED-NEXT: s_clause 0x3 +; ALIGNED-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:108 +; ALIGNED-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:104 +; ALIGNED-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:100 +; ALIGNED-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:96 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v53 offset:158 +; ALIGNED-NEXT: flat_store_byte v[16:17], v53 offset:156 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v52 offset:154 +; ALIGNED-NEXT: flat_store_byte v[16:17], v52 offset:152 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v51 offset:150 +; ALIGNED-NEXT: flat_store_byte v[16:17], v51 offset:148 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v50 offset:146 +; ALIGNED-NEXT: flat_store_byte v[16:17], v50 offset:144 +; ALIGNED-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:112 +; ALIGNED-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:116 +; ALIGNED-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:120 +; ALIGNED-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:124 +; ALIGNED-NEXT: s_clause 0x3 +; ALIGNED-NEXT: buffer_load_dword v65, off, s[0:3], s32 offset:124 +; ALIGNED-NEXT: buffer_load_dword v64, off, s[0:3], s32 offset:120 +; ALIGNED-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:116 +; ALIGNED-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:112 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v65 offset:142 +; ALIGNED-NEXT: flat_store_byte v[16:17], v65 offset:140 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v64 offset:138 +; ALIGNED-NEXT: flat_store_byte v[16:17], v64 offset:136 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v55 offset:134 +; ALIGNED-NEXT: flat_store_byte v[16:17], v55 offset:132 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v54 offset:130 +; ALIGNED-NEXT: flat_store_byte v[16:17], v54 offset:128 +; ALIGNED-NEXT: buffer_store_dword v66, off, s[0:3], s32 offset:192 +; ALIGNED-NEXT: buffer_store_dword v67, off, s[0:3], s32 offset:196 +; ALIGNED-NEXT: buffer_store_dword v68, off, s[0:3], s32 offset:200 +; ALIGNED-NEXT: buffer_store_dword v69, off, s[0:3], s32 offset:204 +; ALIGNED-NEXT: s_clause 0x3 +; ALIGNED-NEXT: buffer_load_dword v69, off, s[0:3], s32 offset:204 +; ALIGNED-NEXT: buffer_load_dword v68, off, s[0:3], s32 offset:200 +; ALIGNED-NEXT: buffer_load_dword v67, off, s[0:3], s32 offset:196 +; ALIGNED-NEXT: buffer_load_dword v66, off, s[0:3], s32 offset:192 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v69 offset:126 +; ALIGNED-NEXT: flat_store_byte v[16:17], v69 offset:124 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v68 offset:122 +; ALIGNED-NEXT: flat_store_byte v[16:17], v68 offset:120 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v67 offset:118 +; ALIGNED-NEXT: flat_store_byte v[16:17], v67 offset:116 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v66 offset:114 +; ALIGNED-NEXT: flat_store_byte v[16:17], v66 offset:112 +; ALIGNED-NEXT: buffer_store_dword v116, off, s[0:3], s32 offset:208 +; ALIGNED-NEXT: buffer_store_dword v117, off, s[0:3], s32 offset:212 +; ALIGNED-NEXT: buffer_store_dword v118, off, s[0:3], s32 offset:216 +; ALIGNED-NEXT: buffer_store_dword v119, off, s[0:3], s32 offset:220 +; ALIGNED-NEXT: s_clause 0x3 +; ALIGNED-NEXT: buffer_load_dword v81, off, s[0:3], s32 offset:220 +; ALIGNED-NEXT: buffer_load_dword v80, off, s[0:3], s32 offset:216 +; ALIGNED-NEXT: buffer_load_dword v71, off, s[0:3], s32 offset:212 +; ALIGNED-NEXT: buffer_load_dword v70, off, s[0:3], s32 offset:208 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v81 offset:110 +; ALIGNED-NEXT: flat_store_byte v[16:17], v81 offset:108 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v80 offset:106 +; ALIGNED-NEXT: flat_store_byte v[16:17], v80 offset:104 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v71 offset:102 +; ALIGNED-NEXT: flat_store_byte v[16:17], v71 offset:100 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v70 offset:98 +; ALIGNED-NEXT: flat_store_byte v[16:17], v70 offset:96 +; ALIGNED-NEXT: buffer_store_dword v82, off, s[0:3], s32 offset:160 +; ALIGNED-NEXT: buffer_store_dword v83, off, s[0:3], s32 offset:164 +; ALIGNED-NEXT: buffer_store_dword v84, off, s[0:3], s32 offset:168 +; ALIGNED-NEXT: buffer_store_dword v85, off, s[0:3], s32 offset:172 +; ALIGNED-NEXT: s_clause 0x3 +; ALIGNED-NEXT: buffer_load_dword v85, off, s[0:3], s32 offset:172 +; ALIGNED-NEXT: buffer_load_dword v84, off, s[0:3], s32 offset:168 +; ALIGNED-NEXT: buffer_load_dword v83, off, s[0:3], s32 offset:164 +; ALIGNED-NEXT: buffer_load_dword v82, off, s[0:3], s32 offset:160 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v85 offset:94 +; ALIGNED-NEXT: flat_store_byte v[16:17], v85 offset:92 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v84 offset:90 +; ALIGNED-NEXT: flat_store_byte v[16:17], v84 offset:88 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v83 offset:86 +; ALIGNED-NEXT: flat_store_byte v[16:17], v83 offset:84 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v82 offset:82 +; ALIGNED-NEXT: flat_store_byte v[16:17], v82 offset:80 +; ALIGNED-NEXT: buffer_store_dword v112, off, s[0:3], s32 offset:176 +; ALIGNED-NEXT: buffer_store_dword v113, off, s[0:3], s32 offset:180 +; ALIGNED-NEXT: buffer_store_dword v114, off, s[0:3], s32 offset:184 +; ALIGNED-NEXT: buffer_store_dword v115, off, s[0:3], s32 offset:188 +; ALIGNED-NEXT: s_clause 0x3 +; ALIGNED-NEXT: buffer_load_dword v97, off, s[0:3], s32 offset:188 +; ALIGNED-NEXT: buffer_load_dword v96, off, s[0:3], s32 offset:184 +; ALIGNED-NEXT: buffer_load_dword v87, off, s[0:3], s32 offset:180 +; ALIGNED-NEXT: buffer_load_dword v86, off, s[0:3], s32 offset:176 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v19 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v19, 8, v19 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v18 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v18, 8, v18 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v25 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v25, 8, v25 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 24, v24 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v24, 8, v24 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v97 offset:78 +; ALIGNED-NEXT: flat_store_byte v[16:17], v97 offset:76 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v96 offset:74 +; ALIGNED-NEXT: flat_store_byte v[16:17], v96 offset:72 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v87 offset:70 +; ALIGNED-NEXT: flat_store_byte v[16:17], v87 offset:68 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v86 offset:66 +; ALIGNED-NEXT: flat_store_byte v[16:17], v86 offset:64 +; ALIGNED-NEXT: buffer_store_dword v98, off, s[0:3], s32 offset:256 +; ALIGNED-NEXT: buffer_store_dword v99, off, s[0:3], s32 offset:260 +; ALIGNED-NEXT: buffer_store_dword v100, off, s[0:3], s32 offset:264 +; ALIGNED-NEXT: buffer_store_dword v101, off, s[0:3], s32 offset:268 +; ALIGNED-NEXT: s_clause 0x3 +; ALIGNED-NEXT: buffer_load_dword v101, off, s[0:3], s32 offset:268 +; ALIGNED-NEXT: buffer_load_dword v100, off, s[0:3], s32 offset:264 +; ALIGNED-NEXT: buffer_load_dword v99, off, s[0:3], s32 offset:260 +; ALIGNED-NEXT: buffer_load_dword v98, off, s[0:3], s32 offset:256 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v101 offset:62 +; ALIGNED-NEXT: flat_store_byte v[16:17], v101 offset:60 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v100 offset:58 +; ALIGNED-NEXT: flat_store_byte v[16:17], v100 offset:56 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v99 offset:54 +; ALIGNED-NEXT: flat_store_byte v[16:17], v99 offset:52 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v98 offset:50 +; ALIGNED-NEXT: flat_store_byte v[16:17], v98 offset:48 +; ALIGNED-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:272 +; ALIGNED-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:276 +; ALIGNED-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:280 +; ALIGNED-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:284 +; ALIGNED-NEXT: s_clause 0x3 +; ALIGNED-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:280 +; ALIGNED-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:284 +; ALIGNED-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:272 +; ALIGNED-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:276 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v15 offset:42 +; ALIGNED-NEXT: flat_store_byte v[16:17], v15 offset:40 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v14 offset:46 +; ALIGNED-NEXT: flat_store_byte v[16:17], v14 offset:44 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v13 offset:34 +; ALIGNED-NEXT: flat_store_byte v[16:17], v13 offset:32 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v12 offset:38 +; ALIGNED-NEXT: flat_store_byte v[16:17], v12 offset:36 +; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:224 +; ALIGNED-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:228 +; ALIGNED-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:232 +; ALIGNED-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:236 +; ALIGNED-NEXT: s_clause 0x3 +; ALIGNED-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:236 +; ALIGNED-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:232 +; ALIGNED-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:228 +; ALIGNED-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:224 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v11 offset:30 +; ALIGNED-NEXT: flat_store_byte v[16:17], v11 offset:28 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v10 offset:26 +; ALIGNED-NEXT: flat_store_byte v[16:17], v10 offset:24 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v9 offset:22 +; ALIGNED-NEXT: flat_store_byte v[16:17], v9 offset:20 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v8 offset:18 +; ALIGNED-NEXT: flat_store_byte v[16:17], v8 offset:16 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:240 +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:244 +; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:248 +; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:252 +; ALIGNED-NEXT: s_clause 0x3 +; ALIGNED-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:252 +; ALIGNED-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:248 +; ALIGNED-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:244 +; ALIGNED-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:240 +; ALIGNED-NEXT: flat_store_byte v[16:17], v112 offset:247 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v27 +; ALIGNED-NEXT: flat_store_byte v[16:17], v102 offset:255 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v23 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v23, 8, v23 +; ALIGNED-NEXT: flat_store_byte v[16:17], v21 offset:253 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v21, 24, v22 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v22, 8, v22 +; ALIGNED-NEXT: flat_store_byte v[16:17], v103 offset:251 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v29 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v29, 8, v29 +; ALIGNED-NEXT: flat_store_byte v[16:17], v20 offset:249 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v20, 24, v28 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v28, 8, v28 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v27, 8, v27 +; ALIGNED-NEXT: flat_store_byte v[16:17], v19 offset:245 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v19, 24, v26 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v26, 8, v26 +; ALIGNED-NEXT: flat_store_byte v[16:17], v112 offset:215 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v51 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 8, v51 +; ALIGNED-NEXT: flat_store_byte v[16:17], v113 offset:243 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v33 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v33, 8, v33 +; ALIGNED-NEXT: flat_store_byte v[16:17], v18 offset:241 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v18, 24, v32 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v32, 8, v32 +; ALIGNED-NEXT: flat_store_byte v[16:17], v114 offset:239 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v31 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v31, 8, v31 +; ALIGNED-NEXT: flat_store_byte v[16:17], v25 offset:237 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v25, 24, v30 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v30, 8, v30 +; ALIGNED-NEXT: flat_store_byte v[16:17], v115 offset:235 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 24, v37 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v37, 8, v37 +; ALIGNED-NEXT: flat_store_byte v[16:17], v24 offset:233 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v24, 24, v36 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v36, 8, v36 +; ALIGNED-NEXT: flat_store_byte v[16:17], v102 offset:231 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v35 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v35, 8, v35 +; ALIGNED-NEXT: flat_store_byte v[16:17], v23 offset:229 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v23, 24, v34 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v34, 8, v34 +; ALIGNED-NEXT: flat_store_byte v[16:17], v21 offset:227 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v21, 24, v49 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v49, 8, v49 +; ALIGNED-NEXT: flat_store_byte v[16:17], v22 offset:225 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v22, 24, v48 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v48, 8, v48 +; ALIGNED-NEXT: flat_store_byte v[16:17], v103 offset:223 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v39 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v39, 8, v39 +; ALIGNED-NEXT: flat_store_byte v[16:17], v29 offset:221 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v29, 24, v38 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v38, 8, v38 +; ALIGNED-NEXT: flat_store_byte v[16:17], v20 offset:219 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v20, 24, v53 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v53, 8, v53 +; ALIGNED-NEXT: flat_store_byte v[16:17], v28 offset:217 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v28, 24, v52 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v52, 8, v52 +; ALIGNED-NEXT: flat_store_byte v[16:17], v27 offset:213 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v27, 24, v50 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v50, 8, v50 +; ALIGNED-NEXT: flat_store_byte v[16:17], v19 offset:211 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v19, 24, v65 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v65, 8, v65 +; ALIGNED-NEXT: flat_store_byte v[16:17], v26 offset:209 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v26, 24, v64 +; ALIGNED-NEXT: flat_store_byte v[16:17], v51 offset:149 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 24, v8 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v8, 8, v8 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v64, 8, v64 +; ALIGNED-NEXT: flat_store_byte v[16:17], v113 offset:207 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v55 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v55, 8, v55 +; ALIGNED-NEXT: flat_store_byte v[16:17], v33 offset:205 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v33, 24, v54 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v54, 8, v54 +; ALIGNED-NEXT: flat_store_byte v[16:17], v18 offset:203 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v18, 24, v69 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 8, v69 +; ALIGNED-NEXT: flat_store_byte v[16:17], v32 offset:201 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v32, 24, v68 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v68, 8, v68 +; ALIGNED-NEXT: flat_store_byte v[16:17], v114 offset:199 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v67 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v67, 8, v67 +; ALIGNED-NEXT: flat_store_byte v[16:17], v31 offset:197 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v31, 24, v66 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v66, 8, v66 +; ALIGNED-NEXT: flat_store_byte v[16:17], v25 offset:195 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v25, 24, v81 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 8, v81 +; ALIGNED-NEXT: flat_store_byte v[16:17], v30 offset:193 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v30, 24, v80 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 8, v80 +; ALIGNED-NEXT: flat_store_byte v[16:17], v115 offset:191 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 24, v71 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 8, v71 +; ALIGNED-NEXT: flat_store_byte v[16:17], v37 offset:189 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v37, 24, v70 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 8, v70 +; ALIGNED-NEXT: flat_store_byte v[16:17], v24 offset:187 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v24, 24, v85 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 8, v85 +; ALIGNED-NEXT: flat_store_byte v[16:17], v36 offset:185 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v36, 24, v84 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v84, 8, v84 +; ALIGNED-NEXT: flat_store_byte v[16:17], v102 offset:183 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v83 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 8, v83 +; ALIGNED-NEXT: flat_store_byte v[16:17], v35 offset:181 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v35, 24, v82 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 8, v82 +; ALIGNED-NEXT: flat_store_byte v[16:17], v23 offset:179 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v23, 24, v97 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v97, 8, v97 +; ALIGNED-NEXT: flat_store_byte v[16:17], v34 offset:177 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v34, 24, v96 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v96, 8, v96 +; ALIGNED-NEXT: flat_store_byte v[16:17], v21 offset:175 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v21, 24, v87 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 8, v87 +; ALIGNED-NEXT: flat_store_byte v[16:17], v49 offset:173 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v49, 24, v86 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 8, v86 +; ALIGNED-NEXT: flat_store_byte v[16:17], v22 offset:171 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v22, 24, v101 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 8, v101 +; ALIGNED-NEXT: flat_store_byte v[16:17], v48 offset:169 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v48, 24, v100 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 8, v100 +; ALIGNED-NEXT: flat_store_byte v[16:17], v103 offset:167 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v99 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v99 +; ALIGNED-NEXT: flat_store_byte v[16:17], v39 offset:165 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v39, 24, v98 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 8, v98 +; ALIGNED-NEXT: flat_store_byte v[16:17], v29 offset:163 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v29, 24, v15 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v15, 8, v15 +; ALIGNED-NEXT: flat_store_byte v[16:17], v38 offset:161 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v38, 24, v14 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v14, 8, v14 +; ALIGNED-NEXT: flat_store_byte v[16:17], v20 offset:159 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v20, 24, v13 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v13, 8, v13 +; ALIGNED-NEXT: flat_store_byte v[16:17], v53 offset:157 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v53, 24, v12 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v12, 8, v12 +; ALIGNED-NEXT: flat_store_byte v[16:17], v28 offset:155 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v28, 24, v11 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v11, 8, v11 +; ALIGNED-NEXT: flat_store_byte v[16:17], v52 offset:153 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v52, 24, v10 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v10, 8, v10 +; ALIGNED-NEXT: flat_store_byte v[16:17], v112 offset:151 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v9 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v9, 8, v9 +; ALIGNED-NEXT: flat_store_byte v[16:17], v27 offset:147 +; ALIGNED-NEXT: flat_store_byte v[16:17], v50 offset:145 +; ALIGNED-NEXT: flat_store_byte v[16:17], v19 offset:143 +; ALIGNED-NEXT: flat_store_byte v[16:17], v65 offset:141 +; ALIGNED-NEXT: flat_store_byte v[16:17], v26 offset:139 +; ALIGNED-NEXT: flat_store_byte v[16:17], v64 offset:137 +; ALIGNED-NEXT: flat_store_byte v[16:17], v113 offset:135 +; ALIGNED-NEXT: flat_store_byte v[16:17], v55 offset:133 +; ALIGNED-NEXT: flat_store_byte v[16:17], v33 offset:131 +; ALIGNED-NEXT: flat_store_byte v[16:17], v54 offset:129 +; ALIGNED-NEXT: flat_store_byte v[16:17], v18 offset:127 +; ALIGNED-NEXT: flat_store_byte v[16:17], v69 offset:125 +; ALIGNED-NEXT: flat_store_byte v[16:17], v32 offset:123 +; ALIGNED-NEXT: flat_store_byte v[16:17], v68 offset:121 +; ALIGNED-NEXT: flat_store_byte v[16:17], v114 offset:119 +; ALIGNED-NEXT: flat_store_byte v[16:17], v67 offset:117 +; ALIGNED-NEXT: flat_store_byte v[16:17], v31 offset:115 +; ALIGNED-NEXT: flat_store_byte v[16:17], v66 offset:113 +; ALIGNED-NEXT: flat_store_byte v[16:17], v25 offset:111 +; ALIGNED-NEXT: flat_store_byte v[16:17], v81 offset:109 +; ALIGNED-NEXT: flat_store_byte v[16:17], v30 offset:107 +; ALIGNED-NEXT: flat_store_byte v[16:17], v80 offset:105 +; ALIGNED-NEXT: flat_store_byte v[16:17], v115 offset:103 +; ALIGNED-NEXT: flat_store_byte v[16:17], v71 offset:101 +; ALIGNED-NEXT: flat_store_byte v[16:17], v37 offset:99 +; ALIGNED-NEXT: flat_store_byte v[16:17], v70 offset:97 +; ALIGNED-NEXT: flat_store_byte v[16:17], v24 offset:95 +; ALIGNED-NEXT: flat_store_byte v[16:17], v85 offset:93 +; ALIGNED-NEXT: flat_store_byte v[16:17], v36 offset:91 +; ALIGNED-NEXT: flat_store_byte v[16:17], v84 offset:89 +; ALIGNED-NEXT: flat_store_byte v[16:17], v102 offset:87 +; ALIGNED-NEXT: flat_store_byte v[16:17], v83 offset:85 +; ALIGNED-NEXT: flat_store_byte v[16:17], v35 offset:83 +; ALIGNED-NEXT: flat_store_byte v[16:17], v82 offset:81 +; ALIGNED-NEXT: flat_store_byte v[16:17], v23 offset:79 +; ALIGNED-NEXT: flat_store_byte v[16:17], v97 offset:77 +; ALIGNED-NEXT: flat_store_byte v[16:17], v34 offset:75 +; ALIGNED-NEXT: flat_store_byte v[16:17], v96 offset:73 +; ALIGNED-NEXT: flat_store_byte v[16:17], v21 offset:71 +; ALIGNED-NEXT: flat_store_byte v[16:17], v87 offset:69 +; ALIGNED-NEXT: flat_store_byte v[16:17], v49 offset:67 +; ALIGNED-NEXT: flat_store_byte v[16:17], v86 offset:65 +; ALIGNED-NEXT: flat_store_byte v[16:17], v22 offset:63 +; ALIGNED-NEXT: flat_store_byte v[16:17], v101 offset:61 +; ALIGNED-NEXT: flat_store_byte v[16:17], v48 offset:59 +; ALIGNED-NEXT: flat_store_byte v[16:17], v100 offset:57 +; ALIGNED-NEXT: flat_store_byte v[16:17], v103 offset:55 +; ALIGNED-NEXT: flat_store_byte v[16:17], v99 offset:53 +; ALIGNED-NEXT: flat_store_byte v[16:17], v39 offset:51 +; ALIGNED-NEXT: flat_store_byte v[16:17], v98 offset:49 +; ALIGNED-NEXT: flat_store_byte v[16:17], v29 offset:43 +; ALIGNED-NEXT: flat_store_byte v[16:17], v15 offset:41 +; ALIGNED-NEXT: flat_store_byte v[16:17], v38 offset:47 +; ALIGNED-NEXT: flat_store_byte v[16:17], v14 offset:45 +; ALIGNED-NEXT: flat_store_byte v[16:17], v20 offset:35 +; ALIGNED-NEXT: flat_store_byte v[16:17], v13 offset:33 +; ALIGNED-NEXT: flat_store_byte v[16:17], v53 offset:39 +; ALIGNED-NEXT: flat_store_byte v[16:17], v12 offset:37 +; ALIGNED-NEXT: flat_store_byte v[16:17], v28 offset:31 +; ALIGNED-NEXT: flat_store_byte v[16:17], v11 offset:29 +; ALIGNED-NEXT: flat_store_byte v[16:17], v52 offset:27 +; ALIGNED-NEXT: flat_store_byte v[16:17], v10 offset:25 +; ALIGNED-NEXT: flat_store_byte v[16:17], v112 offset:23 +; ALIGNED-NEXT: flat_store_byte v[16:17], v9 offset:21 +; ALIGNED-NEXT: flat_store_byte v[16:17], v51 offset:19 +; ALIGNED-NEXT: flat_store_byte v[16:17], v8 offset:17 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v7 offset:14 +; ALIGNED-NEXT: flat_store_byte v[16:17], v7 offset:12 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v6 offset:10 +; ALIGNED-NEXT: flat_store_byte v[16:17], v6 offset:8 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v5 offset:6 +; ALIGNED-NEXT: flat_store_byte v[16:17], v5 offset:4 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v4 offset:2 +; ALIGNED-NEXT: flat_store_byte v[16:17], v4 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v8, 24, v7 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v7, 8, v7 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v9, 24, v6 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v6, 8, v6 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v10, 24, v5 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v5, 8, v5 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v11, 24, v4 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v4, 8, v4 +; ALIGNED-NEXT: flat_store_byte v[16:17], v8 offset:15 +; ALIGNED-NEXT: flat_store_byte v[16:17], v7 offset:13 +; ALIGNED-NEXT: flat_store_byte v[16:17], v9 offset:11 +; ALIGNED-NEXT: flat_store_byte v[16:17], v6 offset:9 +; ALIGNED-NEXT: flat_store_byte v[16:17], v10 offset:7 +; ALIGNED-NEXT: flat_store_byte v[16:17], v5 offset:5 +; ALIGNED-NEXT: flat_store_byte v[16:17], v11 offset:3 +; ALIGNED-NEXT: flat_store_byte v[16:17], v4 offset:1 +; ALIGNED-NEXT: s_cbranch_scc1 .LBB5_2 +; ALIGNED-NEXT: .LBB5_3: ; %Flow5 +; ALIGNED-NEXT: s_andn2_saveexec_b32 s8, s6 +; ALIGNED-NEXT: s_cbranch_execz .LBB5_6 +; ALIGNED-NEXT: ; %bb.4: ; %memmove_bwd_loop.preheader +; ALIGNED-NEXT: s_movk_i32 s6, 0xff00 +; ALIGNED-NEXT: s_mov_b64 s[4:5], 0x700 +; ALIGNED-NEXT: s_mov_b32 s7, -1 +; ALIGNED-NEXT: .LBB5_5: ; %memmove_bwd_loop +; ALIGNED-NEXT: ; =>This Inner Loop Header: Depth=1 +; ALIGNED-NEXT: v_add_co_u32 v24, vcc_lo, v2, s4 +; ALIGNED-NEXT: v_add_co_ci_u32_e32 v25, vcc_lo, s5, v3, vcc_lo +; ALIGNED-NEXT: s_clause 0xf +; ALIGNED-NEXT: flat_load_dwordx4 v[16:19], v[24:25] offset:240 +; ALIGNED-NEXT: flat_load_dwordx4 v[20:23], v[24:25] offset:224 +; ALIGNED-NEXT: flat_load_dwordx4 v[4:7], v[24:25] +; ALIGNED-NEXT: flat_load_dwordx4 v[8:11], v[24:25] offset:16 +; ALIGNED-NEXT: flat_load_dwordx4 v[12:15], v[24:25] offset:32 +; ALIGNED-NEXT: flat_load_dwordx4 v[112:115], v[24:25] offset:48 +; ALIGNED-NEXT: flat_load_dwordx4 v[116:119], v[24:25] offset:64 +; ALIGNED-NEXT: flat_load_dwordx4 v[40:43], v[24:25] offset:80 +; ALIGNED-NEXT: flat_load_dwordx4 v[26:29], v[24:25] offset:96 +; ALIGNED-NEXT: flat_load_dwordx4 v[32:35], v[24:25] offset:112 +; ALIGNED-NEXT: flat_load_dwordx4 v[44:47], v[24:25] offset:128 +; ALIGNED-NEXT: flat_load_dwordx4 v[52:55], v[24:25] offset:144 +; ALIGNED-NEXT: flat_load_dwordx4 v[66:69], v[24:25] offset:160 +; ALIGNED-NEXT: flat_load_dwordx4 v[81:84], v[24:25] offset:176 +; ALIGNED-NEXT: flat_load_dwordx4 v[96:99], v[24:25] offset:192 +; ALIGNED-NEXT: flat_load_dwordx4 v[100:103], v[24:25] offset:208 +; ALIGNED-NEXT: s_waitcnt vmcnt(15) lgkmcnt(15) +; ALIGNED-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:320 +; ALIGNED-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:324 +; ALIGNED-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:328 +; ALIGNED-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:332 +; ALIGNED-NEXT: s_clause 0x3 +; ALIGNED-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:332 +; ALIGNED-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:328 +; ALIGNED-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:324 +; ALIGNED-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:320 +; ALIGNED-NEXT: v_add_co_u32 v16, vcc_lo, v0, s4 +; ALIGNED-NEXT: v_add_co_ci_u32_e32 v17, vcc_lo, s5, v1, vcc_lo +; ALIGNED-NEXT: s_add_u32 s4, s4, 0xffffff00 +; ALIGNED-NEXT: s_addc_u32 s5, s5, -1 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v31 offset:254 +; ALIGNED-NEXT: flat_store_byte v[16:17], v31 offset:252 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v30 offset:250 +; ALIGNED-NEXT: flat_store_byte v[16:17], v30 offset:248 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v25 offset:246 +; ALIGNED-NEXT: flat_store_byte v[16:17], v25 offset:244 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v24 offset:242 +; ALIGNED-NEXT: flat_store_byte v[16:17], v24 offset:240 +; ALIGNED-NEXT: s_waitcnt lgkmcnt(22) +; ALIGNED-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:336 +; ALIGNED-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:340 +; ALIGNED-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:344 +; ALIGNED-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:348 +; ALIGNED-NEXT: s_clause 0x3 +; ALIGNED-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:348 +; ALIGNED-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:344 +; ALIGNED-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:340 +; ALIGNED-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:336 +; ALIGNED-NEXT: s_cmp_eq_u64 s[4:5], s[6:7] +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v51 offset:238 +; ALIGNED-NEXT: flat_store_byte v[16:17], v51 offset:236 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v50 offset:234 +; ALIGNED-NEXT: flat_store_byte v[16:17], v50 offset:232 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v49 offset:230 +; ALIGNED-NEXT: flat_store_byte v[16:17], v49 offset:228 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v36 offset:226 +; ALIGNED-NEXT: flat_store_byte v[16:17], v36 offset:224 +; ALIGNED-NEXT: s_waitcnt lgkmcnt(16) +; ALIGNED-NEXT: buffer_store_dword v100, off, s[0:3], s32 offset:288 +; ALIGNED-NEXT: buffer_store_dword v101, off, s[0:3], s32 offset:292 +; ALIGNED-NEXT: buffer_store_dword v102, off, s[0:3], s32 offset:296 +; ALIGNED-NEXT: buffer_store_dword v103, off, s[0:3], s32 offset:300 +; ALIGNED-NEXT: s_clause 0x3 +; ALIGNED-NEXT: buffer_load_dword v71, off, s[0:3], s32 offset:300 +; ALIGNED-NEXT: buffer_load_dword v70, off, s[0:3], s32 offset:296 +; ALIGNED-NEXT: buffer_load_dword v65, off, s[0:3], s32 offset:292 +; ALIGNED-NEXT: buffer_load_dword v64, off, s[0:3], s32 offset:288 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v31 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v31, 8, v31 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v30 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v30, 8, v30 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v71 offset:222 +; ALIGNED-NEXT: flat_store_byte v[16:17], v71 offset:220 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v70 offset:218 +; ALIGNED-NEXT: flat_store_byte v[16:17], v70 offset:216 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v65 offset:214 +; ALIGNED-NEXT: flat_store_byte v[16:17], v65 offset:212 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v64 offset:210 +; ALIGNED-NEXT: flat_store_byte v[16:17], v64 offset:208 +; ALIGNED-NEXT: buffer_store_dword v96, off, s[0:3], s32 offset:304 +; ALIGNED-NEXT: buffer_store_dword v97, off, s[0:3], s32 offset:308 +; ALIGNED-NEXT: buffer_store_dword v98, off, s[0:3], s32 offset:312 +; ALIGNED-NEXT: buffer_store_dword v99, off, s[0:3], s32 offset:316 +; ALIGNED-NEXT: s_clause 0x3 +; ALIGNED-NEXT: buffer_load_dword v87, off, s[0:3], s32 offset:316 +; ALIGNED-NEXT: buffer_load_dword v86, off, s[0:3], s32 offset:312 +; ALIGNED-NEXT: buffer_load_dword v85, off, s[0:3], s32 offset:308 +; ALIGNED-NEXT: buffer_load_dword v80, off, s[0:3], s32 offset:304 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v87 offset:206 +; ALIGNED-NEXT: flat_store_byte v[16:17], v87 offset:204 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v86 offset:202 +; ALIGNED-NEXT: flat_store_byte v[16:17], v86 offset:200 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v85 offset:198 +; ALIGNED-NEXT: flat_store_byte v[16:17], v85 offset:196 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v80 offset:194 +; ALIGNED-NEXT: flat_store_byte v[16:17], v80 offset:192 +; ALIGNED-NEXT: buffer_store_dword v81, off, s[0:3], s32 offset:384 +; ALIGNED-NEXT: buffer_store_dword v82, off, s[0:3], s32 offset:388 +; ALIGNED-NEXT: buffer_store_dword v83, off, s[0:3], s32 offset:392 +; ALIGNED-NEXT: buffer_store_dword v84, off, s[0:3], s32 offset:396 +; ALIGNED-NEXT: s_clause 0x3 +; ALIGNED-NEXT: buffer_load_dword v101, off, s[0:3], s32 offset:396 +; ALIGNED-NEXT: buffer_load_dword v99, off, s[0:3], s32 offset:392 +; ALIGNED-NEXT: buffer_load_dword v96, off, s[0:3], s32 offset:388 +; ALIGNED-NEXT: buffer_load_dword v81, off, s[0:3], s32 offset:384 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v101 offset:190 +; ALIGNED-NEXT: flat_store_byte v[16:17], v101 offset:188 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v99 offset:186 +; ALIGNED-NEXT: flat_store_byte v[16:17], v99 offset:184 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v96 offset:182 +; ALIGNED-NEXT: flat_store_byte v[16:17], v96 offset:180 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v81 offset:178 +; ALIGNED-NEXT: flat_store_byte v[16:17], v81 offset:176 +; ALIGNED-NEXT: buffer_store_dword v66, off, s[0:3], s32 offset:400 +; ALIGNED-NEXT: buffer_store_dword v67, off, s[0:3], s32 offset:404 +; ALIGNED-NEXT: buffer_store_dword v68, off, s[0:3], s32 offset:408 +; ALIGNED-NEXT: buffer_store_dword v69, off, s[0:3], s32 offset:412 +; ALIGNED-NEXT: s_clause 0x3 +; ALIGNED-NEXT: buffer_load_dword v100, off, s[0:3], s32 offset:412 +; ALIGNED-NEXT: buffer_load_dword v97, off, s[0:3], s32 offset:408 +; ALIGNED-NEXT: buffer_load_dword v82, off, s[0:3], s32 offset:404 +; ALIGNED-NEXT: buffer_load_dword v66, off, s[0:3], s32 offset:400 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v100 offset:174 +; ALIGNED-NEXT: flat_store_byte v[16:17], v100 offset:172 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v97 offset:170 +; ALIGNED-NEXT: flat_store_byte v[16:17], v97 offset:168 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v82 offset:166 +; ALIGNED-NEXT: flat_store_byte v[16:17], v82 offset:164 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v66 offset:162 +; ALIGNED-NEXT: flat_store_byte v[16:17], v66 offset:160 +; ALIGNED-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:352 +; ALIGNED-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:356 +; ALIGNED-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:360 +; ALIGNED-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:364 +; ALIGNED-NEXT: s_clause 0x3 +; ALIGNED-NEXT: buffer_load_dword v98, off, s[0:3], s32 offset:364 +; ALIGNED-NEXT: buffer_load_dword v83, off, s[0:3], s32 offset:360 +; ALIGNED-NEXT: buffer_load_dword v67, off, s[0:3], s32 offset:356 +; ALIGNED-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:352 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v98 offset:158 +; ALIGNED-NEXT: flat_store_byte v[16:17], v98 offset:156 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v83 offset:154 +; ALIGNED-NEXT: flat_store_byte v[16:17], v83 offset:152 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v67 offset:150 +; ALIGNED-NEXT: flat_store_byte v[16:17], v67 offset:148 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v52 offset:146 +; ALIGNED-NEXT: flat_store_byte v[16:17], v52 offset:144 +; ALIGNED-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:368 +; ALIGNED-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:372 +; ALIGNED-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:376 +; ALIGNED-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:380 +; ALIGNED-NEXT: s_clause 0x3 +; ALIGNED-NEXT: buffer_load_dword v84, off, s[0:3], s32 offset:380 +; ALIGNED-NEXT: buffer_load_dword v68, off, s[0:3], s32 offset:376 +; ALIGNED-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:372 +; ALIGNED-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:368 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v84 offset:142 +; ALIGNED-NEXT: flat_store_byte v[16:17], v84 offset:140 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v68 offset:138 +; ALIGNED-NEXT: flat_store_byte v[16:17], v68 offset:136 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v53 offset:134 +; ALIGNED-NEXT: flat_store_byte v[16:17], v53 offset:132 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v37 offset:130 +; ALIGNED-NEXT: flat_store_byte v[16:17], v37 offset:128 +; ALIGNED-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:448 +; ALIGNED-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:452 +; ALIGNED-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:456 +; ALIGNED-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:460 +; ALIGNED-NEXT: s_clause 0x3 +; ALIGNED-NEXT: buffer_load_dword v69, off, s[0:3], s32 offset:460 +; ALIGNED-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:456 +; ALIGNED-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:452 +; ALIGNED-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:448 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v69 offset:126 +; ALIGNED-NEXT: flat_store_byte v[16:17], v69 offset:124 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v54 offset:122 +; ALIGNED-NEXT: flat_store_byte v[16:17], v54 offset:120 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v38 offset:118 +; ALIGNED-NEXT: flat_store_byte v[16:17], v38 offset:116 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v32 offset:114 +; ALIGNED-NEXT: flat_store_byte v[16:17], v32 offset:112 +; ALIGNED-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:464 +; ALIGNED-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:468 +; ALIGNED-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:472 +; ALIGNED-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:476 +; ALIGNED-NEXT: s_clause 0x3 +; ALIGNED-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:476 +; ALIGNED-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:472 +; ALIGNED-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:468 +; ALIGNED-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:464 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v55 offset:110 +; ALIGNED-NEXT: flat_store_byte v[16:17], v55 offset:108 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v39 offset:106 +; ALIGNED-NEXT: flat_store_byte v[16:17], v39 offset:104 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v33 offset:102 +; ALIGNED-NEXT: flat_store_byte v[16:17], v33 offset:100 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v26 offset:98 +; ALIGNED-NEXT: flat_store_byte v[16:17], v26 offset:96 +; ALIGNED-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:416 +; ALIGNED-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:420 +; ALIGNED-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:424 +; ALIGNED-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:428 +; ALIGNED-NEXT: s_clause 0x3 +; ALIGNED-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:428 +; ALIGNED-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:424 +; ALIGNED-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:420 +; ALIGNED-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:416 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v48 offset:94 +; ALIGNED-NEXT: flat_store_byte v[16:17], v48 offset:92 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v34 offset:90 +; ALIGNED-NEXT: flat_store_byte v[16:17], v34 offset:88 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v27 offset:86 +; ALIGNED-NEXT: flat_store_byte v[16:17], v27 offset:84 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v21 offset:82 +; ALIGNED-NEXT: flat_store_byte v[16:17], v21 offset:80 +; ALIGNED-NEXT: buffer_store_dword v116, off, s[0:3], s32 offset:432 +; ALIGNED-NEXT: buffer_store_dword v117, off, s[0:3], s32 offset:436 +; ALIGNED-NEXT: buffer_store_dword v118, off, s[0:3], s32 offset:440 +; ALIGNED-NEXT: buffer_store_dword v119, off, s[0:3], s32 offset:444 +; ALIGNED-NEXT: s_clause 0x3 +; ALIGNED-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:444 +; ALIGNED-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:440 +; ALIGNED-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:436 +; ALIGNED-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:432 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v35 offset:78 +; ALIGNED-NEXT: flat_store_byte v[16:17], v35 offset:76 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v28 offset:74 +; ALIGNED-NEXT: flat_store_byte v[16:17], v28 offset:72 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v22 offset:70 +; ALIGNED-NEXT: flat_store_byte v[16:17], v22 offset:68 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v19 offset:66 +; ALIGNED-NEXT: flat_store_byte v[16:17], v19 offset:64 +; ALIGNED-NEXT: buffer_store_dword v112, off, s[0:3], s32 offset:512 +; ALIGNED-NEXT: buffer_store_dword v113, off, s[0:3], s32 offset:516 +; ALIGNED-NEXT: buffer_store_dword v114, off, s[0:3], s32 offset:520 +; ALIGNED-NEXT: buffer_store_dword v115, off, s[0:3], s32 offset:524 +; ALIGNED-NEXT: s_clause 0x3 +; ALIGNED-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:524 +; ALIGNED-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:520 +; ALIGNED-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:516 +; ALIGNED-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:512 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v25 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v25, 8, v25 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v24 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v24, 8, v24 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v51 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 8, v51 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 24, v50 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v50, 8, v50 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v29 offset:62 +; ALIGNED-NEXT: flat_store_byte v[16:17], v29 offset:60 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v23 offset:58 +; ALIGNED-NEXT: flat_store_byte v[16:17], v23 offset:56 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v20 offset:54 +; ALIGNED-NEXT: flat_store_byte v[16:17], v20 offset:52 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v18 offset:50 +; ALIGNED-NEXT: flat_store_byte v[16:17], v18 offset:48 +; ALIGNED-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:528 +; ALIGNED-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:532 +; ALIGNED-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:536 +; ALIGNED-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:540 +; ALIGNED-NEXT: s_clause 0x3 +; ALIGNED-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:536 +; ALIGNED-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:540 +; ALIGNED-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:528 +; ALIGNED-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:532 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v15 offset:42 +; ALIGNED-NEXT: flat_store_byte v[16:17], v15 offset:40 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v14 offset:46 +; ALIGNED-NEXT: flat_store_byte v[16:17], v14 offset:44 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v13 offset:34 +; ALIGNED-NEXT: flat_store_byte v[16:17], v13 offset:32 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v12 offset:38 +; ALIGNED-NEXT: flat_store_byte v[16:17], v12 offset:36 +; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:480 +; ALIGNED-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:484 +; ALIGNED-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:488 +; ALIGNED-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:492 +; ALIGNED-NEXT: s_clause 0x3 +; ALIGNED-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:492 +; ALIGNED-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:488 +; ALIGNED-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:484 +; ALIGNED-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:480 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v11 offset:30 +; ALIGNED-NEXT: flat_store_byte v[16:17], v11 offset:28 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v10 offset:26 +; ALIGNED-NEXT: flat_store_byte v[16:17], v10 offset:24 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v9 offset:22 +; ALIGNED-NEXT: flat_store_byte v[16:17], v9 offset:20 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v8 offset:18 +; ALIGNED-NEXT: flat_store_byte v[16:17], v8 offset:16 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:496 +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:500 +; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:504 +; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:508 +; ALIGNED-NEXT: s_clause 0x3 +; ALIGNED-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:508 +; ALIGNED-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:504 +; ALIGNED-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:500 +; ALIGNED-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:496 +; ALIGNED-NEXT: flat_store_byte v[16:17], v112 offset:247 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v65 +; ALIGNED-NEXT: flat_store_byte v[16:17], v102 offset:255 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v49 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v49, 8, v49 +; ALIGNED-NEXT: flat_store_byte v[16:17], v31 offset:253 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v31, 24, v36 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v36, 8, v36 +; ALIGNED-NEXT: flat_store_byte v[16:17], v103 offset:251 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v71 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 8, v71 +; ALIGNED-NEXT: flat_store_byte v[16:17], v30 offset:249 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v30, 24, v70 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 8, v70 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v65, 8, v65 +; ALIGNED-NEXT: flat_store_byte v[16:17], v25 offset:245 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v25, 24, v64 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v64, 8, v64 +; ALIGNED-NEXT: flat_store_byte v[16:17], v112 offset:215 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v67 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v67, 8, v67 +; ALIGNED-NEXT: flat_store_byte v[16:17], v113 offset:243 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v87 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 8, v87 +; ALIGNED-NEXT: flat_store_byte v[16:17], v24 offset:241 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v24, 24, v86 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 8, v86 +; ALIGNED-NEXT: flat_store_byte v[16:17], v114 offset:239 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v85 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 8, v85 +; ALIGNED-NEXT: flat_store_byte v[16:17], v51 offset:237 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 24, v80 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 8, v80 +; ALIGNED-NEXT: flat_store_byte v[16:17], v115 offset:235 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 24, v101 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 8, v101 +; ALIGNED-NEXT: flat_store_byte v[16:17], v50 offset:233 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v50, 24, v99 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v99 +; ALIGNED-NEXT: flat_store_byte v[16:17], v102 offset:231 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v96 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v96, 8, v96 +; ALIGNED-NEXT: flat_store_byte v[16:17], v49 offset:229 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v49, 24, v81 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 8, v81 +; ALIGNED-NEXT: flat_store_byte v[16:17], v31 offset:227 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v31, 24, v100 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 8, v100 +; ALIGNED-NEXT: flat_store_byte v[16:17], v36 offset:225 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v36, 24, v97 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v97, 8, v97 +; ALIGNED-NEXT: flat_store_byte v[16:17], v103 offset:223 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v82 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 8, v82 +; ALIGNED-NEXT: flat_store_byte v[16:17], v71 offset:221 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 24, v66 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v66, 8, v66 +; ALIGNED-NEXT: flat_store_byte v[16:17], v30 offset:219 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v30, 24, v98 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 8, v98 +; ALIGNED-NEXT: flat_store_byte v[16:17], v70 offset:217 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 24, v83 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 8, v83 +; ALIGNED-NEXT: flat_store_byte v[16:17], v65 offset:213 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v65, 24, v52 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v52, 8, v52 +; ALIGNED-NEXT: flat_store_byte v[16:17], v25 offset:211 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v25, 24, v84 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v84, 8, v84 +; ALIGNED-NEXT: flat_store_byte v[16:17], v64 offset:209 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v64, 24, v68 +; ALIGNED-NEXT: flat_store_byte v[16:17], v67 offset:149 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v67, 24, v8 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v8, 8, v8 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v68, 8, v68 +; ALIGNED-NEXT: flat_store_byte v[16:17], v113 offset:207 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v53 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v53, 8, v53 +; ALIGNED-NEXT: flat_store_byte v[16:17], v87 offset:205 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 24, v37 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v37, 8, v37 +; ALIGNED-NEXT: flat_store_byte v[16:17], v24 offset:203 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v24, 24, v69 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 8, v69 +; ALIGNED-NEXT: flat_store_byte v[16:17], v86 offset:201 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 24, v54 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v54, 8, v54 +; ALIGNED-NEXT: flat_store_byte v[16:17], v114 offset:199 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v38 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v38, 8, v38 +; ALIGNED-NEXT: flat_store_byte v[16:17], v85 offset:197 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 24, v32 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v32, 8, v32 +; ALIGNED-NEXT: flat_store_byte v[16:17], v51 offset:195 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 24, v55 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v55, 8, v55 +; ALIGNED-NEXT: flat_store_byte v[16:17], v80 offset:193 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 24, v39 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v39, 8, v39 +; ALIGNED-NEXT: flat_store_byte v[16:17], v115 offset:191 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 24, v33 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v33, 8, v33 +; ALIGNED-NEXT: flat_store_byte v[16:17], v101 offset:189 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 24, v26 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v26, 8, v26 +; ALIGNED-NEXT: flat_store_byte v[16:17], v50 offset:187 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v50, 24, v48 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v48, 8, v48 +; ALIGNED-NEXT: flat_store_byte v[16:17], v99 offset:185 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 24, v34 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v34, 8, v34 +; ALIGNED-NEXT: flat_store_byte v[16:17], v102 offset:183 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v27 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v27, 8, v27 +; ALIGNED-NEXT: flat_store_byte v[16:17], v96 offset:181 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v96, 24, v21 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v21, 8, v21 +; ALIGNED-NEXT: flat_store_byte v[16:17], v49 offset:179 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v49, 24, v35 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v35, 8, v35 +; ALIGNED-NEXT: flat_store_byte v[16:17], v81 offset:177 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 24, v28 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v28, 8, v28 +; ALIGNED-NEXT: flat_store_byte v[16:17], v31 offset:175 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v31, 24, v22 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v22, 8, v22 +; ALIGNED-NEXT: flat_store_byte v[16:17], v100 offset:173 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v19 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v19, 8, v19 +; ALIGNED-NEXT: flat_store_byte v[16:17], v36 offset:171 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v36, 24, v29 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v29, 8, v29 +; ALIGNED-NEXT: flat_store_byte v[16:17], v97 offset:169 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v97, 24, v23 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v23, 8, v23 +; ALIGNED-NEXT: flat_store_byte v[16:17], v103 offset:167 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v20 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v20, 8, v20 +; ALIGNED-NEXT: flat_store_byte v[16:17], v82 offset:165 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v18 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v18, 8, v18 +; ALIGNED-NEXT: flat_store_byte v[16:17], v71 offset:163 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 24, v15 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v15, 8, v15 +; ALIGNED-NEXT: flat_store_byte v[16:17], v66 offset:161 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v66, 24, v14 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v14, 8, v14 +; ALIGNED-NEXT: flat_store_byte v[16:17], v30 offset:159 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v30, 24, v13 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v13, 8, v13 +; ALIGNED-NEXT: flat_store_byte v[16:17], v98 offset:157 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v12 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v12, 8, v12 +; ALIGNED-NEXT: flat_store_byte v[16:17], v70 offset:155 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 24, v11 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v11, 8, v11 +; ALIGNED-NEXT: flat_store_byte v[16:17], v83 offset:153 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 24, v10 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v10, 8, v10 +; ALIGNED-NEXT: flat_store_byte v[16:17], v112 offset:151 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v9 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v9, 8, v9 +; ALIGNED-NEXT: flat_store_byte v[16:17], v65 offset:147 +; ALIGNED-NEXT: flat_store_byte v[16:17], v52 offset:145 +; ALIGNED-NEXT: flat_store_byte v[16:17], v25 offset:143 +; ALIGNED-NEXT: flat_store_byte v[16:17], v84 offset:141 +; ALIGNED-NEXT: flat_store_byte v[16:17], v64 offset:139 +; ALIGNED-NEXT: flat_store_byte v[16:17], v68 offset:137 +; ALIGNED-NEXT: flat_store_byte v[16:17], v113 offset:135 +; ALIGNED-NEXT: flat_store_byte v[16:17], v53 offset:133 +; ALIGNED-NEXT: flat_store_byte v[16:17], v87 offset:131 +; ALIGNED-NEXT: flat_store_byte v[16:17], v37 offset:129 +; ALIGNED-NEXT: flat_store_byte v[16:17], v24 offset:127 +; ALIGNED-NEXT: flat_store_byte v[16:17], v69 offset:125 +; ALIGNED-NEXT: flat_store_byte v[16:17], v86 offset:123 +; ALIGNED-NEXT: flat_store_byte v[16:17], v54 offset:121 +; ALIGNED-NEXT: flat_store_byte v[16:17], v114 offset:119 +; ALIGNED-NEXT: flat_store_byte v[16:17], v38 offset:117 +; ALIGNED-NEXT: flat_store_byte v[16:17], v85 offset:115 +; ALIGNED-NEXT: flat_store_byte v[16:17], v32 offset:113 +; ALIGNED-NEXT: flat_store_byte v[16:17], v51 offset:111 +; ALIGNED-NEXT: flat_store_byte v[16:17], v55 offset:109 +; ALIGNED-NEXT: flat_store_byte v[16:17], v80 offset:107 +; ALIGNED-NEXT: flat_store_byte v[16:17], v39 offset:105 +; ALIGNED-NEXT: flat_store_byte v[16:17], v115 offset:103 +; ALIGNED-NEXT: flat_store_byte v[16:17], v33 offset:101 +; ALIGNED-NEXT: flat_store_byte v[16:17], v101 offset:99 +; ALIGNED-NEXT: flat_store_byte v[16:17], v26 offset:97 +; ALIGNED-NEXT: flat_store_byte v[16:17], v50 offset:95 +; ALIGNED-NEXT: flat_store_byte v[16:17], v48 offset:93 +; ALIGNED-NEXT: flat_store_byte v[16:17], v99 offset:91 +; ALIGNED-NEXT: flat_store_byte v[16:17], v34 offset:89 +; ALIGNED-NEXT: flat_store_byte v[16:17], v102 offset:87 +; ALIGNED-NEXT: flat_store_byte v[16:17], v27 offset:85 +; ALIGNED-NEXT: flat_store_byte v[16:17], v96 offset:83 +; ALIGNED-NEXT: flat_store_byte v[16:17], v21 offset:81 +; ALIGNED-NEXT: flat_store_byte v[16:17], v49 offset:79 +; ALIGNED-NEXT: flat_store_byte v[16:17], v35 offset:77 +; ALIGNED-NEXT: flat_store_byte v[16:17], v81 offset:75 +; ALIGNED-NEXT: flat_store_byte v[16:17], v28 offset:73 +; ALIGNED-NEXT: flat_store_byte v[16:17], v31 offset:71 +; ALIGNED-NEXT: flat_store_byte v[16:17], v22 offset:69 +; ALIGNED-NEXT: flat_store_byte v[16:17], v100 offset:67 +; ALIGNED-NEXT: flat_store_byte v[16:17], v19 offset:65 +; ALIGNED-NEXT: flat_store_byte v[16:17], v36 offset:63 +; ALIGNED-NEXT: flat_store_byte v[16:17], v29 offset:61 +; ALIGNED-NEXT: flat_store_byte v[16:17], v97 offset:59 +; ALIGNED-NEXT: flat_store_byte v[16:17], v23 offset:57 +; ALIGNED-NEXT: flat_store_byte v[16:17], v103 offset:55 +; ALIGNED-NEXT: flat_store_byte v[16:17], v20 offset:53 +; ALIGNED-NEXT: flat_store_byte v[16:17], v82 offset:51 +; ALIGNED-NEXT: flat_store_byte v[16:17], v18 offset:49 +; ALIGNED-NEXT: flat_store_byte v[16:17], v71 offset:43 +; ALIGNED-NEXT: flat_store_byte v[16:17], v15 offset:41 +; ALIGNED-NEXT: flat_store_byte v[16:17], v66 offset:47 +; ALIGNED-NEXT: flat_store_byte v[16:17], v14 offset:45 +; ALIGNED-NEXT: flat_store_byte v[16:17], v30 offset:35 +; ALIGNED-NEXT: flat_store_byte v[16:17], v13 offset:33 +; ALIGNED-NEXT: flat_store_byte v[16:17], v98 offset:39 +; ALIGNED-NEXT: flat_store_byte v[16:17], v12 offset:37 +; ALIGNED-NEXT: flat_store_byte v[16:17], v70 offset:31 +; ALIGNED-NEXT: flat_store_byte v[16:17], v11 offset:29 +; ALIGNED-NEXT: flat_store_byte v[16:17], v83 offset:27 +; ALIGNED-NEXT: flat_store_byte v[16:17], v10 offset:25 +; ALIGNED-NEXT: flat_store_byte v[16:17], v112 offset:23 +; ALIGNED-NEXT: flat_store_byte v[16:17], v9 offset:21 +; ALIGNED-NEXT: flat_store_byte v[16:17], v67 offset:19 +; ALIGNED-NEXT: flat_store_byte v[16:17], v8 offset:17 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v7 offset:14 +; ALIGNED-NEXT: flat_store_byte v[16:17], v7 offset:12 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v6 offset:10 +; ALIGNED-NEXT: flat_store_byte v[16:17], v6 offset:8 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v5 offset:6 +; ALIGNED-NEXT: flat_store_byte v[16:17], v5 offset:4 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v4 offset:2 +; ALIGNED-NEXT: flat_store_byte v[16:17], v4 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v8, 24, v7 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v7, 8, v7 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v9, 24, v6 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v6, 8, v6 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v10, 24, v5 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v5, 8, v5 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v11, 24, v4 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v4, 8, v4 +; ALIGNED-NEXT: flat_store_byte v[16:17], v8 offset:15 +; ALIGNED-NEXT: flat_store_byte v[16:17], v7 offset:13 +; ALIGNED-NEXT: flat_store_byte v[16:17], v9 offset:11 +; ALIGNED-NEXT: flat_store_byte v[16:17], v6 offset:9 +; ALIGNED-NEXT: flat_store_byte v[16:17], v10 offset:7 +; ALIGNED-NEXT: flat_store_byte v[16:17], v5 offset:5 +; ALIGNED-NEXT: flat_store_byte v[16:17], v11 offset:3 +; ALIGNED-NEXT: flat_store_byte v[16:17], v4 offset:1 +; ALIGNED-NEXT: s_cbranch_scc0 .LBB5_5 +; ALIGNED-NEXT: .LBB5_6: ; %Flow6 +; ALIGNED-NEXT: s_or_b32 exec_lo, exec_lo, s8 +; ALIGNED-NEXT: s_clause 0x7 +; ALIGNED-NEXT: buffer_load_dword v47, off, s[0:3], s32 +; ALIGNED-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 +; ALIGNED-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:8 +; ALIGNED-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:12 +; ALIGNED-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:16 +; ALIGNED-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 +; ALIGNED-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:24 +; ALIGNED-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; ALIGNED-NEXT: s_setpc_b64 s[30:31] +; +; UNROLL3-LABEL: memmove_p0_p0_sz2048: +; UNROLL3: ; %bb.0: ; %entry +; UNROLL3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; UNROLL3-NEXT: s_mov_b32 s4, exec_lo +; UNROLL3-NEXT: v_cmpx_ge_u64_e64 v[2:3], v[0:1] +; UNROLL3-NEXT: s_xor_b32 s6, exec_lo, s4 +; UNROLL3-NEXT: s_cbranch_execz .LBB5_4 +; UNROLL3-NEXT: ; %bb.1: ; %memmove_fwd_loop.preheader +; UNROLL3-NEXT: s_mov_b64 s[4:5], 0 +; UNROLL3-NEXT: .p2align 6 +; UNROLL3-NEXT: .LBB5_2: ; %memmove_fwd_loop +; UNROLL3-NEXT: ; =>This Inner Loop Header: Depth=1 +; UNROLL3-NEXT: v_add_co_u32 v12, vcc_lo, v2, s4 +; UNROLL3-NEXT: v_add_co_ci_u32_e32 v13, vcc_lo, s5, v3, vcc_lo +; UNROLL3-NEXT: v_add_co_u32 v16, vcc_lo, v0, s4 +; UNROLL3-NEXT: v_add_co_ci_u32_e32 v17, vcc_lo, s5, v1, vcc_lo +; UNROLL3-NEXT: s_clause 0x2 +; UNROLL3-NEXT: flat_load_dwordx4 v[4:7], v[12:13] +; UNROLL3-NEXT: flat_load_dwordx4 v[8:11], v[12:13] offset:16 +; UNROLL3-NEXT: flat_load_dwordx4 v[12:15], v[12:13] offset:32 +; UNROLL3-NEXT: s_add_u32 s4, s4, 48 +; UNROLL3-NEXT: s_addc_u32 s5, s5, 0 +; UNROLL3-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) +; UNROLL3-NEXT: flat_store_dwordx4 v[16:17], v[4:7] +; UNROLL3-NEXT: s_waitcnt vmcnt(1) lgkmcnt(2) +; UNROLL3-NEXT: flat_store_dwordx4 v[16:17], v[8:11] offset:16 +; UNROLL3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(2) +; UNROLL3-NEXT: flat_store_dwordx4 v[16:17], v[12:15] offset:32 +; UNROLL3-NEXT: s_cmp_lg_u64 s[4:5], 0x7e0 +; UNROLL3-NEXT: s_cbranch_scc1 .LBB5_2 +; UNROLL3-NEXT: ; %bb.3: ; %memmove_fwd_residual +; UNROLL3-NEXT: flat_load_dwordx4 v[4:7], v[2:3] offset:2016 +; UNROLL3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; UNROLL3-NEXT: flat_store_dwordx4 v[0:1], v[4:7] offset:2016 +; UNROLL3-NEXT: flat_load_dwordx4 v[2:5], v[2:3] offset:2032 +; UNROLL3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; UNROLL3-NEXT: flat_store_dwordx4 v[0:1], v[2:5] offset:2032 +; UNROLL3-NEXT: ; implicit-def: $vgpr2_vgpr3 +; UNROLL3-NEXT: ; implicit-def: $vgpr0_vgpr1 +; UNROLL3-NEXT: .LBB5_4: ; %Flow3 +; UNROLL3-NEXT: s_andn2_saveexec_b32 s8, s6 +; UNROLL3-NEXT: s_cbranch_execz .LBB5_7 +; UNROLL3-NEXT: ; %bb.5: ; %memmove_bwd_residual +; UNROLL3-NEXT: flat_load_dwordx4 v[4:7], v[2:3] offset:2032 +; UNROLL3-NEXT: s_movk_i32 s6, 0xffd0 +; UNROLL3-NEXT: s_mov_b64 s[4:5], 0x7b0 +; UNROLL3-NEXT: s_mov_b32 s7, -1 +; UNROLL3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; UNROLL3-NEXT: flat_store_dwordx4 v[0:1], v[4:7] offset:2032 +; UNROLL3-NEXT: flat_load_dwordx4 v[4:7], v[2:3] offset:2016 +; UNROLL3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; UNROLL3-NEXT: flat_store_dwordx4 v[0:1], v[4:7] offset:2016 +; UNROLL3-NEXT: .p2align 6 +; UNROLL3-NEXT: .LBB5_6: ; %memmove_bwd_loop +; UNROLL3-NEXT: ; =>This Inner Loop Header: Depth=1 +; UNROLL3-NEXT: v_add_co_u32 v12, vcc_lo, v2, s4 +; UNROLL3-NEXT: v_add_co_ci_u32_e32 v13, vcc_lo, s5, v3, vcc_lo +; UNROLL3-NEXT: v_add_co_u32 v16, vcc_lo, v0, s4 +; UNROLL3-NEXT: v_add_co_ci_u32_e32 v17, vcc_lo, s5, v1, vcc_lo +; UNROLL3-NEXT: s_clause 0x2 +; UNROLL3-NEXT: flat_load_dwordx4 v[4:7], v[12:13] +; UNROLL3-NEXT: flat_load_dwordx4 v[8:11], v[12:13] offset:16 +; UNROLL3-NEXT: flat_load_dwordx4 v[12:15], v[12:13] offset:32 +; UNROLL3-NEXT: s_add_u32 s4, s4, 0xffffffd0 +; UNROLL3-NEXT: s_addc_u32 s5, s5, -1 +; UNROLL3-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) +; UNROLL3-NEXT: flat_store_dwordx4 v[16:17], v[4:7] +; UNROLL3-NEXT: s_waitcnt vmcnt(1) lgkmcnt(2) +; UNROLL3-NEXT: flat_store_dwordx4 v[16:17], v[8:11] offset:16 +; UNROLL3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(2) +; UNROLL3-NEXT: flat_store_dwordx4 v[16:17], v[12:15] offset:32 +; UNROLL3-NEXT: s_cmp_eq_u64 s[4:5], s[6:7] +; UNROLL3-NEXT: s_cbranch_scc0 .LBB5_6 +; UNROLL3-NEXT: .LBB5_7: ; %Flow4 +; UNROLL3-NEXT: s_or_b32 exec_lo, exec_lo, s8 +; UNROLL3-NEXT: s_waitcnt lgkmcnt(0) +; UNROLL3-NEXT: s_setpc_b64 s[30:31] +entry: + tail call void @llvm.memmove.p0.p0.i64(ptr addrspace(0) noundef nonnull align 1 %dst, ptr addrspace(0) noundef nonnull align 1 %src, i64 2048, i1 false) + ret void +} + +define void @memmove_p1_p1_sz2048(ptr addrspace(1) align 1 %dst, ptr addrspace(1) align 1 readonly %src) { +; CHECK-LABEL: memmove_p1_p1_sz2048: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_mov_b32 s4, exec_lo +; CHECK-NEXT: v_cmpx_ge_u64_e64 v[2:3], v[0:1] +; CHECK-NEXT: s_xor_b32 s6, exec_lo, s4 +; CHECK-NEXT: s_cbranch_execz .LBB6_3 +; CHECK-NEXT: ; %bb.1: ; %memmove_fwd_loop.preheader +; CHECK-NEXT: s_mov_b64 s[4:5], 0 +; CHECK-NEXT: .LBB6_2: ; %memmove_fwd_loop +; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: v_add_co_u32 v96, vcc_lo, v2, s4 +; CHECK-NEXT: v_add_co_ci_u32_e32 v97, vcc_lo, s5, v3, vcc_lo +; CHECK-NEXT: v_add_co_u32 v100, vcc_lo, v0, s4 +; CHECK-NEXT: v_add_co_ci_u32_e32 v101, vcc_lo, s5, v1, vcc_lo +; CHECK-NEXT: s_clause 0xf +; CHECK-NEXT: global_load_dwordx4 v[4:7], v[96:97], off offset:224 +; CHECK-NEXT: global_load_dwordx4 v[8:11], v[96:97], off offset:240 +; CHECK-NEXT: global_load_dwordx4 v[12:15], v[96:97], off offset:192 +; CHECK-NEXT: global_load_dwordx4 v[16:19], v[96:97], off offset:208 +; CHECK-NEXT: global_load_dwordx4 v[20:23], v[96:97], off offset:160 +; CHECK-NEXT: global_load_dwordx4 v[24:27], v[96:97], off offset:176 +; CHECK-NEXT: global_load_dwordx4 v[28:31], v[96:97], off offset:128 +; CHECK-NEXT: global_load_dwordx4 v[32:35], v[96:97], off offset:144 +; CHECK-NEXT: global_load_dwordx4 v[36:39], v[96:97], off offset:96 +; CHECK-NEXT: global_load_dwordx4 v[48:51], v[96:97], off offset:112 +; CHECK-NEXT: global_load_dwordx4 v[52:55], v[96:97], off offset:64 +; CHECK-NEXT: global_load_dwordx4 v[64:67], v[96:97], off offset:80 +; CHECK-NEXT: global_load_dwordx4 v[68:71], v[96:97], off offset:32 +; CHECK-NEXT: global_load_dwordx4 v[80:83], v[96:97], off offset:48 +; CHECK-NEXT: global_load_dwordx4 v[84:87], v[96:97], off +; CHECK-NEXT: global_load_dwordx4 v[96:99], v[96:97], off offset:16 +; CHECK-NEXT: s_add_u32 s4, s4, 0x100 +; CHECK-NEXT: s_addc_u32 s5, s5, 0 +; CHECK-NEXT: s_waitcnt vmcnt(15) +; CHECK-NEXT: global_store_dwordx4 v[100:101], v[4:7], off offset:224 +; CHECK-NEXT: s_waitcnt vmcnt(14) +; CHECK-NEXT: global_store_dwordx4 v[100:101], v[8:11], off offset:240 +; CHECK-NEXT: s_waitcnt vmcnt(13) +; CHECK-NEXT: global_store_dwordx4 v[100:101], v[12:15], off offset:192 +; CHECK-NEXT: s_waitcnt vmcnt(12) +; CHECK-NEXT: global_store_dwordx4 v[100:101], v[16:19], off offset:208 +; CHECK-NEXT: s_waitcnt vmcnt(11) +; CHECK-NEXT: global_store_dwordx4 v[100:101], v[20:23], off offset:160 +; CHECK-NEXT: s_waitcnt vmcnt(10) +; CHECK-NEXT: global_store_dwordx4 v[100:101], v[24:27], off offset:176 +; CHECK-NEXT: s_waitcnt vmcnt(9) +; CHECK-NEXT: global_store_dwordx4 v[100:101], v[28:31], off offset:128 +; CHECK-NEXT: s_waitcnt vmcnt(8) +; CHECK-NEXT: global_store_dwordx4 v[100:101], v[32:35], off offset:144 +; CHECK-NEXT: s_waitcnt vmcnt(7) +; CHECK-NEXT: global_store_dwordx4 v[100:101], v[36:39], off offset:96 +; CHECK-NEXT: s_waitcnt vmcnt(6) +; CHECK-NEXT: global_store_dwordx4 v[100:101], v[48:51], off offset:112 +; CHECK-NEXT: s_waitcnt vmcnt(5) +; CHECK-NEXT: global_store_dwordx4 v[100:101], v[52:55], off offset:64 +; CHECK-NEXT: s_waitcnt vmcnt(4) +; CHECK-NEXT: global_store_dwordx4 v[100:101], v[64:67], off offset:80 +; CHECK-NEXT: s_waitcnt vmcnt(3) +; CHECK-NEXT: global_store_dwordx4 v[100:101], v[68:71], off offset:32 +; CHECK-NEXT: s_waitcnt vmcnt(2) +; CHECK-NEXT: global_store_dwordx4 v[100:101], v[80:83], off offset:48 +; CHECK-NEXT: s_waitcnt vmcnt(1) +; CHECK-NEXT: global_store_dwordx4 v[100:101], v[84:87], off +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: global_store_dwordx4 v[100:101], v[96:99], off offset:16 +; CHECK-NEXT: s_cmp_lg_u64 s[4:5], 0x800 +; CHECK-NEXT: s_cbranch_scc1 .LBB6_2 +; CHECK-NEXT: .LBB6_3: ; %Flow9 +; CHECK-NEXT: s_andn2_saveexec_b32 s8, s6 +; CHECK-NEXT: s_cbranch_execz .LBB6_6 +; CHECK-NEXT: ; %bb.4: ; %memmove_bwd_loop.preheader +; CHECK-NEXT: s_movk_i32 s6, 0xff00 +; CHECK-NEXT: s_mov_b64 s[4:5], 0x700 +; CHECK-NEXT: s_mov_b32 s7, -1 +; CHECK-NEXT: .LBB6_5: ; %memmove_bwd_loop +; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: v_add_co_u32 v96, vcc_lo, v2, s4 +; CHECK-NEXT: v_add_co_ci_u32_e32 v97, vcc_lo, s5, v3, vcc_lo +; CHECK-NEXT: v_add_co_u32 v100, vcc_lo, v0, s4 +; CHECK-NEXT: v_add_co_ci_u32_e32 v101, vcc_lo, s5, v1, vcc_lo +; CHECK-NEXT: s_clause 0xf +; CHECK-NEXT: global_load_dwordx4 v[4:7], v[96:97], off offset:224 +; CHECK-NEXT: global_load_dwordx4 v[8:11], v[96:97], off offset:240 +; CHECK-NEXT: global_load_dwordx4 v[12:15], v[96:97], off offset:192 +; CHECK-NEXT: global_load_dwordx4 v[16:19], v[96:97], off offset:208 +; CHECK-NEXT: global_load_dwordx4 v[20:23], v[96:97], off offset:160 +; CHECK-NEXT: global_load_dwordx4 v[24:27], v[96:97], off offset:176 +; CHECK-NEXT: global_load_dwordx4 v[28:31], v[96:97], off offset:128 +; CHECK-NEXT: global_load_dwordx4 v[32:35], v[96:97], off offset:144 +; CHECK-NEXT: global_load_dwordx4 v[36:39], v[96:97], off offset:96 +; CHECK-NEXT: global_load_dwordx4 v[48:51], v[96:97], off offset:112 +; CHECK-NEXT: global_load_dwordx4 v[52:55], v[96:97], off offset:64 +; CHECK-NEXT: global_load_dwordx4 v[64:67], v[96:97], off offset:80 +; CHECK-NEXT: global_load_dwordx4 v[68:71], v[96:97], off offset:32 +; CHECK-NEXT: global_load_dwordx4 v[80:83], v[96:97], off offset:48 +; CHECK-NEXT: global_load_dwordx4 v[84:87], v[96:97], off +; CHECK-NEXT: global_load_dwordx4 v[96:99], v[96:97], off offset:16 +; CHECK-NEXT: s_add_u32 s4, s4, 0xffffff00 +; CHECK-NEXT: s_addc_u32 s5, s5, -1 +; CHECK-NEXT: s_waitcnt vmcnt(15) +; CHECK-NEXT: global_store_dwordx4 v[100:101], v[4:7], off offset:224 +; CHECK-NEXT: s_waitcnt vmcnt(14) +; CHECK-NEXT: global_store_dwordx4 v[100:101], v[8:11], off offset:240 +; CHECK-NEXT: s_waitcnt vmcnt(13) +; CHECK-NEXT: global_store_dwordx4 v[100:101], v[12:15], off offset:192 +; CHECK-NEXT: s_waitcnt vmcnt(12) +; CHECK-NEXT: global_store_dwordx4 v[100:101], v[16:19], off offset:208 +; CHECK-NEXT: s_waitcnt vmcnt(11) +; CHECK-NEXT: global_store_dwordx4 v[100:101], v[20:23], off offset:160 +; CHECK-NEXT: s_waitcnt vmcnt(10) +; CHECK-NEXT: global_store_dwordx4 v[100:101], v[24:27], off offset:176 +; CHECK-NEXT: s_waitcnt vmcnt(9) +; CHECK-NEXT: global_store_dwordx4 v[100:101], v[28:31], off offset:128 +; CHECK-NEXT: s_waitcnt vmcnt(8) +; CHECK-NEXT: global_store_dwordx4 v[100:101], v[32:35], off offset:144 +; CHECK-NEXT: s_waitcnt vmcnt(7) +; CHECK-NEXT: global_store_dwordx4 v[100:101], v[36:39], off offset:96 +; CHECK-NEXT: s_waitcnt vmcnt(6) +; CHECK-NEXT: global_store_dwordx4 v[100:101], v[48:51], off offset:112 +; CHECK-NEXT: s_waitcnt vmcnt(5) +; CHECK-NEXT: global_store_dwordx4 v[100:101], v[52:55], off offset:64 +; CHECK-NEXT: s_waitcnt vmcnt(4) +; CHECK-NEXT: global_store_dwordx4 v[100:101], v[64:67], off offset:80 +; CHECK-NEXT: s_waitcnt vmcnt(3) +; CHECK-NEXT: global_store_dwordx4 v[100:101], v[68:71], off offset:32 +; CHECK-NEXT: s_waitcnt vmcnt(2) +; CHECK-NEXT: global_store_dwordx4 v[100:101], v[80:83], off offset:48 +; CHECK-NEXT: s_waitcnt vmcnt(1) +; CHECK-NEXT: global_store_dwordx4 v[100:101], v[84:87], off +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: global_store_dwordx4 v[100:101], v[96:99], off offset:16 +; CHECK-NEXT: s_cmp_eq_u64 s[4:5], s[6:7] +; CHECK-NEXT: s_cbranch_scc0 .LBB6_5 +; CHECK-NEXT: .LBB6_6: ; %Flow10 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s8 +; CHECK-NEXT: s_setpc_b64 s[30:31] +; +; ALIGNED-LABEL: memmove_p1_p1_sz2048: +; ALIGNED: ; %bb.0: ; %entry +; ALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_mov_b32 s4, exec_lo +; ALIGNED-NEXT: v_cmpx_ge_u64_e64 v[2:3], v[0:1] +; ALIGNED-NEXT: s_xor_b32 s6, exec_lo, s4 +; ALIGNED-NEXT: s_cbranch_execz .LBB6_3 +; ALIGNED-NEXT: ; %bb.1: ; %memmove_fwd_loop.preheader +; ALIGNED-NEXT: s_mov_b64 s[4:5], 0 +; ALIGNED-NEXT: .LBB6_2: ; %memmove_fwd_loop +; ALIGNED-NEXT: ; =>This Inner Loop Header: Depth=1 +; ALIGNED-NEXT: v_add_co_u32 v20, vcc_lo, v2, s4 +; ALIGNED-NEXT: v_add_co_ci_u32_e32 v21, vcc_lo, s5, v3, vcc_lo +; ALIGNED-NEXT: s_clause 0xf +; ALIGNED-NEXT: global_load_dwordx4 v[16:19], v[20:21], off offset:240 +; ALIGNED-NEXT: global_load_dwordx4 v[22:25], v[20:21], off offset:224 +; ALIGNED-NEXT: global_load_dwordx4 v[4:7], v[20:21], off +; ALIGNED-NEXT: global_load_dwordx4 v[8:11], v[20:21], off offset:16 +; ALIGNED-NEXT: global_load_dwordx4 v[12:15], v[20:21], off offset:32 +; ALIGNED-NEXT: global_load_dwordx4 v[98:101], v[20:21], off offset:48 +; ALIGNED-NEXT: global_load_dwordx4 v[112:115], v[20:21], off offset:64 +; ALIGNED-NEXT: global_load_dwordx4 v[82:85], v[20:21], off offset:80 +; ALIGNED-NEXT: global_load_dwordx4 v[116:119], v[20:21], off offset:96 +; ALIGNED-NEXT: global_load_dwordx4 v[66:69], v[20:21], off offset:112 +; ALIGNED-NEXT: global_load_dwordx4 v[40:43], v[20:21], off offset:128 +; ALIGNED-NEXT: global_load_dwordx4 v[50:53], v[20:21], off offset:144 +; ALIGNED-NEXT: global_load_dwordx4 v[44:47], v[20:21], off offset:160 +; ALIGNED-NEXT: global_load_dwordx4 v[34:37], v[20:21], off offset:176 +; ALIGNED-NEXT: global_load_dwordx4 v[30:33], v[20:21], off offset:192 +; ALIGNED-NEXT: global_load_dwordx4 v[26:29], v[20:21], off offset:208 +; ALIGNED-NEXT: s_waitcnt vmcnt(15) +; ALIGNED-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:64 +; ALIGNED-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:68 +; ALIGNED-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:72 +; ALIGNED-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:76 +; ALIGNED-NEXT: s_clause 0x3 +; ALIGNED-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:76 +; ALIGNED-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:72 +; ALIGNED-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:68 +; ALIGNED-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:64 +; ALIGNED-NEXT: v_add_co_u32 v16, vcc_lo, v0, s4 +; ALIGNED-NEXT: v_add_co_ci_u32_e32 v17, vcc_lo, s5, v1, vcc_lo +; ALIGNED-NEXT: s_add_u32 s4, s4, 0x100 +; ALIGNED-NEXT: s_addc_u32 s5, s5, 0 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v21, off offset:254 +; ALIGNED-NEXT: global_store_byte v[16:17], v21, off offset:252 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v20, off offset:250 +; ALIGNED-NEXT: global_store_byte v[16:17], v20, off offset:248 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v19, off offset:246 +; ALIGNED-NEXT: global_store_byte v[16:17], v19, off offset:244 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v18, off offset:242 +; ALIGNED-NEXT: global_store_byte v[16:17], v18, off offset:240 +; ALIGNED-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:80 +; ALIGNED-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:84 +; ALIGNED-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:88 +; ALIGNED-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:92 +; ALIGNED-NEXT: s_clause 0x3 +; ALIGNED-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:92 +; ALIGNED-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:88 +; ALIGNED-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:84 +; ALIGNED-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:80 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v21 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v21, 8, v21 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v20 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v20, 8, v20 +; ALIGNED-NEXT: s_cmp_lg_u64 s[4:5], 0x800 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v25, off offset:238 +; ALIGNED-NEXT: global_store_byte v[16:17], v25, off offset:236 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v24, off offset:234 +; ALIGNED-NEXT: global_store_byte v[16:17], v24, off offset:232 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v23, off offset:230 +; ALIGNED-NEXT: global_store_byte v[16:17], v23, off offset:228 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v22, off offset:226 +; ALIGNED-NEXT: global_store_byte v[16:17], v22, off offset:224 +; ALIGNED-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:32 +; ALIGNED-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:36 +; ALIGNED-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:40 +; ALIGNED-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:44 +; ALIGNED-NEXT: s_clause 0x3 +; ALIGNED-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:44 +; ALIGNED-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:40 +; ALIGNED-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:36 +; ALIGNED-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:32 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v29, off offset:222 +; ALIGNED-NEXT: global_store_byte v[16:17], v29, off offset:220 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v28, off offset:218 +; ALIGNED-NEXT: global_store_byte v[16:17], v28, off offset:216 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v27, off offset:214 +; ALIGNED-NEXT: global_store_byte v[16:17], v27, off offset:212 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v26, off offset:210 +; ALIGNED-NEXT: global_store_byte v[16:17], v26, off offset:208 +; ALIGNED-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:48 +; ALIGNED-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:52 +; ALIGNED-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:56 +; ALIGNED-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:60 +; ALIGNED-NEXT: s_clause 0x3 +; ALIGNED-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:60 +; ALIGNED-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:56 +; ALIGNED-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:52 +; ALIGNED-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:48 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v33, off offset:206 +; ALIGNED-NEXT: global_store_byte v[16:17], v33, off offset:204 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v32, off offset:202 +; ALIGNED-NEXT: global_store_byte v[16:17], v32, off offset:200 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v31, off offset:198 +; ALIGNED-NEXT: global_store_byte v[16:17], v31, off offset:196 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v30, off offset:194 +; ALIGNED-NEXT: global_store_byte v[16:17], v30, off offset:192 +; ALIGNED-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:128 +; ALIGNED-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:132 +; ALIGNED-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:136 +; ALIGNED-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:140 +; ALIGNED-NEXT: s_clause 0x3 +; ALIGNED-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:140 +; ALIGNED-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:136 +; ALIGNED-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:132 +; ALIGNED-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:128 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v37, off offset:190 +; ALIGNED-NEXT: global_store_byte v[16:17], v37, off offset:188 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v36, off offset:186 +; ALIGNED-NEXT: global_store_byte v[16:17], v36, off offset:184 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v35, off offset:182 +; ALIGNED-NEXT: global_store_byte v[16:17], v35, off offset:180 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v34, off offset:178 +; ALIGNED-NEXT: global_store_byte v[16:17], v34, off offset:176 +; ALIGNED-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:144 +; ALIGNED-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:148 +; ALIGNED-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:152 +; ALIGNED-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:156 +; ALIGNED-NEXT: s_clause 0x3 +; ALIGNED-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:156 +; ALIGNED-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:152 +; ALIGNED-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:148 +; ALIGNED-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:144 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v49, off offset:174 +; ALIGNED-NEXT: global_store_byte v[16:17], v49, off offset:172 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v48, off offset:170 +; ALIGNED-NEXT: global_store_byte v[16:17], v48, off offset:168 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v39, off offset:166 +; ALIGNED-NEXT: global_store_byte v[16:17], v39, off offset:164 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v38, off offset:162 +; ALIGNED-NEXT: global_store_byte v[16:17], v38, off offset:160 +; ALIGNED-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:96 +; ALIGNED-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:100 +; ALIGNED-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:104 +; ALIGNED-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:108 +; ALIGNED-NEXT: s_clause 0x3 +; ALIGNED-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:108 +; ALIGNED-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:104 +; ALIGNED-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:100 +; ALIGNED-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:96 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v53, off offset:158 +; ALIGNED-NEXT: global_store_byte v[16:17], v53, off offset:156 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v52, off offset:154 +; ALIGNED-NEXT: global_store_byte v[16:17], v52, off offset:152 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v51, off offset:150 +; ALIGNED-NEXT: global_store_byte v[16:17], v51, off offset:148 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v50, off offset:146 +; ALIGNED-NEXT: global_store_byte v[16:17], v50, off offset:144 +; ALIGNED-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:112 +; ALIGNED-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:116 +; ALIGNED-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:120 +; ALIGNED-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:124 +; ALIGNED-NEXT: s_clause 0x3 +; ALIGNED-NEXT: buffer_load_dword v65, off, s[0:3], s32 offset:124 +; ALIGNED-NEXT: buffer_load_dword v64, off, s[0:3], s32 offset:120 +; ALIGNED-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:116 +; ALIGNED-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:112 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v65, off offset:142 +; ALIGNED-NEXT: global_store_byte v[16:17], v65, off offset:140 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v64, off offset:138 +; ALIGNED-NEXT: global_store_byte v[16:17], v64, off offset:136 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v55, off offset:134 +; ALIGNED-NEXT: global_store_byte v[16:17], v55, off offset:132 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v54, off offset:130 +; ALIGNED-NEXT: global_store_byte v[16:17], v54, off offset:128 +; ALIGNED-NEXT: buffer_store_dword v66, off, s[0:3], s32 offset:192 +; ALIGNED-NEXT: buffer_store_dword v67, off, s[0:3], s32 offset:196 +; ALIGNED-NEXT: buffer_store_dword v68, off, s[0:3], s32 offset:200 +; ALIGNED-NEXT: buffer_store_dword v69, off, s[0:3], s32 offset:204 +; ALIGNED-NEXT: s_clause 0x3 +; ALIGNED-NEXT: buffer_load_dword v69, off, s[0:3], s32 offset:204 +; ALIGNED-NEXT: buffer_load_dword v68, off, s[0:3], s32 offset:200 +; ALIGNED-NEXT: buffer_load_dword v67, off, s[0:3], s32 offset:196 +; ALIGNED-NEXT: buffer_load_dword v66, off, s[0:3], s32 offset:192 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v69, off offset:126 +; ALIGNED-NEXT: global_store_byte v[16:17], v69, off offset:124 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v68, off offset:122 +; ALIGNED-NEXT: global_store_byte v[16:17], v68, off offset:120 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v67, off offset:118 +; ALIGNED-NEXT: global_store_byte v[16:17], v67, off offset:116 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v66, off offset:114 +; ALIGNED-NEXT: global_store_byte v[16:17], v66, off offset:112 +; ALIGNED-NEXT: buffer_store_dword v116, off, s[0:3], s32 offset:208 +; ALIGNED-NEXT: buffer_store_dword v117, off, s[0:3], s32 offset:212 +; ALIGNED-NEXT: buffer_store_dword v118, off, s[0:3], s32 offset:216 +; ALIGNED-NEXT: buffer_store_dword v119, off, s[0:3], s32 offset:220 +; ALIGNED-NEXT: s_clause 0x3 +; ALIGNED-NEXT: buffer_load_dword v81, off, s[0:3], s32 offset:220 +; ALIGNED-NEXT: buffer_load_dword v80, off, s[0:3], s32 offset:216 +; ALIGNED-NEXT: buffer_load_dword v71, off, s[0:3], s32 offset:212 +; ALIGNED-NEXT: buffer_load_dword v70, off, s[0:3], s32 offset:208 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v81, off offset:110 +; ALIGNED-NEXT: global_store_byte v[16:17], v81, off offset:108 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v80, off offset:106 +; ALIGNED-NEXT: global_store_byte v[16:17], v80, off offset:104 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v71, off offset:102 +; ALIGNED-NEXT: global_store_byte v[16:17], v71, off offset:100 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v70, off offset:98 +; ALIGNED-NEXT: global_store_byte v[16:17], v70, off offset:96 +; ALIGNED-NEXT: buffer_store_dword v82, off, s[0:3], s32 offset:160 +; ALIGNED-NEXT: buffer_store_dword v83, off, s[0:3], s32 offset:164 +; ALIGNED-NEXT: buffer_store_dword v84, off, s[0:3], s32 offset:168 +; ALIGNED-NEXT: buffer_store_dword v85, off, s[0:3], s32 offset:172 +; ALIGNED-NEXT: s_clause 0x3 +; ALIGNED-NEXT: buffer_load_dword v85, off, s[0:3], s32 offset:172 +; ALIGNED-NEXT: buffer_load_dword v84, off, s[0:3], s32 offset:168 +; ALIGNED-NEXT: buffer_load_dword v83, off, s[0:3], s32 offset:164 +; ALIGNED-NEXT: buffer_load_dword v82, off, s[0:3], s32 offset:160 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v85, off offset:94 +; ALIGNED-NEXT: global_store_byte v[16:17], v85, off offset:92 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v84, off offset:90 +; ALIGNED-NEXT: global_store_byte v[16:17], v84, off offset:88 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v83, off offset:86 +; ALIGNED-NEXT: global_store_byte v[16:17], v83, off offset:84 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v82, off offset:82 +; ALIGNED-NEXT: global_store_byte v[16:17], v82, off offset:80 +; ALIGNED-NEXT: buffer_store_dword v112, off, s[0:3], s32 offset:176 +; ALIGNED-NEXT: buffer_store_dword v113, off, s[0:3], s32 offset:180 +; ALIGNED-NEXT: buffer_store_dword v114, off, s[0:3], s32 offset:184 +; ALIGNED-NEXT: buffer_store_dword v115, off, s[0:3], s32 offset:188 +; ALIGNED-NEXT: s_clause 0x3 +; ALIGNED-NEXT: buffer_load_dword v97, off, s[0:3], s32 offset:188 +; ALIGNED-NEXT: buffer_load_dword v96, off, s[0:3], s32 offset:184 +; ALIGNED-NEXT: buffer_load_dword v87, off, s[0:3], s32 offset:180 +; ALIGNED-NEXT: buffer_load_dword v86, off, s[0:3], s32 offset:176 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v19 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v19, 8, v19 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v18 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v18, 8, v18 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v25 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v25, 8, v25 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 24, v24 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v24, 8, v24 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v97, off offset:78 +; ALIGNED-NEXT: global_store_byte v[16:17], v97, off offset:76 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v96, off offset:74 +; ALIGNED-NEXT: global_store_byte v[16:17], v96, off offset:72 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v87, off offset:70 +; ALIGNED-NEXT: global_store_byte v[16:17], v87, off offset:68 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v86, off offset:66 +; ALIGNED-NEXT: global_store_byte v[16:17], v86, off offset:64 +; ALIGNED-NEXT: buffer_store_dword v98, off, s[0:3], s32 offset:256 +; ALIGNED-NEXT: buffer_store_dword v99, off, s[0:3], s32 offset:260 +; ALIGNED-NEXT: buffer_store_dword v100, off, s[0:3], s32 offset:264 +; ALIGNED-NEXT: buffer_store_dword v101, off, s[0:3], s32 offset:268 +; ALIGNED-NEXT: s_clause 0x3 +; ALIGNED-NEXT: buffer_load_dword v101, off, s[0:3], s32 offset:268 +; ALIGNED-NEXT: buffer_load_dword v100, off, s[0:3], s32 offset:264 +; ALIGNED-NEXT: buffer_load_dword v99, off, s[0:3], s32 offset:260 +; ALIGNED-NEXT: buffer_load_dword v98, off, s[0:3], s32 offset:256 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v101, off offset:62 +; ALIGNED-NEXT: global_store_byte v[16:17], v101, off offset:60 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v100, off offset:58 +; ALIGNED-NEXT: global_store_byte v[16:17], v100, off offset:56 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v99, off offset:54 +; ALIGNED-NEXT: global_store_byte v[16:17], v99, off offset:52 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v98, off offset:50 +; ALIGNED-NEXT: global_store_byte v[16:17], v98, off offset:48 +; ALIGNED-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:272 +; ALIGNED-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:276 +; ALIGNED-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:280 +; ALIGNED-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:284 +; ALIGNED-NEXT: s_clause 0x3 +; ALIGNED-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:280 +; ALIGNED-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:284 +; ALIGNED-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:272 +; ALIGNED-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:276 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v15, off offset:42 +; ALIGNED-NEXT: global_store_byte v[16:17], v15, off offset:40 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v14, off offset:46 +; ALIGNED-NEXT: global_store_byte v[16:17], v14, off offset:44 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v13, off offset:34 +; ALIGNED-NEXT: global_store_byte v[16:17], v13, off offset:32 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v12, off offset:38 +; ALIGNED-NEXT: global_store_byte v[16:17], v12, off offset:36 +; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:224 +; ALIGNED-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:228 +; ALIGNED-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:232 +; ALIGNED-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:236 +; ALIGNED-NEXT: s_clause 0x3 +; ALIGNED-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:236 +; ALIGNED-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:232 +; ALIGNED-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:228 +; ALIGNED-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:224 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v11, off offset:30 +; ALIGNED-NEXT: global_store_byte v[16:17], v11, off offset:28 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v10, off offset:26 +; ALIGNED-NEXT: global_store_byte v[16:17], v10, off offset:24 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v9, off offset:22 +; ALIGNED-NEXT: global_store_byte v[16:17], v9, off offset:20 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v8, off offset:18 +; ALIGNED-NEXT: global_store_byte v[16:17], v8, off offset:16 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:240 +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:244 +; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:248 +; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:252 +; ALIGNED-NEXT: s_clause 0x3 +; ALIGNED-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:252 +; ALIGNED-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:248 +; ALIGNED-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:244 +; ALIGNED-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:240 +; ALIGNED-NEXT: global_store_byte v[16:17], v112, off offset:247 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v27 +; ALIGNED-NEXT: global_store_byte v[16:17], v102, off offset:255 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v23 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v23, 8, v23 +; ALIGNED-NEXT: global_store_byte v[16:17], v21, off offset:253 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v21, 24, v22 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v22, 8, v22 +; ALIGNED-NEXT: global_store_byte v[16:17], v103, off offset:251 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v29 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v29, 8, v29 +; ALIGNED-NEXT: global_store_byte v[16:17], v20, off offset:249 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v20, 24, v28 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v28, 8, v28 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v27, 8, v27 +; ALIGNED-NEXT: global_store_byte v[16:17], v19, off offset:245 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v19, 24, v26 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v26, 8, v26 +; ALIGNED-NEXT: global_store_byte v[16:17], v112, off offset:215 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v51 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 8, v51 +; ALIGNED-NEXT: global_store_byte v[16:17], v113, off offset:243 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v33 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v33, 8, v33 +; ALIGNED-NEXT: global_store_byte v[16:17], v18, off offset:241 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v18, 24, v32 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v32, 8, v32 +; ALIGNED-NEXT: global_store_byte v[16:17], v114, off offset:239 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v31 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v31, 8, v31 +; ALIGNED-NEXT: global_store_byte v[16:17], v25, off offset:237 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v25, 24, v30 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v30, 8, v30 +; ALIGNED-NEXT: global_store_byte v[16:17], v115, off offset:235 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 24, v37 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v37, 8, v37 +; ALIGNED-NEXT: global_store_byte v[16:17], v24, off offset:233 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v24, 24, v36 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v36, 8, v36 +; ALIGNED-NEXT: global_store_byte v[16:17], v102, off offset:231 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v35 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v35, 8, v35 +; ALIGNED-NEXT: global_store_byte v[16:17], v23, off offset:229 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v23, 24, v34 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v34, 8, v34 +; ALIGNED-NEXT: global_store_byte v[16:17], v21, off offset:227 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v21, 24, v49 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v49, 8, v49 +; ALIGNED-NEXT: global_store_byte v[16:17], v22, off offset:225 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v22, 24, v48 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v48, 8, v48 +; ALIGNED-NEXT: global_store_byte v[16:17], v103, off offset:223 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v39 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v39, 8, v39 +; ALIGNED-NEXT: global_store_byte v[16:17], v29, off offset:221 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v29, 24, v38 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v38, 8, v38 +; ALIGNED-NEXT: global_store_byte v[16:17], v20, off offset:219 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v20, 24, v53 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v53, 8, v53 +; ALIGNED-NEXT: global_store_byte v[16:17], v28, off offset:217 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v28, 24, v52 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v52, 8, v52 +; ALIGNED-NEXT: global_store_byte v[16:17], v27, off offset:213 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v27, 24, v50 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v50, 8, v50 +; ALIGNED-NEXT: global_store_byte v[16:17], v19, off offset:211 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v19, 24, v65 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v65, 8, v65 +; ALIGNED-NEXT: global_store_byte v[16:17], v26, off offset:209 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v26, 24, v64 +; ALIGNED-NEXT: global_store_byte v[16:17], v51, off offset:149 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 24, v8 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v8, 8, v8 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v64, 8, v64 +; ALIGNED-NEXT: global_store_byte v[16:17], v113, off offset:207 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v55 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v55, 8, v55 +; ALIGNED-NEXT: global_store_byte v[16:17], v33, off offset:205 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v33, 24, v54 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v54, 8, v54 +; ALIGNED-NEXT: global_store_byte v[16:17], v18, off offset:203 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v18, 24, v69 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 8, v69 +; ALIGNED-NEXT: global_store_byte v[16:17], v32, off offset:201 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v32, 24, v68 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v68, 8, v68 +; ALIGNED-NEXT: global_store_byte v[16:17], v114, off offset:199 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v67 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v67, 8, v67 +; ALIGNED-NEXT: global_store_byte v[16:17], v31, off offset:197 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v31, 24, v66 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v66, 8, v66 +; ALIGNED-NEXT: global_store_byte v[16:17], v25, off offset:195 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v25, 24, v81 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 8, v81 +; ALIGNED-NEXT: global_store_byte v[16:17], v30, off offset:193 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v30, 24, v80 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 8, v80 +; ALIGNED-NEXT: global_store_byte v[16:17], v115, off offset:191 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 24, v71 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 8, v71 +; ALIGNED-NEXT: global_store_byte v[16:17], v37, off offset:189 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v37, 24, v70 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 8, v70 +; ALIGNED-NEXT: global_store_byte v[16:17], v24, off offset:187 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v24, 24, v85 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 8, v85 +; ALIGNED-NEXT: global_store_byte v[16:17], v36, off offset:185 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v36, 24, v84 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v84, 8, v84 +; ALIGNED-NEXT: global_store_byte v[16:17], v102, off offset:183 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v83 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 8, v83 +; ALIGNED-NEXT: global_store_byte v[16:17], v35, off offset:181 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v35, 24, v82 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 8, v82 +; ALIGNED-NEXT: global_store_byte v[16:17], v23, off offset:179 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v23, 24, v97 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v97, 8, v97 +; ALIGNED-NEXT: global_store_byte v[16:17], v34, off offset:177 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v34, 24, v96 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v96, 8, v96 +; ALIGNED-NEXT: global_store_byte v[16:17], v21, off offset:175 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v21, 24, v87 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 8, v87 +; ALIGNED-NEXT: global_store_byte v[16:17], v49, off offset:173 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v49, 24, v86 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 8, v86 +; ALIGNED-NEXT: global_store_byte v[16:17], v22, off offset:171 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v22, 24, v101 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 8, v101 +; ALIGNED-NEXT: global_store_byte v[16:17], v48, off offset:169 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v48, 24, v100 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 8, v100 +; ALIGNED-NEXT: global_store_byte v[16:17], v103, off offset:167 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v99 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v99 +; ALIGNED-NEXT: global_store_byte v[16:17], v39, off offset:165 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v39, 24, v98 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 8, v98 +; ALIGNED-NEXT: global_store_byte v[16:17], v29, off offset:163 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v29, 24, v15 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v15, 8, v15 +; ALIGNED-NEXT: global_store_byte v[16:17], v38, off offset:161 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v38, 24, v14 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v14, 8, v14 +; ALIGNED-NEXT: global_store_byte v[16:17], v20, off offset:159 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v20, 24, v13 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v13, 8, v13 +; ALIGNED-NEXT: global_store_byte v[16:17], v53, off offset:157 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v53, 24, v12 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v12, 8, v12 +; ALIGNED-NEXT: global_store_byte v[16:17], v28, off offset:155 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v28, 24, v11 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v11, 8, v11 +; ALIGNED-NEXT: global_store_byte v[16:17], v52, off offset:153 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v52, 24, v10 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v10, 8, v10 +; ALIGNED-NEXT: global_store_byte v[16:17], v112, off offset:151 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v9 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v9, 8, v9 +; ALIGNED-NEXT: global_store_byte v[16:17], v27, off offset:147 +; ALIGNED-NEXT: global_store_byte v[16:17], v50, off offset:145 +; ALIGNED-NEXT: global_store_byte v[16:17], v19, off offset:143 +; ALIGNED-NEXT: global_store_byte v[16:17], v65, off offset:141 +; ALIGNED-NEXT: global_store_byte v[16:17], v26, off offset:139 +; ALIGNED-NEXT: global_store_byte v[16:17], v64, off offset:137 +; ALIGNED-NEXT: global_store_byte v[16:17], v113, off offset:135 +; ALIGNED-NEXT: global_store_byte v[16:17], v55, off offset:133 +; ALIGNED-NEXT: global_store_byte v[16:17], v33, off offset:131 +; ALIGNED-NEXT: global_store_byte v[16:17], v54, off offset:129 +; ALIGNED-NEXT: global_store_byte v[16:17], v18, off offset:127 +; ALIGNED-NEXT: global_store_byte v[16:17], v69, off offset:125 +; ALIGNED-NEXT: global_store_byte v[16:17], v32, off offset:123 +; ALIGNED-NEXT: global_store_byte v[16:17], v68, off offset:121 +; ALIGNED-NEXT: global_store_byte v[16:17], v114, off offset:119 +; ALIGNED-NEXT: global_store_byte v[16:17], v67, off offset:117 +; ALIGNED-NEXT: global_store_byte v[16:17], v31, off offset:115 +; ALIGNED-NEXT: global_store_byte v[16:17], v66, off offset:113 +; ALIGNED-NEXT: global_store_byte v[16:17], v25, off offset:111 +; ALIGNED-NEXT: global_store_byte v[16:17], v81, off offset:109 +; ALIGNED-NEXT: global_store_byte v[16:17], v30, off offset:107 +; ALIGNED-NEXT: global_store_byte v[16:17], v80, off offset:105 +; ALIGNED-NEXT: global_store_byte v[16:17], v115, off offset:103 +; ALIGNED-NEXT: global_store_byte v[16:17], v71, off offset:101 +; ALIGNED-NEXT: global_store_byte v[16:17], v37, off offset:99 +; ALIGNED-NEXT: global_store_byte v[16:17], v70, off offset:97 +; ALIGNED-NEXT: global_store_byte v[16:17], v24, off offset:95 +; ALIGNED-NEXT: global_store_byte v[16:17], v85, off offset:93 +; ALIGNED-NEXT: global_store_byte v[16:17], v36, off offset:91 +; ALIGNED-NEXT: global_store_byte v[16:17], v84, off offset:89 +; ALIGNED-NEXT: global_store_byte v[16:17], v102, off offset:87 +; ALIGNED-NEXT: global_store_byte v[16:17], v83, off offset:85 +; ALIGNED-NEXT: global_store_byte v[16:17], v35, off offset:83 +; ALIGNED-NEXT: global_store_byte v[16:17], v82, off offset:81 +; ALIGNED-NEXT: global_store_byte v[16:17], v23, off offset:79 +; ALIGNED-NEXT: global_store_byte v[16:17], v97, off offset:77 +; ALIGNED-NEXT: global_store_byte v[16:17], v34, off offset:75 +; ALIGNED-NEXT: global_store_byte v[16:17], v96, off offset:73 +; ALIGNED-NEXT: global_store_byte v[16:17], v21, off offset:71 +; ALIGNED-NEXT: global_store_byte v[16:17], v87, off offset:69 +; ALIGNED-NEXT: global_store_byte v[16:17], v49, off offset:67 +; ALIGNED-NEXT: global_store_byte v[16:17], v86, off offset:65 +; ALIGNED-NEXT: global_store_byte v[16:17], v22, off offset:63 +; ALIGNED-NEXT: global_store_byte v[16:17], v101, off offset:61 +; ALIGNED-NEXT: global_store_byte v[16:17], v48, off offset:59 +; ALIGNED-NEXT: global_store_byte v[16:17], v100, off offset:57 +; ALIGNED-NEXT: global_store_byte v[16:17], v103, off offset:55 +; ALIGNED-NEXT: global_store_byte v[16:17], v99, off offset:53 +; ALIGNED-NEXT: global_store_byte v[16:17], v39, off offset:51 +; ALIGNED-NEXT: global_store_byte v[16:17], v98, off offset:49 +; ALIGNED-NEXT: global_store_byte v[16:17], v29, off offset:43 +; ALIGNED-NEXT: global_store_byte v[16:17], v15, off offset:41 +; ALIGNED-NEXT: global_store_byte v[16:17], v38, off offset:47 +; ALIGNED-NEXT: global_store_byte v[16:17], v14, off offset:45 +; ALIGNED-NEXT: global_store_byte v[16:17], v20, off offset:35 +; ALIGNED-NEXT: global_store_byte v[16:17], v13, off offset:33 +; ALIGNED-NEXT: global_store_byte v[16:17], v53, off offset:39 +; ALIGNED-NEXT: global_store_byte v[16:17], v12, off offset:37 +; ALIGNED-NEXT: global_store_byte v[16:17], v28, off offset:31 +; ALIGNED-NEXT: global_store_byte v[16:17], v11, off offset:29 +; ALIGNED-NEXT: global_store_byte v[16:17], v52, off offset:27 +; ALIGNED-NEXT: global_store_byte v[16:17], v10, off offset:25 +; ALIGNED-NEXT: global_store_byte v[16:17], v112, off offset:23 +; ALIGNED-NEXT: global_store_byte v[16:17], v9, off offset:21 +; ALIGNED-NEXT: global_store_byte v[16:17], v51, off offset:19 +; ALIGNED-NEXT: global_store_byte v[16:17], v8, off offset:17 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v7, off offset:14 +; ALIGNED-NEXT: global_store_byte v[16:17], v7, off offset:12 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v6, off offset:10 +; ALIGNED-NEXT: global_store_byte v[16:17], v6, off offset:8 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v5, off offset:6 +; ALIGNED-NEXT: global_store_byte v[16:17], v5, off offset:4 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v4, off offset:2 +; ALIGNED-NEXT: global_store_byte v[16:17], v4, off +; ALIGNED-NEXT: v_lshrrev_b32_e32 v8, 24, v7 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v7, 8, v7 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v9, 24, v6 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v6, 8, v6 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v10, 24, v5 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v5, 8, v5 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v11, 24, v4 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v4, 8, v4 +; ALIGNED-NEXT: global_store_byte v[16:17], v8, off offset:15 +; ALIGNED-NEXT: global_store_byte v[16:17], v7, off offset:13 +; ALIGNED-NEXT: global_store_byte v[16:17], v9, off offset:11 +; ALIGNED-NEXT: global_store_byte v[16:17], v6, off offset:9 +; ALIGNED-NEXT: global_store_byte v[16:17], v10, off offset:7 +; ALIGNED-NEXT: global_store_byte v[16:17], v5, off offset:5 +; ALIGNED-NEXT: global_store_byte v[16:17], v11, off offset:3 +; ALIGNED-NEXT: global_store_byte v[16:17], v4, off offset:1 +; ALIGNED-NEXT: s_cbranch_scc1 .LBB6_2 +; ALIGNED-NEXT: .LBB6_3: ; %Flow9 +; ALIGNED-NEXT: s_andn2_saveexec_b32 s8, s6 +; ALIGNED-NEXT: s_cbranch_execz .LBB6_6 +; ALIGNED-NEXT: ; %bb.4: ; %memmove_bwd_loop.preheader +; ALIGNED-NEXT: s_movk_i32 s6, 0xff00 +; ALIGNED-NEXT: s_mov_b64 s[4:5], 0x700 +; ALIGNED-NEXT: s_mov_b32 s7, -1 +; ALIGNED-NEXT: .LBB6_5: ; %memmove_bwd_loop +; ALIGNED-NEXT: ; =>This Inner Loop Header: Depth=1 +; ALIGNED-NEXT: v_add_co_u32 v24, vcc_lo, v2, s4 +; ALIGNED-NEXT: v_add_co_ci_u32_e32 v25, vcc_lo, s5, v3, vcc_lo +; ALIGNED-NEXT: s_clause 0xf +; ALIGNED-NEXT: global_load_dwordx4 v[16:19], v[24:25], off offset:240 +; ALIGNED-NEXT: global_load_dwordx4 v[20:23], v[24:25], off offset:224 +; ALIGNED-NEXT: global_load_dwordx4 v[4:7], v[24:25], off +; ALIGNED-NEXT: global_load_dwordx4 v[8:11], v[24:25], off offset:16 +; ALIGNED-NEXT: global_load_dwordx4 v[12:15], v[24:25], off offset:32 +; ALIGNED-NEXT: global_load_dwordx4 v[112:115], v[24:25], off offset:48 +; ALIGNED-NEXT: global_load_dwordx4 v[116:119], v[24:25], off offset:64 +; ALIGNED-NEXT: global_load_dwordx4 v[40:43], v[24:25], off offset:80 +; ALIGNED-NEXT: global_load_dwordx4 v[26:29], v[24:25], off offset:96 +; ALIGNED-NEXT: global_load_dwordx4 v[32:35], v[24:25], off offset:112 +; ALIGNED-NEXT: global_load_dwordx4 v[44:47], v[24:25], off offset:128 +; ALIGNED-NEXT: global_load_dwordx4 v[52:55], v[24:25], off offset:144 +; ALIGNED-NEXT: global_load_dwordx4 v[66:69], v[24:25], off offset:160 +; ALIGNED-NEXT: global_load_dwordx4 v[81:84], v[24:25], off offset:176 +; ALIGNED-NEXT: global_load_dwordx4 v[96:99], v[24:25], off offset:192 +; ALIGNED-NEXT: global_load_dwordx4 v[100:103], v[24:25], off offset:208 +; ALIGNED-NEXT: s_waitcnt vmcnt(15) +; ALIGNED-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:320 +; ALIGNED-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:324 +; ALIGNED-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:328 +; ALIGNED-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:332 +; ALIGNED-NEXT: s_clause 0x3 +; ALIGNED-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:332 +; ALIGNED-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:328 +; ALIGNED-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:324 +; ALIGNED-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:320 +; ALIGNED-NEXT: v_add_co_u32 v16, vcc_lo, v0, s4 +; ALIGNED-NEXT: v_add_co_ci_u32_e32 v17, vcc_lo, s5, v1, vcc_lo +; ALIGNED-NEXT: s_add_u32 s4, s4, 0xffffff00 +; ALIGNED-NEXT: s_addc_u32 s5, s5, -1 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v31, off offset:254 +; ALIGNED-NEXT: global_store_byte v[16:17], v31, off offset:252 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v30, off offset:250 +; ALIGNED-NEXT: global_store_byte v[16:17], v30, off offset:248 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v25, off offset:246 +; ALIGNED-NEXT: global_store_byte v[16:17], v25, off offset:244 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v24, off offset:242 +; ALIGNED-NEXT: global_store_byte v[16:17], v24, off offset:240 +; ALIGNED-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:336 +; ALIGNED-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:340 +; ALIGNED-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:344 +; ALIGNED-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:348 +; ALIGNED-NEXT: s_clause 0x3 +; ALIGNED-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:348 +; ALIGNED-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:344 +; ALIGNED-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:340 +; ALIGNED-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:336 +; ALIGNED-NEXT: s_cmp_eq_u64 s[4:5], s[6:7] +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v51, off offset:238 +; ALIGNED-NEXT: global_store_byte v[16:17], v51, off offset:236 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v50, off offset:234 +; ALIGNED-NEXT: global_store_byte v[16:17], v50, off offset:232 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v49, off offset:230 +; ALIGNED-NEXT: global_store_byte v[16:17], v49, off offset:228 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v36, off offset:226 +; ALIGNED-NEXT: global_store_byte v[16:17], v36, off offset:224 +; ALIGNED-NEXT: buffer_store_dword v100, off, s[0:3], s32 offset:288 +; ALIGNED-NEXT: buffer_store_dword v101, off, s[0:3], s32 offset:292 +; ALIGNED-NEXT: buffer_store_dword v102, off, s[0:3], s32 offset:296 +; ALIGNED-NEXT: buffer_store_dword v103, off, s[0:3], s32 offset:300 +; ALIGNED-NEXT: s_clause 0x3 +; ALIGNED-NEXT: buffer_load_dword v71, off, s[0:3], s32 offset:300 +; ALIGNED-NEXT: buffer_load_dword v70, off, s[0:3], s32 offset:296 +; ALIGNED-NEXT: buffer_load_dword v65, off, s[0:3], s32 offset:292 +; ALIGNED-NEXT: buffer_load_dword v64, off, s[0:3], s32 offset:288 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v31 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v31, 8, v31 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v30 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v30, 8, v30 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v71, off offset:222 +; ALIGNED-NEXT: global_store_byte v[16:17], v71, off offset:220 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v70, off offset:218 +; ALIGNED-NEXT: global_store_byte v[16:17], v70, off offset:216 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v65, off offset:214 +; ALIGNED-NEXT: global_store_byte v[16:17], v65, off offset:212 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v64, off offset:210 +; ALIGNED-NEXT: global_store_byte v[16:17], v64, off offset:208 +; ALIGNED-NEXT: buffer_store_dword v96, off, s[0:3], s32 offset:304 +; ALIGNED-NEXT: buffer_store_dword v97, off, s[0:3], s32 offset:308 +; ALIGNED-NEXT: buffer_store_dword v98, off, s[0:3], s32 offset:312 +; ALIGNED-NEXT: buffer_store_dword v99, off, s[0:3], s32 offset:316 +; ALIGNED-NEXT: s_clause 0x3 +; ALIGNED-NEXT: buffer_load_dword v87, off, s[0:3], s32 offset:316 +; ALIGNED-NEXT: buffer_load_dword v86, off, s[0:3], s32 offset:312 +; ALIGNED-NEXT: buffer_load_dword v85, off, s[0:3], s32 offset:308 +; ALIGNED-NEXT: buffer_load_dword v80, off, s[0:3], s32 offset:304 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v87, off offset:206 +; ALIGNED-NEXT: global_store_byte v[16:17], v87, off offset:204 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v86, off offset:202 +; ALIGNED-NEXT: global_store_byte v[16:17], v86, off offset:200 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v85, off offset:198 +; ALIGNED-NEXT: global_store_byte v[16:17], v85, off offset:196 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v80, off offset:194 +; ALIGNED-NEXT: global_store_byte v[16:17], v80, off offset:192 +; ALIGNED-NEXT: buffer_store_dword v81, off, s[0:3], s32 offset:384 +; ALIGNED-NEXT: buffer_store_dword v82, off, s[0:3], s32 offset:388 +; ALIGNED-NEXT: buffer_store_dword v83, off, s[0:3], s32 offset:392 +; ALIGNED-NEXT: buffer_store_dword v84, off, s[0:3], s32 offset:396 +; ALIGNED-NEXT: s_clause 0x3 +; ALIGNED-NEXT: buffer_load_dword v101, off, s[0:3], s32 offset:396 +; ALIGNED-NEXT: buffer_load_dword v99, off, s[0:3], s32 offset:392 +; ALIGNED-NEXT: buffer_load_dword v96, off, s[0:3], s32 offset:388 +; ALIGNED-NEXT: buffer_load_dword v81, off, s[0:3], s32 offset:384 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v101, off offset:190 +; ALIGNED-NEXT: global_store_byte v[16:17], v101, off offset:188 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v99, off offset:186 +; ALIGNED-NEXT: global_store_byte v[16:17], v99, off offset:184 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v96, off offset:182 +; ALIGNED-NEXT: global_store_byte v[16:17], v96, off offset:180 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v81, off offset:178 +; ALIGNED-NEXT: global_store_byte v[16:17], v81, off offset:176 +; ALIGNED-NEXT: buffer_store_dword v66, off, s[0:3], s32 offset:400 +; ALIGNED-NEXT: buffer_store_dword v67, off, s[0:3], s32 offset:404 +; ALIGNED-NEXT: buffer_store_dword v68, off, s[0:3], s32 offset:408 +; ALIGNED-NEXT: buffer_store_dword v69, off, s[0:3], s32 offset:412 +; ALIGNED-NEXT: s_clause 0x3 +; ALIGNED-NEXT: buffer_load_dword v100, off, s[0:3], s32 offset:412 +; ALIGNED-NEXT: buffer_load_dword v97, off, s[0:3], s32 offset:408 +; ALIGNED-NEXT: buffer_load_dword v82, off, s[0:3], s32 offset:404 +; ALIGNED-NEXT: buffer_load_dword v66, off, s[0:3], s32 offset:400 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v100, off offset:174 +; ALIGNED-NEXT: global_store_byte v[16:17], v100, off offset:172 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v97, off offset:170 +; ALIGNED-NEXT: global_store_byte v[16:17], v97, off offset:168 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v82, off offset:166 +; ALIGNED-NEXT: global_store_byte v[16:17], v82, off offset:164 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v66, off offset:162 +; ALIGNED-NEXT: global_store_byte v[16:17], v66, off offset:160 +; ALIGNED-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:352 +; ALIGNED-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:356 +; ALIGNED-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:360 +; ALIGNED-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:364 +; ALIGNED-NEXT: s_clause 0x3 +; ALIGNED-NEXT: buffer_load_dword v98, off, s[0:3], s32 offset:364 +; ALIGNED-NEXT: buffer_load_dword v83, off, s[0:3], s32 offset:360 +; ALIGNED-NEXT: buffer_load_dword v67, off, s[0:3], s32 offset:356 +; ALIGNED-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:352 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v98, off offset:158 +; ALIGNED-NEXT: global_store_byte v[16:17], v98, off offset:156 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v83, off offset:154 +; ALIGNED-NEXT: global_store_byte v[16:17], v83, off offset:152 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v67, off offset:150 +; ALIGNED-NEXT: global_store_byte v[16:17], v67, off offset:148 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v52, off offset:146 +; ALIGNED-NEXT: global_store_byte v[16:17], v52, off offset:144 +; ALIGNED-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:368 +; ALIGNED-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:372 +; ALIGNED-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:376 +; ALIGNED-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:380 +; ALIGNED-NEXT: s_clause 0x3 +; ALIGNED-NEXT: buffer_load_dword v84, off, s[0:3], s32 offset:380 +; ALIGNED-NEXT: buffer_load_dword v68, off, s[0:3], s32 offset:376 +; ALIGNED-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:372 +; ALIGNED-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:368 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v84, off offset:142 +; ALIGNED-NEXT: global_store_byte v[16:17], v84, off offset:140 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v68, off offset:138 +; ALIGNED-NEXT: global_store_byte v[16:17], v68, off offset:136 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v53, off offset:134 +; ALIGNED-NEXT: global_store_byte v[16:17], v53, off offset:132 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v37, off offset:130 +; ALIGNED-NEXT: global_store_byte v[16:17], v37, off offset:128 +; ALIGNED-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:448 +; ALIGNED-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:452 +; ALIGNED-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:456 +; ALIGNED-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:460 +; ALIGNED-NEXT: s_clause 0x3 +; ALIGNED-NEXT: buffer_load_dword v69, off, s[0:3], s32 offset:460 +; ALIGNED-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:456 +; ALIGNED-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:452 +; ALIGNED-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:448 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v69, off offset:126 +; ALIGNED-NEXT: global_store_byte v[16:17], v69, off offset:124 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v54, off offset:122 +; ALIGNED-NEXT: global_store_byte v[16:17], v54, off offset:120 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v38, off offset:118 +; ALIGNED-NEXT: global_store_byte v[16:17], v38, off offset:116 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v32, off offset:114 +; ALIGNED-NEXT: global_store_byte v[16:17], v32, off offset:112 +; ALIGNED-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:464 +; ALIGNED-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:468 +; ALIGNED-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:472 +; ALIGNED-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:476 +; ALIGNED-NEXT: s_clause 0x3 +; ALIGNED-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:476 +; ALIGNED-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:472 +; ALIGNED-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:468 +; ALIGNED-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:464 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v55, off offset:110 +; ALIGNED-NEXT: global_store_byte v[16:17], v55, off offset:108 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v39, off offset:106 +; ALIGNED-NEXT: global_store_byte v[16:17], v39, off offset:104 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v33, off offset:102 +; ALIGNED-NEXT: global_store_byte v[16:17], v33, off offset:100 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v26, off offset:98 +; ALIGNED-NEXT: global_store_byte v[16:17], v26, off offset:96 +; ALIGNED-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:416 +; ALIGNED-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:420 +; ALIGNED-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:424 +; ALIGNED-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:428 +; ALIGNED-NEXT: s_clause 0x3 +; ALIGNED-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:428 +; ALIGNED-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:424 +; ALIGNED-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:420 +; ALIGNED-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:416 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v48, off offset:94 +; ALIGNED-NEXT: global_store_byte v[16:17], v48, off offset:92 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v34, off offset:90 +; ALIGNED-NEXT: global_store_byte v[16:17], v34, off offset:88 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v27, off offset:86 +; ALIGNED-NEXT: global_store_byte v[16:17], v27, off offset:84 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v21, off offset:82 +; ALIGNED-NEXT: global_store_byte v[16:17], v21, off offset:80 +; ALIGNED-NEXT: buffer_store_dword v116, off, s[0:3], s32 offset:432 +; ALIGNED-NEXT: buffer_store_dword v117, off, s[0:3], s32 offset:436 +; ALIGNED-NEXT: buffer_store_dword v118, off, s[0:3], s32 offset:440 +; ALIGNED-NEXT: buffer_store_dword v119, off, s[0:3], s32 offset:444 +; ALIGNED-NEXT: s_clause 0x3 +; ALIGNED-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:444 +; ALIGNED-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:440 +; ALIGNED-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:436 +; ALIGNED-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:432 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v35, off offset:78 +; ALIGNED-NEXT: global_store_byte v[16:17], v35, off offset:76 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v28, off offset:74 +; ALIGNED-NEXT: global_store_byte v[16:17], v28, off offset:72 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v22, off offset:70 +; ALIGNED-NEXT: global_store_byte v[16:17], v22, off offset:68 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v19, off offset:66 +; ALIGNED-NEXT: global_store_byte v[16:17], v19, off offset:64 +; ALIGNED-NEXT: buffer_store_dword v112, off, s[0:3], s32 offset:512 +; ALIGNED-NEXT: buffer_store_dword v113, off, s[0:3], s32 offset:516 +; ALIGNED-NEXT: buffer_store_dword v114, off, s[0:3], s32 offset:520 +; ALIGNED-NEXT: buffer_store_dword v115, off, s[0:3], s32 offset:524 +; ALIGNED-NEXT: s_clause 0x3 +; ALIGNED-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:524 +; ALIGNED-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:520 +; ALIGNED-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:516 +; ALIGNED-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:512 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v25 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v25, 8, v25 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v24 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v24, 8, v24 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v51 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 8, v51 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 24, v50 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v50, 8, v50 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v29, off offset:62 +; ALIGNED-NEXT: global_store_byte v[16:17], v29, off offset:60 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v23, off offset:58 +; ALIGNED-NEXT: global_store_byte v[16:17], v23, off offset:56 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v20, off offset:54 +; ALIGNED-NEXT: global_store_byte v[16:17], v20, off offset:52 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v18, off offset:50 +; ALIGNED-NEXT: global_store_byte v[16:17], v18, off offset:48 +; ALIGNED-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:528 +; ALIGNED-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:532 +; ALIGNED-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:536 +; ALIGNED-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:540 +; ALIGNED-NEXT: s_clause 0x3 +; ALIGNED-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:536 +; ALIGNED-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:540 +; ALIGNED-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:528 +; ALIGNED-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:532 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v15, off offset:42 +; ALIGNED-NEXT: global_store_byte v[16:17], v15, off offset:40 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v14, off offset:46 +; ALIGNED-NEXT: global_store_byte v[16:17], v14, off offset:44 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v13, off offset:34 +; ALIGNED-NEXT: global_store_byte v[16:17], v13, off offset:32 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v12, off offset:38 +; ALIGNED-NEXT: global_store_byte v[16:17], v12, off offset:36 +; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:480 +; ALIGNED-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:484 +; ALIGNED-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:488 +; ALIGNED-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:492 +; ALIGNED-NEXT: s_clause 0x3 +; ALIGNED-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:492 +; ALIGNED-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:488 +; ALIGNED-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:484 +; ALIGNED-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:480 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v11, off offset:30 +; ALIGNED-NEXT: global_store_byte v[16:17], v11, off offset:28 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v10, off offset:26 +; ALIGNED-NEXT: global_store_byte v[16:17], v10, off offset:24 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v9, off offset:22 +; ALIGNED-NEXT: global_store_byte v[16:17], v9, off offset:20 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v8, off offset:18 +; ALIGNED-NEXT: global_store_byte v[16:17], v8, off offset:16 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:496 +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:500 +; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:504 +; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:508 +; ALIGNED-NEXT: s_clause 0x3 +; ALIGNED-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:508 +; ALIGNED-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:504 +; ALIGNED-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:500 +; ALIGNED-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:496 +; ALIGNED-NEXT: global_store_byte v[16:17], v112, off offset:247 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v65 +; ALIGNED-NEXT: global_store_byte v[16:17], v102, off offset:255 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v49 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v49, 8, v49 +; ALIGNED-NEXT: global_store_byte v[16:17], v31, off offset:253 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v31, 24, v36 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v36, 8, v36 +; ALIGNED-NEXT: global_store_byte v[16:17], v103, off offset:251 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v71 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 8, v71 +; ALIGNED-NEXT: global_store_byte v[16:17], v30, off offset:249 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v30, 24, v70 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 8, v70 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v65, 8, v65 +; ALIGNED-NEXT: global_store_byte v[16:17], v25, off offset:245 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v25, 24, v64 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v64, 8, v64 +; ALIGNED-NEXT: global_store_byte v[16:17], v112, off offset:215 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v67 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v67, 8, v67 +; ALIGNED-NEXT: global_store_byte v[16:17], v113, off offset:243 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v87 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 8, v87 +; ALIGNED-NEXT: global_store_byte v[16:17], v24, off offset:241 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v24, 24, v86 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 8, v86 +; ALIGNED-NEXT: global_store_byte v[16:17], v114, off offset:239 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v85 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 8, v85 +; ALIGNED-NEXT: global_store_byte v[16:17], v51, off offset:237 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 24, v80 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 8, v80 +; ALIGNED-NEXT: global_store_byte v[16:17], v115, off offset:235 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 24, v101 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 8, v101 +; ALIGNED-NEXT: global_store_byte v[16:17], v50, off offset:233 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v50, 24, v99 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v99 +; ALIGNED-NEXT: global_store_byte v[16:17], v102, off offset:231 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v96 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v96, 8, v96 +; ALIGNED-NEXT: global_store_byte v[16:17], v49, off offset:229 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v49, 24, v81 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 8, v81 +; ALIGNED-NEXT: global_store_byte v[16:17], v31, off offset:227 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v31, 24, v100 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 8, v100 +; ALIGNED-NEXT: global_store_byte v[16:17], v36, off offset:225 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v36, 24, v97 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v97, 8, v97 +; ALIGNED-NEXT: global_store_byte v[16:17], v103, off offset:223 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v82 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 8, v82 +; ALIGNED-NEXT: global_store_byte v[16:17], v71, off offset:221 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 24, v66 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v66, 8, v66 +; ALIGNED-NEXT: global_store_byte v[16:17], v30, off offset:219 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v30, 24, v98 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 8, v98 +; ALIGNED-NEXT: global_store_byte v[16:17], v70, off offset:217 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 24, v83 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 8, v83 +; ALIGNED-NEXT: global_store_byte v[16:17], v65, off offset:213 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v65, 24, v52 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v52, 8, v52 +; ALIGNED-NEXT: global_store_byte v[16:17], v25, off offset:211 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v25, 24, v84 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v84, 8, v84 +; ALIGNED-NEXT: global_store_byte v[16:17], v64, off offset:209 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v64, 24, v68 +; ALIGNED-NEXT: global_store_byte v[16:17], v67, off offset:149 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v67, 24, v8 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v8, 8, v8 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v68, 8, v68 +; ALIGNED-NEXT: global_store_byte v[16:17], v113, off offset:207 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v53 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v53, 8, v53 +; ALIGNED-NEXT: global_store_byte v[16:17], v87, off offset:205 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 24, v37 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v37, 8, v37 +; ALIGNED-NEXT: global_store_byte v[16:17], v24, off offset:203 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v24, 24, v69 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 8, v69 +; ALIGNED-NEXT: global_store_byte v[16:17], v86, off offset:201 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 24, v54 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v54, 8, v54 +; ALIGNED-NEXT: global_store_byte v[16:17], v114, off offset:199 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v38 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v38, 8, v38 +; ALIGNED-NEXT: global_store_byte v[16:17], v85, off offset:197 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 24, v32 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v32, 8, v32 +; ALIGNED-NEXT: global_store_byte v[16:17], v51, off offset:195 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 24, v55 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v55, 8, v55 +; ALIGNED-NEXT: global_store_byte v[16:17], v80, off offset:193 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 24, v39 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v39, 8, v39 +; ALIGNED-NEXT: global_store_byte v[16:17], v115, off offset:191 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 24, v33 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v33, 8, v33 +; ALIGNED-NEXT: global_store_byte v[16:17], v101, off offset:189 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 24, v26 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v26, 8, v26 +; ALIGNED-NEXT: global_store_byte v[16:17], v50, off offset:187 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v50, 24, v48 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v48, 8, v48 +; ALIGNED-NEXT: global_store_byte v[16:17], v99, off offset:185 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 24, v34 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v34, 8, v34 +; ALIGNED-NEXT: global_store_byte v[16:17], v102, off offset:183 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v27 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v27, 8, v27 +; ALIGNED-NEXT: global_store_byte v[16:17], v96, off offset:181 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v96, 24, v21 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v21, 8, v21 +; ALIGNED-NEXT: global_store_byte v[16:17], v49, off offset:179 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v49, 24, v35 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v35, 8, v35 +; ALIGNED-NEXT: global_store_byte v[16:17], v81, off offset:177 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 24, v28 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v28, 8, v28 +; ALIGNED-NEXT: global_store_byte v[16:17], v31, off offset:175 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v31, 24, v22 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v22, 8, v22 +; ALIGNED-NEXT: global_store_byte v[16:17], v100, off offset:173 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v19 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v19, 8, v19 +; ALIGNED-NEXT: global_store_byte v[16:17], v36, off offset:171 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v36, 24, v29 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v29, 8, v29 +; ALIGNED-NEXT: global_store_byte v[16:17], v97, off offset:169 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v97, 24, v23 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v23, 8, v23 +; ALIGNED-NEXT: global_store_byte v[16:17], v103, off offset:167 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v20 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v20, 8, v20 +; ALIGNED-NEXT: global_store_byte v[16:17], v82, off offset:165 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v18 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v18, 8, v18 +; ALIGNED-NEXT: global_store_byte v[16:17], v71, off offset:163 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 24, v15 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v15, 8, v15 +; ALIGNED-NEXT: global_store_byte v[16:17], v66, off offset:161 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v66, 24, v14 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v14, 8, v14 +; ALIGNED-NEXT: global_store_byte v[16:17], v30, off offset:159 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v30, 24, v13 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v13, 8, v13 +; ALIGNED-NEXT: global_store_byte v[16:17], v98, off offset:157 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v12 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v12, 8, v12 +; ALIGNED-NEXT: global_store_byte v[16:17], v70, off offset:155 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 24, v11 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v11, 8, v11 +; ALIGNED-NEXT: global_store_byte v[16:17], v83, off offset:153 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 24, v10 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v10, 8, v10 +; ALIGNED-NEXT: global_store_byte v[16:17], v112, off offset:151 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v9 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v9, 8, v9 +; ALIGNED-NEXT: global_store_byte v[16:17], v65, off offset:147 +; ALIGNED-NEXT: global_store_byte v[16:17], v52, off offset:145 +; ALIGNED-NEXT: global_store_byte v[16:17], v25, off offset:143 +; ALIGNED-NEXT: global_store_byte v[16:17], v84, off offset:141 +; ALIGNED-NEXT: global_store_byte v[16:17], v64, off offset:139 +; ALIGNED-NEXT: global_store_byte v[16:17], v68, off offset:137 +; ALIGNED-NEXT: global_store_byte v[16:17], v113, off offset:135 +; ALIGNED-NEXT: global_store_byte v[16:17], v53, off offset:133 +; ALIGNED-NEXT: global_store_byte v[16:17], v87, off offset:131 +; ALIGNED-NEXT: global_store_byte v[16:17], v37, off offset:129 +; ALIGNED-NEXT: global_store_byte v[16:17], v24, off offset:127 +; ALIGNED-NEXT: global_store_byte v[16:17], v69, off offset:125 +; ALIGNED-NEXT: global_store_byte v[16:17], v86, off offset:123 +; ALIGNED-NEXT: global_store_byte v[16:17], v54, off offset:121 +; ALIGNED-NEXT: global_store_byte v[16:17], v114, off offset:119 +; ALIGNED-NEXT: global_store_byte v[16:17], v38, off offset:117 +; ALIGNED-NEXT: global_store_byte v[16:17], v85, off offset:115 +; ALIGNED-NEXT: global_store_byte v[16:17], v32, off offset:113 +; ALIGNED-NEXT: global_store_byte v[16:17], v51, off offset:111 +; ALIGNED-NEXT: global_store_byte v[16:17], v55, off offset:109 +; ALIGNED-NEXT: global_store_byte v[16:17], v80, off offset:107 +; ALIGNED-NEXT: global_store_byte v[16:17], v39, off offset:105 +; ALIGNED-NEXT: global_store_byte v[16:17], v115, off offset:103 +; ALIGNED-NEXT: global_store_byte v[16:17], v33, off offset:101 +; ALIGNED-NEXT: global_store_byte v[16:17], v101, off offset:99 +; ALIGNED-NEXT: global_store_byte v[16:17], v26, off offset:97 +; ALIGNED-NEXT: global_store_byte v[16:17], v50, off offset:95 +; ALIGNED-NEXT: global_store_byte v[16:17], v48, off offset:93 +; ALIGNED-NEXT: global_store_byte v[16:17], v99, off offset:91 +; ALIGNED-NEXT: global_store_byte v[16:17], v34, off offset:89 +; ALIGNED-NEXT: global_store_byte v[16:17], v102, off offset:87 +; ALIGNED-NEXT: global_store_byte v[16:17], v27, off offset:85 +; ALIGNED-NEXT: global_store_byte v[16:17], v96, off offset:83 +; ALIGNED-NEXT: global_store_byte v[16:17], v21, off offset:81 +; ALIGNED-NEXT: global_store_byte v[16:17], v49, off offset:79 +; ALIGNED-NEXT: global_store_byte v[16:17], v35, off offset:77 +; ALIGNED-NEXT: global_store_byte v[16:17], v81, off offset:75 +; ALIGNED-NEXT: global_store_byte v[16:17], v28, off offset:73 +; ALIGNED-NEXT: global_store_byte v[16:17], v31, off offset:71 +; ALIGNED-NEXT: global_store_byte v[16:17], v22, off offset:69 +; ALIGNED-NEXT: global_store_byte v[16:17], v100, off offset:67 +; ALIGNED-NEXT: global_store_byte v[16:17], v19, off offset:65 +; ALIGNED-NEXT: global_store_byte v[16:17], v36, off offset:63 +; ALIGNED-NEXT: global_store_byte v[16:17], v29, off offset:61 +; ALIGNED-NEXT: global_store_byte v[16:17], v97, off offset:59 +; ALIGNED-NEXT: global_store_byte v[16:17], v23, off offset:57 +; ALIGNED-NEXT: global_store_byte v[16:17], v103, off offset:55 +; ALIGNED-NEXT: global_store_byte v[16:17], v20, off offset:53 +; ALIGNED-NEXT: global_store_byte v[16:17], v82, off offset:51 +; ALIGNED-NEXT: global_store_byte v[16:17], v18, off offset:49 +; ALIGNED-NEXT: global_store_byte v[16:17], v71, off offset:43 +; ALIGNED-NEXT: global_store_byte v[16:17], v15, off offset:41 +; ALIGNED-NEXT: global_store_byte v[16:17], v66, off offset:47 +; ALIGNED-NEXT: global_store_byte v[16:17], v14, off offset:45 +; ALIGNED-NEXT: global_store_byte v[16:17], v30, off offset:35 +; ALIGNED-NEXT: global_store_byte v[16:17], v13, off offset:33 +; ALIGNED-NEXT: global_store_byte v[16:17], v98, off offset:39 +; ALIGNED-NEXT: global_store_byte v[16:17], v12, off offset:37 +; ALIGNED-NEXT: global_store_byte v[16:17], v70, off offset:31 +; ALIGNED-NEXT: global_store_byte v[16:17], v11, off offset:29 +; ALIGNED-NEXT: global_store_byte v[16:17], v83, off offset:27 +; ALIGNED-NEXT: global_store_byte v[16:17], v10, off offset:25 +; ALIGNED-NEXT: global_store_byte v[16:17], v112, off offset:23 +; ALIGNED-NEXT: global_store_byte v[16:17], v9, off offset:21 +; ALIGNED-NEXT: global_store_byte v[16:17], v67, off offset:19 +; ALIGNED-NEXT: global_store_byte v[16:17], v8, off offset:17 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v7, off offset:14 +; ALIGNED-NEXT: global_store_byte v[16:17], v7, off offset:12 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v6, off offset:10 +; ALIGNED-NEXT: global_store_byte v[16:17], v6, off offset:8 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v5, off offset:6 +; ALIGNED-NEXT: global_store_byte v[16:17], v5, off offset:4 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v4, off offset:2 +; ALIGNED-NEXT: global_store_byte v[16:17], v4, off +; ALIGNED-NEXT: v_lshrrev_b32_e32 v8, 24, v7 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v7, 8, v7 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v9, 24, v6 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v6, 8, v6 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v10, 24, v5 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v5, 8, v5 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v11, 24, v4 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v4, 8, v4 +; ALIGNED-NEXT: global_store_byte v[16:17], v8, off offset:15 +; ALIGNED-NEXT: global_store_byte v[16:17], v7, off offset:13 +; ALIGNED-NEXT: global_store_byte v[16:17], v9, off offset:11 +; ALIGNED-NEXT: global_store_byte v[16:17], v6, off offset:9 +; ALIGNED-NEXT: global_store_byte v[16:17], v10, off offset:7 +; ALIGNED-NEXT: global_store_byte v[16:17], v5, off offset:5 +; ALIGNED-NEXT: global_store_byte v[16:17], v11, off offset:3 +; ALIGNED-NEXT: global_store_byte v[16:17], v4, off offset:1 +; ALIGNED-NEXT: s_cbranch_scc0 .LBB6_5 +; ALIGNED-NEXT: .LBB6_6: ; %Flow10 +; ALIGNED-NEXT: s_or_b32 exec_lo, exec_lo, s8 +; ALIGNED-NEXT: s_clause 0x7 +; ALIGNED-NEXT: buffer_load_dword v47, off, s[0:3], s32 +; ALIGNED-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 +; ALIGNED-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:8 +; ALIGNED-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:12 +; ALIGNED-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:16 +; ALIGNED-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 +; ALIGNED-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:24 +; ALIGNED-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: s_setpc_b64 s[30:31] +; +; UNROLL3-LABEL: memmove_p1_p1_sz2048: +; UNROLL3: ; %bb.0: ; %entry +; UNROLL3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; UNROLL3-NEXT: s_mov_b32 s4, exec_lo +; UNROLL3-NEXT: v_cmpx_ge_u64_e64 v[2:3], v[0:1] +; UNROLL3-NEXT: s_xor_b32 s6, exec_lo, s4 +; UNROLL3-NEXT: s_cbranch_execz .LBB6_4 +; UNROLL3-NEXT: ; %bb.1: ; %memmove_fwd_loop.preheader +; UNROLL3-NEXT: s_mov_b64 s[4:5], 0 +; UNROLL3-NEXT: .p2align 6 +; UNROLL3-NEXT: .LBB6_2: ; %memmove_fwd_loop +; UNROLL3-NEXT: ; =>This Inner Loop Header: Depth=1 +; UNROLL3-NEXT: v_add_co_u32 v12, vcc_lo, v2, s4 +; UNROLL3-NEXT: v_add_co_ci_u32_e32 v13, vcc_lo, s5, v3, vcc_lo +; UNROLL3-NEXT: v_add_co_u32 v16, vcc_lo, v0, s4 +; UNROLL3-NEXT: v_add_co_ci_u32_e32 v17, vcc_lo, s5, v1, vcc_lo +; UNROLL3-NEXT: s_clause 0x2 +; UNROLL3-NEXT: global_load_dwordx4 v[4:7], v[12:13], off +; UNROLL3-NEXT: global_load_dwordx4 v[8:11], v[12:13], off offset:16 +; UNROLL3-NEXT: global_load_dwordx4 v[12:15], v[12:13], off offset:32 +; UNROLL3-NEXT: s_add_u32 s4, s4, 48 +; UNROLL3-NEXT: s_addc_u32 s5, s5, 0 +; UNROLL3-NEXT: s_waitcnt vmcnt(2) +; UNROLL3-NEXT: global_store_dwordx4 v[16:17], v[4:7], off +; UNROLL3-NEXT: s_waitcnt vmcnt(1) +; UNROLL3-NEXT: global_store_dwordx4 v[16:17], v[8:11], off offset:16 +; UNROLL3-NEXT: s_waitcnt vmcnt(0) +; UNROLL3-NEXT: global_store_dwordx4 v[16:17], v[12:15], off offset:32 +; UNROLL3-NEXT: s_cmp_lg_u64 s[4:5], 0x7e0 +; UNROLL3-NEXT: s_cbranch_scc1 .LBB6_2 +; UNROLL3-NEXT: ; %bb.3: ; %memmove_fwd_residual +; UNROLL3-NEXT: global_load_dwordx4 v[4:7], v[2:3], off offset:2016 +; UNROLL3-NEXT: s_waitcnt vmcnt(0) +; UNROLL3-NEXT: global_store_dwordx4 v[0:1], v[4:7], off offset:2016 +; UNROLL3-NEXT: global_load_dwordx4 v[2:5], v[2:3], off offset:2032 +; UNROLL3-NEXT: s_waitcnt vmcnt(0) +; UNROLL3-NEXT: global_store_dwordx4 v[0:1], v[2:5], off offset:2032 +; UNROLL3-NEXT: ; implicit-def: $vgpr2_vgpr3 +; UNROLL3-NEXT: ; implicit-def: $vgpr0_vgpr1 +; UNROLL3-NEXT: .LBB6_4: ; %Flow7 +; UNROLL3-NEXT: s_andn2_saveexec_b32 s8, s6 +; UNROLL3-NEXT: s_cbranch_execz .LBB6_7 +; UNROLL3-NEXT: ; %bb.5: ; %memmove_bwd_residual +; UNROLL3-NEXT: global_load_dwordx4 v[4:7], v[2:3], off offset:2032 +; UNROLL3-NEXT: s_movk_i32 s6, 0xffd0 +; UNROLL3-NEXT: s_mov_b64 s[4:5], 0x7b0 +; UNROLL3-NEXT: s_mov_b32 s7, -1 +; UNROLL3-NEXT: s_waitcnt vmcnt(0) +; UNROLL3-NEXT: global_store_dwordx4 v[0:1], v[4:7], off offset:2032 +; UNROLL3-NEXT: global_load_dwordx4 v[4:7], v[2:3], off offset:2016 +; UNROLL3-NEXT: s_waitcnt vmcnt(0) +; UNROLL3-NEXT: global_store_dwordx4 v[0:1], v[4:7], off offset:2016 +; UNROLL3-NEXT: .p2align 6 +; UNROLL3-NEXT: .LBB6_6: ; %memmove_bwd_loop +; UNROLL3-NEXT: ; =>This Inner Loop Header: Depth=1 +; UNROLL3-NEXT: v_add_co_u32 v12, vcc_lo, v2, s4 +; UNROLL3-NEXT: v_add_co_ci_u32_e32 v13, vcc_lo, s5, v3, vcc_lo +; UNROLL3-NEXT: v_add_co_u32 v16, vcc_lo, v0, s4 +; UNROLL3-NEXT: v_add_co_ci_u32_e32 v17, vcc_lo, s5, v1, vcc_lo +; UNROLL3-NEXT: s_clause 0x2 +; UNROLL3-NEXT: global_load_dwordx4 v[4:7], v[12:13], off +; UNROLL3-NEXT: global_load_dwordx4 v[8:11], v[12:13], off offset:16 +; UNROLL3-NEXT: global_load_dwordx4 v[12:15], v[12:13], off offset:32 +; UNROLL3-NEXT: s_add_u32 s4, s4, 0xffffffd0 +; UNROLL3-NEXT: s_addc_u32 s5, s5, -1 +; UNROLL3-NEXT: s_waitcnt vmcnt(2) +; UNROLL3-NEXT: global_store_dwordx4 v[16:17], v[4:7], off +; UNROLL3-NEXT: s_waitcnt vmcnt(1) +; UNROLL3-NEXT: global_store_dwordx4 v[16:17], v[8:11], off offset:16 +; UNROLL3-NEXT: s_waitcnt vmcnt(0) +; UNROLL3-NEXT: global_store_dwordx4 v[16:17], v[12:15], off offset:32 +; UNROLL3-NEXT: s_cmp_eq_u64 s[4:5], s[6:7] +; UNROLL3-NEXT: s_cbranch_scc0 .LBB6_6 +; UNROLL3-NEXT: .LBB6_7: ; %Flow8 +; UNROLL3-NEXT: s_or_b32 exec_lo, exec_lo, s8 +; UNROLL3-NEXT: s_setpc_b64 s[30:31] +entry: + tail call void @llvm.memmove.p1.p1.i64(ptr addrspace(1) noundef nonnull align 1 %dst, ptr addrspace(1) noundef nonnull align 1 %src, i64 2048, i1 false) + ret void +} + +define void @memmove_p0_p4_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(4) align 1 readonly %src) { +; CHECK-LABEL: memmove_p0_p4_sz2048: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_mov_b32 s4, exec_lo +; CHECK-NEXT: v_cmpx_ge_u64_e64 v[2:3], v[0:1] +; CHECK-NEXT: s_xor_b32 s6, exec_lo, s4 +; CHECK-NEXT: s_cbranch_execz .LBB7_3 +; CHECK-NEXT: ; %bb.1: ; %memmove_fwd_loop.preheader +; CHECK-NEXT: s_mov_b64 s[4:5], 0 +; CHECK-NEXT: .LBB7_2: ; %memmove_fwd_loop +; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: v_add_co_u32 v96, vcc_lo, v2, s4 +; CHECK-NEXT: v_add_co_ci_u32_e32 v97, vcc_lo, s5, v3, vcc_lo +; CHECK-NEXT: v_add_co_u32 v100, vcc_lo, v0, s4 +; CHECK-NEXT: v_add_co_ci_u32_e32 v101, vcc_lo, s5, v1, vcc_lo +; CHECK-NEXT: s_clause 0xf +; CHECK-NEXT: global_load_dwordx4 v[4:7], v[96:97], off offset:240 +; CHECK-NEXT: global_load_dwordx4 v[8:11], v[96:97], off offset:224 +; CHECK-NEXT: global_load_dwordx4 v[12:15], v[96:97], off offset:208 +; CHECK-NEXT: global_load_dwordx4 v[16:19], v[96:97], off offset:192 +; CHECK-NEXT: global_load_dwordx4 v[20:23], v[96:97], off offset:176 +; CHECK-NEXT: global_load_dwordx4 v[24:27], v[96:97], off offset:160 +; CHECK-NEXT: global_load_dwordx4 v[28:31], v[96:97], off offset:144 +; CHECK-NEXT: global_load_dwordx4 v[32:35], v[96:97], off offset:128 +; CHECK-NEXT: global_load_dwordx4 v[36:39], v[96:97], off offset:112 +; CHECK-NEXT: global_load_dwordx4 v[48:51], v[96:97], off offset:96 +; CHECK-NEXT: global_load_dwordx4 v[52:55], v[96:97], off offset:80 +; CHECK-NEXT: global_load_dwordx4 v[64:67], v[96:97], off offset:64 +; CHECK-NEXT: global_load_dwordx4 v[68:71], v[96:97], off offset:48 +; CHECK-NEXT: global_load_dwordx4 v[80:83], v[96:97], off offset:32 +; CHECK-NEXT: global_load_dwordx4 v[84:87], v[96:97], off offset:16 +; CHECK-NEXT: global_load_dwordx4 v[96:99], v[96:97], off +; CHECK-NEXT: s_add_u32 s4, s4, 0x100 +; CHECK-NEXT: s_addc_u32 s5, s5, 0 +; CHECK-NEXT: s_waitcnt vmcnt(15) +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[4:7] offset:240 +; CHECK-NEXT: s_waitcnt vmcnt(14) +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[8:11] offset:224 +; CHECK-NEXT: s_waitcnt vmcnt(13) +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[12:15] offset:208 +; CHECK-NEXT: s_waitcnt vmcnt(12) +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[16:19] offset:192 +; CHECK-NEXT: s_waitcnt vmcnt(11) +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[20:23] offset:176 +; CHECK-NEXT: s_waitcnt vmcnt(10) +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[24:27] offset:160 +; CHECK-NEXT: s_waitcnt vmcnt(9) +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[28:31] offset:144 +; CHECK-NEXT: s_waitcnt vmcnt(8) +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[32:35] offset:128 +; CHECK-NEXT: s_waitcnt vmcnt(7) +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[36:39] offset:112 +; CHECK-NEXT: s_waitcnt vmcnt(6) +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[48:51] offset:96 +; CHECK-NEXT: s_waitcnt vmcnt(5) +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[52:55] offset:80 +; CHECK-NEXT: s_waitcnt vmcnt(4) +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[64:67] offset:64 +; CHECK-NEXT: s_waitcnt vmcnt(3) +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[68:71] offset:48 +; CHECK-NEXT: s_waitcnt vmcnt(2) +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[80:83] offset:32 +; CHECK-NEXT: s_waitcnt vmcnt(1) +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[84:87] offset:16 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[96:99] +; CHECK-NEXT: s_cmp_lg_u64 s[4:5], 0x800 +; CHECK-NEXT: s_cbranch_scc1 .LBB7_2 +; CHECK-NEXT: .LBB7_3: ; %Flow6 +; CHECK-NEXT: s_andn2_saveexec_b32 s8, s6 +; CHECK-NEXT: s_cbranch_execz .LBB7_6 +; CHECK-NEXT: ; %bb.4: ; %memmove_bwd_loop.preheader +; CHECK-NEXT: s_movk_i32 s6, 0xff00 +; CHECK-NEXT: s_mov_b64 s[4:5], 0x700 +; CHECK-NEXT: s_mov_b32 s7, -1 +; CHECK-NEXT: .LBB7_5: ; %memmove_bwd_loop +; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: v_add_co_u32 v96, vcc_lo, v2, s4 +; CHECK-NEXT: v_add_co_ci_u32_e32 v97, vcc_lo, s5, v3, vcc_lo +; CHECK-NEXT: v_add_co_u32 v100, vcc_lo, v0, s4 +; CHECK-NEXT: v_add_co_ci_u32_e32 v101, vcc_lo, s5, v1, vcc_lo +; CHECK-NEXT: s_clause 0xf +; CHECK-NEXT: global_load_dwordx4 v[4:7], v[96:97], off offset:240 +; CHECK-NEXT: global_load_dwordx4 v[8:11], v[96:97], off offset:224 +; CHECK-NEXT: global_load_dwordx4 v[12:15], v[96:97], off offset:208 +; CHECK-NEXT: global_load_dwordx4 v[16:19], v[96:97], off offset:192 +; CHECK-NEXT: global_load_dwordx4 v[20:23], v[96:97], off offset:176 +; CHECK-NEXT: global_load_dwordx4 v[24:27], v[96:97], off offset:160 +; CHECK-NEXT: global_load_dwordx4 v[28:31], v[96:97], off offset:144 +; CHECK-NEXT: global_load_dwordx4 v[32:35], v[96:97], off offset:128 +; CHECK-NEXT: global_load_dwordx4 v[36:39], v[96:97], off offset:112 +; CHECK-NEXT: global_load_dwordx4 v[48:51], v[96:97], off offset:96 +; CHECK-NEXT: global_load_dwordx4 v[52:55], v[96:97], off offset:80 +; CHECK-NEXT: global_load_dwordx4 v[64:67], v[96:97], off offset:64 +; CHECK-NEXT: global_load_dwordx4 v[68:71], v[96:97], off offset:48 +; CHECK-NEXT: global_load_dwordx4 v[80:83], v[96:97], off offset:32 +; CHECK-NEXT: global_load_dwordx4 v[84:87], v[96:97], off offset:16 +; CHECK-NEXT: global_load_dwordx4 v[96:99], v[96:97], off +; CHECK-NEXT: s_add_u32 s4, s4, 0xffffff00 +; CHECK-NEXT: s_addc_u32 s5, s5, -1 +; CHECK-NEXT: s_waitcnt vmcnt(15) +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[4:7] offset:240 +; CHECK-NEXT: s_waitcnt vmcnt(14) +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[8:11] offset:224 +; CHECK-NEXT: s_waitcnt vmcnt(13) +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[12:15] offset:208 +; CHECK-NEXT: s_waitcnt vmcnt(12) +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[16:19] offset:192 +; CHECK-NEXT: s_waitcnt vmcnt(11) +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[20:23] offset:176 +; CHECK-NEXT: s_waitcnt vmcnt(10) +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[24:27] offset:160 +; CHECK-NEXT: s_waitcnt vmcnt(9) +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[28:31] offset:144 +; CHECK-NEXT: s_waitcnt vmcnt(8) +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[32:35] offset:128 +; CHECK-NEXT: s_waitcnt vmcnt(7) +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[36:39] offset:112 +; CHECK-NEXT: s_waitcnt vmcnt(6) +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[48:51] offset:96 +; CHECK-NEXT: s_waitcnt vmcnt(5) +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[52:55] offset:80 +; CHECK-NEXT: s_waitcnt vmcnt(4) +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[64:67] offset:64 +; CHECK-NEXT: s_waitcnt vmcnt(3) +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[68:71] offset:48 +; CHECK-NEXT: s_waitcnt vmcnt(2) +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[80:83] offset:32 +; CHECK-NEXT: s_waitcnt vmcnt(1) +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[84:87] offset:16 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[96:99] +; CHECK-NEXT: s_cmp_eq_u64 s[4:5], s[6:7] +; CHECK-NEXT: s_cbranch_scc0 .LBB7_5 +; CHECK-NEXT: .LBB7_6: ; %Flow7 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s8 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: s_setpc_b64 s[30:31] +; +; ALIGNED-LABEL: memmove_p0_p4_sz2048: +; ALIGNED: ; %bb.0: ; %entry +; ALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; ALIGNED-NEXT: s_mov_b32 s4, exec_lo +; ALIGNED-NEXT: v_cmpx_ge_u64_e64 v[2:3], v[0:1] +; ALIGNED-NEXT: s_xor_b32 s6, exec_lo, s4 +; ALIGNED-NEXT: s_cbranch_execz .LBB7_3 +; ALIGNED-NEXT: ; %bb.1: ; %memmove_fwd_loop.preheader +; ALIGNED-NEXT: s_mov_b64 s[4:5], 0 +; ALIGNED-NEXT: .LBB7_2: ; %memmove_fwd_loop +; ALIGNED-NEXT: ; =>This Inner Loop Header: Depth=1 +; ALIGNED-NEXT: v_add_co_u32 v4, vcc_lo, v2, s4 +; ALIGNED-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, s5, v3, vcc_lo +; ALIGNED-NEXT: v_add_co_u32 v96, vcc_lo, v0, s4 +; ALIGNED-NEXT: v_add_co_ci_u32_e32 v97, vcc_lo, s5, v1, vcc_lo +; ALIGNED-NEXT: s_clause 0xf +; ALIGNED-NEXT: global_load_dwordx4 v[112:115], v[4:5], off offset:240 +; ALIGNED-NEXT: global_load_dwordx4 v[84:87], v[4:5], off offset:224 +; ALIGNED-NEXT: global_load_dwordx4 v[80:83], v[4:5], off offset:208 +; ALIGNED-NEXT: global_load_dwordx4 v[68:71], v[4:5], off offset:192 +; ALIGNED-NEXT: global_load_dwordx4 v[64:67], v[4:5], off offset:176 +; ALIGNED-NEXT: global_load_dwordx4 v[52:55], v[4:5], off offset:160 +; ALIGNED-NEXT: global_load_dwordx4 v[48:51], v[4:5], off offset:144 +; ALIGNED-NEXT: global_load_dwordx4 v[36:39], v[4:5], off offset:128 +; ALIGNED-NEXT: global_load_dwordx4 v[32:35], v[4:5], off offset:112 +; ALIGNED-NEXT: global_load_dwordx4 v[28:31], v[4:5], off offset:96 +; ALIGNED-NEXT: global_load_dwordx4 v[24:27], v[4:5], off offset:80 +; ALIGNED-NEXT: global_load_dwordx4 v[20:23], v[4:5], off offset:64 +; ALIGNED-NEXT: global_load_dwordx4 v[16:19], v[4:5], off offset:48 +; ALIGNED-NEXT: global_load_dwordx4 v[12:15], v[4:5], off offset:32 +; ALIGNED-NEXT: global_load_dwordx4 v[8:11], v[4:5], off offset:16 +; ALIGNED-NEXT: global_load_dwordx4 v[4:7], v[4:5], off +; ALIGNED-NEXT: s_add_u32 s4, s4, 0x100 +; ALIGNED-NEXT: s_addc_u32 s5, s5, 0 +; ALIGNED-NEXT: s_waitcnt vmcnt(15) +; ALIGNED-NEXT: buffer_store_dword v114, off, s[0:3], s32 offset:168 +; ALIGNED-NEXT: buffer_store_dword v115, off, s[0:3], s32 offset:172 +; ALIGNED-NEXT: buffer_store_dword v113, off, s[0:3], s32 offset:164 +; ALIGNED-NEXT: buffer_store_dword v112, off, s[0:3], s32 offset:160 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v114 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v114 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v114 offset:250 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v115 offset:254 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v115 +; ALIGNED-NEXT: flat_store_byte v[96:97], v115 offset:252 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 8, v115 +; ALIGNED-NEXT: flat_store_byte v[96:97], v114 offset:248 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v113 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v113 offset:246 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 8, v113 +; ALIGNED-NEXT: flat_store_byte v[96:97], v113 offset:244 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v112 offset:242 +; ALIGNED-NEXT: flat_store_byte v[96:97], v112 offset:240 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v112 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v112 +; ALIGNED-NEXT: s_waitcnt vmcnt(14) +; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v86 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v86 +; ALIGNED-NEXT: flat_store_byte v[96:97], v98 offset:251 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v87 +; ALIGNED-NEXT: flat_store_byte v[96:97], v99 offset:249 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v87 +; ALIGNED-NEXT: flat_store_byte v[96:97], v100 offset:255 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v85 +; ALIGNED-NEXT: flat_store_byte v[96:97], v101 offset:253 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 8, v85 +; ALIGNED-NEXT: flat_store_byte v[96:97], v102 offset:247 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v84 +; ALIGNED-NEXT: flat_store_byte v[96:97], v103 offset:245 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 8, v84 +; ALIGNED-NEXT: flat_store_byte v[96:97], v113 offset:243 +; ALIGNED-NEXT: s_waitcnt vmcnt(13) +; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v82 +; ALIGNED-NEXT: flat_store_byte v[96:97], v112 offset:241 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v82 +; ALIGNED-NEXT: buffer_store_dword v86, off, s[0:3], s32 offset:184 +; ALIGNED-NEXT: buffer_store_dword v87, off, s[0:3], s32 offset:188 +; ALIGNED-NEXT: buffer_store_dword v85, off, s[0:3], s32 offset:180 +; ALIGNED-NEXT: buffer_store_dword v84, off, s[0:3], s32 offset:176 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v86 offset:234 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v87 offset:238 +; ALIGNED-NEXT: flat_store_byte v[96:97], v87 offset:236 +; ALIGNED-NEXT: flat_store_byte v[96:97], v86 offset:232 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v85 offset:230 +; ALIGNED-NEXT: flat_store_byte v[96:97], v85 offset:228 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v84 offset:226 +; ALIGNED-NEXT: flat_store_byte v[96:97], v84 offset:224 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v84, 24, v83 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 8, v83 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 24, v81 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 8, v81 +; ALIGNED-NEXT: flat_store_byte v[96:97], v114 offset:235 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v80 +; ALIGNED-NEXT: flat_store_byte v[96:97], v115 offset:233 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v80 +; ALIGNED-NEXT: flat_store_byte v[96:97], v98 offset:239 +; ALIGNED-NEXT: s_waitcnt vmcnt(12) +; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v70 +; ALIGNED-NEXT: flat_store_byte v[96:97], v99 offset:237 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v70 +; ALIGNED-NEXT: flat_store_byte v[96:97], v100 offset:231 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v71 +; ALIGNED-NEXT: flat_store_byte v[96:97], v101 offset:229 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 8, v71 +; ALIGNED-NEXT: flat_store_byte v[96:97], v102 offset:227 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v69 +; ALIGNED-NEXT: flat_store_byte v[96:97], v103 offset:225 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 8, v69 +; ALIGNED-NEXT: buffer_store_dword v82, off, s[0:3], s32 offset:136 +; ALIGNED-NEXT: buffer_store_dword v83, off, s[0:3], s32 offset:140 +; ALIGNED-NEXT: buffer_store_dword v81, off, s[0:3], s32 offset:132 +; ALIGNED-NEXT: buffer_store_dword v80, off, s[0:3], s32 offset:128 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v82 offset:218 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v83 offset:222 +; ALIGNED-NEXT: flat_store_byte v[96:97], v83 offset:220 +; ALIGNED-NEXT: flat_store_byte v[96:97], v82 offset:216 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v81 offset:214 +; ALIGNED-NEXT: flat_store_byte v[96:97], v81 offset:212 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v80 offset:210 +; ALIGNED-NEXT: flat_store_byte v[96:97], v80 offset:208 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 24, v68 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 8, v68 +; ALIGNED-NEXT: s_waitcnt vmcnt(11) +; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v66 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 8, v66 +; ALIGNED-NEXT: flat_store_byte v[96:97], v113 offset:219 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v67 +; ALIGNED-NEXT: flat_store_byte v[96:97], v112 offset:217 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v67 +; ALIGNED-NEXT: flat_store_byte v[96:97], v84 offset:223 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v84, 24, v65 +; ALIGNED-NEXT: flat_store_byte v[96:97], v85 offset:221 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 8, v65 +; ALIGNED-NEXT: flat_store_byte v[96:97], v86 offset:215 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 24, v64 +; ALIGNED-NEXT: flat_store_byte v[96:97], v87 offset:213 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 8, v64 +; ALIGNED-NEXT: flat_store_byte v[96:97], v114 offset:211 +; ALIGNED-NEXT: s_waitcnt vmcnt(10) +; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v54 +; ALIGNED-NEXT: flat_store_byte v[96:97], v115 offset:209 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v54 +; ALIGNED-NEXT: buffer_store_dword v70, off, s[0:3], s32 offset:152 +; ALIGNED-NEXT: buffer_store_dword v71, off, s[0:3], s32 offset:156 +; ALIGNED-NEXT: buffer_store_dword v69, off, s[0:3], s32 offset:148 +; ALIGNED-NEXT: buffer_store_dword v68, off, s[0:3], s32 offset:144 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v70 offset:202 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v71 offset:206 +; ALIGNED-NEXT: flat_store_byte v[96:97], v71 offset:204 +; ALIGNED-NEXT: flat_store_byte v[96:97], v70 offset:200 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v69 offset:198 +; ALIGNED-NEXT: flat_store_byte v[96:97], v69 offset:196 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v68 offset:194 +; ALIGNED-NEXT: flat_store_byte v[96:97], v68 offset:192 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 24, v52 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 8, v52 +; ALIGNED-NEXT: flat_store_byte v[96:97], v98 offset:203 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v53 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v68, 24, v55 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 8, v55 +; ALIGNED-NEXT: flat_store_byte v[96:97], v99 offset:201 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v53 +; ALIGNED-NEXT: flat_store_byte v[96:97], v100 offset:207 +; ALIGNED-NEXT: s_waitcnt vmcnt(9) +; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v50 +; ALIGNED-NEXT: flat_store_byte v[96:97], v101 offset:205 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 8, v50 +; ALIGNED-NEXT: flat_store_byte v[96:97], v102 offset:199 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v51 +; ALIGNED-NEXT: flat_store_byte v[96:97], v103 offset:197 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 8, v51 +; ALIGNED-NEXT: flat_store_byte v[96:97], v80 offset:195 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 24, v49 +; ALIGNED-NEXT: flat_store_byte v[96:97], v81 offset:193 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 8, v49 +; ALIGNED-NEXT: buffer_store_dword v66, off, s[0:3], s32 offset:232 +; ALIGNED-NEXT: buffer_store_dword v67, off, s[0:3], s32 offset:236 +; ALIGNED-NEXT: buffer_store_dword v65, off, s[0:3], s32 offset:228 +; ALIGNED-NEXT: buffer_store_dword v64, off, s[0:3], s32 offset:224 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v66 offset:186 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v67 offset:190 +; ALIGNED-NEXT: flat_store_byte v[96:97], v67 offset:188 +; ALIGNED-NEXT: flat_store_byte v[96:97], v66 offset:184 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v65 offset:182 +; ALIGNED-NEXT: flat_store_byte v[96:97], v65 offset:180 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v64 offset:178 +; ALIGNED-NEXT: flat_store_byte v[96:97], v64 offset:176 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v64, 24, v48 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v65, 8, v48 +; ALIGNED-NEXT: s_waitcnt vmcnt(8) +; ALIGNED-NEXT: v_lshrrev_b32_e32 v66, 24, v38 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v67, 8, v38 +; ALIGNED-NEXT: flat_store_byte v[96:97], v82 offset:187 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v39 +; ALIGNED-NEXT: flat_store_byte v[96:97], v83 offset:185 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 8, v39 +; ALIGNED-NEXT: flat_store_byte v[96:97], v113 offset:191 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v37 +; ALIGNED-NEXT: flat_store_byte v[96:97], v112 offset:189 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v37 +; ALIGNED-NEXT: flat_store_byte v[96:97], v84 offset:183 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v84, 24, v36 +; ALIGNED-NEXT: flat_store_byte v[96:97], v85 offset:181 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 8, v36 +; ALIGNED-NEXT: flat_store_byte v[96:97], v86 offset:179 +; ALIGNED-NEXT: s_waitcnt vmcnt(7) +; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 24, v34 +; ALIGNED-NEXT: flat_store_byte v[96:97], v87 offset:177 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 8, v34 +; ALIGNED-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:252 +; ALIGNED-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:248 +; ALIGNED-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:244 +; ALIGNED-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:240 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v54 offset:170 +; ALIGNED-NEXT: flat_store_byte v[96:97], v54 offset:168 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v55 offset:174 +; ALIGNED-NEXT: flat_store_byte v[96:97], v55 offset:172 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v52 offset:162 +; ALIGNED-NEXT: flat_store_byte v[96:97], v52 offset:160 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v53 offset:166 +; ALIGNED-NEXT: flat_store_byte v[96:97], v53 offset:164 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v52, 24, v35 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v53, 8, v35 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v54, 24, v33 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v55, 8, v33 +; ALIGNED-NEXT: flat_store_byte v[96:97], v114 offset:171 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v32 +; ALIGNED-NEXT: flat_store_byte v[96:97], v115 offset:169 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v32 +; ALIGNED-NEXT: flat_store_byte v[96:97], v70 offset:163 +; ALIGNED-NEXT: s_waitcnt vmcnt(6) +; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 24, v31 +; ALIGNED-NEXT: flat_store_byte v[96:97], v71 offset:161 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 8, v31 +; ALIGNED-NEXT: flat_store_byte v[96:97], v98 offset:167 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v29 +; ALIGNED-NEXT: flat_store_byte v[96:97], v68 offset:175 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v68, 24, v30 +; ALIGNED-NEXT: flat_store_byte v[96:97], v69 offset:173 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 8, v30 +; ALIGNED-NEXT: flat_store_byte v[96:97], v99 offset:165 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v29 +; ALIGNED-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:200 +; ALIGNED-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:204 +; ALIGNED-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:196 +; ALIGNED-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:192 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v50 offset:154 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v51 offset:158 +; ALIGNED-NEXT: flat_store_byte v[96:97], v51 offset:156 +; ALIGNED-NEXT: flat_store_byte v[96:97], v50 offset:152 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v49 offset:150 +; ALIGNED-NEXT: flat_store_byte v[96:97], v49 offset:148 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v48 offset:146 +; ALIGNED-NEXT: flat_store_byte v[96:97], v48 offset:144 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v48, 24, v28 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v49, 8, v28 +; ALIGNED-NEXT: s_waitcnt vmcnt(5) +; ALIGNED-NEXT: v_lshrrev_b32_e32 v50, 24, v26 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 8, v26 +; ALIGNED-NEXT: flat_store_byte v[96:97], v100 offset:155 +; ALIGNED-NEXT: flat_store_byte v[96:97], v101 offset:153 +; ALIGNED-NEXT: flat_store_byte v[96:97], v102 offset:159 +; ALIGNED-NEXT: flat_store_byte v[96:97], v103 offset:157 +; ALIGNED-NEXT: flat_store_byte v[96:97], v80 offset:151 +; ALIGNED-NEXT: flat_store_byte v[96:97], v81 offset:149 +; ALIGNED-NEXT: flat_store_byte v[96:97], v64 offset:147 +; ALIGNED-NEXT: s_waitcnt vmcnt(4) +; ALIGNED-NEXT: v_lshrrev_b32_e32 v64, 24, v22 +; ALIGNED-NEXT: flat_store_byte v[96:97], v65 offset:145 +; ALIGNED-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:216 +; ALIGNED-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:220 +; ALIGNED-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:212 +; ALIGNED-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:208 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v38 offset:138 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v39 offset:142 +; ALIGNED-NEXT: flat_store_byte v[96:97], v39 offset:140 +; ALIGNED-NEXT: flat_store_byte v[96:97], v38 offset:136 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v37 offset:134 +; ALIGNED-NEXT: flat_store_byte v[96:97], v37 offset:132 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v36 offset:130 +; ALIGNED-NEXT: flat_store_byte v[96:97], v36 offset:128 +; ALIGNED-NEXT: flat_store_byte v[96:97], v66 offset:139 +; ALIGNED-NEXT: flat_store_byte v[96:97], v67 offset:137 +; ALIGNED-NEXT: flat_store_byte v[96:97], v82 offset:143 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v18 +; ALIGNED-NEXT: flat_store_byte v[96:97], v83 offset:141 +; ALIGNED-NEXT: flat_store_byte v[96:97], v113 offset:135 +; ALIGNED-NEXT: flat_store_byte v[96:97], v112 offset:133 +; ALIGNED-NEXT: flat_store_byte v[96:97], v84 offset:131 +; ALIGNED-NEXT: flat_store_byte v[96:97], v85 offset:129 +; ALIGNED-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:40 +; ALIGNED-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:44 +; ALIGNED-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:36 +; ALIGNED-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:32 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v34 offset:122 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v35 offset:126 +; ALIGNED-NEXT: flat_store_byte v[96:97], v35 offset:124 +; ALIGNED-NEXT: flat_store_byte v[96:97], v34 offset:120 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v33 offset:118 +; ALIGNED-NEXT: flat_store_byte v[96:97], v33 offset:116 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v32 offset:114 +; ALIGNED-NEXT: flat_store_byte v[96:97], v32 offset:112 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: v_lshrrev_b32_e32 v34, 24, v14 +; ALIGNED-NEXT: flat_store_byte v[96:97], v86 offset:123 +; ALIGNED-NEXT: flat_store_byte v[96:97], v87 offset:121 +; ALIGNED-NEXT: flat_store_byte v[96:97], v52 offset:127 +; ALIGNED-NEXT: flat_store_byte v[96:97], v53 offset:125 +; ALIGNED-NEXT: flat_store_byte v[96:97], v54 offset:119 +; ALIGNED-NEXT: flat_store_byte v[96:97], v55 offset:117 +; ALIGNED-NEXT: flat_store_byte v[96:97], v114 offset:115 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v10 +; ALIGNED-NEXT: flat_store_byte v[96:97], v115 offset:113 +; ALIGNED-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:56 +; ALIGNED-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:60 +; ALIGNED-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:52 +; ALIGNED-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:48 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v30 offset:106 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v31 offset:110 +; ALIGNED-NEXT: flat_store_byte v[96:97], v31 offset:108 +; ALIGNED-NEXT: flat_store_byte v[96:97], v30 offset:104 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v29 offset:102 +; ALIGNED-NEXT: flat_store_byte v[96:97], v29 offset:100 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v28 offset:98 +; ALIGNED-NEXT: flat_store_byte v[96:97], v28 offset:96 +; ALIGNED-NEXT: flat_store_byte v[96:97], v70 offset:111 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 24, v6 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v27 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v65, 8, v22 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 8, v18 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v35, 8, v14 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v10 +; ALIGNED-NEXT: flat_store_byte v[96:97], v71 offset:109 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 8, v6 +; ALIGNED-NEXT: s_cmp_lg_u64 s[4:5], 0x800 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 8, v27 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v36, 24, v23 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v19 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 24, v15 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v28, 24, v11 +; ALIGNED-NEXT: flat_store_byte v[96:97], v98 offset:103 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v7 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v25 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 8, v25 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 24, v24 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 8, v24 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v37, 8, v23 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v38, 24, v21 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v39, 8, v21 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v66, 24, v20 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v67, 8, v20 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v19 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v84, 24, v17 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 8, v17 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v32, 24, v16 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v33, 8, v16 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 8, v15 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v52, 24, v13 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v53, 8, v13 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v54, 24, v12 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v55, 8, v12 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v29, 8, v11 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v30, 24, v9 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v31, 8, v9 +; ALIGNED-NEXT: flat_store_byte v[96:97], v68 offset:107 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v68, 24, v8 +; ALIGNED-NEXT: flat_store_byte v[96:97], v69 offset:105 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 8, v8 +; ALIGNED-NEXT: flat_store_byte v[96:97], v99 offset:101 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v7 +; ALIGNED-NEXT: flat_store_byte v[96:97], v48 offset:99 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v48, 24, v5 +; ALIGNED-NEXT: flat_store_byte v[96:97], v49 offset:97 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v49, 8, v5 +; ALIGNED-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:8 +; ALIGNED-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:12 +; ALIGNED-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:4 +; ALIGNED-NEXT: buffer_store_dword v24, off, s[0:3], s32 +; ALIGNED-NEXT: flat_store_byte v[96:97], v50 offset:91 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v50, 24, v4 +; ALIGNED-NEXT: flat_store_byte v[96:97], v51 offset:89 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 8, v4 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v26 offset:90 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v27 offset:94 +; ALIGNED-NEXT: flat_store_byte v[96:97], v100 offset:95 +; ALIGNED-NEXT: flat_store_byte v[96:97], v27 offset:92 +; ALIGNED-NEXT: flat_store_byte v[96:97], v101 offset:93 +; ALIGNED-NEXT: flat_store_byte v[96:97], v26 offset:88 +; ALIGNED-NEXT: flat_store_byte v[96:97], v102 offset:87 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v25 offset:86 +; ALIGNED-NEXT: flat_store_byte v[96:97], v103 offset:85 +; ALIGNED-NEXT: flat_store_byte v[96:97], v25 offset:84 +; ALIGNED-NEXT: flat_store_byte v[96:97], v80 offset:83 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v24 offset:82 +; ALIGNED-NEXT: flat_store_byte v[96:97], v81 offset:81 +; ALIGNED-NEXT: flat_store_byte v[96:97], v24 offset:80 +; ALIGNED-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:24 +; ALIGNED-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:28 +; ALIGNED-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:20 +; ALIGNED-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:16 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v22 offset:74 +; ALIGNED-NEXT: flat_store_byte v[96:97], v64 offset:75 +; ALIGNED-NEXT: flat_store_byte v[96:97], v65 offset:73 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v23 offset:78 +; ALIGNED-NEXT: flat_store_byte v[96:97], v36 offset:79 +; ALIGNED-NEXT: flat_store_byte v[96:97], v23 offset:76 +; ALIGNED-NEXT: flat_store_byte v[96:97], v37 offset:77 +; ALIGNED-NEXT: flat_store_byte v[96:97], v22 offset:72 +; ALIGNED-NEXT: flat_store_byte v[96:97], v38 offset:71 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v21 offset:70 +; ALIGNED-NEXT: flat_store_byte v[96:97], v39 offset:69 +; ALIGNED-NEXT: flat_store_byte v[96:97], v21 offset:68 +; ALIGNED-NEXT: flat_store_byte v[96:97], v66 offset:67 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v20 offset:66 +; ALIGNED-NEXT: flat_store_byte v[96:97], v67 offset:65 +; ALIGNED-NEXT: flat_store_byte v[96:97], v20 offset:64 +; ALIGNED-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:104 +; ALIGNED-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:108 +; ALIGNED-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:100 +; ALIGNED-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:96 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v18 offset:58 +; ALIGNED-NEXT: flat_store_byte v[96:97], v82 offset:59 +; ALIGNED-NEXT: flat_store_byte v[96:97], v83 offset:57 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v19 offset:62 +; ALIGNED-NEXT: flat_store_byte v[96:97], v113 offset:63 +; ALIGNED-NEXT: flat_store_byte v[96:97], v19 offset:60 +; ALIGNED-NEXT: flat_store_byte v[96:97], v112 offset:61 +; ALIGNED-NEXT: flat_store_byte v[96:97], v18 offset:56 +; ALIGNED-NEXT: flat_store_byte v[96:97], v84 offset:55 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v17 offset:54 +; ALIGNED-NEXT: flat_store_byte v[96:97], v85 offset:53 +; ALIGNED-NEXT: flat_store_byte v[96:97], v17 offset:52 +; ALIGNED-NEXT: flat_store_byte v[96:97], v32 offset:51 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v16 offset:50 +; ALIGNED-NEXT: flat_store_byte v[96:97], v33 offset:49 +; ALIGNED-NEXT: flat_store_byte v[96:97], v16 offset:48 +; ALIGNED-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:120 +; ALIGNED-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:124 +; ALIGNED-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:116 +; ALIGNED-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:112 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v14 offset:42 +; ALIGNED-NEXT: flat_store_byte v[96:97], v34 offset:43 +; ALIGNED-NEXT: flat_store_byte v[96:97], v35 offset:41 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v15 offset:46 +; ALIGNED-NEXT: flat_store_byte v[96:97], v86 offset:47 +; ALIGNED-NEXT: flat_store_byte v[96:97], v15 offset:44 +; ALIGNED-NEXT: flat_store_byte v[96:97], v87 offset:45 +; ALIGNED-NEXT: flat_store_byte v[96:97], v14 offset:40 +; ALIGNED-NEXT: flat_store_byte v[96:97], v52 offset:39 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v13 offset:38 +; ALIGNED-NEXT: flat_store_byte v[96:97], v53 offset:37 +; ALIGNED-NEXT: flat_store_byte v[96:97], v13 offset:36 +; ALIGNED-NEXT: flat_store_byte v[96:97], v54 offset:35 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v12 offset:34 +; ALIGNED-NEXT: flat_store_byte v[96:97], v55 offset:33 +; ALIGNED-NEXT: flat_store_byte v[96:97], v12 offset:32 +; ALIGNED-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:72 +; ALIGNED-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:76 +; ALIGNED-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:68 +; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:64 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v10 offset:26 +; ALIGNED-NEXT: flat_store_byte v[96:97], v114 offset:27 +; ALIGNED-NEXT: flat_store_byte v[96:97], v115 offset:25 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v11 offset:30 +; ALIGNED-NEXT: flat_store_byte v[96:97], v28 offset:31 +; ALIGNED-NEXT: flat_store_byte v[96:97], v11 offset:28 +; ALIGNED-NEXT: flat_store_byte v[96:97], v29 offset:29 +; ALIGNED-NEXT: flat_store_byte v[96:97], v10 offset:24 +; ALIGNED-NEXT: flat_store_byte v[96:97], v30 offset:23 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v9 offset:22 +; ALIGNED-NEXT: flat_store_byte v[96:97], v31 offset:21 +; ALIGNED-NEXT: flat_store_byte v[96:97], v9 offset:20 +; ALIGNED-NEXT: flat_store_byte v[96:97], v68 offset:19 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v8 offset:18 +; ALIGNED-NEXT: flat_store_byte v[96:97], v69 offset:17 +; ALIGNED-NEXT: flat_store_byte v[96:97], v8 offset:16 +; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:88 +; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:92 +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:84 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:80 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v6 offset:10 +; ALIGNED-NEXT: flat_store_byte v[96:97], v70 offset:11 +; ALIGNED-NEXT: flat_store_byte v[96:97], v71 offset:9 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v7 offset:14 +; ALIGNED-NEXT: flat_store_byte v[96:97], v98 offset:15 +; ALIGNED-NEXT: flat_store_byte v[96:97], v7 offset:12 +; ALIGNED-NEXT: flat_store_byte v[96:97], v99 offset:13 +; ALIGNED-NEXT: flat_store_byte v[96:97], v6 offset:8 +; ALIGNED-NEXT: flat_store_byte v[96:97], v48 offset:7 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v5 offset:6 +; ALIGNED-NEXT: flat_store_byte v[96:97], v49 offset:5 +; ALIGNED-NEXT: flat_store_byte v[96:97], v5 offset:4 +; ALIGNED-NEXT: flat_store_byte v[96:97], v50 offset:3 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v4 offset:2 +; ALIGNED-NEXT: flat_store_byte v[96:97], v51 offset:1 +; ALIGNED-NEXT: flat_store_byte v[96:97], v4 +; ALIGNED-NEXT: s_cbranch_scc1 .LBB7_2 +; ALIGNED-NEXT: .LBB7_3: ; %Flow6 +; ALIGNED-NEXT: s_andn2_saveexec_b32 s8, s6 +; ALIGNED-NEXT: s_cbranch_execz .LBB7_6 +; ALIGNED-NEXT: ; %bb.4: ; %memmove_bwd_loop.preheader +; ALIGNED-NEXT: s_movk_i32 s6, 0xff00 +; ALIGNED-NEXT: s_mov_b64 s[4:5], 0x700 +; ALIGNED-NEXT: s_mov_b32 s7, -1 +; ALIGNED-NEXT: .LBB7_5: ; %memmove_bwd_loop +; ALIGNED-NEXT: ; =>This Inner Loop Header: Depth=1 +; ALIGNED-NEXT: v_add_co_u32 v4, vcc_lo, v2, s4 +; ALIGNED-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, s5, v3, vcc_lo +; ALIGNED-NEXT: v_add_co_u32 v96, vcc_lo, v0, s4 +; ALIGNED-NEXT: v_add_co_ci_u32_e32 v97, vcc_lo, s5, v1, vcc_lo +; ALIGNED-NEXT: s_clause 0xf +; ALIGNED-NEXT: global_load_dwordx4 v[98:101], v[4:5], off offset:240 +; ALIGNED-NEXT: global_load_dwordx4 v[84:87], v[4:5], off offset:224 +; ALIGNED-NEXT: global_load_dwordx4 v[80:83], v[4:5], off offset:208 +; ALIGNED-NEXT: global_load_dwordx4 v[68:71], v[4:5], off offset:192 +; ALIGNED-NEXT: global_load_dwordx4 v[64:67], v[4:5], off offset:176 +; ALIGNED-NEXT: global_load_dwordx4 v[52:55], v[4:5], off offset:160 +; ALIGNED-NEXT: global_load_dwordx4 v[48:51], v[4:5], off offset:144 +; ALIGNED-NEXT: global_load_dwordx4 v[36:39], v[4:5], off offset:128 +; ALIGNED-NEXT: global_load_dwordx4 v[32:35], v[4:5], off offset:112 +; ALIGNED-NEXT: global_load_dwordx4 v[28:31], v[4:5], off offset:96 +; ALIGNED-NEXT: global_load_dwordx4 v[24:27], v[4:5], off offset:80 +; ALIGNED-NEXT: global_load_dwordx4 v[20:23], v[4:5], off offset:64 +; ALIGNED-NEXT: global_load_dwordx4 v[16:19], v[4:5], off offset:48 +; ALIGNED-NEXT: global_load_dwordx4 v[12:15], v[4:5], off offset:32 +; ALIGNED-NEXT: global_load_dwordx4 v[8:11], v[4:5], off offset:16 +; ALIGNED-NEXT: global_load_dwordx4 v[4:7], v[4:5], off +; ALIGNED-NEXT: s_add_u32 s4, s4, 0xffffff00 +; ALIGNED-NEXT: s_addc_u32 s5, s5, -1 +; ALIGNED-NEXT: s_waitcnt vmcnt(15) +; ALIGNED-NEXT: buffer_store_dword v100, off, s[0:3], s32 offset:424 +; ALIGNED-NEXT: buffer_store_dword v101, off, s[0:3], s32 offset:428 +; ALIGNED-NEXT: buffer_store_dword v99, off, s[0:3], s32 offset:420 +; ALIGNED-NEXT: buffer_store_dword v98, off, s[0:3], s32 offset:416 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v100 offset:250 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v101 offset:254 +; ALIGNED-NEXT: flat_store_byte v[96:97], v101 offset:252 +; ALIGNED-NEXT: flat_store_byte v[96:97], v100 offset:248 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v99 offset:246 +; ALIGNED-NEXT: flat_store_byte v[96:97], v99 offset:244 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v98 offset:242 +; ALIGNED-NEXT: flat_store_byte v[96:97], v98 offset:240 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v100 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v100 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v101 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 8, v101 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 24, v99 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 8, v99 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 24, v98 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 8, v98 +; ALIGNED-NEXT: s_waitcnt vmcnt(14) +; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v86 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v86 +; ALIGNED-NEXT: flat_store_byte v[96:97], v113 offset:251 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v87 +; ALIGNED-NEXT: flat_store_byte v[96:97], v112 offset:249 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v87 +; ALIGNED-NEXT: flat_store_byte v[96:97], v103 offset:255 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v85 +; ALIGNED-NEXT: flat_store_byte v[96:97], v102 offset:253 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 8, v85 +; ALIGNED-NEXT: flat_store_byte v[96:97], v101 offset:247 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 24, v84 +; ALIGNED-NEXT: flat_store_byte v[96:97], v100 offset:245 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 8, v84 +; ALIGNED-NEXT: flat_store_byte v[96:97], v99 offset:243 +; ALIGNED-NEXT: s_waitcnt vmcnt(13) +; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 24, v82 +; ALIGNED-NEXT: flat_store_byte v[96:97], v98 offset:241 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 8, v82 +; ALIGNED-NEXT: buffer_store_dword v86, off, s[0:3], s32 offset:440 +; ALIGNED-NEXT: buffer_store_dword v87, off, s[0:3], s32 offset:444 +; ALIGNED-NEXT: buffer_store_dword v85, off, s[0:3], s32 offset:436 +; ALIGNED-NEXT: buffer_store_dword v84, off, s[0:3], s32 offset:432 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v86 offset:234 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v87 offset:238 +; ALIGNED-NEXT: flat_store_byte v[96:97], v87 offset:236 +; ALIGNED-NEXT: flat_store_byte v[96:97], v86 offset:232 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v85 offset:230 +; ALIGNED-NEXT: flat_store_byte v[96:97], v85 offset:228 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v84 offset:226 +; ALIGNED-NEXT: flat_store_byte v[96:97], v84 offset:224 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v84, 24, v83 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 8, v83 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 24, v81 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 8, v81 +; ALIGNED-NEXT: flat_store_byte v[96:97], v114 offset:235 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v80 +; ALIGNED-NEXT: flat_store_byte v[96:97], v115 offset:233 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v80 +; ALIGNED-NEXT: flat_store_byte v[96:97], v113 offset:239 +; ALIGNED-NEXT: s_waitcnt vmcnt(12) +; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v70 +; ALIGNED-NEXT: flat_store_byte v[96:97], v112 offset:237 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v70 +; ALIGNED-NEXT: flat_store_byte v[96:97], v103 offset:231 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v71 +; ALIGNED-NEXT: flat_store_byte v[96:97], v102 offset:229 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 8, v71 +; ALIGNED-NEXT: flat_store_byte v[96:97], v101 offset:227 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 24, v69 +; ALIGNED-NEXT: flat_store_byte v[96:97], v100 offset:225 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 8, v69 +; ALIGNED-NEXT: buffer_store_dword v82, off, s[0:3], s32 offset:392 +; ALIGNED-NEXT: buffer_store_dword v83, off, s[0:3], s32 offset:396 +; ALIGNED-NEXT: buffer_store_dword v81, off, s[0:3], s32 offset:388 +; ALIGNED-NEXT: buffer_store_dword v80, off, s[0:3], s32 offset:384 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v82 offset:218 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v83 offset:222 +; ALIGNED-NEXT: flat_store_byte v[96:97], v83 offset:220 +; ALIGNED-NEXT: flat_store_byte v[96:97], v82 offset:216 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v81 offset:214 +; ALIGNED-NEXT: flat_store_byte v[96:97], v81 offset:212 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v80 offset:210 +; ALIGNED-NEXT: flat_store_byte v[96:97], v80 offset:208 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 24, v68 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 8, v68 +; ALIGNED-NEXT: s_waitcnt vmcnt(11) +; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v66 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 8, v66 +; ALIGNED-NEXT: flat_store_byte v[96:97], v99 offset:219 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 24, v67 +; ALIGNED-NEXT: flat_store_byte v[96:97], v98 offset:217 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 8, v67 +; ALIGNED-NEXT: flat_store_byte v[96:97], v84 offset:223 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v84, 24, v65 +; ALIGNED-NEXT: flat_store_byte v[96:97], v85 offset:221 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 8, v65 +; ALIGNED-NEXT: flat_store_byte v[96:97], v86 offset:215 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 24, v64 +; ALIGNED-NEXT: flat_store_byte v[96:97], v87 offset:213 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 8, v64 +; ALIGNED-NEXT: flat_store_byte v[96:97], v114 offset:211 +; ALIGNED-NEXT: s_waitcnt vmcnt(10) +; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v54 +; ALIGNED-NEXT: flat_store_byte v[96:97], v115 offset:209 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v54 +; ALIGNED-NEXT: buffer_store_dword v70, off, s[0:3], s32 offset:408 +; ALIGNED-NEXT: buffer_store_dword v71, off, s[0:3], s32 offset:412 +; ALIGNED-NEXT: buffer_store_dword v69, off, s[0:3], s32 offset:404 +; ALIGNED-NEXT: buffer_store_dword v68, off, s[0:3], s32 offset:400 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v70 offset:202 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v71 offset:206 +; ALIGNED-NEXT: flat_store_byte v[96:97], v71 offset:204 +; ALIGNED-NEXT: flat_store_byte v[96:97], v70 offset:200 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v69 offset:198 +; ALIGNED-NEXT: flat_store_byte v[96:97], v69 offset:196 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v68 offset:194 +; ALIGNED-NEXT: flat_store_byte v[96:97], v68 offset:192 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 8, v55 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 24, v52 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 8, v52 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v68, 24, v55 +; ALIGNED-NEXT: flat_store_byte v[96:97], v113 offset:203 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v53 +; ALIGNED-NEXT: flat_store_byte v[96:97], v112 offset:201 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v53 +; ALIGNED-NEXT: flat_store_byte v[96:97], v103 offset:207 +; ALIGNED-NEXT: s_waitcnt vmcnt(9) +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v50 +; ALIGNED-NEXT: flat_store_byte v[96:97], v102 offset:205 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 8, v50 +; ALIGNED-NEXT: flat_store_byte v[96:97], v101 offset:199 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 24, v51 +; ALIGNED-NEXT: flat_store_byte v[96:97], v100 offset:197 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 8, v51 +; ALIGNED-NEXT: flat_store_byte v[96:97], v80 offset:195 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 24, v49 +; ALIGNED-NEXT: flat_store_byte v[96:97], v81 offset:193 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 8, v49 +; ALIGNED-NEXT: buffer_store_dword v66, off, s[0:3], s32 offset:488 +; ALIGNED-NEXT: buffer_store_dword v67, off, s[0:3], s32 offset:492 +; ALIGNED-NEXT: buffer_store_dword v65, off, s[0:3], s32 offset:484 +; ALIGNED-NEXT: buffer_store_dword v64, off, s[0:3], s32 offset:480 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v66 offset:186 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v67 offset:190 +; ALIGNED-NEXT: flat_store_byte v[96:97], v67 offset:188 +; ALIGNED-NEXT: flat_store_byte v[96:97], v66 offset:184 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v65 offset:182 +; ALIGNED-NEXT: flat_store_byte v[96:97], v65 offset:180 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v64 offset:178 +; ALIGNED-NEXT: flat_store_byte v[96:97], v64 offset:176 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v64, 24, v48 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v65, 8, v48 +; ALIGNED-NEXT: flat_store_byte v[96:97], v82 offset:187 +; ALIGNED-NEXT: s_waitcnt vmcnt(8) +; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v39 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v66, 24, v38 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v67, 8, v38 +; ALIGNED-NEXT: flat_store_byte v[96:97], v83 offset:185 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 8, v39 +; ALIGNED-NEXT: flat_store_byte v[96:97], v99 offset:191 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 24, v37 +; ALIGNED-NEXT: flat_store_byte v[96:97], v98 offset:189 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 8, v37 +; ALIGNED-NEXT: flat_store_byte v[96:97], v84 offset:183 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v84, 24, v36 +; ALIGNED-NEXT: flat_store_byte v[96:97], v85 offset:181 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 8, v36 +; ALIGNED-NEXT: flat_store_byte v[96:97], v86 offset:179 +; ALIGNED-NEXT: s_waitcnt vmcnt(7) +; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 24, v34 +; ALIGNED-NEXT: flat_store_byte v[96:97], v87 offset:177 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 8, v34 +; ALIGNED-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:508 +; ALIGNED-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:504 +; ALIGNED-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:500 +; ALIGNED-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:496 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v54 offset:170 +; ALIGNED-NEXT: flat_store_byte v[96:97], v54 offset:168 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v55 offset:174 +; ALIGNED-NEXT: flat_store_byte v[96:97], v55 offset:172 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v52 offset:162 +; ALIGNED-NEXT: flat_store_byte v[96:97], v52 offset:160 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v53 offset:166 +; ALIGNED-NEXT: flat_store_byte v[96:97], v53 offset:164 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v53, 24, v35 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v54, 8, v35 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v55, 24, v33 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v116, 8, v33 +; ALIGNED-NEXT: flat_store_byte v[96:97], v114 offset:171 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v32 +; ALIGNED-NEXT: flat_store_byte v[96:97], v115 offset:169 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v32 +; ALIGNED-NEXT: flat_store_byte v[96:97], v69 offset:173 +; ALIGNED-NEXT: s_waitcnt vmcnt(6) +; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 24, v31 +; ALIGNED-NEXT: flat_store_byte v[96:97], v70 offset:163 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 8, v31 +; ALIGNED-NEXT: flat_store_byte v[96:97], v71 offset:161 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 24, v29 +; ALIGNED-NEXT: flat_store_byte v[96:97], v68 offset:175 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v52, 24, v30 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v68, 8, v30 +; ALIGNED-NEXT: flat_store_byte v[96:97], v113 offset:167 +; ALIGNED-NEXT: flat_store_byte v[96:97], v112 offset:165 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v29 +; ALIGNED-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:456 +; ALIGNED-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:460 +; ALIGNED-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:452 +; ALIGNED-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:448 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v50 offset:154 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v51 offset:158 +; ALIGNED-NEXT: flat_store_byte v[96:97], v51 offset:156 +; ALIGNED-NEXT: flat_store_byte v[96:97], v50 offset:152 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v49 offset:150 +; ALIGNED-NEXT: flat_store_byte v[96:97], v49 offset:148 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v48 offset:146 +; ALIGNED-NEXT: flat_store_byte v[96:97], v48 offset:144 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v48, 24, v28 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v49, 8, v28 +; ALIGNED-NEXT: s_waitcnt vmcnt(5) +; ALIGNED-NEXT: v_lshrrev_b32_e32 v50, 24, v26 +; ALIGNED-NEXT: flat_store_byte v[96:97], v103 offset:155 +; ALIGNED-NEXT: flat_store_byte v[96:97], v102 offset:153 +; ALIGNED-NEXT: flat_store_byte v[96:97], v101 offset:159 +; ALIGNED-NEXT: flat_store_byte v[96:97], v100 offset:157 +; ALIGNED-NEXT: flat_store_byte v[96:97], v80 offset:151 +; ALIGNED-NEXT: flat_store_byte v[96:97], v81 offset:149 +; ALIGNED-NEXT: flat_store_byte v[96:97], v64 offset:147 +; ALIGNED-NEXT: flat_store_byte v[96:97], v65 offset:145 +; ALIGNED-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:472 +; ALIGNED-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:476 +; ALIGNED-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:468 +; ALIGNED-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:464 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v38 offset:138 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v39 offset:142 +; ALIGNED-NEXT: flat_store_byte v[96:97], v39 offset:140 +; ALIGNED-NEXT: flat_store_byte v[96:97], v38 offset:136 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v37 offset:134 +; ALIGNED-NEXT: flat_store_byte v[96:97], v37 offset:132 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v36 offset:130 +; ALIGNED-NEXT: flat_store_byte v[96:97], v36 offset:128 +; ALIGNED-NEXT: flat_store_byte v[96:97], v82 offset:143 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v18 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 8, v26 +; ALIGNED-NEXT: flat_store_byte v[96:97], v66 offset:139 +; ALIGNED-NEXT: flat_store_byte v[96:97], v67 offset:137 +; ALIGNED-NEXT: flat_store_byte v[96:97], v83 offset:141 +; ALIGNED-NEXT: flat_store_byte v[96:97], v99 offset:135 +; ALIGNED-NEXT: flat_store_byte v[96:97], v98 offset:133 +; ALIGNED-NEXT: flat_store_byte v[96:97], v84 offset:131 +; ALIGNED-NEXT: flat_store_byte v[96:97], v85 offset:129 +; ALIGNED-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:296 +; ALIGNED-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:300 +; ALIGNED-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 +; ALIGNED-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:288 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v34 offset:122 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v35 offset:126 +; ALIGNED-NEXT: flat_store_byte v[96:97], v35 offset:124 +; ALIGNED-NEXT: flat_store_byte v[96:97], v34 offset:120 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v33 offset:118 +; ALIGNED-NEXT: flat_store_byte v[96:97], v33 offset:116 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v32 offset:114 +; ALIGNED-NEXT: flat_store_byte v[96:97], v32 offset:112 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: v_lshrrev_b32_e32 v34, 24, v14 +; ALIGNED-NEXT: flat_store_byte v[96:97], v86 offset:123 +; ALIGNED-NEXT: flat_store_byte v[96:97], v87 offset:121 +; ALIGNED-NEXT: flat_store_byte v[96:97], v53 offset:127 +; ALIGNED-NEXT: flat_store_byte v[96:97], v54 offset:125 +; ALIGNED-NEXT: flat_store_byte v[96:97], v55 offset:119 +; ALIGNED-NEXT: flat_store_byte v[96:97], v116 offset:117 +; ALIGNED-NEXT: flat_store_byte v[96:97], v114 offset:115 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v10 +; ALIGNED-NEXT: flat_store_byte v[96:97], v115 offset:113 +; ALIGNED-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:312 +; ALIGNED-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:316 +; ALIGNED-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:308 +; ALIGNED-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:304 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v30 offset:106 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v31 offset:110 +; ALIGNED-NEXT: flat_store_byte v[96:97], v31 offset:108 +; ALIGNED-NEXT: flat_store_byte v[96:97], v30 offset:104 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v29 offset:102 +; ALIGNED-NEXT: flat_store_byte v[96:97], v29 offset:100 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v28 offset:98 +; ALIGNED-NEXT: flat_store_byte v[96:97], v28 offset:96 +; ALIGNED-NEXT: flat_store_byte v[96:97], v69 offset:111 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 24, v6 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v27 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 8, v18 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v35, 8, v14 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v10 +; ALIGNED-NEXT: flat_store_byte v[96:97], v70 offset:109 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 8, v6 +; ALIGNED-NEXT: s_cmp_eq_u64 s[4:5], s[6:7] +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 8, v27 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 24, v25 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 24, v19 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 24, v15 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v28, 24, v11 +; ALIGNED-NEXT: flat_store_byte v[96:97], v71 offset:103 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 24, v7 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 8, v25 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 24, v24 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 8, v24 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v64, 24, v22 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v65, 8, v22 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v36, 24, v23 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v37, 8, v23 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v38, 24, v21 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v39, 8, v21 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v66, 24, v20 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v67, 8, v20 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 8, v19 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v84, 24, v17 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 8, v17 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v32, 24, v16 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v33, 8, v16 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 8, v15 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v53, 24, v13 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v54, 8, v13 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v55, 24, v12 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 8, v12 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v29, 8, v11 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v30, 24, v9 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v31, 8, v9 +; ALIGNED-NEXT: flat_store_byte v[96:97], v52 offset:107 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v52, 24, v8 +; ALIGNED-NEXT: flat_store_byte v[96:97], v68 offset:105 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v68, 8, v8 +; ALIGNED-NEXT: flat_store_byte v[96:97], v112 offset:101 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v7 +; ALIGNED-NEXT: flat_store_byte v[96:97], v48 offset:99 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v48, 24, v5 +; ALIGNED-NEXT: flat_store_byte v[96:97], v49 offset:97 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v49, 8, v5 +; ALIGNED-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:264 +; ALIGNED-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:268 +; ALIGNED-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:260 +; ALIGNED-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:256 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v26 offset:90 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v27 offset:94 +; ALIGNED-NEXT: flat_store_byte v[96:97], v27 offset:92 +; ALIGNED-NEXT: flat_store_byte v[96:97], v26 offset:88 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v25 offset:86 +; ALIGNED-NEXT: flat_store_byte v[96:97], v25 offset:84 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v24 offset:82 +; ALIGNED-NEXT: flat_store_byte v[96:97], v24 offset:80 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v24, 24, v4 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v25, 8, v4 +; ALIGNED-NEXT: flat_store_byte v[96:97], v50 offset:91 +; ALIGNED-NEXT: flat_store_byte v[96:97], v51 offset:89 +; ALIGNED-NEXT: flat_store_byte v[96:97], v103 offset:95 +; ALIGNED-NEXT: flat_store_byte v[96:97], v102 offset:93 +; ALIGNED-NEXT: flat_store_byte v[96:97], v101 offset:87 +; ALIGNED-NEXT: flat_store_byte v[96:97], v100 offset:85 +; ALIGNED-NEXT: flat_store_byte v[96:97], v80 offset:83 +; ALIGNED-NEXT: flat_store_byte v[96:97], v81 offset:81 +; ALIGNED-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:280 +; ALIGNED-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:284 +; ALIGNED-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:276 +; ALIGNED-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:272 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v22 offset:74 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v23 offset:78 +; ALIGNED-NEXT: flat_store_byte v[96:97], v23 offset:76 +; ALIGNED-NEXT: flat_store_byte v[96:97], v22 offset:72 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v21 offset:70 +; ALIGNED-NEXT: flat_store_byte v[96:97], v21 offset:68 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v20 offset:66 +; ALIGNED-NEXT: flat_store_byte v[96:97], v20 offset:64 +; ALIGNED-NEXT: flat_store_byte v[96:97], v64 offset:75 +; ALIGNED-NEXT: flat_store_byte v[96:97], v65 offset:73 +; ALIGNED-NEXT: flat_store_byte v[96:97], v36 offset:79 +; ALIGNED-NEXT: flat_store_byte v[96:97], v37 offset:77 +; ALIGNED-NEXT: flat_store_byte v[96:97], v38 offset:71 +; ALIGNED-NEXT: flat_store_byte v[96:97], v39 offset:69 +; ALIGNED-NEXT: flat_store_byte v[96:97], v66 offset:67 +; ALIGNED-NEXT: flat_store_byte v[96:97], v67 offset:65 +; ALIGNED-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:360 +; ALIGNED-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:364 +; ALIGNED-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:356 +; ALIGNED-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:352 +; ALIGNED-NEXT: flat_store_byte v[96:97], v82 offset:59 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v18 offset:58 +; ALIGNED-NEXT: flat_store_byte v[96:97], v83 offset:57 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v19 offset:62 +; ALIGNED-NEXT: flat_store_byte v[96:97], v99 offset:63 +; ALIGNED-NEXT: flat_store_byte v[96:97], v19 offset:60 +; ALIGNED-NEXT: flat_store_byte v[96:97], v98 offset:61 +; ALIGNED-NEXT: flat_store_byte v[96:97], v18 offset:56 +; ALIGNED-NEXT: flat_store_byte v[96:97], v84 offset:55 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v17 offset:54 +; ALIGNED-NEXT: flat_store_byte v[96:97], v85 offset:53 +; ALIGNED-NEXT: flat_store_byte v[96:97], v17 offset:52 +; ALIGNED-NEXT: flat_store_byte v[96:97], v32 offset:51 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v16 offset:50 +; ALIGNED-NEXT: flat_store_byte v[96:97], v33 offset:49 +; ALIGNED-NEXT: flat_store_byte v[96:97], v16 offset:48 +; ALIGNED-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:376 +; ALIGNED-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:380 +; ALIGNED-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:372 +; ALIGNED-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:368 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v14 offset:42 +; ALIGNED-NEXT: flat_store_byte v[96:97], v34 offset:43 +; ALIGNED-NEXT: flat_store_byte v[96:97], v35 offset:41 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v15 offset:46 +; ALIGNED-NEXT: flat_store_byte v[96:97], v86 offset:47 +; ALIGNED-NEXT: flat_store_byte v[96:97], v15 offset:44 +; ALIGNED-NEXT: flat_store_byte v[96:97], v87 offset:45 +; ALIGNED-NEXT: flat_store_byte v[96:97], v14 offset:40 +; ALIGNED-NEXT: flat_store_byte v[96:97], v53 offset:39 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v13 offset:38 +; ALIGNED-NEXT: flat_store_byte v[96:97], v54 offset:37 +; ALIGNED-NEXT: flat_store_byte v[96:97], v13 offset:36 +; ALIGNED-NEXT: flat_store_byte v[96:97], v55 offset:35 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v12 offset:34 +; ALIGNED-NEXT: flat_store_byte v[96:97], v113 offset:33 +; ALIGNED-NEXT: flat_store_byte v[96:97], v12 offset:32 +; ALIGNED-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:328 +; ALIGNED-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:332 +; ALIGNED-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:324 +; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:320 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v10 offset:26 +; ALIGNED-NEXT: flat_store_byte v[96:97], v114 offset:27 +; ALIGNED-NEXT: flat_store_byte v[96:97], v115 offset:25 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v11 offset:30 +; ALIGNED-NEXT: flat_store_byte v[96:97], v28 offset:31 +; ALIGNED-NEXT: flat_store_byte v[96:97], v11 offset:28 +; ALIGNED-NEXT: flat_store_byte v[96:97], v29 offset:29 +; ALIGNED-NEXT: flat_store_byte v[96:97], v10 offset:24 +; ALIGNED-NEXT: flat_store_byte v[96:97], v30 offset:23 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v9 offset:22 +; ALIGNED-NEXT: flat_store_byte v[96:97], v31 offset:21 +; ALIGNED-NEXT: flat_store_byte v[96:97], v9 offset:20 +; ALIGNED-NEXT: flat_store_byte v[96:97], v52 offset:19 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v8 offset:18 +; ALIGNED-NEXT: flat_store_byte v[96:97], v68 offset:17 +; ALIGNED-NEXT: flat_store_byte v[96:97], v8 offset:16 +; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:344 +; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:348 +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:340 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:336 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v6 offset:10 +; ALIGNED-NEXT: flat_store_byte v[96:97], v69 offset:11 +; ALIGNED-NEXT: flat_store_byte v[96:97], v70 offset:9 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v7 offset:14 +; ALIGNED-NEXT: flat_store_byte v[96:97], v71 offset:15 +; ALIGNED-NEXT: flat_store_byte v[96:97], v7 offset:12 +; ALIGNED-NEXT: flat_store_byte v[96:97], v112 offset:13 +; ALIGNED-NEXT: flat_store_byte v[96:97], v6 offset:8 +; ALIGNED-NEXT: flat_store_byte v[96:97], v48 offset:7 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v5 offset:6 +; ALIGNED-NEXT: flat_store_byte v[96:97], v49 offset:5 +; ALIGNED-NEXT: flat_store_byte v[96:97], v5 offset:4 +; ALIGNED-NEXT: flat_store_byte v[96:97], v24 offset:3 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v4 offset:2 +; ALIGNED-NEXT: flat_store_byte v[96:97], v25 offset:1 +; ALIGNED-NEXT: flat_store_byte v[96:97], v4 +; ALIGNED-NEXT: s_cbranch_scc0 .LBB7_5 +; ALIGNED-NEXT: .LBB7_6: ; %Flow7 +; ALIGNED-NEXT: s_or_b32 exec_lo, exec_lo, s8 +; ALIGNED-NEXT: s_waitcnt lgkmcnt(0) +; ALIGNED-NEXT: s_setpc_b64 s[30:31] +; +; UNROLL3-LABEL: memmove_p0_p4_sz2048: +; UNROLL3: ; %bb.0: ; %entry +; UNROLL3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; UNROLL3-NEXT: s_mov_b32 s4, exec_lo +; UNROLL3-NEXT: v_cmpx_ge_u64_e64 v[2:3], v[0:1] +; UNROLL3-NEXT: s_xor_b32 s6, exec_lo, s4 +; UNROLL3-NEXT: s_cbranch_execz .LBB7_4 +; UNROLL3-NEXT: ; %bb.1: ; %memmove_fwd_loop.preheader +; UNROLL3-NEXT: s_mov_b64 s[4:5], 0 +; UNROLL3-NEXT: .p2align 6 +; UNROLL3-NEXT: .LBB7_2: ; %memmove_fwd_loop +; UNROLL3-NEXT: ; =>This Inner Loop Header: Depth=1 +; UNROLL3-NEXT: v_add_co_u32 v12, vcc_lo, v2, s4 +; UNROLL3-NEXT: v_add_co_ci_u32_e32 v13, vcc_lo, s5, v3, vcc_lo +; UNROLL3-NEXT: v_add_co_u32 v16, vcc_lo, v0, s4 +; UNROLL3-NEXT: v_add_co_ci_u32_e32 v17, vcc_lo, s5, v1, vcc_lo +; UNROLL3-NEXT: s_clause 0x2 +; UNROLL3-NEXT: global_load_dwordx4 v[4:7], v[12:13], off offset:16 +; UNROLL3-NEXT: global_load_dwordx4 v[8:11], v[12:13], off +; UNROLL3-NEXT: global_load_dwordx4 v[12:15], v[12:13], off offset:32 +; UNROLL3-NEXT: s_add_u32 s4, s4, 48 +; UNROLL3-NEXT: s_addc_u32 s5, s5, 0 +; UNROLL3-NEXT: s_waitcnt vmcnt(2) +; UNROLL3-NEXT: flat_store_dwordx4 v[16:17], v[4:7] offset:16 +; UNROLL3-NEXT: s_waitcnt vmcnt(1) +; UNROLL3-NEXT: flat_store_dwordx4 v[16:17], v[8:11] +; UNROLL3-NEXT: s_waitcnt vmcnt(0) +; UNROLL3-NEXT: flat_store_dwordx4 v[16:17], v[12:15] offset:32 +; UNROLL3-NEXT: s_cmp_lg_u64 s[4:5], 0x7e0 +; UNROLL3-NEXT: s_cbranch_scc1 .LBB7_2 +; UNROLL3-NEXT: ; %bb.3: ; %memmove_fwd_residual +; UNROLL3-NEXT: s_clause 0x1 +; UNROLL3-NEXT: global_load_dwordx4 v[4:7], v[2:3], off offset:2016 +; UNROLL3-NEXT: global_load_dwordx4 v[8:11], v[2:3], off offset:2032 +; UNROLL3-NEXT: ; implicit-def: $vgpr2_vgpr3 +; UNROLL3-NEXT: s_waitcnt vmcnt(1) +; UNROLL3-NEXT: flat_store_dwordx4 v[0:1], v[4:7] offset:2016 +; UNROLL3-NEXT: s_waitcnt vmcnt(0) +; UNROLL3-NEXT: flat_store_dwordx4 v[0:1], v[8:11] offset:2032 +; UNROLL3-NEXT: ; implicit-def: $vgpr0_vgpr1 +; UNROLL3-NEXT: .LBB7_4: ; %Flow4 +; UNROLL3-NEXT: s_andn2_saveexec_b32 s8, s6 +; UNROLL3-NEXT: s_cbranch_execz .LBB7_7 +; UNROLL3-NEXT: ; %bb.5: ; %memmove_bwd_residual +; UNROLL3-NEXT: s_clause 0x1 +; UNROLL3-NEXT: global_load_dwordx4 v[4:7], v[2:3], off offset:2032 +; UNROLL3-NEXT: global_load_dwordx4 v[8:11], v[2:3], off offset:2016 +; UNROLL3-NEXT: s_movk_i32 s6, 0xffd0 +; UNROLL3-NEXT: s_mov_b64 s[4:5], 0x7b0 +; UNROLL3-NEXT: s_mov_b32 s7, -1 +; UNROLL3-NEXT: s_waitcnt vmcnt(1) +; UNROLL3-NEXT: flat_store_dwordx4 v[0:1], v[4:7] offset:2032 +; UNROLL3-NEXT: s_waitcnt vmcnt(0) +; UNROLL3-NEXT: flat_store_dwordx4 v[0:1], v[8:11] offset:2016 +; UNROLL3-NEXT: .p2align 6 +; UNROLL3-NEXT: .LBB7_6: ; %memmove_bwd_loop +; UNROLL3-NEXT: ; =>This Inner Loop Header: Depth=1 +; UNROLL3-NEXT: v_add_co_u32 v12, vcc_lo, v2, s4 +; UNROLL3-NEXT: v_add_co_ci_u32_e32 v13, vcc_lo, s5, v3, vcc_lo +; UNROLL3-NEXT: v_add_co_u32 v16, vcc_lo, v0, s4 +; UNROLL3-NEXT: v_add_co_ci_u32_e32 v17, vcc_lo, s5, v1, vcc_lo +; UNROLL3-NEXT: s_clause 0x2 +; UNROLL3-NEXT: global_load_dwordx4 v[4:7], v[12:13], off offset:16 +; UNROLL3-NEXT: global_load_dwordx4 v[8:11], v[12:13], off +; UNROLL3-NEXT: global_load_dwordx4 v[12:15], v[12:13], off offset:32 +; UNROLL3-NEXT: s_add_u32 s4, s4, 0xffffffd0 +; UNROLL3-NEXT: s_addc_u32 s5, s5, -1 +; UNROLL3-NEXT: s_waitcnt vmcnt(2) +; UNROLL3-NEXT: flat_store_dwordx4 v[16:17], v[4:7] offset:16 +; UNROLL3-NEXT: s_waitcnt vmcnt(1) +; UNROLL3-NEXT: flat_store_dwordx4 v[16:17], v[8:11] +; UNROLL3-NEXT: s_waitcnt vmcnt(0) +; UNROLL3-NEXT: flat_store_dwordx4 v[16:17], v[12:15] offset:32 +; UNROLL3-NEXT: s_cmp_eq_u64 s[4:5], s[6:7] +; UNROLL3-NEXT: s_cbranch_scc0 .LBB7_6 +; UNROLL3-NEXT: .LBB7_7: ; %Flow5 +; UNROLL3-NEXT: s_or_b32 exec_lo, exec_lo, s8 +; UNROLL3-NEXT: s_waitcnt lgkmcnt(0) +; UNROLL3-NEXT: s_setpc_b64 s[30:31] +entry: + tail call void @llvm.memmove.p0.p4.i64(ptr addrspace(0) noundef nonnull align 1 %dst, ptr addrspace(4) noundef nonnull align 1 %src, i64 2048, i1 false) + ret void +} + +define void @memmove_p5_p5_sz2048(ptr addrspace(5) align 1 %dst, ptr addrspace(5) align 1 readonly %src) { +; CHECK-LABEL: memmove_p5_p5_sz2048: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_mov_b32 s4, exec_lo +; CHECK-NEXT: v_cmpx_ge_u32_e64 v1, v0 +; CHECK-NEXT: s_xor_b32 s6, exec_lo, s4 +; CHECK-NEXT: s_cbranch_execz .LBB8_3 +; CHECK-NEXT: ; %bb.1: ; %memmove_fwd_loop.preheader +; CHECK-NEXT: s_mov_b64 s[4:5], 0x800 +; CHECK-NEXT: .LBB8_2: ; %memmove_fwd_loop +; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: s_clause 0x3e +; CHECK-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen offset:252 +; CHECK-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:248 +; CHECK-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen offset:244 +; CHECK-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen offset:240 +; CHECK-NEXT: buffer_load_dword v6, v1, s[0:3], 0 offen offset:236 +; CHECK-NEXT: buffer_load_dword v7, v1, s[0:3], 0 offen offset:232 +; CHECK-NEXT: buffer_load_dword v8, v1, s[0:3], 0 offen offset:228 +; CHECK-NEXT: buffer_load_dword v9, v1, s[0:3], 0 offen offset:224 +; CHECK-NEXT: buffer_load_dword v10, v1, s[0:3], 0 offen offset:220 +; CHECK-NEXT: buffer_load_dword v11, v1, s[0:3], 0 offen offset:216 +; CHECK-NEXT: buffer_load_dword v12, v1, s[0:3], 0 offen offset:212 +; CHECK-NEXT: buffer_load_dword v13, v1, s[0:3], 0 offen offset:208 +; CHECK-NEXT: buffer_load_dword v14, v1, s[0:3], 0 offen offset:204 +; CHECK-NEXT: buffer_load_dword v15, v1, s[0:3], 0 offen offset:200 +; CHECK-NEXT: buffer_load_dword v16, v1, s[0:3], 0 offen offset:196 +; CHECK-NEXT: buffer_load_dword v17, v1, s[0:3], 0 offen offset:192 +; CHECK-NEXT: buffer_load_dword v18, v1, s[0:3], 0 offen offset:188 +; CHECK-NEXT: buffer_load_dword v19, v1, s[0:3], 0 offen offset:184 +; CHECK-NEXT: buffer_load_dword v20, v1, s[0:3], 0 offen offset:180 +; CHECK-NEXT: buffer_load_dword v21, v1, s[0:3], 0 offen offset:176 +; CHECK-NEXT: buffer_load_dword v22, v1, s[0:3], 0 offen offset:172 +; CHECK-NEXT: buffer_load_dword v23, v1, s[0:3], 0 offen offset:168 +; CHECK-NEXT: buffer_load_dword v24, v1, s[0:3], 0 offen offset:164 +; CHECK-NEXT: buffer_load_dword v25, v1, s[0:3], 0 offen offset:160 +; CHECK-NEXT: buffer_load_dword v26, v1, s[0:3], 0 offen offset:156 +; CHECK-NEXT: buffer_load_dword v27, v1, s[0:3], 0 offen offset:152 +; CHECK-NEXT: buffer_load_dword v28, v1, s[0:3], 0 offen offset:148 +; CHECK-NEXT: buffer_load_dword v29, v1, s[0:3], 0 offen offset:144 +; CHECK-NEXT: buffer_load_dword v30, v1, s[0:3], 0 offen offset:140 +; CHECK-NEXT: buffer_load_dword v31, v1, s[0:3], 0 offen offset:136 +; CHECK-NEXT: buffer_load_dword v32, v1, s[0:3], 0 offen offset:132 +; CHECK-NEXT: buffer_load_dword v33, v1, s[0:3], 0 offen offset:128 +; CHECK-NEXT: buffer_load_dword v34, v1, s[0:3], 0 offen offset:124 +; CHECK-NEXT: buffer_load_dword v35, v1, s[0:3], 0 offen offset:120 +; CHECK-NEXT: buffer_load_dword v36, v1, s[0:3], 0 offen offset:116 +; CHECK-NEXT: buffer_load_dword v37, v1, s[0:3], 0 offen offset:112 +; CHECK-NEXT: buffer_load_dword v38, v1, s[0:3], 0 offen offset:108 +; CHECK-NEXT: buffer_load_dword v39, v1, s[0:3], 0 offen offset:104 +; CHECK-NEXT: buffer_load_dword v48, v1, s[0:3], 0 offen offset:100 +; CHECK-NEXT: buffer_load_dword v49, v1, s[0:3], 0 offen offset:96 +; CHECK-NEXT: buffer_load_dword v50, v1, s[0:3], 0 offen offset:92 +; CHECK-NEXT: buffer_load_dword v51, v1, s[0:3], 0 offen offset:88 +; CHECK-NEXT: buffer_load_dword v52, v1, s[0:3], 0 offen offset:84 +; CHECK-NEXT: buffer_load_dword v53, v1, s[0:3], 0 offen offset:80 +; CHECK-NEXT: buffer_load_dword v54, v1, s[0:3], 0 offen offset:76 +; CHECK-NEXT: buffer_load_dword v55, v1, s[0:3], 0 offen offset:72 +; CHECK-NEXT: buffer_load_dword v64, v1, s[0:3], 0 offen offset:68 +; CHECK-NEXT: buffer_load_dword v65, v1, s[0:3], 0 offen offset:64 +; CHECK-NEXT: buffer_load_dword v66, v1, s[0:3], 0 offen offset:60 +; CHECK-NEXT: buffer_load_dword v67, v1, s[0:3], 0 offen offset:56 +; CHECK-NEXT: buffer_load_dword v68, v1, s[0:3], 0 offen offset:52 +; CHECK-NEXT: buffer_load_dword v69, v1, s[0:3], 0 offen offset:48 +; CHECK-NEXT: buffer_load_dword v70, v1, s[0:3], 0 offen offset:44 +; CHECK-NEXT: buffer_load_dword v71, v1, s[0:3], 0 offen offset:40 +; CHECK-NEXT: buffer_load_dword v80, v1, s[0:3], 0 offen offset:36 +; CHECK-NEXT: buffer_load_dword v81, v1, s[0:3], 0 offen offset:32 +; CHECK-NEXT: buffer_load_dword v82, v1, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_load_dword v83, v1, s[0:3], 0 offen offset:24 +; CHECK-NEXT: buffer_load_dword v84, v1, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_load_dword v85, v1, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_load_dword v86, v1, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_load_dword v87, v1, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v96, v1, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v97, v1, s[0:3], 0 offen +; CHECK-NEXT: v_add_nc_u32_e32 v1, 0x100, v1 +; CHECK-NEXT: s_add_u32 s4, s4, 0xffffff00 +; CHECK-NEXT: s_addc_u32 s5, s5, -1 +; CHECK-NEXT: s_waitcnt vmcnt(62) +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:252 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:248 +; CHECK-NEXT: s_waitcnt vmcnt(61) +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:244 +; CHECK-NEXT: s_waitcnt vmcnt(60) +; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:240 +; CHECK-NEXT: s_waitcnt vmcnt(59) +; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:236 +; CHECK-NEXT: s_waitcnt vmcnt(58) +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:232 +; CHECK-NEXT: s_waitcnt vmcnt(57) +; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:228 +; CHECK-NEXT: s_waitcnt vmcnt(56) +; CHECK-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:224 +; CHECK-NEXT: s_waitcnt vmcnt(55) +; CHECK-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:220 +; CHECK-NEXT: s_waitcnt vmcnt(54) +; CHECK-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:216 +; CHECK-NEXT: s_waitcnt vmcnt(53) +; CHECK-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen offset:212 +; CHECK-NEXT: s_waitcnt vmcnt(52) +; CHECK-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:208 +; CHECK-NEXT: s_waitcnt vmcnt(51) +; CHECK-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen offset:204 +; CHECK-NEXT: s_waitcnt vmcnt(50) +; CHECK-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:200 +; CHECK-NEXT: s_waitcnt vmcnt(49) +; CHECK-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen offset:196 +; CHECK-NEXT: s_waitcnt vmcnt(48) +; CHECK-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen offset:192 +; CHECK-NEXT: s_waitcnt vmcnt(47) +; CHECK-NEXT: buffer_store_dword v18, v0, s[0:3], 0 offen offset:188 +; CHECK-NEXT: s_waitcnt vmcnt(46) +; CHECK-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:184 +; CHECK-NEXT: s_waitcnt vmcnt(45) +; CHECK-NEXT: buffer_store_dword v20, v0, s[0:3], 0 offen offset:180 +; CHECK-NEXT: s_waitcnt vmcnt(44) +; CHECK-NEXT: buffer_store_dword v21, v0, s[0:3], 0 offen offset:176 +; CHECK-NEXT: s_waitcnt vmcnt(43) +; CHECK-NEXT: buffer_store_dword v22, v0, s[0:3], 0 offen offset:172 +; CHECK-NEXT: s_waitcnt vmcnt(42) +; CHECK-NEXT: buffer_store_dword v23, v0, s[0:3], 0 offen offset:168 +; CHECK-NEXT: s_waitcnt vmcnt(41) +; CHECK-NEXT: buffer_store_dword v24, v0, s[0:3], 0 offen offset:164 +; CHECK-NEXT: s_waitcnt vmcnt(40) +; CHECK-NEXT: buffer_store_dword v25, v0, s[0:3], 0 offen offset:160 +; CHECK-NEXT: s_waitcnt vmcnt(39) +; CHECK-NEXT: buffer_store_dword v26, v0, s[0:3], 0 offen offset:156 +; CHECK-NEXT: s_waitcnt vmcnt(38) +; CHECK-NEXT: buffer_store_dword v27, v0, s[0:3], 0 offen offset:152 +; CHECK-NEXT: s_waitcnt vmcnt(37) +; CHECK-NEXT: buffer_store_dword v28, v0, s[0:3], 0 offen offset:148 +; CHECK-NEXT: s_waitcnt vmcnt(36) +; CHECK-NEXT: buffer_store_dword v29, v0, s[0:3], 0 offen offset:144 +; CHECK-NEXT: s_waitcnt vmcnt(35) +; CHECK-NEXT: buffer_store_dword v30, v0, s[0:3], 0 offen offset:140 +; CHECK-NEXT: s_waitcnt vmcnt(34) +; CHECK-NEXT: buffer_store_dword v31, v0, s[0:3], 0 offen offset:136 +; CHECK-NEXT: s_waitcnt vmcnt(33) +; CHECK-NEXT: buffer_store_dword v32, v0, s[0:3], 0 offen offset:132 +; CHECK-NEXT: s_waitcnt vmcnt(32) +; CHECK-NEXT: buffer_store_dword v33, v0, s[0:3], 0 offen offset:128 +; CHECK-NEXT: s_waitcnt vmcnt(31) +; CHECK-NEXT: buffer_store_dword v34, v0, s[0:3], 0 offen offset:124 +; CHECK-NEXT: s_waitcnt vmcnt(30) +; CHECK-NEXT: buffer_store_dword v35, v0, s[0:3], 0 offen offset:120 +; CHECK-NEXT: s_waitcnt vmcnt(29) +; CHECK-NEXT: buffer_store_dword v36, v0, s[0:3], 0 offen offset:116 +; CHECK-NEXT: s_waitcnt vmcnt(28) +; CHECK-NEXT: buffer_store_dword v37, v0, s[0:3], 0 offen offset:112 +; CHECK-NEXT: s_waitcnt vmcnt(27) +; CHECK-NEXT: buffer_store_dword v38, v0, s[0:3], 0 offen offset:108 +; CHECK-NEXT: s_waitcnt vmcnt(26) +; CHECK-NEXT: buffer_store_dword v39, v0, s[0:3], 0 offen offset:104 +; CHECK-NEXT: s_waitcnt vmcnt(25) +; CHECK-NEXT: buffer_store_dword v48, v0, s[0:3], 0 offen offset:100 +; CHECK-NEXT: s_waitcnt vmcnt(24) +; CHECK-NEXT: buffer_store_dword v49, v0, s[0:3], 0 offen offset:96 +; CHECK-NEXT: s_waitcnt vmcnt(23) +; CHECK-NEXT: buffer_store_dword v50, v0, s[0:3], 0 offen offset:92 +; CHECK-NEXT: s_waitcnt vmcnt(22) +; CHECK-NEXT: buffer_store_dword v51, v0, s[0:3], 0 offen offset:88 +; CHECK-NEXT: s_waitcnt vmcnt(21) +; CHECK-NEXT: buffer_store_dword v52, v0, s[0:3], 0 offen offset:84 +; CHECK-NEXT: s_waitcnt vmcnt(20) +; CHECK-NEXT: buffer_store_dword v53, v0, s[0:3], 0 offen offset:80 +; CHECK-NEXT: s_waitcnt vmcnt(19) +; CHECK-NEXT: buffer_store_dword v54, v0, s[0:3], 0 offen offset:76 +; CHECK-NEXT: s_waitcnt vmcnt(18) +; CHECK-NEXT: buffer_store_dword v55, v0, s[0:3], 0 offen offset:72 +; CHECK-NEXT: s_waitcnt vmcnt(17) +; CHECK-NEXT: buffer_store_dword v64, v0, s[0:3], 0 offen offset:68 +; CHECK-NEXT: s_waitcnt vmcnt(16) +; CHECK-NEXT: buffer_store_dword v65, v0, s[0:3], 0 offen offset:64 +; CHECK-NEXT: s_waitcnt vmcnt(15) +; CHECK-NEXT: buffer_store_dword v66, v0, s[0:3], 0 offen offset:60 +; CHECK-NEXT: s_waitcnt vmcnt(14) +; CHECK-NEXT: buffer_store_dword v67, v0, s[0:3], 0 offen offset:56 +; CHECK-NEXT: s_waitcnt vmcnt(13) +; CHECK-NEXT: buffer_store_dword v68, v0, s[0:3], 0 offen offset:52 +; CHECK-NEXT: s_waitcnt vmcnt(12) +; CHECK-NEXT: buffer_store_dword v69, v0, s[0:3], 0 offen offset:48 +; CHECK-NEXT: s_waitcnt vmcnt(11) +; CHECK-NEXT: buffer_store_dword v70, v0, s[0:3], 0 offen offset:44 +; CHECK-NEXT: s_waitcnt vmcnt(10) +; CHECK-NEXT: buffer_store_dword v71, v0, s[0:3], 0 offen offset:40 +; CHECK-NEXT: s_waitcnt vmcnt(9) +; CHECK-NEXT: buffer_store_dword v80, v0, s[0:3], 0 offen offset:36 +; CHECK-NEXT: s_waitcnt vmcnt(8) +; CHECK-NEXT: buffer_store_dword v81, v0, s[0:3], 0 offen offset:32 +; CHECK-NEXT: s_waitcnt vmcnt(7) +; CHECK-NEXT: buffer_store_dword v82, v0, s[0:3], 0 offen offset:28 +; CHECK-NEXT: s_waitcnt vmcnt(6) +; CHECK-NEXT: buffer_store_dword v83, v0, s[0:3], 0 offen offset:24 +; CHECK-NEXT: s_waitcnt vmcnt(5) +; CHECK-NEXT: buffer_store_dword v84, v0, s[0:3], 0 offen offset:20 +; CHECK-NEXT: s_waitcnt vmcnt(4) +; CHECK-NEXT: buffer_store_dword v85, v0, s[0:3], 0 offen offset:16 +; CHECK-NEXT: s_waitcnt vmcnt(3) +; CHECK-NEXT: buffer_store_dword v86, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: s_waitcnt vmcnt(2) +; CHECK-NEXT: buffer_store_dword v87, v0, s[0:3], 0 offen offset:8 +; CHECK-NEXT: s_waitcnt vmcnt(1) +; CHECK-NEXT: buffer_store_dword v96, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: buffer_store_dword v97, v0, s[0:3], 0 offen +; CHECK-NEXT: v_add_nc_u32_e32 v0, 0x100, v0 +; CHECK-NEXT: s_cmp_lg_u64 s[4:5], 0 +; CHECK-NEXT: s_cbranch_scc1 .LBB8_2 +; CHECK-NEXT: .LBB8_3: ; %Flow18 +; CHECK-NEXT: s_andn2_saveexec_b32 s6, s6 +; CHECK-NEXT: s_cbranch_execz .LBB8_6 +; CHECK-NEXT: ; %bb.4: ; %memmove_bwd_loop.preheader +; CHECK-NEXT: v_add_nc_u32_e32 v0, 0x700, v0 +; CHECK-NEXT: v_add_nc_u32_e32 v1, 0x700, v1 +; CHECK-NEXT: s_movk_i32 s4, 0xf800 +; CHECK-NEXT: s_mov_b32 s5, -1 +; CHECK-NEXT: .LBB8_5: ; %memmove_bwd_loop +; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: s_clause 0x3e +; CHECK-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen offset:252 +; CHECK-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:248 +; CHECK-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen offset:244 +; CHECK-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen offset:240 +; CHECK-NEXT: buffer_load_dword v6, v1, s[0:3], 0 offen offset:236 +; CHECK-NEXT: buffer_load_dword v7, v1, s[0:3], 0 offen offset:232 +; CHECK-NEXT: buffer_load_dword v8, v1, s[0:3], 0 offen offset:228 +; CHECK-NEXT: buffer_load_dword v9, v1, s[0:3], 0 offen offset:224 +; CHECK-NEXT: buffer_load_dword v10, v1, s[0:3], 0 offen offset:220 +; CHECK-NEXT: buffer_load_dword v11, v1, s[0:3], 0 offen offset:216 +; CHECK-NEXT: buffer_load_dword v12, v1, s[0:3], 0 offen offset:212 +; CHECK-NEXT: buffer_load_dword v13, v1, s[0:3], 0 offen offset:208 +; CHECK-NEXT: buffer_load_dword v14, v1, s[0:3], 0 offen offset:204 +; CHECK-NEXT: buffer_load_dword v15, v1, s[0:3], 0 offen offset:200 +; CHECK-NEXT: buffer_load_dword v16, v1, s[0:3], 0 offen offset:196 +; CHECK-NEXT: buffer_load_dword v17, v1, s[0:3], 0 offen offset:192 +; CHECK-NEXT: buffer_load_dword v18, v1, s[0:3], 0 offen offset:188 +; CHECK-NEXT: buffer_load_dword v19, v1, s[0:3], 0 offen offset:184 +; CHECK-NEXT: buffer_load_dword v20, v1, s[0:3], 0 offen offset:180 +; CHECK-NEXT: buffer_load_dword v21, v1, s[0:3], 0 offen offset:176 +; CHECK-NEXT: buffer_load_dword v22, v1, s[0:3], 0 offen offset:172 +; CHECK-NEXT: buffer_load_dword v23, v1, s[0:3], 0 offen offset:168 +; CHECK-NEXT: buffer_load_dword v24, v1, s[0:3], 0 offen offset:164 +; CHECK-NEXT: buffer_load_dword v25, v1, s[0:3], 0 offen offset:160 +; CHECK-NEXT: buffer_load_dword v26, v1, s[0:3], 0 offen offset:156 +; CHECK-NEXT: buffer_load_dword v27, v1, s[0:3], 0 offen offset:152 +; CHECK-NEXT: buffer_load_dword v28, v1, s[0:3], 0 offen offset:148 +; CHECK-NEXT: buffer_load_dword v29, v1, s[0:3], 0 offen offset:144 +; CHECK-NEXT: buffer_load_dword v30, v1, s[0:3], 0 offen offset:140 +; CHECK-NEXT: buffer_load_dword v31, v1, s[0:3], 0 offen offset:136 +; CHECK-NEXT: buffer_load_dword v32, v1, s[0:3], 0 offen offset:132 +; CHECK-NEXT: buffer_load_dword v33, v1, s[0:3], 0 offen offset:128 +; CHECK-NEXT: buffer_load_dword v34, v1, s[0:3], 0 offen offset:124 +; CHECK-NEXT: buffer_load_dword v35, v1, s[0:3], 0 offen offset:120 +; CHECK-NEXT: buffer_load_dword v36, v1, s[0:3], 0 offen offset:116 +; CHECK-NEXT: buffer_load_dword v37, v1, s[0:3], 0 offen offset:112 +; CHECK-NEXT: buffer_load_dword v38, v1, s[0:3], 0 offen offset:108 +; CHECK-NEXT: buffer_load_dword v39, v1, s[0:3], 0 offen offset:104 +; CHECK-NEXT: buffer_load_dword v48, v1, s[0:3], 0 offen offset:100 +; CHECK-NEXT: buffer_load_dword v49, v1, s[0:3], 0 offen offset:96 +; CHECK-NEXT: buffer_load_dword v50, v1, s[0:3], 0 offen offset:92 +; CHECK-NEXT: buffer_load_dword v51, v1, s[0:3], 0 offen offset:88 +; CHECK-NEXT: buffer_load_dword v52, v1, s[0:3], 0 offen offset:84 +; CHECK-NEXT: buffer_load_dword v53, v1, s[0:3], 0 offen offset:80 +; CHECK-NEXT: buffer_load_dword v54, v1, s[0:3], 0 offen offset:76 +; CHECK-NEXT: buffer_load_dword v55, v1, s[0:3], 0 offen offset:72 +; CHECK-NEXT: buffer_load_dword v64, v1, s[0:3], 0 offen offset:68 +; CHECK-NEXT: buffer_load_dword v65, v1, s[0:3], 0 offen offset:64 +; CHECK-NEXT: buffer_load_dword v66, v1, s[0:3], 0 offen offset:60 +; CHECK-NEXT: buffer_load_dword v67, v1, s[0:3], 0 offen offset:56 +; CHECK-NEXT: buffer_load_dword v68, v1, s[0:3], 0 offen offset:52 +; CHECK-NEXT: buffer_load_dword v69, v1, s[0:3], 0 offen offset:48 +; CHECK-NEXT: buffer_load_dword v70, v1, s[0:3], 0 offen offset:44 +; CHECK-NEXT: buffer_load_dword v71, v1, s[0:3], 0 offen offset:40 +; CHECK-NEXT: buffer_load_dword v80, v1, s[0:3], 0 offen offset:36 +; CHECK-NEXT: buffer_load_dword v81, v1, s[0:3], 0 offen offset:32 +; CHECK-NEXT: buffer_load_dword v82, v1, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_load_dword v83, v1, s[0:3], 0 offen offset:24 +; CHECK-NEXT: buffer_load_dword v84, v1, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_load_dword v85, v1, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_load_dword v86, v1, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_load_dword v87, v1, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v96, v1, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v97, v1, s[0:3], 0 offen +; CHECK-NEXT: v_add_nc_u32_e32 v1, 0xffffff00, v1 +; CHECK-NEXT: s_add_u32 s4, s4, 0x100 +; CHECK-NEXT: s_addc_u32 s5, s5, 0 +; CHECK-NEXT: s_waitcnt vmcnt(62) +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:252 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:248 +; CHECK-NEXT: s_waitcnt vmcnt(61) +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:244 +; CHECK-NEXT: s_waitcnt vmcnt(60) +; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:240 +; CHECK-NEXT: s_waitcnt vmcnt(59) +; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:236 +; CHECK-NEXT: s_waitcnt vmcnt(58) +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:232 +; CHECK-NEXT: s_waitcnt vmcnt(57) +; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:228 +; CHECK-NEXT: s_waitcnt vmcnt(56) +; CHECK-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:224 +; CHECK-NEXT: s_waitcnt vmcnt(55) +; CHECK-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:220 +; CHECK-NEXT: s_waitcnt vmcnt(54) +; CHECK-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:216 +; CHECK-NEXT: s_waitcnt vmcnt(53) +; CHECK-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen offset:212 +; CHECK-NEXT: s_waitcnt vmcnt(52) +; CHECK-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:208 +; CHECK-NEXT: s_waitcnt vmcnt(51) +; CHECK-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen offset:204 +; CHECK-NEXT: s_waitcnt vmcnt(50) +; CHECK-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:200 +; CHECK-NEXT: s_waitcnt vmcnt(49) +; CHECK-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen offset:196 +; CHECK-NEXT: s_waitcnt vmcnt(48) +; CHECK-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen offset:192 +; CHECK-NEXT: s_waitcnt vmcnt(47) +; CHECK-NEXT: buffer_store_dword v18, v0, s[0:3], 0 offen offset:188 +; CHECK-NEXT: s_waitcnt vmcnt(46) +; CHECK-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:184 +; CHECK-NEXT: s_waitcnt vmcnt(45) +; CHECK-NEXT: buffer_store_dword v20, v0, s[0:3], 0 offen offset:180 +; CHECK-NEXT: s_waitcnt vmcnt(44) +; CHECK-NEXT: buffer_store_dword v21, v0, s[0:3], 0 offen offset:176 +; CHECK-NEXT: s_waitcnt vmcnt(43) +; CHECK-NEXT: buffer_store_dword v22, v0, s[0:3], 0 offen offset:172 +; CHECK-NEXT: s_waitcnt vmcnt(42) +; CHECK-NEXT: buffer_store_dword v23, v0, s[0:3], 0 offen offset:168 +; CHECK-NEXT: s_waitcnt vmcnt(41) +; CHECK-NEXT: buffer_store_dword v24, v0, s[0:3], 0 offen offset:164 +; CHECK-NEXT: s_waitcnt vmcnt(40) +; CHECK-NEXT: buffer_store_dword v25, v0, s[0:3], 0 offen offset:160 +; CHECK-NEXT: s_waitcnt vmcnt(39) +; CHECK-NEXT: buffer_store_dword v26, v0, s[0:3], 0 offen offset:156 +; CHECK-NEXT: s_waitcnt vmcnt(38) +; CHECK-NEXT: buffer_store_dword v27, v0, s[0:3], 0 offen offset:152 +; CHECK-NEXT: s_waitcnt vmcnt(37) +; CHECK-NEXT: buffer_store_dword v28, v0, s[0:3], 0 offen offset:148 +; CHECK-NEXT: s_waitcnt vmcnt(36) +; CHECK-NEXT: buffer_store_dword v29, v0, s[0:3], 0 offen offset:144 +; CHECK-NEXT: s_waitcnt vmcnt(35) +; CHECK-NEXT: buffer_store_dword v30, v0, s[0:3], 0 offen offset:140 +; CHECK-NEXT: s_waitcnt vmcnt(34) +; CHECK-NEXT: buffer_store_dword v31, v0, s[0:3], 0 offen offset:136 +; CHECK-NEXT: s_waitcnt vmcnt(33) +; CHECK-NEXT: buffer_store_dword v32, v0, s[0:3], 0 offen offset:132 +; CHECK-NEXT: s_waitcnt vmcnt(32) +; CHECK-NEXT: buffer_store_dword v33, v0, s[0:3], 0 offen offset:128 +; CHECK-NEXT: s_waitcnt vmcnt(31) +; CHECK-NEXT: buffer_store_dword v34, v0, s[0:3], 0 offen offset:124 +; CHECK-NEXT: s_waitcnt vmcnt(30) +; CHECK-NEXT: buffer_store_dword v35, v0, s[0:3], 0 offen offset:120 +; CHECK-NEXT: s_waitcnt vmcnt(29) +; CHECK-NEXT: buffer_store_dword v36, v0, s[0:3], 0 offen offset:116 +; CHECK-NEXT: s_waitcnt vmcnt(28) +; CHECK-NEXT: buffer_store_dword v37, v0, s[0:3], 0 offen offset:112 +; CHECK-NEXT: s_waitcnt vmcnt(27) +; CHECK-NEXT: buffer_store_dword v38, v0, s[0:3], 0 offen offset:108 +; CHECK-NEXT: s_waitcnt vmcnt(26) +; CHECK-NEXT: buffer_store_dword v39, v0, s[0:3], 0 offen offset:104 +; CHECK-NEXT: s_waitcnt vmcnt(25) +; CHECK-NEXT: buffer_store_dword v48, v0, s[0:3], 0 offen offset:100 +; CHECK-NEXT: s_waitcnt vmcnt(24) +; CHECK-NEXT: buffer_store_dword v49, v0, s[0:3], 0 offen offset:96 +; CHECK-NEXT: s_waitcnt vmcnt(23) +; CHECK-NEXT: buffer_store_dword v50, v0, s[0:3], 0 offen offset:92 +; CHECK-NEXT: s_waitcnt vmcnt(22) +; CHECK-NEXT: buffer_store_dword v51, v0, s[0:3], 0 offen offset:88 +; CHECK-NEXT: s_waitcnt vmcnt(21) +; CHECK-NEXT: buffer_store_dword v52, v0, s[0:3], 0 offen offset:84 +; CHECK-NEXT: s_waitcnt vmcnt(20) +; CHECK-NEXT: buffer_store_dword v53, v0, s[0:3], 0 offen offset:80 +; CHECK-NEXT: s_waitcnt vmcnt(19) +; CHECK-NEXT: buffer_store_dword v54, v0, s[0:3], 0 offen offset:76 +; CHECK-NEXT: s_waitcnt vmcnt(18) +; CHECK-NEXT: buffer_store_dword v55, v0, s[0:3], 0 offen offset:72 +; CHECK-NEXT: s_waitcnt vmcnt(17) +; CHECK-NEXT: buffer_store_dword v64, v0, s[0:3], 0 offen offset:68 +; CHECK-NEXT: s_waitcnt vmcnt(16) +; CHECK-NEXT: buffer_store_dword v65, v0, s[0:3], 0 offen offset:64 +; CHECK-NEXT: s_waitcnt vmcnt(15) +; CHECK-NEXT: buffer_store_dword v66, v0, s[0:3], 0 offen offset:60 +; CHECK-NEXT: s_waitcnt vmcnt(14) +; CHECK-NEXT: buffer_store_dword v67, v0, s[0:3], 0 offen offset:56 +; CHECK-NEXT: s_waitcnt vmcnt(13) +; CHECK-NEXT: buffer_store_dword v68, v0, s[0:3], 0 offen offset:52 +; CHECK-NEXT: s_waitcnt vmcnt(12) +; CHECK-NEXT: buffer_store_dword v69, v0, s[0:3], 0 offen offset:48 +; CHECK-NEXT: s_waitcnt vmcnt(11) +; CHECK-NEXT: buffer_store_dword v70, v0, s[0:3], 0 offen offset:44 +; CHECK-NEXT: s_waitcnt vmcnt(10) +; CHECK-NEXT: buffer_store_dword v71, v0, s[0:3], 0 offen offset:40 +; CHECK-NEXT: s_waitcnt vmcnt(9) +; CHECK-NEXT: buffer_store_dword v80, v0, s[0:3], 0 offen offset:36 +; CHECK-NEXT: s_waitcnt vmcnt(8) +; CHECK-NEXT: buffer_store_dword v81, v0, s[0:3], 0 offen offset:32 +; CHECK-NEXT: s_waitcnt vmcnt(7) +; CHECK-NEXT: buffer_store_dword v82, v0, s[0:3], 0 offen offset:28 +; CHECK-NEXT: s_waitcnt vmcnt(6) +; CHECK-NEXT: buffer_store_dword v83, v0, s[0:3], 0 offen offset:24 +; CHECK-NEXT: s_waitcnt vmcnt(5) +; CHECK-NEXT: buffer_store_dword v84, v0, s[0:3], 0 offen offset:20 +; CHECK-NEXT: s_waitcnt vmcnt(4) +; CHECK-NEXT: buffer_store_dword v85, v0, s[0:3], 0 offen offset:16 +; CHECK-NEXT: s_waitcnt vmcnt(3) +; CHECK-NEXT: buffer_store_dword v86, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: s_waitcnt vmcnt(2) +; CHECK-NEXT: buffer_store_dword v87, v0, s[0:3], 0 offen offset:8 +; CHECK-NEXT: s_waitcnt vmcnt(1) +; CHECK-NEXT: buffer_store_dword v96, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: buffer_store_dword v97, v0, s[0:3], 0 offen +; CHECK-NEXT: v_add_nc_u32_e32 v0, 0xffffff00, v0 +; CHECK-NEXT: s_cmp_eq_u64 s[4:5], 0 +; CHECK-NEXT: s_cbranch_scc0 .LBB8_5 +; CHECK-NEXT: .LBB8_6: ; %Flow19 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6 +; CHECK-NEXT: s_setpc_b64 s[30:31] +; +; ALIGNED-LABEL: memmove_p5_p5_sz2048: +; ALIGNED: ; %bb.0: ; %entry +; ALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v72, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v73, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v74, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v75, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v76, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v77, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v78, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v79, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v88, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v89, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v90, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v91, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v92, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v93, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v94, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v95, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v104, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v105, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v106, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v107, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v108, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v109, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v110, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v111, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v120, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v121, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v122, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v123, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v124, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v125, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v126, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v127, off, s[0:3], s32 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_mov_b32 s4, exec_lo +; ALIGNED-NEXT: v_cmpx_ge_u32_e64 v1, v0 +; ALIGNED-NEXT: s_xor_b32 s6, exec_lo, s4 +; ALIGNED-NEXT: s_cbranch_execz .LBB8_3 +; ALIGNED-NEXT: ; %bb.1: ; %memmove_fwd_loop.preheader +; ALIGNED-NEXT: s_mov_b64 s[4:5], 0x800 +; ALIGNED-NEXT: .LBB8_2: ; %memmove_fwd_loop +; ALIGNED-NEXT: ; =>This Inner Loop Header: Depth=1 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:255 +; ALIGNED-NEXT: s_add_u32 s4, s4, 0xffffff00 +; ALIGNED-NEXT: s_addc_u32 s5, s5, -1 +; ALIGNED-NEXT: s_cmp_lg_u64 s[4:5], 0 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:254 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:253 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:252 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:251 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:250 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:249 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:248 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_clause 0x3e +; ALIGNED-NEXT: buffer_load_ubyte v123, v1, s[0:3], 0 offen offset:247 +; ALIGNED-NEXT: buffer_load_ubyte v122, v1, s[0:3], 0 offen offset:246 +; ALIGNED-NEXT: buffer_load_ubyte v121, v1, s[0:3], 0 offen offset:245 +; ALIGNED-NEXT: buffer_load_ubyte v120, v1, s[0:3], 0 offen offset:244 +; ALIGNED-NEXT: buffer_load_ubyte v111, v1, s[0:3], 0 offen offset:243 +; ALIGNED-NEXT: buffer_load_ubyte v110, v1, s[0:3], 0 offen offset:242 +; ALIGNED-NEXT: buffer_load_ubyte v109, v1, s[0:3], 0 offen offset:241 +; ALIGNED-NEXT: buffer_load_ubyte v108, v1, s[0:3], 0 offen offset:240 +; ALIGNED-NEXT: buffer_load_ubyte v107, v1, s[0:3], 0 offen offset:239 +; ALIGNED-NEXT: buffer_load_ubyte v106, v1, s[0:3], 0 offen offset:238 +; ALIGNED-NEXT: buffer_load_ubyte v105, v1, s[0:3], 0 offen offset:237 +; ALIGNED-NEXT: buffer_load_ubyte v104, v1, s[0:3], 0 offen offset:236 +; ALIGNED-NEXT: buffer_load_ubyte v95, v1, s[0:3], 0 offen offset:235 +; ALIGNED-NEXT: buffer_load_ubyte v94, v1, s[0:3], 0 offen offset:234 +; ALIGNED-NEXT: buffer_load_ubyte v93, v1, s[0:3], 0 offen offset:233 +; ALIGNED-NEXT: buffer_load_ubyte v92, v1, s[0:3], 0 offen offset:232 +; ALIGNED-NEXT: buffer_load_ubyte v91, v1, s[0:3], 0 offen offset:231 +; ALIGNED-NEXT: buffer_load_ubyte v90, v1, s[0:3], 0 offen offset:230 +; ALIGNED-NEXT: buffer_load_ubyte v89, v1, s[0:3], 0 offen offset:229 +; ALIGNED-NEXT: buffer_load_ubyte v88, v1, s[0:3], 0 offen offset:228 +; ALIGNED-NEXT: buffer_load_ubyte v79, v1, s[0:3], 0 offen offset:227 +; ALIGNED-NEXT: buffer_load_ubyte v78, v1, s[0:3], 0 offen offset:226 +; ALIGNED-NEXT: buffer_load_ubyte v77, v1, s[0:3], 0 offen offset:225 +; ALIGNED-NEXT: buffer_load_ubyte v76, v1, s[0:3], 0 offen offset:224 +; ALIGNED-NEXT: buffer_load_ubyte v75, v1, s[0:3], 0 offen offset:223 +; ALIGNED-NEXT: buffer_load_ubyte v74, v1, s[0:3], 0 offen offset:222 +; ALIGNED-NEXT: buffer_load_ubyte v73, v1, s[0:3], 0 offen offset:221 +; ALIGNED-NEXT: buffer_load_ubyte v72, v1, s[0:3], 0 offen offset:220 +; ALIGNED-NEXT: buffer_load_ubyte v63, v1, s[0:3], 0 offen offset:219 +; ALIGNED-NEXT: buffer_load_ubyte v62, v1, s[0:3], 0 offen offset:218 +; ALIGNED-NEXT: buffer_load_ubyte v61, v1, s[0:3], 0 offen offset:217 +; ALIGNED-NEXT: buffer_load_ubyte v60, v1, s[0:3], 0 offen offset:216 +; ALIGNED-NEXT: buffer_load_ubyte v59, v1, s[0:3], 0 offen offset:215 +; ALIGNED-NEXT: buffer_load_ubyte v58, v1, s[0:3], 0 offen offset:214 +; ALIGNED-NEXT: buffer_load_ubyte v57, v1, s[0:3], 0 offen offset:213 +; ALIGNED-NEXT: buffer_load_ubyte v56, v1, s[0:3], 0 offen offset:212 +; ALIGNED-NEXT: buffer_load_ubyte v47, v1, s[0:3], 0 offen offset:211 +; ALIGNED-NEXT: buffer_load_ubyte v46, v1, s[0:3], 0 offen offset:210 +; ALIGNED-NEXT: buffer_load_ubyte v45, v1, s[0:3], 0 offen offset:209 +; ALIGNED-NEXT: buffer_load_ubyte v44, v1, s[0:3], 0 offen offset:208 +; ALIGNED-NEXT: buffer_load_ubyte v43, v1, s[0:3], 0 offen offset:207 +; ALIGNED-NEXT: buffer_load_ubyte v42, v1, s[0:3], 0 offen offset:206 +; ALIGNED-NEXT: buffer_load_ubyte v41, v1, s[0:3], 0 offen offset:205 +; ALIGNED-NEXT: buffer_load_ubyte v40, v1, s[0:3], 0 offen offset:204 +; ALIGNED-NEXT: buffer_load_ubyte v119, v1, s[0:3], 0 offen offset:203 +; ALIGNED-NEXT: buffer_load_ubyte v118, v1, s[0:3], 0 offen offset:202 +; ALIGNED-NEXT: buffer_load_ubyte v117, v1, s[0:3], 0 offen offset:201 +; ALIGNED-NEXT: buffer_load_ubyte v116, v1, s[0:3], 0 offen offset:200 +; ALIGNED-NEXT: buffer_load_ubyte v115, v1, s[0:3], 0 offen offset:199 +; ALIGNED-NEXT: buffer_load_ubyte v114, v1, s[0:3], 0 offen offset:198 +; ALIGNED-NEXT: buffer_load_ubyte v113, v1, s[0:3], 0 offen offset:197 +; ALIGNED-NEXT: buffer_load_ubyte v112, v1, s[0:3], 0 offen offset:196 +; ALIGNED-NEXT: buffer_load_ubyte v103, v1, s[0:3], 0 offen offset:195 +; ALIGNED-NEXT: buffer_load_ubyte v102, v1, s[0:3], 0 offen offset:194 +; ALIGNED-NEXT: buffer_load_ubyte v101, v1, s[0:3], 0 offen offset:193 +; ALIGNED-NEXT: buffer_load_ubyte v100, v1, s[0:3], 0 offen offset:192 +; ALIGNED-NEXT: buffer_load_ubyte v99, v1, s[0:3], 0 offen offset:191 +; ALIGNED-NEXT: buffer_load_ubyte v98, v1, s[0:3], 0 offen offset:190 +; ALIGNED-NEXT: buffer_load_ubyte v97, v1, s[0:3], 0 offen offset:189 +; ALIGNED-NEXT: buffer_load_ubyte v96, v1, s[0:3], 0 offen offset:188 +; ALIGNED-NEXT: buffer_load_ubyte v87, v1, s[0:3], 0 offen offset:187 +; ALIGNED-NEXT: buffer_load_ubyte v86, v1, s[0:3], 0 offen offset:186 +; ALIGNED-NEXT: buffer_load_ubyte v85, v1, s[0:3], 0 offen offset:185 +; ALIGNED-NEXT: s_clause 0x3a +; ALIGNED-NEXT: buffer_load_ubyte v84, v1, s[0:3], 0 offen offset:184 +; ALIGNED-NEXT: buffer_load_ubyte v83, v1, s[0:3], 0 offen offset:183 +; ALIGNED-NEXT: buffer_load_ubyte v82, v1, s[0:3], 0 offen offset:182 +; ALIGNED-NEXT: buffer_load_ubyte v81, v1, s[0:3], 0 offen offset:181 +; ALIGNED-NEXT: buffer_load_ubyte v80, v1, s[0:3], 0 offen offset:180 +; ALIGNED-NEXT: buffer_load_ubyte v71, v1, s[0:3], 0 offen offset:179 +; ALIGNED-NEXT: buffer_load_ubyte v70, v1, s[0:3], 0 offen offset:178 +; ALIGNED-NEXT: buffer_load_ubyte v69, v1, s[0:3], 0 offen offset:177 +; ALIGNED-NEXT: buffer_load_ubyte v68, v1, s[0:3], 0 offen offset:176 +; ALIGNED-NEXT: buffer_load_ubyte v67, v1, s[0:3], 0 offen offset:175 +; ALIGNED-NEXT: buffer_load_ubyte v66, v1, s[0:3], 0 offen offset:174 +; ALIGNED-NEXT: buffer_load_ubyte v65, v1, s[0:3], 0 offen offset:173 +; ALIGNED-NEXT: buffer_load_ubyte v64, v1, s[0:3], 0 offen offset:172 +; ALIGNED-NEXT: buffer_load_ubyte v55, v1, s[0:3], 0 offen offset:171 +; ALIGNED-NEXT: buffer_load_ubyte v54, v1, s[0:3], 0 offen offset:170 +; ALIGNED-NEXT: buffer_load_ubyte v53, v1, s[0:3], 0 offen offset:169 +; ALIGNED-NEXT: buffer_load_ubyte v52, v1, s[0:3], 0 offen offset:168 +; ALIGNED-NEXT: buffer_load_ubyte v51, v1, s[0:3], 0 offen offset:167 +; ALIGNED-NEXT: buffer_load_ubyte v50, v1, s[0:3], 0 offen offset:166 +; ALIGNED-NEXT: buffer_load_ubyte v49, v1, s[0:3], 0 offen offset:165 +; ALIGNED-NEXT: buffer_load_ubyte v48, v1, s[0:3], 0 offen offset:164 +; ALIGNED-NEXT: buffer_load_ubyte v39, v1, s[0:3], 0 offen offset:163 +; ALIGNED-NEXT: buffer_load_ubyte v38, v1, s[0:3], 0 offen offset:162 +; ALIGNED-NEXT: buffer_load_ubyte v37, v1, s[0:3], 0 offen offset:161 +; ALIGNED-NEXT: buffer_load_ubyte v36, v1, s[0:3], 0 offen offset:160 +; ALIGNED-NEXT: buffer_load_ubyte v35, v1, s[0:3], 0 offen offset:159 +; ALIGNED-NEXT: buffer_load_ubyte v34, v1, s[0:3], 0 offen offset:158 +; ALIGNED-NEXT: buffer_load_ubyte v33, v1, s[0:3], 0 offen offset:157 +; ALIGNED-NEXT: buffer_load_ubyte v32, v1, s[0:3], 0 offen offset:156 +; ALIGNED-NEXT: buffer_load_ubyte v31, v1, s[0:3], 0 offen offset:155 +; ALIGNED-NEXT: buffer_load_ubyte v30, v1, s[0:3], 0 offen offset:154 +; ALIGNED-NEXT: buffer_load_ubyte v29, v1, s[0:3], 0 offen offset:153 +; ALIGNED-NEXT: buffer_load_ubyte v28, v1, s[0:3], 0 offen offset:152 +; ALIGNED-NEXT: buffer_load_ubyte v27, v1, s[0:3], 0 offen offset:151 +; ALIGNED-NEXT: buffer_load_ubyte v26, v1, s[0:3], 0 offen offset:150 +; ALIGNED-NEXT: buffer_load_ubyte v25, v1, s[0:3], 0 offen offset:149 +; ALIGNED-NEXT: buffer_load_ubyte v24, v1, s[0:3], 0 offen offset:148 +; ALIGNED-NEXT: buffer_load_ubyte v23, v1, s[0:3], 0 offen offset:147 +; ALIGNED-NEXT: buffer_load_ubyte v22, v1, s[0:3], 0 offen offset:146 +; ALIGNED-NEXT: buffer_load_ubyte v21, v1, s[0:3], 0 offen offset:145 +; ALIGNED-NEXT: buffer_load_ubyte v20, v1, s[0:3], 0 offen offset:144 +; ALIGNED-NEXT: buffer_load_ubyte v19, v1, s[0:3], 0 offen offset:143 +; ALIGNED-NEXT: buffer_load_ubyte v18, v1, s[0:3], 0 offen offset:142 +; ALIGNED-NEXT: buffer_load_ubyte v17, v1, s[0:3], 0 offen offset:141 +; ALIGNED-NEXT: buffer_load_ubyte v16, v1, s[0:3], 0 offen offset:140 +; ALIGNED-NEXT: buffer_load_ubyte v15, v1, s[0:3], 0 offen offset:139 +; ALIGNED-NEXT: buffer_load_ubyte v14, v1, s[0:3], 0 offen offset:138 +; ALIGNED-NEXT: buffer_load_ubyte v13, v1, s[0:3], 0 offen offset:137 +; ALIGNED-NEXT: buffer_load_ubyte v12, v1, s[0:3], 0 offen offset:136 +; ALIGNED-NEXT: buffer_load_ubyte v11, v1, s[0:3], 0 offen offset:135 +; ALIGNED-NEXT: buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:134 +; ALIGNED-NEXT: buffer_load_ubyte v9, v1, s[0:3], 0 offen offset:133 +; ALIGNED-NEXT: buffer_load_ubyte v8, v1, s[0:3], 0 offen offset:132 +; ALIGNED-NEXT: buffer_load_ubyte v7, v1, s[0:3], 0 offen offset:131 +; ALIGNED-NEXT: buffer_load_ubyte v6, v1, s[0:3], 0 offen offset:130 +; ALIGNED-NEXT: buffer_load_ubyte v5, v1, s[0:3], 0 offen offset:129 +; ALIGNED-NEXT: buffer_load_ubyte v4, v1, s[0:3], 0 offen offset:128 +; ALIGNED-NEXT: buffer_load_ubyte v3, v1, s[0:3], 0 offen offset:127 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:126 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_clause 0x4 +; ALIGNED-NEXT: buffer_load_ubyte v127, v1, s[0:3], 0 offen offset:125 +; ALIGNED-NEXT: buffer_load_ubyte v126, v1, s[0:3], 0 offen offset:124 +; ALIGNED-NEXT: buffer_load_ubyte v125, v1, s[0:3], 0 offen offset:123 +; ALIGNED-NEXT: buffer_load_ubyte v124, v1, s[0:3], 0 offen offset:122 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:121 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:120 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:119 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:118 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:117 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:116 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:115 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:114 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:113 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:112 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:111 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:110 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:109 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:108 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:107 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:106 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:105 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:104 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:103 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:102 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:101 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:100 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:99 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:98 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:97 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:96 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:95 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:94 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:93 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:92 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:91 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:90 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:89 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:88 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:87 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:86 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:85 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:84 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:83 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:82 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:81 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:80 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:79 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:78 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:77 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:76 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:75 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:74 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:73 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:72 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:71 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:70 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:69 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:68 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:67 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:66 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:65 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:64 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:63 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:62 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:61 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:60 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:59 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:58 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:57 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:56 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:55 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:54 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:53 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:52 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:51 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:50 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:49 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:48 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:47 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:46 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:45 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:44 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:43 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:42 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:41 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:40 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:39 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:38 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:37 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:36 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:35 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:34 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:33 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:32 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:31 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:30 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:29 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:28 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:27 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:26 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:25 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:24 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:23 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:22 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:21 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:20 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:19 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:18 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:17 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:16 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:15 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:14 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:13 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:12 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:11 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:10 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:9 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:8 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:7 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:6 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:5 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:4 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:3 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:2 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen +; ALIGNED-NEXT: v_add_nc_u32_e32 v1, 0x100, v1 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:255 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:254 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:253 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:252 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:251 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:250 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:249 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:248 +; ALIGNED-NEXT: buffer_store_byte v123, v0, s[0:3], 0 offen offset:247 +; ALIGNED-NEXT: buffer_store_byte v122, v0, s[0:3], 0 offen offset:246 +; ALIGNED-NEXT: buffer_store_byte v121, v0, s[0:3], 0 offen offset:245 +; ALIGNED-NEXT: buffer_store_byte v120, v0, s[0:3], 0 offen offset:244 +; ALIGNED-NEXT: buffer_store_byte v111, v0, s[0:3], 0 offen offset:243 +; ALIGNED-NEXT: buffer_store_byte v110, v0, s[0:3], 0 offen offset:242 +; ALIGNED-NEXT: buffer_store_byte v109, v0, s[0:3], 0 offen offset:241 +; ALIGNED-NEXT: buffer_store_byte v108, v0, s[0:3], 0 offen offset:240 +; ALIGNED-NEXT: buffer_store_byte v107, v0, s[0:3], 0 offen offset:239 +; ALIGNED-NEXT: buffer_store_byte v106, v0, s[0:3], 0 offen offset:238 +; ALIGNED-NEXT: buffer_store_byte v105, v0, s[0:3], 0 offen offset:237 +; ALIGNED-NEXT: buffer_store_byte v104, v0, s[0:3], 0 offen offset:236 +; ALIGNED-NEXT: buffer_store_byte v95, v0, s[0:3], 0 offen offset:235 +; ALIGNED-NEXT: buffer_store_byte v94, v0, s[0:3], 0 offen offset:234 +; ALIGNED-NEXT: buffer_store_byte v93, v0, s[0:3], 0 offen offset:233 +; ALIGNED-NEXT: buffer_store_byte v92, v0, s[0:3], 0 offen offset:232 +; ALIGNED-NEXT: buffer_store_byte v91, v0, s[0:3], 0 offen offset:231 +; ALIGNED-NEXT: buffer_store_byte v90, v0, s[0:3], 0 offen offset:230 +; ALIGNED-NEXT: buffer_store_byte v89, v0, s[0:3], 0 offen offset:229 +; ALIGNED-NEXT: buffer_store_byte v88, v0, s[0:3], 0 offen offset:228 +; ALIGNED-NEXT: buffer_store_byte v79, v0, s[0:3], 0 offen offset:227 +; ALIGNED-NEXT: buffer_store_byte v78, v0, s[0:3], 0 offen offset:226 +; ALIGNED-NEXT: buffer_store_byte v77, v0, s[0:3], 0 offen offset:225 +; ALIGNED-NEXT: buffer_store_byte v76, v0, s[0:3], 0 offen offset:224 +; ALIGNED-NEXT: buffer_store_byte v75, v0, s[0:3], 0 offen offset:223 +; ALIGNED-NEXT: buffer_store_byte v74, v0, s[0:3], 0 offen offset:222 +; ALIGNED-NEXT: buffer_store_byte v73, v0, s[0:3], 0 offen offset:221 +; ALIGNED-NEXT: buffer_store_byte v72, v0, s[0:3], 0 offen offset:220 +; ALIGNED-NEXT: buffer_store_byte v63, v0, s[0:3], 0 offen offset:219 +; ALIGNED-NEXT: buffer_store_byte v62, v0, s[0:3], 0 offen offset:218 +; ALIGNED-NEXT: buffer_store_byte v61, v0, s[0:3], 0 offen offset:217 +; ALIGNED-NEXT: buffer_store_byte v60, v0, s[0:3], 0 offen offset:216 +; ALIGNED-NEXT: buffer_store_byte v59, v0, s[0:3], 0 offen offset:215 +; ALIGNED-NEXT: buffer_store_byte v58, v0, s[0:3], 0 offen offset:214 +; ALIGNED-NEXT: buffer_store_byte v57, v0, s[0:3], 0 offen offset:213 +; ALIGNED-NEXT: buffer_store_byte v56, v0, s[0:3], 0 offen offset:212 +; ALIGNED-NEXT: buffer_store_byte v47, v0, s[0:3], 0 offen offset:211 +; ALIGNED-NEXT: buffer_store_byte v46, v0, s[0:3], 0 offen offset:210 +; ALIGNED-NEXT: buffer_store_byte v45, v0, s[0:3], 0 offen offset:209 +; ALIGNED-NEXT: buffer_store_byte v44, v0, s[0:3], 0 offen offset:208 +; ALIGNED-NEXT: buffer_store_byte v43, v0, s[0:3], 0 offen offset:207 +; ALIGNED-NEXT: buffer_store_byte v42, v0, s[0:3], 0 offen offset:206 +; ALIGNED-NEXT: buffer_store_byte v41, v0, s[0:3], 0 offen offset:205 +; ALIGNED-NEXT: buffer_store_byte v40, v0, s[0:3], 0 offen offset:204 +; ALIGNED-NEXT: buffer_store_byte v119, v0, s[0:3], 0 offen offset:203 +; ALIGNED-NEXT: buffer_store_byte v118, v0, s[0:3], 0 offen offset:202 +; ALIGNED-NEXT: buffer_store_byte v117, v0, s[0:3], 0 offen offset:201 +; ALIGNED-NEXT: buffer_store_byte v116, v0, s[0:3], 0 offen offset:200 +; ALIGNED-NEXT: buffer_store_byte v115, v0, s[0:3], 0 offen offset:199 +; ALIGNED-NEXT: buffer_store_byte v114, v0, s[0:3], 0 offen offset:198 +; ALIGNED-NEXT: buffer_store_byte v113, v0, s[0:3], 0 offen offset:197 +; ALIGNED-NEXT: buffer_store_byte v112, v0, s[0:3], 0 offen offset:196 +; ALIGNED-NEXT: buffer_store_byte v103, v0, s[0:3], 0 offen offset:195 +; ALIGNED-NEXT: buffer_store_byte v102, v0, s[0:3], 0 offen offset:194 +; ALIGNED-NEXT: buffer_store_byte v101, v0, s[0:3], 0 offen offset:193 +; ALIGNED-NEXT: buffer_store_byte v100, v0, s[0:3], 0 offen offset:192 +; ALIGNED-NEXT: buffer_store_byte v99, v0, s[0:3], 0 offen offset:191 +; ALIGNED-NEXT: buffer_store_byte v98, v0, s[0:3], 0 offen offset:190 +; ALIGNED-NEXT: buffer_store_byte v97, v0, s[0:3], 0 offen offset:189 +; ALIGNED-NEXT: buffer_store_byte v96, v0, s[0:3], 0 offen offset:188 +; ALIGNED-NEXT: buffer_store_byte v87, v0, s[0:3], 0 offen offset:187 +; ALIGNED-NEXT: buffer_store_byte v86, v0, s[0:3], 0 offen offset:186 +; ALIGNED-NEXT: buffer_store_byte v85, v0, s[0:3], 0 offen offset:185 +; ALIGNED-NEXT: buffer_store_byte v84, v0, s[0:3], 0 offen offset:184 +; ALIGNED-NEXT: buffer_store_byte v83, v0, s[0:3], 0 offen offset:183 +; ALIGNED-NEXT: buffer_store_byte v82, v0, s[0:3], 0 offen offset:182 +; ALIGNED-NEXT: buffer_store_byte v81, v0, s[0:3], 0 offen offset:181 +; ALIGNED-NEXT: buffer_store_byte v80, v0, s[0:3], 0 offen offset:180 +; ALIGNED-NEXT: buffer_store_byte v71, v0, s[0:3], 0 offen offset:179 +; ALIGNED-NEXT: buffer_store_byte v70, v0, s[0:3], 0 offen offset:178 +; ALIGNED-NEXT: buffer_store_byte v69, v0, s[0:3], 0 offen offset:177 +; ALIGNED-NEXT: buffer_store_byte v68, v0, s[0:3], 0 offen offset:176 +; ALIGNED-NEXT: buffer_store_byte v67, v0, s[0:3], 0 offen offset:175 +; ALIGNED-NEXT: buffer_store_byte v66, v0, s[0:3], 0 offen offset:174 +; ALIGNED-NEXT: buffer_store_byte v65, v0, s[0:3], 0 offen offset:173 +; ALIGNED-NEXT: buffer_store_byte v64, v0, s[0:3], 0 offen offset:172 +; ALIGNED-NEXT: buffer_store_byte v55, v0, s[0:3], 0 offen offset:171 +; ALIGNED-NEXT: buffer_store_byte v54, v0, s[0:3], 0 offen offset:170 +; ALIGNED-NEXT: buffer_store_byte v53, v0, s[0:3], 0 offen offset:169 +; ALIGNED-NEXT: buffer_store_byte v52, v0, s[0:3], 0 offen offset:168 +; ALIGNED-NEXT: buffer_store_byte v51, v0, s[0:3], 0 offen offset:167 +; ALIGNED-NEXT: buffer_store_byte v50, v0, s[0:3], 0 offen offset:166 +; ALIGNED-NEXT: buffer_store_byte v49, v0, s[0:3], 0 offen offset:165 +; ALIGNED-NEXT: buffer_store_byte v48, v0, s[0:3], 0 offen offset:164 +; ALIGNED-NEXT: buffer_store_byte v39, v0, s[0:3], 0 offen offset:163 +; ALIGNED-NEXT: buffer_store_byte v38, v0, s[0:3], 0 offen offset:162 +; ALIGNED-NEXT: buffer_store_byte v37, v0, s[0:3], 0 offen offset:161 +; ALIGNED-NEXT: buffer_store_byte v36, v0, s[0:3], 0 offen offset:160 +; ALIGNED-NEXT: buffer_store_byte v35, v0, s[0:3], 0 offen offset:159 +; ALIGNED-NEXT: buffer_store_byte v34, v0, s[0:3], 0 offen offset:158 +; ALIGNED-NEXT: buffer_store_byte v33, v0, s[0:3], 0 offen offset:157 +; ALIGNED-NEXT: buffer_store_byte v32, v0, s[0:3], 0 offen offset:156 +; ALIGNED-NEXT: buffer_store_byte v31, v0, s[0:3], 0 offen offset:155 +; ALIGNED-NEXT: buffer_store_byte v30, v0, s[0:3], 0 offen offset:154 +; ALIGNED-NEXT: buffer_store_byte v29, v0, s[0:3], 0 offen offset:153 +; ALIGNED-NEXT: buffer_store_byte v28, v0, s[0:3], 0 offen offset:152 +; ALIGNED-NEXT: buffer_store_byte v27, v0, s[0:3], 0 offen offset:151 +; ALIGNED-NEXT: buffer_store_byte v26, v0, s[0:3], 0 offen offset:150 +; ALIGNED-NEXT: buffer_store_byte v25, v0, s[0:3], 0 offen offset:149 +; ALIGNED-NEXT: buffer_store_byte v24, v0, s[0:3], 0 offen offset:148 +; ALIGNED-NEXT: buffer_store_byte v23, v0, s[0:3], 0 offen offset:147 +; ALIGNED-NEXT: buffer_store_byte v22, v0, s[0:3], 0 offen offset:146 +; ALIGNED-NEXT: buffer_store_byte v21, v0, s[0:3], 0 offen offset:145 +; ALIGNED-NEXT: buffer_store_byte v20, v0, s[0:3], 0 offen offset:144 +; ALIGNED-NEXT: buffer_store_byte v19, v0, s[0:3], 0 offen offset:143 +; ALIGNED-NEXT: buffer_store_byte v18, v0, s[0:3], 0 offen offset:142 +; ALIGNED-NEXT: buffer_store_byte v17, v0, s[0:3], 0 offen offset:141 +; ALIGNED-NEXT: buffer_store_byte v16, v0, s[0:3], 0 offen offset:140 +; ALIGNED-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:139 +; ALIGNED-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:138 +; ALIGNED-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:137 +; ALIGNED-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:136 +; ALIGNED-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:135 +; ALIGNED-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:134 +; ALIGNED-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:133 +; ALIGNED-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:132 +; ALIGNED-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:131 +; ALIGNED-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:130 +; ALIGNED-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:129 +; ALIGNED-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:128 +; ALIGNED-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:127 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:126 +; ALIGNED-NEXT: buffer_store_byte v127, v0, s[0:3], 0 offen offset:125 +; ALIGNED-NEXT: buffer_store_byte v126, v0, s[0:3], 0 offen offset:124 +; ALIGNED-NEXT: buffer_store_byte v125, v0, s[0:3], 0 offen offset:123 +; ALIGNED-NEXT: buffer_store_byte v124, v0, s[0:3], 0 offen offset:122 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:121 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:120 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:119 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:118 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:117 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:116 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:115 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:114 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:113 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:112 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:111 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:110 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:109 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:108 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:107 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:106 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:105 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:104 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:103 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:102 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:101 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:100 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:99 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:98 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:97 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:96 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:95 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:94 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:93 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:92 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:91 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:90 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:89 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:88 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:87 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:86 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:85 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:84 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:83 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:82 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:81 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:80 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:79 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:78 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:77 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:76 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:75 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:74 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:73 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:72 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:71 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:70 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:69 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:68 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:67 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:66 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:65 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:64 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:63 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:62 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:61 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:60 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:59 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:58 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:57 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:56 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:55 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:54 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:53 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:52 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:51 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:50 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:49 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:48 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:47 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:46 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:45 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:44 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:43 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:42 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:41 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:40 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:39 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:38 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:37 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:36 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:35 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:34 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:33 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:32 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:31 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:30 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:29 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:28 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:27 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:26 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:25 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:24 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:23 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:22 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:21 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:20 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:19 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:18 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:17 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:16 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:15 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:14 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:13 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:12 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:11 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:10 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:9 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:8 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:7 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:6 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:5 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:4 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:3 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:2 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen +; ALIGNED-NEXT: v_add_nc_u32_e32 v0, 0x100, v0 +; ALIGNED-NEXT: s_cbranch_scc1 .LBB8_2 +; ALIGNED-NEXT: .LBB8_3: ; %Flow18 +; ALIGNED-NEXT: s_andn2_saveexec_b32 s6, s6 +; ALIGNED-NEXT: s_cbranch_execz .LBB8_6 +; ALIGNED-NEXT: ; %bb.4: ; %memmove_bwd_loop.preheader +; ALIGNED-NEXT: v_add_nc_u32_e32 v0, 0x700, v0 +; ALIGNED-NEXT: v_add_nc_u32_e32 v1, 0x700, v1 +; ALIGNED-NEXT: s_movk_i32 s4, 0xf800 +; ALIGNED-NEXT: s_mov_b32 s5, -1 +; ALIGNED-NEXT: .LBB8_5: ; %memmove_bwd_loop +; ALIGNED-NEXT: ; =>This Inner Loop Header: Depth=1 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:255 +; ALIGNED-NEXT: s_add_u32 s4, s4, 0x100 +; ALIGNED-NEXT: s_addc_u32 s5, s5, 0 +; ALIGNED-NEXT: s_cmp_eq_u64 s[4:5], 0 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:254 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:253 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:252 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:251 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:250 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:249 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:248 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:247 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:246 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:245 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:244 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:243 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:242 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:241 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:240 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:239 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:238 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:237 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:236 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:235 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:234 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:233 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:232 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:231 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:230 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:229 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:228 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:227 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:226 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:225 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:224 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:223 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:222 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:221 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:220 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:219 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:218 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:217 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:216 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:215 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:214 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:213 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:212 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:211 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:210 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:209 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:208 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:207 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:206 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:205 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:204 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:203 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:202 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:201 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:200 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_clause 0x3e +; ALIGNED-NEXT: buffer_load_ubyte v115, v1, s[0:3], 0 offen offset:199 +; ALIGNED-NEXT: buffer_load_ubyte v114, v1, s[0:3], 0 offen offset:198 +; ALIGNED-NEXT: buffer_load_ubyte v113, v1, s[0:3], 0 offen offset:197 +; ALIGNED-NEXT: buffer_load_ubyte v112, v1, s[0:3], 0 offen offset:196 +; ALIGNED-NEXT: buffer_load_ubyte v103, v1, s[0:3], 0 offen offset:195 +; ALIGNED-NEXT: buffer_load_ubyte v102, v1, s[0:3], 0 offen offset:194 +; ALIGNED-NEXT: buffer_load_ubyte v101, v1, s[0:3], 0 offen offset:193 +; ALIGNED-NEXT: buffer_load_ubyte v100, v1, s[0:3], 0 offen offset:192 +; ALIGNED-NEXT: buffer_load_ubyte v99, v1, s[0:3], 0 offen offset:191 +; ALIGNED-NEXT: buffer_load_ubyte v98, v1, s[0:3], 0 offen offset:190 +; ALIGNED-NEXT: buffer_load_ubyte v97, v1, s[0:3], 0 offen offset:189 +; ALIGNED-NEXT: buffer_load_ubyte v96, v1, s[0:3], 0 offen offset:188 +; ALIGNED-NEXT: buffer_load_ubyte v87, v1, s[0:3], 0 offen offset:187 +; ALIGNED-NEXT: buffer_load_ubyte v86, v1, s[0:3], 0 offen offset:186 +; ALIGNED-NEXT: buffer_load_ubyte v85, v1, s[0:3], 0 offen offset:185 +; ALIGNED-NEXT: buffer_load_ubyte v84, v1, s[0:3], 0 offen offset:184 +; ALIGNED-NEXT: buffer_load_ubyte v83, v1, s[0:3], 0 offen offset:183 +; ALIGNED-NEXT: buffer_load_ubyte v82, v1, s[0:3], 0 offen offset:182 +; ALIGNED-NEXT: buffer_load_ubyte v81, v1, s[0:3], 0 offen offset:181 +; ALIGNED-NEXT: buffer_load_ubyte v80, v1, s[0:3], 0 offen offset:180 +; ALIGNED-NEXT: buffer_load_ubyte v71, v1, s[0:3], 0 offen offset:179 +; ALIGNED-NEXT: buffer_load_ubyte v70, v1, s[0:3], 0 offen offset:178 +; ALIGNED-NEXT: buffer_load_ubyte v69, v1, s[0:3], 0 offen offset:177 +; ALIGNED-NEXT: buffer_load_ubyte v68, v1, s[0:3], 0 offen offset:176 +; ALIGNED-NEXT: buffer_load_ubyte v67, v1, s[0:3], 0 offen offset:175 +; ALIGNED-NEXT: buffer_load_ubyte v66, v1, s[0:3], 0 offen offset:174 +; ALIGNED-NEXT: buffer_load_ubyte v65, v1, s[0:3], 0 offen offset:173 +; ALIGNED-NEXT: buffer_load_ubyte v64, v1, s[0:3], 0 offen offset:172 +; ALIGNED-NEXT: buffer_load_ubyte v55, v1, s[0:3], 0 offen offset:171 +; ALIGNED-NEXT: buffer_load_ubyte v54, v1, s[0:3], 0 offen offset:170 +; ALIGNED-NEXT: buffer_load_ubyte v53, v1, s[0:3], 0 offen offset:169 +; ALIGNED-NEXT: buffer_load_ubyte v52, v1, s[0:3], 0 offen offset:168 +; ALIGNED-NEXT: buffer_load_ubyte v51, v1, s[0:3], 0 offen offset:167 +; ALIGNED-NEXT: buffer_load_ubyte v50, v1, s[0:3], 0 offen offset:166 +; ALIGNED-NEXT: buffer_load_ubyte v49, v1, s[0:3], 0 offen offset:165 +; ALIGNED-NEXT: buffer_load_ubyte v48, v1, s[0:3], 0 offen offset:164 +; ALIGNED-NEXT: buffer_load_ubyte v39, v1, s[0:3], 0 offen offset:163 +; ALIGNED-NEXT: buffer_load_ubyte v38, v1, s[0:3], 0 offen offset:162 +; ALIGNED-NEXT: buffer_load_ubyte v37, v1, s[0:3], 0 offen offset:161 +; ALIGNED-NEXT: buffer_load_ubyte v36, v1, s[0:3], 0 offen offset:160 +; ALIGNED-NEXT: buffer_load_ubyte v35, v1, s[0:3], 0 offen offset:159 +; ALIGNED-NEXT: buffer_load_ubyte v34, v1, s[0:3], 0 offen offset:158 +; ALIGNED-NEXT: buffer_load_ubyte v33, v1, s[0:3], 0 offen offset:157 +; ALIGNED-NEXT: buffer_load_ubyte v32, v1, s[0:3], 0 offen offset:156 +; ALIGNED-NEXT: buffer_load_ubyte v31, v1, s[0:3], 0 offen offset:155 +; ALIGNED-NEXT: buffer_load_ubyte v30, v1, s[0:3], 0 offen offset:154 +; ALIGNED-NEXT: buffer_load_ubyte v29, v1, s[0:3], 0 offen offset:153 +; ALIGNED-NEXT: buffer_load_ubyte v28, v1, s[0:3], 0 offen offset:152 +; ALIGNED-NEXT: buffer_load_ubyte v27, v1, s[0:3], 0 offen offset:151 +; ALIGNED-NEXT: buffer_load_ubyte v26, v1, s[0:3], 0 offen offset:150 +; ALIGNED-NEXT: buffer_load_ubyte v25, v1, s[0:3], 0 offen offset:149 +; ALIGNED-NEXT: buffer_load_ubyte v24, v1, s[0:3], 0 offen offset:148 +; ALIGNED-NEXT: buffer_load_ubyte v23, v1, s[0:3], 0 offen offset:147 +; ALIGNED-NEXT: buffer_load_ubyte v22, v1, s[0:3], 0 offen offset:146 +; ALIGNED-NEXT: buffer_load_ubyte v21, v1, s[0:3], 0 offen offset:145 +; ALIGNED-NEXT: buffer_load_ubyte v20, v1, s[0:3], 0 offen offset:144 +; ALIGNED-NEXT: buffer_load_ubyte v19, v1, s[0:3], 0 offen offset:143 +; ALIGNED-NEXT: buffer_load_ubyte v18, v1, s[0:3], 0 offen offset:142 +; ALIGNED-NEXT: buffer_load_ubyte v17, v1, s[0:3], 0 offen offset:141 +; ALIGNED-NEXT: buffer_load_ubyte v16, v1, s[0:3], 0 offen offset:140 +; ALIGNED-NEXT: buffer_load_ubyte v15, v1, s[0:3], 0 offen offset:139 +; ALIGNED-NEXT: buffer_load_ubyte v14, v1, s[0:3], 0 offen offset:138 +; ALIGNED-NEXT: buffer_load_ubyte v13, v1, s[0:3], 0 offen offset:137 +; ALIGNED-NEXT: s_clause 0xa +; ALIGNED-NEXT: buffer_load_ubyte v12, v1, s[0:3], 0 offen offset:136 +; ALIGNED-NEXT: buffer_load_ubyte v11, v1, s[0:3], 0 offen offset:135 +; ALIGNED-NEXT: buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:134 +; ALIGNED-NEXT: buffer_load_ubyte v9, v1, s[0:3], 0 offen offset:133 +; ALIGNED-NEXT: buffer_load_ubyte v8, v1, s[0:3], 0 offen offset:132 +; ALIGNED-NEXT: buffer_load_ubyte v7, v1, s[0:3], 0 offen offset:131 +; ALIGNED-NEXT: buffer_load_ubyte v6, v1, s[0:3], 0 offen offset:130 +; ALIGNED-NEXT: buffer_load_ubyte v5, v1, s[0:3], 0 offen offset:129 +; ALIGNED-NEXT: buffer_load_ubyte v4, v1, s[0:3], 0 offen offset:128 +; ALIGNED-NEXT: buffer_load_ubyte v3, v1, s[0:3], 0 offen offset:127 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:126 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_clause 0x34 +; ALIGNED-NEXT: buffer_load_ubyte v127, v1, s[0:3], 0 offen offset:125 +; ALIGNED-NEXT: buffer_load_ubyte v126, v1, s[0:3], 0 offen offset:124 +; ALIGNED-NEXT: buffer_load_ubyte v125, v1, s[0:3], 0 offen offset:123 +; ALIGNED-NEXT: buffer_load_ubyte v124, v1, s[0:3], 0 offen offset:122 +; ALIGNED-NEXT: buffer_load_ubyte v123, v1, s[0:3], 0 offen offset:121 +; ALIGNED-NEXT: buffer_load_ubyte v122, v1, s[0:3], 0 offen offset:120 +; ALIGNED-NEXT: buffer_load_ubyte v121, v1, s[0:3], 0 offen offset:119 +; ALIGNED-NEXT: buffer_load_ubyte v120, v1, s[0:3], 0 offen offset:118 +; ALIGNED-NEXT: buffer_load_ubyte v111, v1, s[0:3], 0 offen offset:117 +; ALIGNED-NEXT: buffer_load_ubyte v110, v1, s[0:3], 0 offen offset:116 +; ALIGNED-NEXT: buffer_load_ubyte v109, v1, s[0:3], 0 offen offset:115 +; ALIGNED-NEXT: buffer_load_ubyte v108, v1, s[0:3], 0 offen offset:114 +; ALIGNED-NEXT: buffer_load_ubyte v107, v1, s[0:3], 0 offen offset:113 +; ALIGNED-NEXT: buffer_load_ubyte v106, v1, s[0:3], 0 offen offset:112 +; ALIGNED-NEXT: buffer_load_ubyte v105, v1, s[0:3], 0 offen offset:111 +; ALIGNED-NEXT: buffer_load_ubyte v104, v1, s[0:3], 0 offen offset:110 +; ALIGNED-NEXT: buffer_load_ubyte v95, v1, s[0:3], 0 offen offset:109 +; ALIGNED-NEXT: buffer_load_ubyte v94, v1, s[0:3], 0 offen offset:108 +; ALIGNED-NEXT: buffer_load_ubyte v93, v1, s[0:3], 0 offen offset:107 +; ALIGNED-NEXT: buffer_load_ubyte v92, v1, s[0:3], 0 offen offset:106 +; ALIGNED-NEXT: buffer_load_ubyte v91, v1, s[0:3], 0 offen offset:105 +; ALIGNED-NEXT: buffer_load_ubyte v90, v1, s[0:3], 0 offen offset:104 +; ALIGNED-NEXT: buffer_load_ubyte v89, v1, s[0:3], 0 offen offset:103 +; ALIGNED-NEXT: buffer_load_ubyte v88, v1, s[0:3], 0 offen offset:102 +; ALIGNED-NEXT: buffer_load_ubyte v79, v1, s[0:3], 0 offen offset:101 +; ALIGNED-NEXT: buffer_load_ubyte v78, v1, s[0:3], 0 offen offset:100 +; ALIGNED-NEXT: buffer_load_ubyte v77, v1, s[0:3], 0 offen offset:99 +; ALIGNED-NEXT: buffer_load_ubyte v76, v1, s[0:3], 0 offen offset:98 +; ALIGNED-NEXT: buffer_load_ubyte v75, v1, s[0:3], 0 offen offset:97 +; ALIGNED-NEXT: buffer_load_ubyte v74, v1, s[0:3], 0 offen offset:96 +; ALIGNED-NEXT: buffer_load_ubyte v73, v1, s[0:3], 0 offen offset:95 +; ALIGNED-NEXT: buffer_load_ubyte v72, v1, s[0:3], 0 offen offset:94 +; ALIGNED-NEXT: buffer_load_ubyte v63, v1, s[0:3], 0 offen offset:93 +; ALIGNED-NEXT: buffer_load_ubyte v62, v1, s[0:3], 0 offen offset:92 +; ALIGNED-NEXT: buffer_load_ubyte v61, v1, s[0:3], 0 offen offset:91 +; ALIGNED-NEXT: buffer_load_ubyte v60, v1, s[0:3], 0 offen offset:90 +; ALIGNED-NEXT: buffer_load_ubyte v59, v1, s[0:3], 0 offen offset:89 +; ALIGNED-NEXT: buffer_load_ubyte v58, v1, s[0:3], 0 offen offset:88 +; ALIGNED-NEXT: buffer_load_ubyte v57, v1, s[0:3], 0 offen offset:87 +; ALIGNED-NEXT: buffer_load_ubyte v56, v1, s[0:3], 0 offen offset:86 +; ALIGNED-NEXT: buffer_load_ubyte v47, v1, s[0:3], 0 offen offset:85 +; ALIGNED-NEXT: buffer_load_ubyte v46, v1, s[0:3], 0 offen offset:84 +; ALIGNED-NEXT: buffer_load_ubyte v45, v1, s[0:3], 0 offen offset:83 +; ALIGNED-NEXT: buffer_load_ubyte v44, v1, s[0:3], 0 offen offset:82 +; ALIGNED-NEXT: buffer_load_ubyte v43, v1, s[0:3], 0 offen offset:81 +; ALIGNED-NEXT: buffer_load_ubyte v42, v1, s[0:3], 0 offen offset:80 +; ALIGNED-NEXT: buffer_load_ubyte v41, v1, s[0:3], 0 offen offset:79 +; ALIGNED-NEXT: buffer_load_ubyte v40, v1, s[0:3], 0 offen offset:78 +; ALIGNED-NEXT: buffer_load_ubyte v119, v1, s[0:3], 0 offen offset:77 +; ALIGNED-NEXT: buffer_load_ubyte v118, v1, s[0:3], 0 offen offset:76 +; ALIGNED-NEXT: buffer_load_ubyte v117, v1, s[0:3], 0 offen offset:75 +; ALIGNED-NEXT: buffer_load_ubyte v116, v1, s[0:3], 0 offen offset:74 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:73 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:72 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:71 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:70 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:69 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:68 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:67 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:66 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:65 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:64 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:63 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:62 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:61 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:60 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:59 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:58 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:57 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:56 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:55 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:54 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:53 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:52 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:51 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:50 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:49 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:48 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:47 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:46 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:45 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:44 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:43 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:42 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:41 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:40 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:39 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:38 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:37 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:36 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:35 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:34 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:33 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:32 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:31 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:30 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:29 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:28 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:27 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:26 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:25 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:24 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:23 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:22 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:21 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:20 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:19 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:18 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:17 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:16 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:15 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:14 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:13 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:12 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:11 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:10 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:9 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:8 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:7 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:6 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:5 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:4 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:3 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:2 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen +; ALIGNED-NEXT: v_add_nc_u32_e32 v1, 0xffffff00, v1 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:255 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:254 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:253 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:252 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:251 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:250 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:249 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:248 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:247 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:246 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:245 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:244 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:243 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:242 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:241 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:240 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:239 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:238 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:237 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:236 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:235 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:234 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:233 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:232 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:231 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:230 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:229 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:228 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:227 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:226 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:225 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:224 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:223 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:222 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:221 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:220 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:219 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:218 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:217 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:216 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:215 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:214 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:213 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:212 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:211 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:210 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:209 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:208 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:207 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:206 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:205 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:204 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:203 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:202 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:201 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:200 +; ALIGNED-NEXT: buffer_store_byte v115, v0, s[0:3], 0 offen offset:199 +; ALIGNED-NEXT: buffer_store_byte v114, v0, s[0:3], 0 offen offset:198 +; ALIGNED-NEXT: buffer_store_byte v113, v0, s[0:3], 0 offen offset:197 +; ALIGNED-NEXT: buffer_store_byte v112, v0, s[0:3], 0 offen offset:196 +; ALIGNED-NEXT: buffer_store_byte v103, v0, s[0:3], 0 offen offset:195 +; ALIGNED-NEXT: buffer_store_byte v102, v0, s[0:3], 0 offen offset:194 +; ALIGNED-NEXT: buffer_store_byte v101, v0, s[0:3], 0 offen offset:193 +; ALIGNED-NEXT: buffer_store_byte v100, v0, s[0:3], 0 offen offset:192 +; ALIGNED-NEXT: buffer_store_byte v99, v0, s[0:3], 0 offen offset:191 +; ALIGNED-NEXT: buffer_store_byte v98, v0, s[0:3], 0 offen offset:190 +; ALIGNED-NEXT: buffer_store_byte v97, v0, s[0:3], 0 offen offset:189 +; ALIGNED-NEXT: buffer_store_byte v96, v0, s[0:3], 0 offen offset:188 +; ALIGNED-NEXT: buffer_store_byte v87, v0, s[0:3], 0 offen offset:187 +; ALIGNED-NEXT: buffer_store_byte v86, v0, s[0:3], 0 offen offset:186 +; ALIGNED-NEXT: buffer_store_byte v85, v0, s[0:3], 0 offen offset:185 +; ALIGNED-NEXT: buffer_store_byte v84, v0, s[0:3], 0 offen offset:184 +; ALIGNED-NEXT: buffer_store_byte v83, v0, s[0:3], 0 offen offset:183 +; ALIGNED-NEXT: buffer_store_byte v82, v0, s[0:3], 0 offen offset:182 +; ALIGNED-NEXT: buffer_store_byte v81, v0, s[0:3], 0 offen offset:181 +; ALIGNED-NEXT: buffer_store_byte v80, v0, s[0:3], 0 offen offset:180 +; ALIGNED-NEXT: buffer_store_byte v71, v0, s[0:3], 0 offen offset:179 +; ALIGNED-NEXT: buffer_store_byte v70, v0, s[0:3], 0 offen offset:178 +; ALIGNED-NEXT: buffer_store_byte v69, v0, s[0:3], 0 offen offset:177 +; ALIGNED-NEXT: buffer_store_byte v68, v0, s[0:3], 0 offen offset:176 +; ALIGNED-NEXT: buffer_store_byte v67, v0, s[0:3], 0 offen offset:175 +; ALIGNED-NEXT: buffer_store_byte v66, v0, s[0:3], 0 offen offset:174 +; ALIGNED-NEXT: buffer_store_byte v65, v0, s[0:3], 0 offen offset:173 +; ALIGNED-NEXT: buffer_store_byte v64, v0, s[0:3], 0 offen offset:172 +; ALIGNED-NEXT: buffer_store_byte v55, v0, s[0:3], 0 offen offset:171 +; ALIGNED-NEXT: buffer_store_byte v54, v0, s[0:3], 0 offen offset:170 +; ALIGNED-NEXT: buffer_store_byte v53, v0, s[0:3], 0 offen offset:169 +; ALIGNED-NEXT: buffer_store_byte v52, v0, s[0:3], 0 offen offset:168 +; ALIGNED-NEXT: buffer_store_byte v51, v0, s[0:3], 0 offen offset:167 +; ALIGNED-NEXT: buffer_store_byte v50, v0, s[0:3], 0 offen offset:166 +; ALIGNED-NEXT: buffer_store_byte v49, v0, s[0:3], 0 offen offset:165 +; ALIGNED-NEXT: buffer_store_byte v48, v0, s[0:3], 0 offen offset:164 +; ALIGNED-NEXT: buffer_store_byte v39, v0, s[0:3], 0 offen offset:163 +; ALIGNED-NEXT: buffer_store_byte v38, v0, s[0:3], 0 offen offset:162 +; ALIGNED-NEXT: buffer_store_byte v37, v0, s[0:3], 0 offen offset:161 +; ALIGNED-NEXT: buffer_store_byte v36, v0, s[0:3], 0 offen offset:160 +; ALIGNED-NEXT: buffer_store_byte v35, v0, s[0:3], 0 offen offset:159 +; ALIGNED-NEXT: buffer_store_byte v34, v0, s[0:3], 0 offen offset:158 +; ALIGNED-NEXT: buffer_store_byte v33, v0, s[0:3], 0 offen offset:157 +; ALIGNED-NEXT: buffer_store_byte v32, v0, s[0:3], 0 offen offset:156 +; ALIGNED-NEXT: buffer_store_byte v31, v0, s[0:3], 0 offen offset:155 +; ALIGNED-NEXT: buffer_store_byte v30, v0, s[0:3], 0 offen offset:154 +; ALIGNED-NEXT: buffer_store_byte v29, v0, s[0:3], 0 offen offset:153 +; ALIGNED-NEXT: buffer_store_byte v28, v0, s[0:3], 0 offen offset:152 +; ALIGNED-NEXT: buffer_store_byte v27, v0, s[0:3], 0 offen offset:151 +; ALIGNED-NEXT: buffer_store_byte v26, v0, s[0:3], 0 offen offset:150 +; ALIGNED-NEXT: buffer_store_byte v25, v0, s[0:3], 0 offen offset:149 +; ALIGNED-NEXT: buffer_store_byte v24, v0, s[0:3], 0 offen offset:148 +; ALIGNED-NEXT: buffer_store_byte v23, v0, s[0:3], 0 offen offset:147 +; ALIGNED-NEXT: buffer_store_byte v22, v0, s[0:3], 0 offen offset:146 +; ALIGNED-NEXT: buffer_store_byte v21, v0, s[0:3], 0 offen offset:145 +; ALIGNED-NEXT: buffer_store_byte v20, v0, s[0:3], 0 offen offset:144 +; ALIGNED-NEXT: buffer_store_byte v19, v0, s[0:3], 0 offen offset:143 +; ALIGNED-NEXT: buffer_store_byte v18, v0, s[0:3], 0 offen offset:142 +; ALIGNED-NEXT: buffer_store_byte v17, v0, s[0:3], 0 offen offset:141 +; ALIGNED-NEXT: buffer_store_byte v16, v0, s[0:3], 0 offen offset:140 +; ALIGNED-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:139 +; ALIGNED-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:138 +; ALIGNED-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:137 +; ALIGNED-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:136 +; ALIGNED-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:135 +; ALIGNED-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:134 +; ALIGNED-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:133 +; ALIGNED-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:132 +; ALIGNED-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:131 +; ALIGNED-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:130 +; ALIGNED-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:129 +; ALIGNED-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:128 +; ALIGNED-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:127 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:126 +; ALIGNED-NEXT: buffer_store_byte v127, v0, s[0:3], 0 offen offset:125 +; ALIGNED-NEXT: buffer_store_byte v126, v0, s[0:3], 0 offen offset:124 +; ALIGNED-NEXT: buffer_store_byte v125, v0, s[0:3], 0 offen offset:123 +; ALIGNED-NEXT: buffer_store_byte v124, v0, s[0:3], 0 offen offset:122 +; ALIGNED-NEXT: buffer_store_byte v123, v0, s[0:3], 0 offen offset:121 +; ALIGNED-NEXT: buffer_store_byte v122, v0, s[0:3], 0 offen offset:120 +; ALIGNED-NEXT: buffer_store_byte v121, v0, s[0:3], 0 offen offset:119 +; ALIGNED-NEXT: buffer_store_byte v120, v0, s[0:3], 0 offen offset:118 +; ALIGNED-NEXT: buffer_store_byte v111, v0, s[0:3], 0 offen offset:117 +; ALIGNED-NEXT: buffer_store_byte v110, v0, s[0:3], 0 offen offset:116 +; ALIGNED-NEXT: buffer_store_byte v109, v0, s[0:3], 0 offen offset:115 +; ALIGNED-NEXT: buffer_store_byte v108, v0, s[0:3], 0 offen offset:114 +; ALIGNED-NEXT: buffer_store_byte v107, v0, s[0:3], 0 offen offset:113 +; ALIGNED-NEXT: buffer_store_byte v106, v0, s[0:3], 0 offen offset:112 +; ALIGNED-NEXT: buffer_store_byte v105, v0, s[0:3], 0 offen offset:111 +; ALIGNED-NEXT: buffer_store_byte v104, v0, s[0:3], 0 offen offset:110 +; ALIGNED-NEXT: buffer_store_byte v95, v0, s[0:3], 0 offen offset:109 +; ALIGNED-NEXT: buffer_store_byte v94, v0, s[0:3], 0 offen offset:108 +; ALIGNED-NEXT: buffer_store_byte v93, v0, s[0:3], 0 offen offset:107 +; ALIGNED-NEXT: buffer_store_byte v92, v0, s[0:3], 0 offen offset:106 +; ALIGNED-NEXT: buffer_store_byte v91, v0, s[0:3], 0 offen offset:105 +; ALIGNED-NEXT: buffer_store_byte v90, v0, s[0:3], 0 offen offset:104 +; ALIGNED-NEXT: buffer_store_byte v89, v0, s[0:3], 0 offen offset:103 +; ALIGNED-NEXT: buffer_store_byte v88, v0, s[0:3], 0 offen offset:102 +; ALIGNED-NEXT: buffer_store_byte v79, v0, s[0:3], 0 offen offset:101 +; ALIGNED-NEXT: buffer_store_byte v78, v0, s[0:3], 0 offen offset:100 +; ALIGNED-NEXT: buffer_store_byte v77, v0, s[0:3], 0 offen offset:99 +; ALIGNED-NEXT: buffer_store_byte v76, v0, s[0:3], 0 offen offset:98 +; ALIGNED-NEXT: buffer_store_byte v75, v0, s[0:3], 0 offen offset:97 +; ALIGNED-NEXT: buffer_store_byte v74, v0, s[0:3], 0 offen offset:96 +; ALIGNED-NEXT: buffer_store_byte v73, v0, s[0:3], 0 offen offset:95 +; ALIGNED-NEXT: buffer_store_byte v72, v0, s[0:3], 0 offen offset:94 +; ALIGNED-NEXT: buffer_store_byte v63, v0, s[0:3], 0 offen offset:93 +; ALIGNED-NEXT: buffer_store_byte v62, v0, s[0:3], 0 offen offset:92 +; ALIGNED-NEXT: buffer_store_byte v61, v0, s[0:3], 0 offen offset:91 +; ALIGNED-NEXT: buffer_store_byte v60, v0, s[0:3], 0 offen offset:90 +; ALIGNED-NEXT: buffer_store_byte v59, v0, s[0:3], 0 offen offset:89 +; ALIGNED-NEXT: buffer_store_byte v58, v0, s[0:3], 0 offen offset:88 +; ALIGNED-NEXT: buffer_store_byte v57, v0, s[0:3], 0 offen offset:87 +; ALIGNED-NEXT: buffer_store_byte v56, v0, s[0:3], 0 offen offset:86 +; ALIGNED-NEXT: buffer_store_byte v47, v0, s[0:3], 0 offen offset:85 +; ALIGNED-NEXT: buffer_store_byte v46, v0, s[0:3], 0 offen offset:84 +; ALIGNED-NEXT: buffer_store_byte v45, v0, s[0:3], 0 offen offset:83 +; ALIGNED-NEXT: buffer_store_byte v44, v0, s[0:3], 0 offen offset:82 +; ALIGNED-NEXT: buffer_store_byte v43, v0, s[0:3], 0 offen offset:81 +; ALIGNED-NEXT: buffer_store_byte v42, v0, s[0:3], 0 offen offset:80 +; ALIGNED-NEXT: buffer_store_byte v41, v0, s[0:3], 0 offen offset:79 +; ALIGNED-NEXT: buffer_store_byte v40, v0, s[0:3], 0 offen offset:78 +; ALIGNED-NEXT: buffer_store_byte v119, v0, s[0:3], 0 offen offset:77 +; ALIGNED-NEXT: buffer_store_byte v118, v0, s[0:3], 0 offen offset:76 +; ALIGNED-NEXT: buffer_store_byte v117, v0, s[0:3], 0 offen offset:75 +; ALIGNED-NEXT: buffer_store_byte v116, v0, s[0:3], 0 offen offset:74 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:73 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:72 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:71 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:70 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:69 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:68 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:67 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:66 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:65 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:64 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:63 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:62 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:61 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:60 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:59 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:58 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:57 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:56 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:55 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:54 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:53 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:52 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:51 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:50 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:49 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:48 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:47 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:46 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:45 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:44 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:43 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:42 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:41 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:40 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:39 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:38 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:37 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:36 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:35 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:34 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:33 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:32 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:31 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:30 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:29 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:28 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:27 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:26 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:25 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:24 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:23 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:22 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:21 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:20 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:19 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:18 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:17 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:16 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:15 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:14 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:13 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:12 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:11 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:10 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:9 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:8 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:7 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:6 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:5 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:4 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:3 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:2 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen +; ALIGNED-NEXT: v_add_nc_u32_e32 v0, 0xffffff00, v0 +; ALIGNED-NEXT: s_cbranch_scc0 .LBB8_5 +; ALIGNED-NEXT: .LBB8_6: ; %Flow19 +; ALIGNED-NEXT: s_or_b32 exec_lo, exec_lo, s6 +; ALIGNED-NEXT: s_clause 0x2f +; ALIGNED-NEXT: buffer_load_dword v127, off, s[0:3], s32 +; ALIGNED-NEXT: buffer_load_dword v126, off, s[0:3], s32 offset:4 +; ALIGNED-NEXT: buffer_load_dword v125, off, s[0:3], s32 offset:8 +; ALIGNED-NEXT: buffer_load_dword v124, off, s[0:3], s32 offset:12 +; ALIGNED-NEXT: buffer_load_dword v123, off, s[0:3], s32 offset:16 +; ALIGNED-NEXT: buffer_load_dword v122, off, s[0:3], s32 offset:20 +; ALIGNED-NEXT: buffer_load_dword v121, off, s[0:3], s32 offset:24 +; ALIGNED-NEXT: buffer_load_dword v120, off, s[0:3], s32 offset:28 +; ALIGNED-NEXT: buffer_load_dword v111, off, s[0:3], s32 offset:32 +; ALIGNED-NEXT: buffer_load_dword v110, off, s[0:3], s32 offset:36 +; ALIGNED-NEXT: buffer_load_dword v109, off, s[0:3], s32 offset:40 +; ALIGNED-NEXT: buffer_load_dword v108, off, s[0:3], s32 offset:44 +; ALIGNED-NEXT: buffer_load_dword v107, off, s[0:3], s32 offset:48 +; ALIGNED-NEXT: buffer_load_dword v106, off, s[0:3], s32 offset:52 +; ALIGNED-NEXT: buffer_load_dword v105, off, s[0:3], s32 offset:56 +; ALIGNED-NEXT: buffer_load_dword v104, off, s[0:3], s32 offset:60 +; ALIGNED-NEXT: buffer_load_dword v95, off, s[0:3], s32 offset:64 +; ALIGNED-NEXT: buffer_load_dword v94, off, s[0:3], s32 offset:68 +; ALIGNED-NEXT: buffer_load_dword v93, off, s[0:3], s32 offset:72 +; ALIGNED-NEXT: buffer_load_dword v92, off, s[0:3], s32 offset:76 +; ALIGNED-NEXT: buffer_load_dword v91, off, s[0:3], s32 offset:80 +; ALIGNED-NEXT: buffer_load_dword v90, off, s[0:3], s32 offset:84 +; ALIGNED-NEXT: buffer_load_dword v89, off, s[0:3], s32 offset:88 +; ALIGNED-NEXT: buffer_load_dword v88, off, s[0:3], s32 offset:92 +; ALIGNED-NEXT: buffer_load_dword v79, off, s[0:3], s32 offset:96 +; ALIGNED-NEXT: buffer_load_dword v78, off, s[0:3], s32 offset:100 +; ALIGNED-NEXT: buffer_load_dword v77, off, s[0:3], s32 offset:104 +; ALIGNED-NEXT: buffer_load_dword v76, off, s[0:3], s32 offset:108 +; ALIGNED-NEXT: buffer_load_dword v75, off, s[0:3], s32 offset:112 +; ALIGNED-NEXT: buffer_load_dword v74, off, s[0:3], s32 offset:116 +; ALIGNED-NEXT: buffer_load_dword v73, off, s[0:3], s32 offset:120 +; ALIGNED-NEXT: buffer_load_dword v72, off, s[0:3], s32 offset:124 +; ALIGNED-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:128 +; ALIGNED-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:132 +; ALIGNED-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:136 +; ALIGNED-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:140 +; ALIGNED-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:144 +; ALIGNED-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:148 +; ALIGNED-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:152 +; ALIGNED-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:156 +; ALIGNED-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:160 +; ALIGNED-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:164 +; ALIGNED-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:168 +; ALIGNED-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:172 +; ALIGNED-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:176 +; ALIGNED-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:180 +; ALIGNED-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:184 +; ALIGNED-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:188 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: s_setpc_b64 s[30:31] +; +; UNROLL3-LABEL: memmove_p5_p5_sz2048: +; UNROLL3: ; %bb.0: ; %entry +; UNROLL3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; UNROLL3-NEXT: s_mov_b32 s4, exec_lo +; UNROLL3-NEXT: v_cmpx_ge_u32_e64 v1, v0 +; UNROLL3-NEXT: s_xor_b32 s6, exec_lo, s4 +; UNROLL3-NEXT: s_cbranch_execz .LBB8_4 +; UNROLL3-NEXT: ; %bb.1: ; %memmove_fwd_loop.preheader +; UNROLL3-NEXT: v_mov_b32_e32 v2, v1 +; UNROLL3-NEXT: v_mov_b32_e32 v3, v0 +; UNROLL3-NEXT: s_mov_b64 s[4:5], 0x7e0 +; UNROLL3-NEXT: .LBB8_2: ; %memmove_fwd_loop +; UNROLL3-NEXT: ; =>This Inner Loop Header: Depth=1 +; UNROLL3-NEXT: s_clause 0xb +; UNROLL3-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:44 +; UNROLL3-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:40 +; UNROLL3-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:36 +; UNROLL3-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen offset:32 +; UNROLL3-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:28 +; UNROLL3-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:24 +; UNROLL3-NEXT: buffer_load_dword v10, v2, s[0:3], 0 offen offset:20 +; UNROLL3-NEXT: buffer_load_dword v11, v2, s[0:3], 0 offen offset:16 +; UNROLL3-NEXT: buffer_load_dword v12, v2, s[0:3], 0 offen offset:12 +; UNROLL3-NEXT: buffer_load_dword v13, v2, s[0:3], 0 offen offset:8 +; UNROLL3-NEXT: buffer_load_dword v14, v2, s[0:3], 0 offen offset:4 +; UNROLL3-NEXT: buffer_load_dword v15, v2, s[0:3], 0 offen +; UNROLL3-NEXT: v_add_nc_u32_e32 v2, 48, v2 +; UNROLL3-NEXT: s_add_u32 s4, s4, 0xffffffd0 +; UNROLL3-NEXT: s_addc_u32 s5, s5, -1 +; UNROLL3-NEXT: s_waitcnt vmcnt(11) +; UNROLL3-NEXT: buffer_store_dword v4, v3, s[0:3], 0 offen offset:44 +; UNROLL3-NEXT: s_waitcnt vmcnt(10) +; UNROLL3-NEXT: buffer_store_dword v5, v3, s[0:3], 0 offen offset:40 +; UNROLL3-NEXT: s_waitcnt vmcnt(9) +; UNROLL3-NEXT: buffer_store_dword v6, v3, s[0:3], 0 offen offset:36 +; UNROLL3-NEXT: s_waitcnt vmcnt(8) +; UNROLL3-NEXT: buffer_store_dword v7, v3, s[0:3], 0 offen offset:32 +; UNROLL3-NEXT: s_waitcnt vmcnt(7) +; UNROLL3-NEXT: buffer_store_dword v8, v3, s[0:3], 0 offen offset:28 +; UNROLL3-NEXT: s_waitcnt vmcnt(6) +; UNROLL3-NEXT: buffer_store_dword v9, v3, s[0:3], 0 offen offset:24 +; UNROLL3-NEXT: s_waitcnt vmcnt(5) +; UNROLL3-NEXT: buffer_store_dword v10, v3, s[0:3], 0 offen offset:20 +; UNROLL3-NEXT: s_waitcnt vmcnt(4) +; UNROLL3-NEXT: buffer_store_dword v11, v3, s[0:3], 0 offen offset:16 +; UNROLL3-NEXT: s_waitcnt vmcnt(3) +; UNROLL3-NEXT: buffer_store_dword v12, v3, s[0:3], 0 offen offset:12 +; UNROLL3-NEXT: s_waitcnt vmcnt(2) +; UNROLL3-NEXT: buffer_store_dword v13, v3, s[0:3], 0 offen offset:8 +; UNROLL3-NEXT: s_waitcnt vmcnt(1) +; UNROLL3-NEXT: buffer_store_dword v14, v3, s[0:3], 0 offen offset:4 +; UNROLL3-NEXT: s_waitcnt vmcnt(0) +; UNROLL3-NEXT: buffer_store_dword v15, v3, s[0:3], 0 offen +; UNROLL3-NEXT: v_add_nc_u32_e32 v3, 48, v3 +; UNROLL3-NEXT: s_cmp_lg_u64 s[4:5], 0 +; UNROLL3-NEXT: s_cbranch_scc1 .LBB8_2 +; UNROLL3-NEXT: ; %bb.3: ; %memmove_fwd_residual +; UNROLL3-NEXT: s_clause 0x3 +; UNROLL3-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen offset:2028 +; UNROLL3-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:2024 +; UNROLL3-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen offset:2020 +; UNROLL3-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen offset:2016 +; UNROLL3-NEXT: s_waitcnt vmcnt(3) +; UNROLL3-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:2028 +; UNROLL3-NEXT: s_waitcnt vmcnt(2) +; UNROLL3-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:2024 +; UNROLL3-NEXT: s_waitcnt vmcnt(1) +; UNROLL3-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:2020 +; UNROLL3-NEXT: s_waitcnt vmcnt(0) +; UNROLL3-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:2016 +; UNROLL3-NEXT: s_clause 0x3 +; UNROLL3-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen offset:2044 +; UNROLL3-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:2040 +; UNROLL3-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen offset:2036 +; UNROLL3-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:2032 +; UNROLL3-NEXT: s_waitcnt vmcnt(3) +; UNROLL3-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:2044 +; UNROLL3-NEXT: s_waitcnt vmcnt(2) +; UNROLL3-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:2040 +; UNROLL3-NEXT: s_waitcnt vmcnt(1) +; UNROLL3-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:2036 +; UNROLL3-NEXT: s_waitcnt vmcnt(0) +; UNROLL3-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:2032 +; UNROLL3-NEXT: ; implicit-def: $vgpr1 +; UNROLL3-NEXT: ; implicit-def: $vgpr0 +; UNROLL3-NEXT: .LBB8_4: ; %Flow16 +; UNROLL3-NEXT: s_andn2_saveexec_b32 s6, s6 +; UNROLL3-NEXT: s_cbranch_execz .LBB8_7 +; UNROLL3-NEXT: ; %bb.5: ; %memmove_bwd_residual +; UNROLL3-NEXT: s_clause 0x3 +; UNROLL3-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen offset:2044 +; UNROLL3-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:2040 +; UNROLL3-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen offset:2036 +; UNROLL3-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen offset:2032 +; UNROLL3-NEXT: s_movk_i32 s4, 0xf820 +; UNROLL3-NEXT: s_mov_b32 s5, -1 +; UNROLL3-NEXT: s_waitcnt vmcnt(3) +; UNROLL3-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:2044 +; UNROLL3-NEXT: s_waitcnt vmcnt(2) +; UNROLL3-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:2040 +; UNROLL3-NEXT: s_waitcnt vmcnt(1) +; UNROLL3-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:2036 +; UNROLL3-NEXT: s_waitcnt vmcnt(0) +; UNROLL3-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:2032 +; UNROLL3-NEXT: s_clause 0x3 +; UNROLL3-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:2028 +; UNROLL3-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen offset:2024 +; UNROLL3-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen offset:2020 +; UNROLL3-NEXT: buffer_load_dword v6, v1, s[0:3], 0 offen offset:2016 +; UNROLL3-NEXT: v_add_nc_u32_e32 v2, 0x7b0, v0 +; UNROLL3-NEXT: v_add_nc_u32_e32 v1, 0x7b0, v1 +; UNROLL3-NEXT: s_waitcnt vmcnt(3) +; UNROLL3-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:2028 +; UNROLL3-NEXT: s_waitcnt vmcnt(2) +; UNROLL3-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:2024 +; UNROLL3-NEXT: s_waitcnt vmcnt(1) +; UNROLL3-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:2020 +; UNROLL3-NEXT: s_waitcnt vmcnt(0) +; UNROLL3-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:2016 +; UNROLL3-NEXT: .LBB8_6: ; %memmove_bwd_loop +; UNROLL3-NEXT: ; =>This Inner Loop Header: Depth=1 +; UNROLL3-NEXT: s_clause 0xb +; UNROLL3-NEXT: buffer_load_dword v0, v1, s[0:3], 0 offen offset:44 +; UNROLL3-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:40 +; UNROLL3-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen offset:36 +; UNROLL3-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen offset:32 +; UNROLL3-NEXT: buffer_load_dword v6, v1, s[0:3], 0 offen offset:28 +; UNROLL3-NEXT: buffer_load_dword v7, v1, s[0:3], 0 offen offset:24 +; UNROLL3-NEXT: buffer_load_dword v8, v1, s[0:3], 0 offen offset:20 +; UNROLL3-NEXT: buffer_load_dword v9, v1, s[0:3], 0 offen offset:16 +; UNROLL3-NEXT: buffer_load_dword v10, v1, s[0:3], 0 offen offset:12 +; UNROLL3-NEXT: buffer_load_dword v11, v1, s[0:3], 0 offen offset:8 +; UNROLL3-NEXT: buffer_load_dword v12, v1, s[0:3], 0 offen offset:4 +; UNROLL3-NEXT: buffer_load_dword v13, v1, s[0:3], 0 offen +; UNROLL3-NEXT: v_subrev_nc_u32_e32 v1, 48, v1 +; UNROLL3-NEXT: s_add_u32 s4, s4, 48 +; UNROLL3-NEXT: s_addc_u32 s5, s5, 0 +; UNROLL3-NEXT: s_waitcnt vmcnt(11) +; UNROLL3-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen offset:44 +; UNROLL3-NEXT: s_waitcnt vmcnt(10) +; UNROLL3-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:40 +; UNROLL3-NEXT: s_waitcnt vmcnt(9) +; UNROLL3-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen offset:36 +; UNROLL3-NEXT: s_waitcnt vmcnt(8) +; UNROLL3-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen offset:32 +; UNROLL3-NEXT: s_waitcnt vmcnt(7) +; UNROLL3-NEXT: buffer_store_dword v6, v2, s[0:3], 0 offen offset:28 +; UNROLL3-NEXT: s_waitcnt vmcnt(6) +; UNROLL3-NEXT: buffer_store_dword v7, v2, s[0:3], 0 offen offset:24 +; UNROLL3-NEXT: s_waitcnt vmcnt(5) +; UNROLL3-NEXT: buffer_store_dword v8, v2, s[0:3], 0 offen offset:20 +; UNROLL3-NEXT: s_waitcnt vmcnt(4) +; UNROLL3-NEXT: buffer_store_dword v9, v2, s[0:3], 0 offen offset:16 +; UNROLL3-NEXT: s_waitcnt vmcnt(3) +; UNROLL3-NEXT: buffer_store_dword v10, v2, s[0:3], 0 offen offset:12 +; UNROLL3-NEXT: s_waitcnt vmcnt(2) +; UNROLL3-NEXT: buffer_store_dword v11, v2, s[0:3], 0 offen offset:8 +; UNROLL3-NEXT: s_waitcnt vmcnt(1) +; UNROLL3-NEXT: buffer_store_dword v12, v2, s[0:3], 0 offen offset:4 +; UNROLL3-NEXT: s_waitcnt vmcnt(0) +; UNROLL3-NEXT: buffer_store_dword v13, v2, s[0:3], 0 offen +; UNROLL3-NEXT: v_subrev_nc_u32_e32 v2, 48, v2 +; UNROLL3-NEXT: s_cmp_eq_u64 s[4:5], 0 +; UNROLL3-NEXT: s_cbranch_scc0 .LBB8_6 +; UNROLL3-NEXT: .LBB8_7: ; %Flow17 +; UNROLL3-NEXT: s_or_b32 exec_lo, exec_lo, s6 +; UNROLL3-NEXT: s_setpc_b64 s[30:31] +entry: + tail call void @llvm.memmove.p5.p5.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(5) noundef nonnull align 1 %src, i64 2048, i1 false) + ret void +} + +define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) align 1 readonly %src) { +; CHECK-LABEL: memmove_p0_p5_sz2048: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; CHECK-NEXT: s_mov_b64 s[4:5], 0 +; CHECK-NEXT: s_mov_b32 s6, exec_lo +; CHECK-NEXT: v_cndmask_b32_e32 v3, -1, v0, vcc_lo +; CHECK-NEXT: v_cmpx_ge_u32_e64 v2, v3 +; CHECK-NEXT: s_xor_b32 s6, exec_lo, s6 +; CHECK-NEXT: s_cbranch_execz .LBB9_2 +; CHECK-NEXT: .LBB9_1: ; %memmove_fwd_loop +; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: s_clause 0x3e +; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:24 +; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen offset:32 +; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:36 +; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:40 +; CHECK-NEXT: buffer_load_dword v10, v2, s[0:3], 0 offen offset:44 +; CHECK-NEXT: buffer_load_dword v11, v2, s[0:3], 0 offen offset:48 +; CHECK-NEXT: buffer_load_dword v12, v2, s[0:3], 0 offen offset:52 +; CHECK-NEXT: buffer_load_dword v13, v2, s[0:3], 0 offen offset:56 +; CHECK-NEXT: buffer_load_dword v14, v2, s[0:3], 0 offen offset:60 +; CHECK-NEXT: buffer_load_dword v18, v2, s[0:3], 0 offen offset:92 +; CHECK-NEXT: buffer_load_dword v17, v2, s[0:3], 0 offen offset:88 +; CHECK-NEXT: buffer_load_dword v16, v2, s[0:3], 0 offen offset:84 +; CHECK-NEXT: buffer_load_dword v15, v2, s[0:3], 0 offen offset:80 +; CHECK-NEXT: buffer_load_dword v22, v2, s[0:3], 0 offen offset:124 +; CHECK-NEXT: buffer_load_dword v21, v2, s[0:3], 0 offen offset:120 +; CHECK-NEXT: buffer_load_dword v20, v2, s[0:3], 0 offen offset:116 +; CHECK-NEXT: buffer_load_dword v19, v2, s[0:3], 0 offen offset:112 +; CHECK-NEXT: buffer_load_dword v26, v2, s[0:3], 0 offen offset:108 +; CHECK-NEXT: buffer_load_dword v25, v2, s[0:3], 0 offen offset:104 +; CHECK-NEXT: buffer_load_dword v24, v2, s[0:3], 0 offen offset:100 +; CHECK-NEXT: buffer_load_dword v23, v2, s[0:3], 0 offen offset:96 +; CHECK-NEXT: buffer_load_dword v30, v2, s[0:3], 0 offen offset:156 +; CHECK-NEXT: buffer_load_dword v29, v2, s[0:3], 0 offen offset:152 +; CHECK-NEXT: buffer_load_dword v28, v2, s[0:3], 0 offen offset:148 +; CHECK-NEXT: buffer_load_dword v27, v2, s[0:3], 0 offen offset:144 +; CHECK-NEXT: buffer_load_dword v34, v2, s[0:3], 0 offen offset:188 +; CHECK-NEXT: buffer_load_dword v33, v2, s[0:3], 0 offen offset:184 +; CHECK-NEXT: buffer_load_dword v32, v2, s[0:3], 0 offen offset:180 +; CHECK-NEXT: buffer_load_dword v31, v2, s[0:3], 0 offen offset:176 +; CHECK-NEXT: buffer_load_dword v38, v2, s[0:3], 0 offen offset:172 +; CHECK-NEXT: buffer_load_dword v37, v2, s[0:3], 0 offen offset:168 +; CHECK-NEXT: buffer_load_dword v36, v2, s[0:3], 0 offen offset:164 +; CHECK-NEXT: buffer_load_dword v35, v2, s[0:3], 0 offen offset:160 +; CHECK-NEXT: buffer_load_dword v51, v2, s[0:3], 0 offen offset:220 +; CHECK-NEXT: buffer_load_dword v50, v2, s[0:3], 0 offen offset:216 +; CHECK-NEXT: buffer_load_dword v49, v2, s[0:3], 0 offen offset:212 +; CHECK-NEXT: buffer_load_dword v48, v2, s[0:3], 0 offen offset:208 +; CHECK-NEXT: buffer_load_dword v55, v2, s[0:3], 0 offen offset:252 +; CHECK-NEXT: buffer_load_dword v54, v2, s[0:3], 0 offen offset:248 +; CHECK-NEXT: buffer_load_dword v53, v2, s[0:3], 0 offen offset:244 +; CHECK-NEXT: buffer_load_dword v52, v2, s[0:3], 0 offen offset:240 +; CHECK-NEXT: buffer_load_dword v67, v2, s[0:3], 0 offen offset:236 +; CHECK-NEXT: buffer_load_dword v66, v2, s[0:3], 0 offen offset:232 +; CHECK-NEXT: buffer_load_dword v65, v2, s[0:3], 0 offen offset:228 +; CHECK-NEXT: buffer_load_dword v64, v2, s[0:3], 0 offen offset:224 +; CHECK-NEXT: buffer_load_dword v71, v2, s[0:3], 0 offen offset:204 +; CHECK-NEXT: buffer_load_dword v70, v2, s[0:3], 0 offen offset:200 +; CHECK-NEXT: buffer_load_dword v69, v2, s[0:3], 0 offen offset:196 +; CHECK-NEXT: buffer_load_dword v68, v2, s[0:3], 0 offen offset:192 +; CHECK-NEXT: buffer_load_dword v83, v2, s[0:3], 0 offen offset:140 +; CHECK-NEXT: buffer_load_dword v82, v2, s[0:3], 0 offen offset:136 +; CHECK-NEXT: buffer_load_dword v81, v2, s[0:3], 0 offen offset:132 +; CHECK-NEXT: buffer_load_dword v80, v2, s[0:3], 0 offen offset:128 +; CHECK-NEXT: buffer_load_dword v87, v2, s[0:3], 0 offen offset:76 +; CHECK-NEXT: buffer_load_dword v86, v2, s[0:3], 0 offen offset:72 +; CHECK-NEXT: buffer_load_dword v85, v2, s[0:3], 0 offen offset:68 +; CHECK-NEXT: buffer_load_dword v84, v2, s[0:3], 0 offen offset:64 +; CHECK-NEXT: buffer_load_dword v96, v2, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v97, v2, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v98, v2, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v99, v2, s[0:3], 0 offen offset:12 +; CHECK-NEXT: v_add_co_u32 v100, vcc_lo, v0, s4 +; CHECK-NEXT: v_add_co_ci_u32_e32 v101, vcc_lo, s5, v1, vcc_lo +; CHECK-NEXT: s_add_u32 s4, s4, 0x100 +; CHECK-NEXT: v_add_nc_u32_e32 v2, 0x100, v2 +; CHECK-NEXT: s_addc_u32 s5, s5, 0 +; CHECK-NEXT: s_waitcnt vmcnt(20) +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[52:55] offset:240 +; CHECK-NEXT: s_waitcnt vmcnt(16) +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[64:67] offset:224 +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[48:51] offset:208 +; CHECK-NEXT: s_waitcnt vmcnt(12) +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[68:71] offset:192 +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[31:34] offset:176 +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[35:38] offset:160 +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[27:30] offset:144 +; CHECK-NEXT: s_waitcnt vmcnt(8) +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[80:83] offset:128 +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[19:22] offset:112 +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[23:26] offset:96 +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[15:18] offset:80 +; CHECK-NEXT: s_waitcnt vmcnt(4) +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[84:87] offset:64 +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[11:14] offset:48 +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[7:10] offset:32 +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[3:6] offset:16 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[96:99] +; CHECK-NEXT: s_cmp_lg_u64 s[4:5], 0x800 +; CHECK-NEXT: s_cbranch_scc1 .LBB9_1 +; CHECK-NEXT: .LBB9_2: ; %Flow10 +; CHECK-NEXT: s_andn2_saveexec_b32 s8, s6 +; CHECK-NEXT: s_cbranch_execz .LBB9_5 +; CHECK-NEXT: ; %bb.3: ; %memmove_bwd_loop.preheader +; CHECK-NEXT: v_add_nc_u32_e32 v2, 0x700, v2 +; CHECK-NEXT: s_movk_i32 s6, 0xff00 +; CHECK-NEXT: s_mov_b64 s[4:5], 0x700 +; CHECK-NEXT: s_mov_b32 s7, -1 +; CHECK-NEXT: .LBB9_4: ; %memmove_bwd_loop +; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: s_clause 0x3e +; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:24 +; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen offset:32 +; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:36 +; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:40 +; CHECK-NEXT: buffer_load_dword v10, v2, s[0:3], 0 offen offset:44 +; CHECK-NEXT: buffer_load_dword v11, v2, s[0:3], 0 offen offset:48 +; CHECK-NEXT: buffer_load_dword v12, v2, s[0:3], 0 offen offset:52 +; CHECK-NEXT: buffer_load_dword v13, v2, s[0:3], 0 offen offset:56 +; CHECK-NEXT: buffer_load_dword v14, v2, s[0:3], 0 offen offset:60 +; CHECK-NEXT: buffer_load_dword v18, v2, s[0:3], 0 offen offset:124 +; CHECK-NEXT: buffer_load_dword v17, v2, s[0:3], 0 offen offset:120 +; CHECK-NEXT: buffer_load_dword v16, v2, s[0:3], 0 offen offset:116 +; CHECK-NEXT: buffer_load_dword v15, v2, s[0:3], 0 offen offset:112 +; CHECK-NEXT: buffer_load_dword v22, v2, s[0:3], 0 offen offset:108 +; CHECK-NEXT: buffer_load_dword v21, v2, s[0:3], 0 offen offset:104 +; CHECK-NEXT: buffer_load_dword v20, v2, s[0:3], 0 offen offset:100 +; CHECK-NEXT: buffer_load_dword v19, v2, s[0:3], 0 offen offset:96 +; CHECK-NEXT: buffer_load_dword v26, v2, s[0:3], 0 offen offset:252 +; CHECK-NEXT: buffer_load_dword v25, v2, s[0:3], 0 offen offset:248 +; CHECK-NEXT: buffer_load_dword v24, v2, s[0:3], 0 offen offset:244 +; CHECK-NEXT: buffer_load_dword v23, v2, s[0:3], 0 offen offset:240 +; CHECK-NEXT: buffer_load_dword v30, v2, s[0:3], 0 offen offset:236 +; CHECK-NEXT: buffer_load_dword v29, v2, s[0:3], 0 offen offset:232 +; CHECK-NEXT: buffer_load_dword v28, v2, s[0:3], 0 offen offset:228 +; CHECK-NEXT: buffer_load_dword v27, v2, s[0:3], 0 offen offset:224 +; CHECK-NEXT: buffer_load_dword v34, v2, s[0:3], 0 offen offset:220 +; CHECK-NEXT: buffer_load_dword v33, v2, s[0:3], 0 offen offset:216 +; CHECK-NEXT: buffer_load_dword v32, v2, s[0:3], 0 offen offset:212 +; CHECK-NEXT: buffer_load_dword v31, v2, s[0:3], 0 offen offset:208 +; CHECK-NEXT: buffer_load_dword v38, v2, s[0:3], 0 offen offset:204 +; CHECK-NEXT: buffer_load_dword v37, v2, s[0:3], 0 offen offset:200 +; CHECK-NEXT: buffer_load_dword v36, v2, s[0:3], 0 offen offset:196 +; CHECK-NEXT: buffer_load_dword v35, v2, s[0:3], 0 offen offset:192 +; CHECK-NEXT: buffer_load_dword v51, v2, s[0:3], 0 offen offset:188 +; CHECK-NEXT: buffer_load_dword v50, v2, s[0:3], 0 offen offset:184 +; CHECK-NEXT: buffer_load_dword v49, v2, s[0:3], 0 offen offset:180 +; CHECK-NEXT: buffer_load_dword v48, v2, s[0:3], 0 offen offset:176 +; CHECK-NEXT: buffer_load_dword v55, v2, s[0:3], 0 offen offset:172 +; CHECK-NEXT: buffer_load_dword v54, v2, s[0:3], 0 offen offset:168 +; CHECK-NEXT: buffer_load_dword v53, v2, s[0:3], 0 offen offset:164 +; CHECK-NEXT: buffer_load_dword v52, v2, s[0:3], 0 offen offset:160 +; CHECK-NEXT: buffer_load_dword v67, v2, s[0:3], 0 offen offset:156 +; CHECK-NEXT: buffer_load_dword v66, v2, s[0:3], 0 offen offset:152 +; CHECK-NEXT: buffer_load_dword v65, v2, s[0:3], 0 offen offset:148 +; CHECK-NEXT: buffer_load_dword v64, v2, s[0:3], 0 offen offset:144 +; CHECK-NEXT: buffer_load_dword v71, v2, s[0:3], 0 offen offset:140 +; CHECK-NEXT: buffer_load_dword v70, v2, s[0:3], 0 offen offset:136 +; CHECK-NEXT: buffer_load_dword v69, v2, s[0:3], 0 offen offset:132 +; CHECK-NEXT: buffer_load_dword v68, v2, s[0:3], 0 offen offset:128 +; CHECK-NEXT: buffer_load_dword v83, v2, s[0:3], 0 offen offset:92 +; CHECK-NEXT: buffer_load_dword v82, v2, s[0:3], 0 offen offset:88 +; CHECK-NEXT: buffer_load_dword v81, v2, s[0:3], 0 offen offset:84 +; CHECK-NEXT: buffer_load_dword v80, v2, s[0:3], 0 offen offset:80 +; CHECK-NEXT: buffer_load_dword v87, v2, s[0:3], 0 offen offset:76 +; CHECK-NEXT: buffer_load_dword v86, v2, s[0:3], 0 offen offset:72 +; CHECK-NEXT: buffer_load_dword v85, v2, s[0:3], 0 offen offset:68 +; CHECK-NEXT: buffer_load_dword v84, v2, s[0:3], 0 offen offset:64 +; CHECK-NEXT: buffer_load_dword v96, v2, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v97, v2, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v98, v2, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_load_dword v99, v2, s[0:3], 0 offen offset:12 +; CHECK-NEXT: v_add_co_u32 v100, vcc_lo, v0, s4 +; CHECK-NEXT: v_add_co_ci_u32_e32 v101, vcc_lo, s5, v1, vcc_lo +; CHECK-NEXT: v_add_nc_u32_e32 v2, 0xffffff00, v2 +; CHECK-NEXT: s_add_u32 s4, s4, 0xffffff00 +; CHECK-NEXT: s_addc_u32 s5, s5, -1 +; CHECK-NEXT: s_waitcnt vmcnt(41) +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[23:26] offset:240 +; CHECK-NEXT: s_waitcnt vmcnt(37) +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[27:30] offset:224 +; CHECK-NEXT: s_waitcnt vmcnt(33) +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[31:34] offset:208 +; CHECK-NEXT: s_waitcnt vmcnt(29) +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[35:38] offset:192 +; CHECK-NEXT: s_waitcnt vmcnt(25) +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[48:51] offset:176 +; CHECK-NEXT: s_waitcnt vmcnt(21) +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[52:55] offset:160 +; CHECK-NEXT: s_waitcnt vmcnt(17) +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[64:67] offset:144 +; CHECK-NEXT: s_waitcnt vmcnt(13) +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[68:71] offset:128 +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[15:18] offset:112 +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[19:22] offset:96 +; CHECK-NEXT: s_waitcnt vmcnt(9) +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[80:83] offset:80 +; CHECK-NEXT: s_waitcnt vmcnt(5) +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[84:87] offset:64 +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[11:14] offset:48 +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[7:10] offset:32 +; CHECK-NEXT: s_waitcnt vmcnt(1) +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[3:6] offset:16 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[96:99] +; CHECK-NEXT: s_cmp_eq_u64 s[4:5], s[6:7] +; CHECK-NEXT: s_cbranch_scc0 .LBB9_4 +; CHECK-NEXT: .LBB9_5: ; %Flow11 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s8 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: s_setpc_b64 s[30:31] +; +; ALIGNED-LABEL: memmove_p0_p5_sz2048: +; ALIGNED: ; %bb.0: ; %entry +; ALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v72, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v73, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v74, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v75, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v76, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v77, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v78, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v79, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v88, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v89, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v90, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v91, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v92, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v93, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v94, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v95, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v104, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v105, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v106, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v107, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v108, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v109, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v110, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v111, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v120, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v121, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v122, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v123, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v124, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v125, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v126, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v127, off, s[0:3], s32 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_mov_b64 s[4:5], 0 +; ALIGNED-NEXT: s_mov_b32 s6, exec_lo +; ALIGNED-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc_lo +; ALIGNED-NEXT: v_cmpx_ge_u32_e64 v2, v0 +; ALIGNED-NEXT: s_xor_b32 s6, exec_lo, s6 +; ALIGNED-NEXT: s_cbranch_execz .LBB9_2 +; ALIGNED-NEXT: .LBB9_1: ; %memmove_fwd_loop +; ALIGNED-NEXT: ; =>This Inner Loop Header: Depth=1 +; ALIGNED-NEXT: s_clause 0x39 +; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:20 +; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:21 +; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:22 +; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:23 +; ALIGNED-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:24 +; ALIGNED-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:25 +; ALIGNED-NEXT: buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:26 +; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:30 +; ALIGNED-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:31 +; ALIGNED-NEXT: buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:32 +; ALIGNED-NEXT: buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:33 +; ALIGNED-NEXT: buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:34 +; ALIGNED-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:29 +; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:28 +; ALIGNED-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:27 +; ALIGNED-NEXT: buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:35 +; ALIGNED-NEXT: buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:36 +; ALIGNED-NEXT: buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:37 +; ALIGNED-NEXT: buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:38 +; ALIGNED-NEXT: buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:39 +; ALIGNED-NEXT: buffer_load_ubyte v22, v2, s[0:3], 0 offen offset:40 +; ALIGNED-NEXT: buffer_load_ubyte v23, v2, s[0:3], 0 offen offset:41 +; ALIGNED-NEXT: buffer_load_ubyte v25, v2, s[0:3], 0 offen offset:42 +; ALIGNED-NEXT: buffer_load_ubyte v27, v2, s[0:3], 0 offen offset:43 +; ALIGNED-NEXT: buffer_load_ubyte v21, v2, s[0:3], 0 offen offset:44 +; ALIGNED-NEXT: buffer_load_ubyte v24, v2, s[0:3], 0 offen offset:45 +; ALIGNED-NEXT: buffer_load_ubyte v26, v2, s[0:3], 0 offen offset:46 +; ALIGNED-NEXT: buffer_load_ubyte v28, v2, s[0:3], 0 offen offset:47 +; ALIGNED-NEXT: buffer_load_ubyte v29, v2, s[0:3], 0 offen offset:48 +; ALIGNED-NEXT: buffer_load_ubyte v30, v2, s[0:3], 0 offen offset:49 +; ALIGNED-NEXT: buffer_load_ubyte v34, v2, s[0:3], 0 offen offset:50 +; ALIGNED-NEXT: buffer_load_ubyte v32, v2, s[0:3], 0 offen offset:51 +; ALIGNED-NEXT: buffer_load_ubyte v31, v2, s[0:3], 0 offen offset:52 +; ALIGNED-NEXT: buffer_load_ubyte v36, v2, s[0:3], 0 offen offset:53 +; ALIGNED-NEXT: buffer_load_ubyte v33, v2, s[0:3], 0 offen offset:54 +; ALIGNED-NEXT: buffer_load_ubyte v35, v2, s[0:3], 0 offen offset:55 +; ALIGNED-NEXT: buffer_load_ubyte v49, v2, s[0:3], 0 offen offset:56 +; ALIGNED-NEXT: buffer_load_ubyte v50, v2, s[0:3], 0 offen offset:57 +; ALIGNED-NEXT: buffer_load_ubyte v52, v2, s[0:3], 0 offen offset:58 +; ALIGNED-NEXT: buffer_load_ubyte v38, v2, s[0:3], 0 offen offset:62 +; ALIGNED-NEXT: buffer_load_ubyte v39, v2, s[0:3], 0 offen offset:63 +; ALIGNED-NEXT: buffer_load_ubyte v53, v2, s[0:3], 0 offen offset:64 +; ALIGNED-NEXT: buffer_load_ubyte v54, v2, s[0:3], 0 offen offset:65 +; ALIGNED-NEXT: buffer_load_ubyte v65, v2, s[0:3], 0 offen offset:66 +; ALIGNED-NEXT: buffer_load_ubyte v48, v2, s[0:3], 0 offen offset:61 +; ALIGNED-NEXT: buffer_load_ubyte v37, v2, s[0:3], 0 offen offset:60 +; ALIGNED-NEXT: buffer_load_ubyte v51, v2, s[0:3], 0 offen offset:59 +; ALIGNED-NEXT: buffer_load_ubyte v55, v2, s[0:3], 0 offen offset:67 +; ALIGNED-NEXT: buffer_load_ubyte v64, v2, s[0:3], 0 offen offset:68 +; ALIGNED-NEXT: buffer_load_ubyte v66, v2, s[0:3], 0 offen offset:69 +; ALIGNED-NEXT: buffer_load_ubyte v67, v2, s[0:3], 0 offen offset:70 +; ALIGNED-NEXT: buffer_load_ubyte v68, v2, s[0:3], 0 offen offset:71 +; ALIGNED-NEXT: buffer_load_ubyte v69, v2, s[0:3], 0 offen offset:76 +; ALIGNED-NEXT: buffer_load_ubyte v70, v2, s[0:3], 0 offen offset:77 +; ALIGNED-NEXT: buffer_load_ubyte v71, v2, s[0:3], 0 offen offset:78 +; ALIGNED-NEXT: buffer_load_ubyte v80, v2, s[0:3], 0 offen offset:79 +; ALIGNED-NEXT: buffer_load_ubyte v127, v2, s[0:3], 0 offen offset:19 +; ALIGNED-NEXT: buffer_load_ubyte v81, v2, s[0:3], 0 offen offset:75 +; ALIGNED-NEXT: s_waitcnt vmcnt(57) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(56) +; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(55) +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(54) +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(53) +; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(52) +; ALIGNED-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(51) +; ALIGNED-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(50) +; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(49) +; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(48) +; ALIGNED-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 8, v3 +; ALIGNED-NEXT: s_waitcnt vmcnt(45) +; ALIGNED-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(44) +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(43) +; ALIGNED-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v3, v9, 8, v5 +; ALIGNED-NEXT: s_waitcnt vmcnt(41) +; ALIGNED-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v4, v8, 8, v6 +; ALIGNED-NEXT: v_lshl_or_b32 v5, v10, 8, v7 +; ALIGNED-NEXT: v_lshl_or_b32 v6, v11, 8, v12 +; ALIGNED-NEXT: v_lshl_or_b32 v7, v15, 8, v14 +; ALIGNED-NEXT: v_lshl_or_b32 v8, v19, 8, v17 +; ALIGNED-NEXT: s_waitcnt vmcnt(40) +; ALIGNED-NEXT: v_lshl_or_b32 v9, v16, 8, v13 +; ALIGNED-NEXT: s_waitcnt vmcnt(38) +; ALIGNED-NEXT: v_lshl_or_b32 v10, v20, 8, v18 +; ALIGNED-NEXT: s_waitcnt vmcnt(36) +; ALIGNED-NEXT: v_lshl_or_b32 v11, v23, 8, v22 +; ALIGNED-NEXT: s_waitcnt vmcnt(34) +; ALIGNED-NEXT: v_lshl_or_b32 v12, v27, 8, v25 +; ALIGNED-NEXT: s_waitcnt vmcnt(32) +; ALIGNED-NEXT: v_lshl_or_b32 v13, v24, 8, v21 +; ALIGNED-NEXT: s_waitcnt vmcnt(30) +; ALIGNED-NEXT: v_lshl_or_b32 v14, v28, 8, v26 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 16, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v6, 16, v5 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v8, 16, v7 +; ALIGNED-NEXT: v_lshl_or_b32 v5, v10, 16, v9 +; ALIGNED-NEXT: v_lshl_or_b32 v6, v12, 16, v11 +; ALIGNED-NEXT: v_lshl_or_b32 v7, v14, 16, v13 +; ALIGNED-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(28) +; ALIGNED-NEXT: v_lshl_or_b32 v15, v30, 8, v29 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(26) +; ALIGNED-NEXT: v_lshl_or_b32 v0, v32, 8, v34 +; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(24) +; ALIGNED-NEXT: v_lshl_or_b32 v1, v36, 8, v31 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:840 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(22) +; ALIGNED-NEXT: v_lshl_or_b32 v3, v35, 8, v33 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:852 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:868 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(12) +; ALIGNED-NEXT: v_lshl_or_b32 v4, v48, 8, v37 +; ALIGNED-NEXT: v_lshl_or_b32 v5, v39, 8, v38 +; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:876 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v6, v50, 8, v49 +; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:880 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(11) +; ALIGNED-NEXT: v_lshl_or_b32 v7, v51, 8, v52 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v0, 16, v15 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 16, v1 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v5, 16, v4 +; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:85 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v7, 16, v6 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:916 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v54, 8, v53 +; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:924 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(11) +; ALIGNED-NEXT: v_lshl_or_b32 v1, v55, 8, v65 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:932 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(9) +; ALIGNED-NEXT: v_lshl_or_b32 v3, v66, 8, v64 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:948 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(7) +; ALIGNED-NEXT: v_lshl_or_b32 v4, v68, 8, v67 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: s_clause 0x1 +; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:86 +; ALIGNED-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:82 +; ALIGNED-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 16, v3 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:972 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(7) +; ALIGNED-NEXT: v_lshl_or_b32 v0, v70, 8, v69 +; ALIGNED-NEXT: s_clause 0x1 +; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:83 +; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:74 +; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:984 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(7) +; ALIGNED-NEXT: v_lshl_or_b32 v1, v80, 8, v71 +; ALIGNED-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:73 +; ALIGNED-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:996 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:72 +; ALIGNED-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:856 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:848 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:844 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:872 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:860 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:864 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:888 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:900 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:896 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:884 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:892 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:904 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:912 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:908 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:920 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:928 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v65, off, s[0:3], s32 offset:944 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:936 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v64, off, s[0:3], s32 offset:940 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v66, off, s[0:3], s32 offset:952 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v67, off, s[0:3], s32 offset:956 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v68, off, s[0:3], s32 offset:960 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v69, off, s[0:3], s32 offset:964 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v70, off, s[0:3], s32 offset:968 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v71, off, s[0:3], s32 offset:976 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v80, off, s[0:3], s32 offset:980 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(7) +; ALIGNED-NEXT: buffer_store_dword v81, off, s[0:3], s32 offset:1000 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v127, off, s[0:3], s32 offset:1404 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:87 +; ALIGNED-NEXT: s_waitcnt vmcnt(7) +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1032 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(6) +; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1036 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(5) +; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:1024 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(4) +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1020 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1004 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:992 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:988 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v81, 8, v3 +; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:84 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:81 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1008 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:80 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:1040 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1028 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1016 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1012 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 8, v7 +; ALIGNED-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:98 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v8, 8, v6 +; ALIGNED-NEXT: s_clause 0x1 +; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:102 +; ALIGNED-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:103 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1044 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v5, 8, v3 +; ALIGNED-NEXT: s_clause 0x1 +; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:94 +; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:95 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: s_clause 0x1 +; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:93 +; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:91 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1052 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:92 +; ALIGNED-NEXT: s_waitcnt vmcnt(6) +; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1116 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(5) +; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:1120 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(4) +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1060 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1064 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1056 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1048 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v5, 8, v3 +; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:90 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1080 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:101 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:89 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1076 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:88 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1084 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1112 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1072 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1068 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 8, v3 +; ALIGNED-NEXT: s_clause 0x1 +; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:99 +; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:100 +; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:1104 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:97 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1088 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:96 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1100 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1108 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1096 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1092 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 8, v7 +; ALIGNED-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:114 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v8, 8, v6 +; ALIGNED-NEXT: s_clause 0x1 +; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:118 +; ALIGNED-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:119 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1124 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v5, 8, v3 +; ALIGNED-NEXT: s_clause 0x1 +; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:110 +; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:111 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: s_clause 0x1 +; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:109 +; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:107 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1132 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:108 +; ALIGNED-NEXT: s_waitcnt vmcnt(6) +; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1196 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(5) +; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:1200 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(4) +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1140 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1144 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1136 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1128 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v5, 8, v3 +; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:106 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1160 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:117 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:105 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1156 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:104 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1164 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1192 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1152 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1148 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 8, v3 +; ALIGNED-NEXT: s_clause 0x1 +; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:115 +; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:116 +; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:1188 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:113 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1168 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:112 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1180 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1184 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1176 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1172 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 8, v7 +; ALIGNED-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:130 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v8, 8, v6 +; ALIGNED-NEXT: s_clause 0x1 +; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:134 +; ALIGNED-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:135 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1204 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v5, 8, v3 +; ALIGNED-NEXT: s_clause 0x1 +; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:126 +; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:127 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: s_clause 0x1 +; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:125 +; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:123 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1212 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:124 +; ALIGNED-NEXT: s_waitcnt vmcnt(6) +; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1280 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(5) +; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:1288 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(4) +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1220 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1224 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1216 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1208 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v5, 8, v3 +; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:122 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1240 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:133 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:121 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1232 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:120 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1244 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1276 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1236 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1228 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 8, v3 +; ALIGNED-NEXT: s_clause 0x1 +; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:131 +; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:132 +; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:1272 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:129 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1248 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:128 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1264 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1268 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1256 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1252 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 8, v7 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v8, 8, v6 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1312 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v5, 8, v3 +; ALIGNED-NEXT: s_clause 0x1 +; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:142 +; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:143 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: s_clause 0x1 +; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:141 +; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:139 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1328 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:140 +; ALIGNED-NEXT: s_waitcnt vmcnt(4) +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1336 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1340 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1332 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1324 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v5, 8, v3 +; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:138 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1364 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:137 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1352 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:136 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1376 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1356 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1344 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 8, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1384 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:144 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1396 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_clause 0x3 +; ALIGNED-NEXT: buffer_load_ubyte v124, v2, s[0:3], 0 offen offset:145 +; ALIGNED-NEXT: buffer_load_ubyte v111, v2, s[0:3], 0 offen offset:146 +; ALIGNED-NEXT: buffer_load_ubyte v120, v2, s[0:3], 0 offen offset:147 +; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:148 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: v_lshl_or_b32 v0, v124, 8, v0 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: v_lshl_or_b32 v1, v120, 8, v111 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1400 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_clause 0x2 +; ALIGNED-NEXT: buffer_load_ubyte v121, v2, s[0:3], 0 offen offset:149 +; ALIGNED-NEXT: buffer_load_ubyte v122, v2, s[0:3], 0 offen offset:150 +; ALIGNED-NEXT: buffer_load_ubyte v109, v2, s[0:3], 0 offen offset:151 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1408 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: v_lshl_or_b32 v0, v121, 8, v3 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: v_lshl_or_b32 v1, v109, 8, v122 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1412 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_clause 0x4 +; ALIGNED-NEXT: buffer_load_ubyte v108, v2, s[0:3], 0 offen offset:156 +; ALIGNED-NEXT: buffer_load_ubyte v105, v2, s[0:3], 0 offen offset:157 +; ALIGNED-NEXT: buffer_load_ubyte v107, v2, s[0:3], 0 offen offset:158 +; ALIGNED-NEXT: buffer_load_ubyte v104, v2, s[0:3], 0 offen offset:159 +; ALIGNED-NEXT: buffer_load_ubyte v95, v2, s[0:3], 0 offen offset:155 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: v_lshl_or_b32 v0, v105, 8, v108 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: v_lshl_or_b32 v1, v104, 8, v107 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1416 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_clause 0x2 +; ALIGNED-NEXT: buffer_load_ubyte v93, v2, s[0:3], 0 offen offset:152 +; ALIGNED-NEXT: buffer_load_ubyte v92, v2, s[0:3], 0 offen offset:153 +; ALIGNED-NEXT: buffer_load_ubyte v90, v2, s[0:3], 0 offen offset:154 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: v_lshl_or_b32 v0, v92, 8, v93 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: v_lshl_or_b32 v1, v95, 8, v90 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1420 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_clause 0x7 +; ALIGNED-NEXT: buffer_load_ubyte v89, v2, s[0:3], 0 offen offset:160 +; ALIGNED-NEXT: buffer_load_ubyte v79, v2, s[0:3], 0 offen offset:161 +; ALIGNED-NEXT: buffer_load_ubyte v73, v2, s[0:3], 0 offen offset:162 +; ALIGNED-NEXT: buffer_load_ubyte v74, v2, s[0:3], 0 offen offset:163 +; ALIGNED-NEXT: buffer_load_ubyte v88, v2, s[0:3], 0 offen offset:164 +; ALIGNED-NEXT: buffer_load_ubyte v75, v2, s[0:3], 0 offen offset:165 +; ALIGNED-NEXT: buffer_load_ubyte v76, v2, s[0:3], 0 offen offset:166 +; ALIGNED-NEXT: buffer_load_ubyte v72, v2, s[0:3], 0 offen offset:167 +; ALIGNED-NEXT: s_waitcnt vmcnt(6) +; ALIGNED-NEXT: v_lshl_or_b32 v0, v79, 8, v89 +; ALIGNED-NEXT: s_waitcnt vmcnt(4) +; ALIGNED-NEXT: v_lshl_or_b32 v1, v74, 8, v73 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: v_lshl_or_b32 v1, v72, 8, v76 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1424 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v75, 8, v88 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1428 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_clause 0x4 +; ALIGNED-NEXT: buffer_load_ubyte v63, v2, s[0:3], 0 offen offset:172 +; ALIGNED-NEXT: buffer_load_ubyte v61, v2, s[0:3], 0 offen offset:173 +; ALIGNED-NEXT: buffer_load_ubyte v62, v2, s[0:3], 0 offen offset:174 +; ALIGNED-NEXT: buffer_load_ubyte v60, v2, s[0:3], 0 offen offset:175 +; ALIGNED-NEXT: buffer_load_ubyte v58, v2, s[0:3], 0 offen offset:171 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: v_lshl_or_b32 v0, v61, 8, v63 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: v_lshl_or_b32 v1, v60, 8, v62 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1432 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_clause 0x2 +; ALIGNED-NEXT: buffer_load_ubyte v59, v2, s[0:3], 0 offen offset:168 +; ALIGNED-NEXT: buffer_load_ubyte v57, v2, s[0:3], 0 offen offset:169 +; ALIGNED-NEXT: buffer_load_ubyte v56, v2, s[0:3], 0 offen offset:170 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: v_lshl_or_b32 v0, v57, 8, v59 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: v_lshl_or_b32 v1, v58, 8, v56 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1436 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_clause 0x7 +; ALIGNED-NEXT: buffer_load_ubyte v47, v2, s[0:3], 0 offen offset:176 +; ALIGNED-NEXT: buffer_load_ubyte v45, v2, s[0:3], 0 offen offset:177 +; ALIGNED-NEXT: buffer_load_ubyte v41, v2, s[0:3], 0 offen offset:178 +; ALIGNED-NEXT: buffer_load_ubyte v42, v2, s[0:3], 0 offen offset:179 +; ALIGNED-NEXT: buffer_load_ubyte v46, v2, s[0:3], 0 offen offset:180 +; ALIGNED-NEXT: buffer_load_ubyte v43, v2, s[0:3], 0 offen offset:181 +; ALIGNED-NEXT: buffer_load_ubyte v44, v2, s[0:3], 0 offen offset:182 +; ALIGNED-NEXT: buffer_load_ubyte v40, v2, s[0:3], 0 offen offset:183 +; ALIGNED-NEXT: s_waitcnt vmcnt(6) +; ALIGNED-NEXT: v_lshl_or_b32 v0, v45, 8, v47 +; ALIGNED-NEXT: s_waitcnt vmcnt(4) +; ALIGNED-NEXT: v_lshl_or_b32 v1, v42, 8, v41 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: v_lshl_or_b32 v1, v40, 8, v44 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1440 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v43, 8, v46 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1444 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_clause 0x4 +; ALIGNED-NEXT: buffer_load_ubyte v119, v2, s[0:3], 0 offen offset:188 +; ALIGNED-NEXT: buffer_load_ubyte v117, v2, s[0:3], 0 offen offset:189 +; ALIGNED-NEXT: buffer_load_ubyte v118, v2, s[0:3], 0 offen offset:190 +; ALIGNED-NEXT: buffer_load_ubyte v116, v2, s[0:3], 0 offen offset:191 +; ALIGNED-NEXT: buffer_load_ubyte v114, v2, s[0:3], 0 offen offset:187 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: v_lshl_or_b32 v0, v117, 8, v119 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: v_lshl_or_b32 v1, v116, 8, v118 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1448 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_clause 0x2 +; ALIGNED-NEXT: buffer_load_ubyte v115, v2, s[0:3], 0 offen offset:184 +; ALIGNED-NEXT: buffer_load_ubyte v113, v2, s[0:3], 0 offen offset:185 +; ALIGNED-NEXT: buffer_load_ubyte v112, v2, s[0:3], 0 offen offset:186 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: v_lshl_or_b32 v0, v113, 8, v115 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: v_lshl_or_b32 v1, v114, 8, v112 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1452 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_clause 0x7 +; ALIGNED-NEXT: buffer_load_ubyte v102, v2, s[0:3], 0 offen offset:192 +; ALIGNED-NEXT: buffer_load_ubyte v100, v2, s[0:3], 0 offen offset:193 +; ALIGNED-NEXT: buffer_load_ubyte v97, v2, s[0:3], 0 offen offset:194 +; ALIGNED-NEXT: buffer_load_ubyte v96, v2, s[0:3], 0 offen offset:195 +; ALIGNED-NEXT: buffer_load_ubyte v101, v2, s[0:3], 0 offen offset:196 +; ALIGNED-NEXT: buffer_load_ubyte v99, v2, s[0:3], 0 offen offset:197 +; ALIGNED-NEXT: buffer_load_ubyte v98, v2, s[0:3], 0 offen offset:198 +; ALIGNED-NEXT: buffer_load_ubyte v87, v2, s[0:3], 0 offen offset:199 +; ALIGNED-NEXT: s_waitcnt vmcnt(6) +; ALIGNED-NEXT: v_lshl_or_b32 v0, v100, 8, v102 +; ALIGNED-NEXT: s_waitcnt vmcnt(4) +; ALIGNED-NEXT: v_lshl_or_b32 v3, v96, 8, v97 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v3, 16, v0 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: v_lshl_or_b32 v3, v87, 8, v98 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1456 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v99, 8, v101 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v3, 16, v0 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1460 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_clause 0x4 +; ALIGNED-NEXT: buffer_load_ubyte v85, v2, s[0:3], 0 offen offset:204 +; ALIGNED-NEXT: buffer_load_ubyte v83, v2, s[0:3], 0 offen offset:205 +; ALIGNED-NEXT: buffer_load_ubyte v84, v2, s[0:3], 0 offen offset:206 +; ALIGNED-NEXT: buffer_load_ubyte v82, v2, s[0:3], 0 offen offset:207 +; ALIGNED-NEXT: buffer_load_ubyte v81, v2, s[0:3], 0 offen offset:203 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: v_lshl_or_b32 v3, v83, 8, v85 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: v_lshl_or_b32 v4, v82, 8, v84 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v4, 16, v3 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1464 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_clause 0x2 +; ALIGNED-NEXT: buffer_load_ubyte v80, v2, s[0:3], 0 offen offset:200 +; ALIGNED-NEXT: buffer_load_ubyte v70, v2, s[0:3], 0 offen offset:201 +; ALIGNED-NEXT: buffer_load_ubyte v69, v2, s[0:3], 0 offen offset:202 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: v_lshl_or_b32 v3, v70, 8, v80 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: v_lshl_or_b32 v4, v81, 8, v69 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v4, 16, v3 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1468 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_clause 0x4 +; ALIGNED-NEXT: buffer_load_ubyte v68, v2, s[0:3], 0 offen offset:212 +; ALIGNED-NEXT: buffer_load_ubyte v54, v2, s[0:3], 0 offen offset:213 +; ALIGNED-NEXT: buffer_load_ubyte v66, v2, s[0:3], 0 offen offset:214 +; ALIGNED-NEXT: buffer_load_ubyte v53, v2, s[0:3], 0 offen offset:215 +; ALIGNED-NEXT: buffer_load_ubyte v55, v2, s[0:3], 0 offen offset:211 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: v_lshl_or_b32 v3, v54, 8, v68 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: v_lshl_or_b32 v4, v53, 8, v66 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v4, 16, v3 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1472 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_clause 0x7 +; ALIGNED-NEXT: buffer_load_ubyte v67, v2, s[0:3], 0 offen offset:216 +; ALIGNED-NEXT: buffer_load_ubyte v64, v2, s[0:3], 0 offen offset:217 +; ALIGNED-NEXT: buffer_load_ubyte v49, v2, s[0:3], 0 offen offset:218 +; ALIGNED-NEXT: buffer_load_ubyte v50, v2, s[0:3], 0 offen offset:219 +; ALIGNED-NEXT: buffer_load_ubyte v65, v2, s[0:3], 0 offen offset:220 +; ALIGNED-NEXT: buffer_load_ubyte v51, v2, s[0:3], 0 offen offset:221 +; ALIGNED-NEXT: buffer_load_ubyte v52, v2, s[0:3], 0 offen offset:222 +; ALIGNED-NEXT: buffer_load_ubyte v48, v2, s[0:3], 0 offen offset:223 +; ALIGNED-NEXT: s_waitcnt vmcnt(6) +; ALIGNED-NEXT: v_lshl_or_b32 v3, v64, 8, v67 +; ALIGNED-NEXT: s_waitcnt vmcnt(4) +; ALIGNED-NEXT: v_lshl_or_b32 v4, v50, 8, v49 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v4, 16, v3 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: v_lshl_or_b32 v3, v51, 8, v65 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: v_lshl_or_b32 v4, v48, 8, v52 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1476 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v4, 16, v3 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1480 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_clause 0x2 +; ALIGNED-NEXT: buffer_load_ubyte v39, v2, s[0:3], 0 offen offset:208 +; ALIGNED-NEXT: buffer_load_ubyte v37, v2, s[0:3], 0 offen offset:209 +; ALIGNED-NEXT: buffer_load_ubyte v38, v2, s[0:3], 0 offen offset:210 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: v_lshl_or_b32 v3, v37, 8, v39 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: v_lshl_or_b32 v4, v55, 8, v38 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v4, 16, v3 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1484 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_clause 0x7 +; ALIGNED-NEXT: buffer_load_ubyte v36, v2, s[0:3], 0 offen offset:224 +; ALIGNED-NEXT: buffer_load_ubyte v34, v2, s[0:3], 0 offen offset:225 +; ALIGNED-NEXT: buffer_load_ubyte v31, v2, s[0:3], 0 offen offset:226 +; ALIGNED-NEXT: buffer_load_ubyte v30, v2, s[0:3], 0 offen offset:227 +; ALIGNED-NEXT: buffer_load_ubyte v35, v2, s[0:3], 0 offen offset:228 +; ALIGNED-NEXT: buffer_load_ubyte v33, v2, s[0:3], 0 offen offset:229 +; ALIGNED-NEXT: buffer_load_ubyte v32, v2, s[0:3], 0 offen offset:230 +; ALIGNED-NEXT: buffer_load_ubyte v29, v2, s[0:3], 0 offen offset:231 +; ALIGNED-NEXT: s_waitcnt vmcnt(6) +; ALIGNED-NEXT: v_lshl_or_b32 v3, v34, 8, v36 +; ALIGNED-NEXT: s_waitcnt vmcnt(4) +; ALIGNED-NEXT: v_lshl_or_b32 v4, v30, 8, v31 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v4, 16, v3 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: v_lshl_or_b32 v3, v33, 8, v35 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: v_lshl_or_b32 v4, v29, 8, v32 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1488 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_clause 0x17 +; ALIGNED-NEXT: buffer_load_ubyte v28, v2, s[0:3], 0 offen offset:236 +; ALIGNED-NEXT: buffer_load_ubyte v27, v2, s[0:3], 0 offen offset:237 +; ALIGNED-NEXT: buffer_load_ubyte v26, v2, s[0:3], 0 offen offset:238 +; ALIGNED-NEXT: buffer_load_ubyte v25, v2, s[0:3], 0 offen offset:239 +; ALIGNED-NEXT: buffer_load_ubyte v23, v2, s[0:3], 0 offen offset:235 +; ALIGNED-NEXT: buffer_load_ubyte v24, v2, s[0:3], 0 offen offset:232 +; ALIGNED-NEXT: buffer_load_ubyte v22, v2, s[0:3], 0 offen offset:233 +; ALIGNED-NEXT: buffer_load_ubyte v21, v2, s[0:3], 0 offen offset:234 +; ALIGNED-NEXT: buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:240 +; ALIGNED-NEXT: buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:241 +; ALIGNED-NEXT: buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:242 +; ALIGNED-NEXT: buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:243 +; ALIGNED-NEXT: buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:244 +; ALIGNED-NEXT: buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:245 +; ALIGNED-NEXT: buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:246 +; ALIGNED-NEXT: buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:247 +; ALIGNED-NEXT: buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:252 +; ALIGNED-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:253 +; ALIGNED-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:254 +; ALIGNED-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:255 +; ALIGNED-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:251 +; ALIGNED-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:248 +; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:249 +; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:250 +; ALIGNED-NEXT: v_lshl_or_b32 v110, v4, 16, v3 +; ALIGNED-NEXT: s_clause 0x4 +; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen +; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:3 +; ALIGNED-NEXT: buffer_load_ubyte v106, v2, s[0:3], 0 offen offset:4 +; ALIGNED-NEXT: buffer_load_ubyte v123, v2, s[0:3], 0 offen offset:5 +; ALIGNED-NEXT: buffer_load_ubyte v125, v2, s[0:3], 0 offen offset:6 +; ALIGNED-NEXT: s_waitcnt vmcnt(27) +; ALIGNED-NEXT: v_lshl_or_b32 v3, v27, 8, v28 +; ALIGNED-NEXT: s_waitcnt vmcnt(25) +; ALIGNED-NEXT: v_lshl_or_b32 v4, v25, 8, v26 +; ALIGNED-NEXT: s_waitcnt vmcnt(13) +; ALIGNED-NEXT: v_lshl_or_b32 v77, v13, 8, v16 +; ALIGNED-NEXT: s_waitcnt vmcnt(9) +; ALIGNED-NEXT: v_lshl_or_b32 v91, v9, 8, v10 +; ALIGNED-NEXT: v_lshl_or_b32 v94, v4, 16, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v22, 8, v24 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v23, 8, v21 +; ALIGNED-NEXT: v_lshl_or_b32 v78, v4, 16, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v18, 8, v20 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v14, 8, v15 +; ALIGNED-NEXT: v_lshl_or_b32 v103, v4, 16, v3 +; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:7 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v17, 8, v19 +; ALIGNED-NEXT: s_waitcnt vmcnt(4) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1292 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: buffer_store_dword v106, off, s[0:3], s32 offset:1296 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: buffer_store_dword v123, off, s[0:3], s32 offset:1304 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: buffer_store_dword v125, off, s[0:3], s32 offset:1308 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v86, v77, 16, v4 +; ALIGNED-NEXT: v_lshl_or_b32 v77, v11, 8, v12 +; ALIGNED-NEXT: v_lshl_or_b32 v71, v91, 16, v77 +; ALIGNED-NEXT: v_lshl_or_b32 v77, v6, 8, v8 +; ALIGNED-NEXT: v_lshl_or_b32 v91, v7, 8, v5 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v91, 16, v77 +; ALIGNED-NEXT: s_clause 0x1 +; ALIGNED-NEXT: buffer_load_ubyte v77, v2, s[0:3], 0 offen offset:1 +; ALIGNED-NEXT: buffer_load_ubyte v91, v2, s[0:3], 0 offen offset:2 +; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1260 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1320 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: buffer_store_dword v77, off, s[0:3], s32 offset:1284 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v91, off, s[0:3], s32 offset:1300 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v77, v77, 8, v1 +; ALIGNED-NEXT: v_lshl_or_b32 v91, v0, 8, v91 +; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:12 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v91, 16, v77 +; ALIGNED-NEXT: v_lshl_or_b32 v77, v123, 8, v106 +; ALIGNED-NEXT: v_lshl_or_b32 v91, v3, 8, v125 +; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:13 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1316 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v91, 16, v77 +; ALIGNED-NEXT: buffer_load_ubyte v91, v2, s[0:3], 0 offen offset:15 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1348 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_clause 0x1 +; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:14 +; ALIGNED-NEXT: buffer_load_ubyte v126, v2, s[0:3], 0 offen offset:11 +; ALIGNED-NEXT: s_waitcnt vmcnt(4) +; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1360 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: v_lshl_or_b32 v77, v3, 8, v1 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1380 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: buffer_store_dword v91, off, s[0:3], s32 offset:1372 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: v_lshl_or_b32 v91, v91, 8, v0 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1368 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v91, 16, v77 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1388 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_clause 0x2 +; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:8 +; ALIGNED-NEXT: buffer_load_ubyte v125, v2, s[0:3], 0 offen offset:9 +; ALIGNED-NEXT: buffer_load_ubyte v123, v2, s[0:3], 0 offen offset:10 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: v_lshl_or_b32 v77, v125, 8, v1 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: v_lshl_or_b32 v91, v126, 8, v123 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v91, 16, v77 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1392 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_clause 0x2 +; ALIGNED-NEXT: buffer_load_ubyte v106, v2, s[0:3], 0 offen offset:16 +; ALIGNED-NEXT: buffer_load_ubyte v77, v2, s[0:3], 0 offen offset:18 +; ALIGNED-NEXT: buffer_load_ubyte v91, v2, s[0:3], 0 offen offset:17 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:232 +; ALIGNED-NEXT: buffer_store_dword v71, off, s[0:3], s32 offset:236 +; ALIGNED-NEXT: buffer_store_dword v86, off, s[0:3], s32 offset:228 +; ALIGNED-NEXT: buffer_store_dword v103, off, s[0:3], s32 offset:224 +; ALIGNED-NEXT: s_clause 0x1 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:704 +; ALIGNED-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:708 +; ALIGNED-NEXT: v_add_nc_u32_e32 v2, 0x100, v2 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: v_lshl_or_b32 v0, v127, 8, v77 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: v_lshl_or_b32 v127, v91, 8, v106 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: v_add_co_u32 v3, vcc_lo, v3, s4 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, s5, v4, vcc_lo +; ALIGNED-NEXT: flat_store_byte v[3:4], v5 offset:250 +; ALIGNED-NEXT: flat_store_byte v[3:4], v7 offset:251 +; ALIGNED-NEXT: flat_store_byte v[3:4], v6 offset:249 +; ALIGNED-NEXT: flat_store_byte v[3:4], v9 offset:255 +; ALIGNED-NEXT: flat_store_byte v[3:4], v11 offset:253 +; ALIGNED-NEXT: flat_store_byte v[3:4], v10 offset:254 +; ALIGNED-NEXT: flat_store_byte v[3:4], v12 offset:252 +; ALIGNED-NEXT: flat_store_byte v[3:4], v8 offset:248 +; ALIGNED-NEXT: flat_store_byte v[3:4], v15 offset:242 +; ALIGNED-NEXT: flat_store_byte v[3:4], v14 offset:243 +; ALIGNED-NEXT: flat_store_byte v[3:4], v18 offset:241 +; ALIGNED-NEXT: flat_store_byte v[3:4], v13 offset:247 +; ALIGNED-NEXT: flat_store_byte v[3:4], v17 offset:245 +; ALIGNED-NEXT: flat_store_byte v[3:4], v16 offset:246 +; ALIGNED-NEXT: flat_store_byte v[3:4], v19 offset:244 +; ALIGNED-NEXT: flat_store_byte v[3:4], v20 offset:240 +; ALIGNED-NEXT: buffer_store_dword v78, off, s[0:3], s32 offset:248 +; ALIGNED-NEXT: buffer_store_dword v94, off, s[0:3], s32 offset:252 +; ALIGNED-NEXT: buffer_store_dword v110, off, s[0:3], s32 offset:244 +; ALIGNED-NEXT: v_lshl_or_b32 v127, v0, 16, v127 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1488 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_add_u32 s4, s4, 0x100 +; ALIGNED-NEXT: s_addc_u32 s5, s5, 0 +; ALIGNED-NEXT: s_cmp_lg_u64 s[4:5], 0x800 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 +; ALIGNED-NEXT: flat_store_byte v[3:4], v21 offset:234 +; ALIGNED-NEXT: flat_store_byte v[3:4], v23 offset:235 +; ALIGNED-NEXT: flat_store_byte v[3:4], v22 offset:233 +; ALIGNED-NEXT: flat_store_byte v[3:4], v25 offset:239 +; ALIGNED-NEXT: flat_store_byte v[3:4], v27 offset:237 +; ALIGNED-NEXT: flat_store_byte v[3:4], v26 offset:238 +; ALIGNED-NEXT: flat_store_byte v[3:4], v28 offset:236 +; ALIGNED-NEXT: flat_store_byte v[3:4], v24 offset:232 +; ALIGNED-NEXT: flat_store_byte v[3:4], v31 offset:226 +; ALIGNED-NEXT: flat_store_byte v[3:4], v30 offset:227 +; ALIGNED-NEXT: flat_store_byte v[3:4], v34 offset:225 +; ALIGNED-NEXT: flat_store_byte v[3:4], v29 offset:231 +; ALIGNED-NEXT: flat_store_byte v[3:4], v33 offset:229 +; ALIGNED-NEXT: flat_store_byte v[3:4], v32 offset:230 +; ALIGNED-NEXT: flat_store_byte v[3:4], v35 offset:228 +; ALIGNED-NEXT: flat_store_byte v[3:4], v36 offset:224 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1484 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1480 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1476 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1472 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 +; ALIGNED-NEXT: flat_store_byte v[3:4], v54 offset:213 +; ALIGNED-NEXT: flat_store_byte v[3:4], v53 offset:215 +; ALIGNED-NEXT: flat_store_byte v[3:4], v37 offset:209 +; ALIGNED-NEXT: flat_store_byte v[3:4], v55 offset:211 +; ALIGNED-NEXT: flat_store_byte v[3:4], v38 offset:210 +; ALIGNED-NEXT: flat_store_byte v[3:4], v66 offset:214 +; ALIGNED-NEXT: flat_store_byte v[3:4], v68 offset:212 +; ALIGNED-NEXT: flat_store_byte v[3:4], v49 offset:218 +; ALIGNED-NEXT: flat_store_byte v[3:4], v50 offset:219 +; ALIGNED-NEXT: flat_store_byte v[3:4], v64 offset:217 +; ALIGNED-NEXT: flat_store_byte v[3:4], v48 offset:223 +; ALIGNED-NEXT: flat_store_byte v[3:4], v51 offset:221 +; ALIGNED-NEXT: flat_store_byte v[3:4], v52 offset:222 +; ALIGNED-NEXT: flat_store_byte v[3:4], v65 offset:220 +; ALIGNED-NEXT: flat_store_byte v[3:4], v67 offset:216 +; ALIGNED-NEXT: flat_store_byte v[3:4], v39 offset:208 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1468 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1464 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1460 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1456 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 +; ALIGNED-NEXT: flat_store_byte v[3:4], v69 offset:202 +; ALIGNED-NEXT: flat_store_byte v[3:4], v81 offset:203 +; ALIGNED-NEXT: flat_store_byte v[3:4], v70 offset:201 +; ALIGNED-NEXT: flat_store_byte v[3:4], v82 offset:207 +; ALIGNED-NEXT: flat_store_byte v[3:4], v83 offset:205 +; ALIGNED-NEXT: flat_store_byte v[3:4], v84 offset:206 +; ALIGNED-NEXT: flat_store_byte v[3:4], v85 offset:204 +; ALIGNED-NEXT: flat_store_byte v[3:4], v80 offset:200 +; ALIGNED-NEXT: flat_store_byte v[3:4], v97 offset:194 +; ALIGNED-NEXT: flat_store_byte v[3:4], v96 offset:195 +; ALIGNED-NEXT: flat_store_byte v[3:4], v100 offset:193 +; ALIGNED-NEXT: flat_store_byte v[3:4], v87 offset:199 +; ALIGNED-NEXT: flat_store_byte v[3:4], v99 offset:197 +; ALIGNED-NEXT: flat_store_byte v[3:4], v98 offset:198 +; ALIGNED-NEXT: flat_store_byte v[3:4], v101 offset:196 +; ALIGNED-NEXT: flat_store_byte v[3:4], v102 offset:192 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1452 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1448 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:300 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1444 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1440 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 +; ALIGNED-NEXT: flat_store_byte v[3:4], v112 offset:186 +; ALIGNED-NEXT: flat_store_byte v[3:4], v114 offset:187 +; ALIGNED-NEXT: flat_store_byte v[3:4], v113 offset:185 +; ALIGNED-NEXT: flat_store_byte v[3:4], v116 offset:191 +; ALIGNED-NEXT: flat_store_byte v[3:4], v117 offset:189 +; ALIGNED-NEXT: flat_store_byte v[3:4], v118 offset:190 +; ALIGNED-NEXT: flat_store_byte v[3:4], v119 offset:188 +; ALIGNED-NEXT: flat_store_byte v[3:4], v115 offset:184 +; ALIGNED-NEXT: flat_store_byte v[3:4], v41 offset:178 +; ALIGNED-NEXT: flat_store_byte v[3:4], v42 offset:179 +; ALIGNED-NEXT: flat_store_byte v[3:4], v45 offset:177 +; ALIGNED-NEXT: flat_store_byte v[3:4], v40 offset:183 +; ALIGNED-NEXT: flat_store_byte v[3:4], v43 offset:181 +; ALIGNED-NEXT: flat_store_byte v[3:4], v44 offset:182 +; ALIGNED-NEXT: flat_store_byte v[3:4], v46 offset:180 +; ALIGNED-NEXT: flat_store_byte v[3:4], v47 offset:176 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1436 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:312 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1432 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:316 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1428 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1424 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 +; ALIGNED-NEXT: flat_store_byte v[3:4], v56 offset:170 +; ALIGNED-NEXT: flat_store_byte v[3:4], v58 offset:171 +; ALIGNED-NEXT: flat_store_byte v[3:4], v57 offset:169 +; ALIGNED-NEXT: flat_store_byte v[3:4], v60 offset:175 +; ALIGNED-NEXT: flat_store_byte v[3:4], v61 offset:173 +; ALIGNED-NEXT: flat_store_byte v[3:4], v62 offset:174 +; ALIGNED-NEXT: flat_store_byte v[3:4], v63 offset:172 +; ALIGNED-NEXT: flat_store_byte v[3:4], v59 offset:168 +; ALIGNED-NEXT: flat_store_byte v[3:4], v73 offset:162 +; ALIGNED-NEXT: flat_store_byte v[3:4], v74 offset:163 +; ALIGNED-NEXT: flat_store_byte v[3:4], v79 offset:161 +; ALIGNED-NEXT: flat_store_byte v[3:4], v72 offset:167 +; ALIGNED-NEXT: flat_store_byte v[3:4], v75 offset:165 +; ALIGNED-NEXT: flat_store_byte v[3:4], v76 offset:166 +; ALIGNED-NEXT: flat_store_byte v[3:4], v88 offset:164 +; ALIGNED-NEXT: flat_store_byte v[3:4], v89 offset:160 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1420 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1416 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1412 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1408 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 +; ALIGNED-NEXT: flat_store_byte v[3:4], v90 offset:154 +; ALIGNED-NEXT: flat_store_byte v[3:4], v95 offset:155 +; ALIGNED-NEXT: flat_store_byte v[3:4], v92 offset:153 +; ALIGNED-NEXT: flat_store_byte v[3:4], v104 offset:159 +; ALIGNED-NEXT: flat_store_byte v[3:4], v105 offset:157 +; ALIGNED-NEXT: flat_store_byte v[3:4], v107 offset:158 +; ALIGNED-NEXT: flat_store_byte v[3:4], v108 offset:156 +; ALIGNED-NEXT: flat_store_byte v[3:4], v93 offset:152 +; ALIGNED-NEXT: flat_store_byte v[3:4], v111 offset:146 +; ALIGNED-NEXT: flat_store_byte v[3:4], v120 offset:147 +; ALIGNED-NEXT: flat_store_byte v[3:4], v124 offset:145 +; ALIGNED-NEXT: flat_store_byte v[3:4], v109 offset:151 +; ALIGNED-NEXT: flat_store_byte v[3:4], v121 offset:149 +; ALIGNED-NEXT: flat_store_byte v[3:4], v122 offset:150 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1400 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:148 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1396 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:144 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1384 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1352 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:284 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1328 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1312 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1376 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:138 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1364 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:139 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1356 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:137 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1340 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:143 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1332 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:141 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1336 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:142 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1324 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:140 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1344 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:136 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1272 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:130 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1264 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:131 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1256 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:129 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1288 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:135 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1276 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:133 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1280 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:134 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1268 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:132 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1252 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:128 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1248 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:360 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1232 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:364 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1212 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:356 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1204 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:352 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1244 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:122 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1240 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:123 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1236 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:121 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1224 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:127 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1216 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:125 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1220 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:126 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1208 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:124 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1228 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:120 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1188 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:114 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1180 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:115 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1176 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:113 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1200 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:119 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1192 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:117 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1196 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:118 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1184 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:116 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1172 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:112 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1168 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:376 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1156 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:380 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1132 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:372 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1124 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:368 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1164 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:106 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1160 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:107 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1152 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:105 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1144 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:111 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1136 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:109 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1140 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:110 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1128 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:108 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1148 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:104 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1104 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:98 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1100 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:99 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1096 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:97 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1120 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:103 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1112 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:101 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1116 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:102 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1108 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:100 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1092 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:96 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1088 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:328 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1076 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:332 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1052 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:324 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1044 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1084 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:90 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1080 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:91 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1072 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:89 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1064 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:95 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1056 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:93 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1060 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:94 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1048 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:92 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1068 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:88 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1024 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:82 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1020 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:83 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1016 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:81 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1040 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:87 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1032 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:85 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1036 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:86 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1028 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:84 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1012 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:80 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1008 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:344 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:996 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:348 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:984 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:340 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:972 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:336 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1004 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:74 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1000 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:75 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:992 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:73 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:980 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:79 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:968 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:77 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:976 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:78 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:964 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:76 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:988 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:72 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:944 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:66 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:936 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:67 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:928 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:65 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:960 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:71 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:952 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:69 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:956 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:70 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:940 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:68 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:920 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:64 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:948 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:424 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:932 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:428 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:924 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:420 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:916 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:416 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:896 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:61 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:912 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:58 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:908 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:59 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:904 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:57 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:900 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:63 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:888 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:62 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:884 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:60 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:892 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:56 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:872 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:53 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:856 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:50 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:848 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:51 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:49 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:864 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:55 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:860 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:54 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:844 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:52 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:48 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:880 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:444 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:876 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:440 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:868 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:436 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:852 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:432 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:43 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:42 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:41 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:40 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:47 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:46 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:45 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:44 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:35 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:34 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:33 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:32 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:39 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:38 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:37 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:36 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:840 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:392 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:396 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:388 +; ALIGNED-NEXT: buffer_store_dword v127, off, s[0:3], s32 offset:384 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:26 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:27 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:25 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:31 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:29 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:30 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:28 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:24 +; ALIGNED-NEXT: flat_store_byte v[3:4], v77 offset:18 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1404 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:19 +; ALIGNED-NEXT: flat_store_byte v[3:4], v91 offset:17 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:23 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:21 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:22 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:20 +; ALIGNED-NEXT: flat_store_byte v[3:4], v106 offset:16 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1392 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:408 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1388 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:412 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1348 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:404 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1316 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:400 +; ALIGNED-NEXT: flat_store_byte v[3:4], v123 offset:10 +; ALIGNED-NEXT: flat_store_byte v[3:4], v126 offset:11 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1380 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:13 +; ALIGNED-NEXT: flat_store_byte v[3:4], v125 offset:9 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1372 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:15 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1368 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:14 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1360 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:12 +; ALIGNED-NEXT: flat_store_byte v[3:4], v1 offset:8 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1300 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:2 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1292 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:3 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1284 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:1 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1320 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:7 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1304 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:5 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1308 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:6 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1296 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:4 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1260 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 +; ALIGNED-NEXT: s_cbranch_scc1 .LBB9_1 +; ALIGNED-NEXT: .LBB9_2: ; %Flow10 +; ALIGNED-NEXT: s_andn2_saveexec_b32 s8, s6 +; ALIGNED-NEXT: s_cbranch_execz .LBB9_5 +; ALIGNED-NEXT: ; %bb.3: ; %memmove_bwd_loop.preheader +; ALIGNED-NEXT: v_add_nc_u32_e32 v4, 0x700, v2 +; ALIGNED-NEXT: s_movk_i32 s6, 0xff00 +; ALIGNED-NEXT: s_mov_b64 s[4:5], 0x700 +; ALIGNED-NEXT: s_mov_b32 s7, -1 +; ALIGNED-NEXT: .LBB9_4: ; %memmove_bwd_loop +; ALIGNED-NEXT: ; =>This Inner Loop Header: Depth=1 +; ALIGNED-NEXT: s_clause 0x3a +; ALIGNED-NEXT: buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:20 +; ALIGNED-NEXT: buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:21 +; ALIGNED-NEXT: buffer_load_ubyte v2, v4, s[0:3], 0 offen offset:22 +; ALIGNED-NEXT: buffer_load_ubyte v3, v4, s[0:3], 0 offen offset:23 +; ALIGNED-NEXT: buffer_load_ubyte v7, v4, s[0:3], 0 offen offset:24 +; ALIGNED-NEXT: buffer_load_ubyte v10, v4, s[0:3], 0 offen offset:25 +; ALIGNED-NEXT: buffer_load_ubyte v12, v4, s[0:3], 0 offen offset:26 +; ALIGNED-NEXT: buffer_load_ubyte v6, v4, s[0:3], 0 offen offset:30 +; ALIGNED-NEXT: buffer_load_ubyte v8, v4, s[0:3], 0 offen offset:31 +; ALIGNED-NEXT: buffer_load_ubyte v14, v4, s[0:3], 0 offen offset:32 +; ALIGNED-NEXT: buffer_load_ubyte v15, v4, s[0:3], 0 offen offset:33 +; ALIGNED-NEXT: buffer_load_ubyte v17, v4, s[0:3], 0 offen offset:34 +; ALIGNED-NEXT: buffer_load_ubyte v9, v4, s[0:3], 0 offen offset:29 +; ALIGNED-NEXT: buffer_load_ubyte v5, v4, s[0:3], 0 offen offset:28 +; ALIGNED-NEXT: buffer_load_ubyte v11, v4, s[0:3], 0 offen offset:27 +; ALIGNED-NEXT: buffer_load_ubyte v19, v4, s[0:3], 0 offen offset:35 +; ALIGNED-NEXT: buffer_load_ubyte v13, v4, s[0:3], 0 offen offset:36 +; ALIGNED-NEXT: buffer_load_ubyte v16, v4, s[0:3], 0 offen offset:37 +; ALIGNED-NEXT: buffer_load_ubyte v18, v4, s[0:3], 0 offen offset:38 +; ALIGNED-NEXT: buffer_load_ubyte v20, v4, s[0:3], 0 offen offset:39 +; ALIGNED-NEXT: buffer_load_ubyte v22, v4, s[0:3], 0 offen offset:40 +; ALIGNED-NEXT: buffer_load_ubyte v23, v4, s[0:3], 0 offen offset:41 +; ALIGNED-NEXT: buffer_load_ubyte v25, v4, s[0:3], 0 offen offset:42 +; ALIGNED-NEXT: buffer_load_ubyte v28, v4, s[0:3], 0 offen offset:43 +; ALIGNED-NEXT: buffer_load_ubyte v21, v4, s[0:3], 0 offen offset:44 +; ALIGNED-NEXT: buffer_load_ubyte v24, v4, s[0:3], 0 offen offset:45 +; ALIGNED-NEXT: buffer_load_ubyte v26, v4, s[0:3], 0 offen offset:46 +; ALIGNED-NEXT: buffer_load_ubyte v27, v4, s[0:3], 0 offen offset:47 +; ALIGNED-NEXT: buffer_load_ubyte v30, v4, s[0:3], 0 offen offset:48 +; ALIGNED-NEXT: buffer_load_ubyte v31, v4, s[0:3], 0 offen offset:49 +; ALIGNED-NEXT: buffer_load_ubyte v33, v4, s[0:3], 0 offen offset:50 +; ALIGNED-NEXT: buffer_load_ubyte v34, v4, s[0:3], 0 offen offset:51 +; ALIGNED-NEXT: buffer_load_ubyte v32, v4, s[0:3], 0 offen offset:52 +; ALIGNED-NEXT: buffer_load_ubyte v37, v4, s[0:3], 0 offen offset:53 +; ALIGNED-NEXT: buffer_load_ubyte v35, v4, s[0:3], 0 offen offset:54 +; ALIGNED-NEXT: buffer_load_ubyte v36, v4, s[0:3], 0 offen offset:55 +; ALIGNED-NEXT: buffer_load_ubyte v48, v4, s[0:3], 0 offen offset:56 +; ALIGNED-NEXT: buffer_load_ubyte v51, v4, s[0:3], 0 offen offset:57 +; ALIGNED-NEXT: buffer_load_ubyte v52, v4, s[0:3], 0 offen offset:58 +; ALIGNED-NEXT: buffer_load_ubyte v38, v4, s[0:3], 0 offen offset:60 +; ALIGNED-NEXT: buffer_load_ubyte v50, v4, s[0:3], 0 offen offset:61 +; ALIGNED-NEXT: buffer_load_ubyte v39, v4, s[0:3], 0 offen offset:62 +; ALIGNED-NEXT: buffer_load_ubyte v49, v4, s[0:3], 0 offen offset:63 +; ALIGNED-NEXT: buffer_load_ubyte v29, v4, s[0:3], 0 offen offset:64 +; ALIGNED-NEXT: buffer_load_ubyte v55, v4, s[0:3], 0 offen offset:65 +; ALIGNED-NEXT: buffer_load_ubyte v66, v4, s[0:3], 0 offen offset:66 +; ALIGNED-NEXT: buffer_load_ubyte v53, v4, s[0:3], 0 offen offset:59 +; ALIGNED-NEXT: buffer_load_ubyte v67, v4, s[0:3], 0 offen offset:67 +; ALIGNED-NEXT: buffer_load_ubyte v54, v4, s[0:3], 0 offen offset:68 +; ALIGNED-NEXT: buffer_load_ubyte v64, v4, s[0:3], 0 offen offset:69 +; ALIGNED-NEXT: buffer_load_ubyte v65, v4, s[0:3], 0 offen offset:70 +; ALIGNED-NEXT: buffer_load_ubyte v68, v4, s[0:3], 0 offen offset:71 +; ALIGNED-NEXT: buffer_load_ubyte v69, v4, s[0:3], 0 offen offset:76 +; ALIGNED-NEXT: buffer_load_ubyte v70, v4, s[0:3], 0 offen offset:77 +; ALIGNED-NEXT: buffer_load_ubyte v71, v4, s[0:3], 0 offen offset:78 +; ALIGNED-NEXT: buffer_load_ubyte v80, v4, s[0:3], 0 offen offset:79 +; ALIGNED-NEXT: buffer_load_ubyte v126, v4, s[0:3], 0 offen offset:19 +; ALIGNED-NEXT: buffer_load_ubyte v81, v4, s[0:3], 0 offen offset:75 +; ALIGNED-NEXT: buffer_load_ubyte v125, v4, s[0:3], 0 offen offset:151 +; ALIGNED-NEXT: s_waitcnt vmcnt(58) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(57) +; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(56) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(55) +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(54) +; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(53) +; ALIGNED-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(52) +; ALIGNED-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(51) +; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(50) +; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(49) +; ALIGNED-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 8, v2 +; ALIGNED-NEXT: s_waitcnt vmcnt(46) +; ALIGNED-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(45) +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(44) +; ALIGNED-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v2, v9, 8, v5 +; ALIGNED-NEXT: s_waitcnt vmcnt(42) +; ALIGNED-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v3, v8, 8, v6 +; ALIGNED-NEXT: v_lshl_or_b32 v5, v10, 8, v7 +; ALIGNED-NEXT: v_lshl_or_b32 v6, v11, 8, v12 +; ALIGNED-NEXT: v_lshl_or_b32 v7, v15, 8, v14 +; ALIGNED-NEXT: v_lshl_or_b32 v8, v19, 8, v17 +; ALIGNED-NEXT: s_waitcnt vmcnt(41) +; ALIGNED-NEXT: v_lshl_or_b32 v9, v16, 8, v13 +; ALIGNED-NEXT: s_waitcnt vmcnt(39) +; ALIGNED-NEXT: v_lshl_or_b32 v10, v20, 8, v18 +; ALIGNED-NEXT: s_waitcnt vmcnt(37) +; ALIGNED-NEXT: v_lshl_or_b32 v11, v23, 8, v22 +; ALIGNED-NEXT: s_waitcnt vmcnt(35) +; ALIGNED-NEXT: v_lshl_or_b32 v12, v28, 8, v25 +; ALIGNED-NEXT: s_waitcnt vmcnt(33) +; ALIGNED-NEXT: v_lshl_or_b32 v13, v24, 8, v21 +; ALIGNED-NEXT: s_waitcnt vmcnt(31) +; ALIGNED-NEXT: v_lshl_or_b32 v14, v27, 8, v26 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 16, v2 +; ALIGNED-NEXT: v_lshl_or_b32 v2, v6, 16, v5 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v8, 16, v7 +; ALIGNED-NEXT: v_lshl_or_b32 v5, v10, 16, v9 +; ALIGNED-NEXT: v_lshl_or_b32 v6, v12, 16, v11 +; ALIGNED-NEXT: v_lshl_or_b32 v7, v14, 16, v13 +; ALIGNED-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(29) +; ALIGNED-NEXT: v_lshl_or_b32 v15, v31, 8, v30 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(27) +; ALIGNED-NEXT: v_lshl_or_b32 v0, v34, 8, v33 +; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(25) +; ALIGNED-NEXT: v_lshl_or_b32 v1, v37, 8, v32 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(23) +; ALIGNED-NEXT: v_lshl_or_b32 v2, v36, 8, v35 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:840 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(18) +; ALIGNED-NEXT: v_lshl_or_b32 v3, v50, 8, v38 +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:852 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(16) +; ALIGNED-NEXT: v_lshl_or_b32 v5, v49, 8, v39 +; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:868 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v6, v51, 8, v48 +; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:880 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(12) +; ALIGNED-NEXT: v_lshl_or_b32 v7, v53, 8, v52 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v0, 16, v15 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v2, 16, v1 +; ALIGNED-NEXT: v_lshl_or_b32 v2, v5, 16, v3 +; ALIGNED-NEXT: buffer_load_ubyte v5, v4, s[0:3], 0 offen offset:85 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v7, 16, v6 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:916 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v55, 8, v29 +; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:920 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(12) +; ALIGNED-NEXT: v_lshl_or_b32 v1, v67, 8, v66 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:928 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(10) +; ALIGNED-NEXT: v_lshl_or_b32 v2, v64, 8, v54 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:932 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(8) +; ALIGNED-NEXT: v_lshl_or_b32 v3, v68, 8, v65 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: s_clause 0x1 +; ALIGNED-NEXT: buffer_load_ubyte v6, v4, s[0:3], 0 offen offset:86 +; ALIGNED-NEXT: buffer_load_ubyte v7, v4, s[0:3], 0 offen offset:82 +; ALIGNED-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 16, v2 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:976 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(8) +; ALIGNED-NEXT: v_lshl_or_b32 v0, v70, 8, v69 +; ALIGNED-NEXT: s_clause 0x1 +; ALIGNED-NEXT: buffer_load_ubyte v3, v4, s[0:3], 0 offen offset:83 +; ALIGNED-NEXT: buffer_load_ubyte v2, v4, s[0:3], 0 offen offset:74 +; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:988 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(8) +; ALIGNED-NEXT: v_lshl_or_b32 v1, v80, 8, v71 +; ALIGNED-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:73 +; ALIGNED-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:992 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:72 +; ALIGNED-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:844 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:860 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:856 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:848 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:876 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:864 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:872 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:884 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:900 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:888 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:896 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:892 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:904 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:912 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:908 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:924 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:936 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v66, off, s[0:3], s32 offset:956 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v67, off, s[0:3], s32 offset:948 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:940 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v64, off, s[0:3], s32 offset:944 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v65, off, s[0:3], s32 offset:952 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v68, off, s[0:3], s32 offset:960 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v69, off, s[0:3], s32 offset:964 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v70, off, s[0:3], s32 offset:968 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v71, off, s[0:3], s32 offset:972 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v80, off, s[0:3], s32 offset:980 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(9) +; ALIGNED-NEXT: buffer_store_dword v126, off, s[0:3], s32 offset:1416 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(8) +; ALIGNED-NEXT: buffer_store_dword v81, off, s[0:3], s32 offset:1000 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v8, v4, s[0:3], 0 offen offset:87 +; ALIGNED-NEXT: s_waitcnt vmcnt(7) +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1032 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(6) +; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1036 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(5) +; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:1028 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(4) +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1020 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1004 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:996 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:984 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v81, 8, v2 +; ALIGNED-NEXT: buffer_load_ubyte v2, v4, s[0:3], 0 offen offset:84 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:81 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1008 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:80 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:1040 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1024 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1016 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1012 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 8, v7 +; ALIGNED-NEXT: buffer_load_ubyte v7, v4, s[0:3], 0 offen offset:98 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v8, 8, v6 +; ALIGNED-NEXT: s_clause 0x1 +; ALIGNED-NEXT: buffer_load_ubyte v6, v4, s[0:3], 0 offen offset:102 +; ALIGNED-NEXT: buffer_load_ubyte v8, v4, s[0:3], 0 offen offset:103 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1044 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v5, 8, v2 +; ALIGNED-NEXT: s_clause 0x1 +; ALIGNED-NEXT: buffer_load_ubyte v2, v4, s[0:3], 0 offen offset:94 +; ALIGNED-NEXT: buffer_load_ubyte v5, v4, s[0:3], 0 offen offset:95 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: s_clause 0x1 +; ALIGNED-NEXT: buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:93 +; ALIGNED-NEXT: buffer_load_ubyte v3, v4, s[0:3], 0 offen offset:91 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1052 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:92 +; ALIGNED-NEXT: s_waitcnt vmcnt(6) +; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1116 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(5) +; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:1120 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(4) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1060 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1064 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1056 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1048 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v5, 8, v2 +; ALIGNED-NEXT: buffer_load_ubyte v2, v4, s[0:3], 0 offen offset:90 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1080 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v5, v4, s[0:3], 0 offen offset:101 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:89 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1072 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:88 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1084 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1112 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1076 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1068 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 8, v2 +; ALIGNED-NEXT: s_clause 0x1 +; ALIGNED-NEXT: buffer_load_ubyte v3, v4, s[0:3], 0 offen offset:99 +; ALIGNED-NEXT: buffer_load_ubyte v2, v4, s[0:3], 0 offen offset:100 +; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:1108 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:97 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1088 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:96 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1100 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1104 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1096 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1092 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 8, v7 +; ALIGNED-NEXT: buffer_load_ubyte v7, v4, s[0:3], 0 offen offset:114 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v8, 8, v6 +; ALIGNED-NEXT: s_clause 0x1 +; ALIGNED-NEXT: buffer_load_ubyte v6, v4, s[0:3], 0 offen offset:118 +; ALIGNED-NEXT: buffer_load_ubyte v8, v4, s[0:3], 0 offen offset:119 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1124 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v5, 8, v2 +; ALIGNED-NEXT: s_clause 0x1 +; ALIGNED-NEXT: buffer_load_ubyte v2, v4, s[0:3], 0 offen offset:110 +; ALIGNED-NEXT: buffer_load_ubyte v5, v4, s[0:3], 0 offen offset:111 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: s_clause 0x1 +; ALIGNED-NEXT: buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:109 +; ALIGNED-NEXT: buffer_load_ubyte v3, v4, s[0:3], 0 offen offset:107 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1132 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:108 +; ALIGNED-NEXT: s_waitcnt vmcnt(6) +; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1196 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(5) +; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:1200 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(4) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1140 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1144 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1136 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1128 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v5, 8, v2 +; ALIGNED-NEXT: buffer_load_ubyte v2, v4, s[0:3], 0 offen offset:106 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1160 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v5, v4, s[0:3], 0 offen offset:117 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:105 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1152 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:104 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1164 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1192 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1156 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1148 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 8, v2 +; ALIGNED-NEXT: s_clause 0x1 +; ALIGNED-NEXT: buffer_load_ubyte v3, v4, s[0:3], 0 offen offset:115 +; ALIGNED-NEXT: buffer_load_ubyte v2, v4, s[0:3], 0 offen offset:116 +; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:1188 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:113 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1168 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:112 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1180 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1184 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1176 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1172 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 8, v7 +; ALIGNED-NEXT: buffer_load_ubyte v7, v4, s[0:3], 0 offen offset:130 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v8, 8, v6 +; ALIGNED-NEXT: s_clause 0x1 +; ALIGNED-NEXT: buffer_load_ubyte v6, v4, s[0:3], 0 offen offset:134 +; ALIGNED-NEXT: buffer_load_ubyte v8, v4, s[0:3], 0 offen offset:135 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1204 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v5, 8, v2 +; ALIGNED-NEXT: s_clause 0x1 +; ALIGNED-NEXT: buffer_load_ubyte v2, v4, s[0:3], 0 offen offset:126 +; ALIGNED-NEXT: buffer_load_ubyte v5, v4, s[0:3], 0 offen offset:127 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: s_clause 0x1 +; ALIGNED-NEXT: buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:125 +; ALIGNED-NEXT: buffer_load_ubyte v3, v4, s[0:3], 0 offen offset:123 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1212 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:124 +; ALIGNED-NEXT: s_waitcnt vmcnt(6) +; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1276 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(5) +; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:1280 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(4) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1220 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1224 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1216 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1208 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v5, 8, v2 +; ALIGNED-NEXT: buffer_load_ubyte v2, v4, s[0:3], 0 offen offset:122 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1240 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v5, v4, s[0:3], 0 offen offset:133 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:121 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1232 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:120 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1244 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1272 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1236 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1228 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 8, v2 +; ALIGNED-NEXT: s_clause 0x1 +; ALIGNED-NEXT: buffer_load_ubyte v3, v4, s[0:3], 0 offen offset:131 +; ALIGNED-NEXT: buffer_load_ubyte v2, v4, s[0:3], 0 offen offset:132 +; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:1268 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:129 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1248 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:128 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1260 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1264 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1256 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1252 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 8, v7 +; ALIGNED-NEXT: buffer_load_ubyte v7, v4, s[0:3], 0 offen offset:146 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v8, 8, v6 +; ALIGNED-NEXT: buffer_load_ubyte v6, v4, s[0:3], 0 offen offset:150 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1284 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v5, 8, v2 +; ALIGNED-NEXT: s_clause 0x1 +; ALIGNED-NEXT: buffer_load_ubyte v2, v4, s[0:3], 0 offen offset:142 +; ALIGNED-NEXT: buffer_load_ubyte v5, v4, s[0:3], 0 offen offset:143 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: s_clause 0x1 +; ALIGNED-NEXT: buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:141 +; ALIGNED-NEXT: buffer_load_ubyte v3, v4, s[0:3], 0 offen offset:139 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1292 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:140 +; ALIGNED-NEXT: s_waitcnt vmcnt(5) +; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1372 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(4) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1300 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1304 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1296 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1288 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v5, 8, v2 +; ALIGNED-NEXT: buffer_load_ubyte v2, v4, s[0:3], 0 offen offset:138 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1320 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v5, v4, s[0:3], 0 offen offset:149 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:137 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1312 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:136 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1324 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1368 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1316 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1308 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 8, v2 +; ALIGNED-NEXT: s_clause 0x1 +; ALIGNED-NEXT: buffer_load_ubyte v3, v4, s[0:3], 0 offen offset:147 +; ALIGNED-NEXT: buffer_load_ubyte v2, v4, s[0:3], 0 offen offset:148 +; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:1356 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:145 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1328 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:144 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1348 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1352 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1340 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1332 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 8, v7 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v125, 8, v6 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1384 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v5, 8, v2 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1392 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_clause 0x4 +; ALIGNED-NEXT: buffer_load_ubyte v122, v4, s[0:3], 0 offen offset:156 +; ALIGNED-NEXT: buffer_load_ubyte v111, v4, s[0:3], 0 offen offset:157 +; ALIGNED-NEXT: buffer_load_ubyte v120, v4, s[0:3], 0 offen offset:158 +; ALIGNED-NEXT: buffer_load_ubyte v109, v4, s[0:3], 0 offen offset:159 +; ALIGNED-NEXT: buffer_load_ubyte v106, v4, s[0:3], 0 offen offset:155 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: v_lshl_or_b32 v0, v111, 8, v122 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: v_lshl_or_b32 v1, v109, 8, v120 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1400 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_clause 0x2 +; ALIGNED-NEXT: buffer_load_ubyte v105, v4, s[0:3], 0 offen offset:152 +; ALIGNED-NEXT: buffer_load_ubyte v94, v4, s[0:3], 0 offen offset:153 +; ALIGNED-NEXT: buffer_load_ubyte v92, v4, s[0:3], 0 offen offset:154 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: v_lshl_or_b32 v0, v94, 8, v105 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: v_lshl_or_b32 v1, v106, 8, v92 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1408 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_clause 0x7 +; ALIGNED-NEXT: buffer_load_ubyte v89, v4, s[0:3], 0 offen offset:160 +; ALIGNED-NEXT: buffer_load_ubyte v79, v4, s[0:3], 0 offen offset:161 +; ALIGNED-NEXT: buffer_load_ubyte v73, v4, s[0:3], 0 offen offset:162 +; ALIGNED-NEXT: buffer_load_ubyte v74, v4, s[0:3], 0 offen offset:163 +; ALIGNED-NEXT: buffer_load_ubyte v88, v4, s[0:3], 0 offen offset:164 +; ALIGNED-NEXT: buffer_load_ubyte v75, v4, s[0:3], 0 offen offset:165 +; ALIGNED-NEXT: buffer_load_ubyte v77, v4, s[0:3], 0 offen offset:166 +; ALIGNED-NEXT: buffer_load_ubyte v72, v4, s[0:3], 0 offen offset:167 +; ALIGNED-NEXT: s_waitcnt vmcnt(6) +; ALIGNED-NEXT: v_lshl_or_b32 v0, v79, 8, v89 +; ALIGNED-NEXT: s_waitcnt vmcnt(4) +; ALIGNED-NEXT: v_lshl_or_b32 v1, v74, 8, v73 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: v_lshl_or_b32 v1, v72, 8, v77 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1420 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v75, 8, v88 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1424 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_clause 0x4 +; ALIGNED-NEXT: buffer_load_ubyte v63, v4, s[0:3], 0 offen offset:172 +; ALIGNED-NEXT: buffer_load_ubyte v61, v4, s[0:3], 0 offen offset:173 +; ALIGNED-NEXT: buffer_load_ubyte v62, v4, s[0:3], 0 offen offset:174 +; ALIGNED-NEXT: buffer_load_ubyte v60, v4, s[0:3], 0 offen offset:175 +; ALIGNED-NEXT: buffer_load_ubyte v58, v4, s[0:3], 0 offen offset:171 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: v_lshl_or_b32 v0, v61, 8, v63 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: v_lshl_or_b32 v1, v60, 8, v62 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1428 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_clause 0x2 +; ALIGNED-NEXT: buffer_load_ubyte v59, v4, s[0:3], 0 offen offset:168 +; ALIGNED-NEXT: buffer_load_ubyte v56, v4, s[0:3], 0 offen offset:169 +; ALIGNED-NEXT: buffer_load_ubyte v47, v4, s[0:3], 0 offen offset:170 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: v_lshl_or_b32 v0, v56, 8, v59 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: v_lshl_or_b32 v1, v58, 8, v47 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1432 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_clause 0x7 +; ALIGNED-NEXT: buffer_load_ubyte v46, v4, s[0:3], 0 offen offset:176 +; ALIGNED-NEXT: buffer_load_ubyte v44, v4, s[0:3], 0 offen offset:177 +; ALIGNED-NEXT: buffer_load_ubyte v119, v4, s[0:3], 0 offen offset:178 +; ALIGNED-NEXT: buffer_load_ubyte v40, v4, s[0:3], 0 offen offset:179 +; ALIGNED-NEXT: buffer_load_ubyte v45, v4, s[0:3], 0 offen offset:180 +; ALIGNED-NEXT: buffer_load_ubyte v41, v4, s[0:3], 0 offen offset:181 +; ALIGNED-NEXT: buffer_load_ubyte v42, v4, s[0:3], 0 offen offset:182 +; ALIGNED-NEXT: buffer_load_ubyte v118, v4, s[0:3], 0 offen offset:183 +; ALIGNED-NEXT: s_waitcnt vmcnt(6) +; ALIGNED-NEXT: v_lshl_or_b32 v0, v44, 8, v46 +; ALIGNED-NEXT: s_waitcnt vmcnt(4) +; ALIGNED-NEXT: v_lshl_or_b32 v1, v40, 8, v119 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: v_lshl_or_b32 v1, v118, 8, v42 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1436 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v41, 8, v45 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1440 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_clause 0x4 +; ALIGNED-NEXT: buffer_load_ubyte v117, v4, s[0:3], 0 offen offset:188 +; ALIGNED-NEXT: buffer_load_ubyte v115, v4, s[0:3], 0 offen offset:189 +; ALIGNED-NEXT: buffer_load_ubyte v116, v4, s[0:3], 0 offen offset:190 +; ALIGNED-NEXT: buffer_load_ubyte v114, v4, s[0:3], 0 offen offset:191 +; ALIGNED-NEXT: buffer_load_ubyte v112, v4, s[0:3], 0 offen offset:187 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: v_lshl_or_b32 v0, v115, 8, v117 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: v_lshl_or_b32 v1, v114, 8, v116 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1444 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_clause 0x2 +; ALIGNED-NEXT: buffer_load_ubyte v113, v4, s[0:3], 0 offen offset:184 +; ALIGNED-NEXT: buffer_load_ubyte v103, v4, s[0:3], 0 offen offset:185 +; ALIGNED-NEXT: buffer_load_ubyte v102, v4, s[0:3], 0 offen offset:186 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: v_lshl_or_b32 v0, v103, 8, v113 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: v_lshl_or_b32 v1, v112, 8, v102 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1448 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_clause 0x7 +; ALIGNED-NEXT: buffer_load_ubyte v100, v4, s[0:3], 0 offen offset:192 +; ALIGNED-NEXT: buffer_load_ubyte v98, v4, s[0:3], 0 offen offset:193 +; ALIGNED-NEXT: buffer_load_ubyte v87, v4, s[0:3], 0 offen offset:194 +; ALIGNED-NEXT: buffer_load_ubyte v86, v4, s[0:3], 0 offen offset:195 +; ALIGNED-NEXT: buffer_load_ubyte v99, v4, s[0:3], 0 offen offset:196 +; ALIGNED-NEXT: buffer_load_ubyte v97, v4, s[0:3], 0 offen offset:197 +; ALIGNED-NEXT: buffer_load_ubyte v96, v4, s[0:3], 0 offen offset:198 +; ALIGNED-NEXT: buffer_load_ubyte v85, v4, s[0:3], 0 offen offset:199 +; ALIGNED-NEXT: s_waitcnt vmcnt(6) +; ALIGNED-NEXT: v_lshl_or_b32 v2, v98, 8, v100 +; ALIGNED-NEXT: s_waitcnt vmcnt(4) +; ALIGNED-NEXT: v_lshl_or_b32 v3, v86, 8, v87 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v3, 16, v2 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: v_lshl_or_b32 v2, v97, 8, v99 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: v_lshl_or_b32 v3, v85, 8, v96 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1452 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v3, 16, v2 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1456 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_clause 0x4 +; ALIGNED-NEXT: buffer_load_ubyte v83, v4, s[0:3], 0 offen offset:204 +; ALIGNED-NEXT: buffer_load_ubyte v81, v4, s[0:3], 0 offen offset:205 +; ALIGNED-NEXT: buffer_load_ubyte v82, v4, s[0:3], 0 offen offset:206 +; ALIGNED-NEXT: buffer_load_ubyte v80, v4, s[0:3], 0 offen offset:207 +; ALIGNED-NEXT: buffer_load_ubyte v71, v4, s[0:3], 0 offen offset:203 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: v_lshl_or_b32 v2, v81, 8, v83 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: v_lshl_or_b32 v3, v80, 8, v82 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v3, 16, v2 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1460 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_clause 0x2 +; ALIGNED-NEXT: buffer_load_ubyte v70, v4, s[0:3], 0 offen offset:200 +; ALIGNED-NEXT: buffer_load_ubyte v69, v4, s[0:3], 0 offen offset:201 +; ALIGNED-NEXT: buffer_load_ubyte v68, v4, s[0:3], 0 offen offset:202 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: v_lshl_or_b32 v2, v69, 8, v70 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: v_lshl_or_b32 v3, v71, 8, v68 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v3, 16, v2 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1464 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_clause 0x4 +; ALIGNED-NEXT: buffer_load_ubyte v67, v4, s[0:3], 0 offen offset:212 +; ALIGNED-NEXT: buffer_load_ubyte v54, v4, s[0:3], 0 offen offset:213 +; ALIGNED-NEXT: buffer_load_ubyte v65, v4, s[0:3], 0 offen offset:214 +; ALIGNED-NEXT: buffer_load_ubyte v52, v4, s[0:3], 0 offen offset:215 +; ALIGNED-NEXT: buffer_load_ubyte v55, v4, s[0:3], 0 offen offset:211 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: v_lshl_or_b32 v2, v54, 8, v67 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: v_lshl_or_b32 v3, v52, 8, v65 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v3, 16, v2 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1468 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_clause 0x7 +; ALIGNED-NEXT: buffer_load_ubyte v66, v4, s[0:3], 0 offen offset:216 +; ALIGNED-NEXT: buffer_load_ubyte v53, v4, s[0:3], 0 offen offset:217 +; ALIGNED-NEXT: buffer_load_ubyte v49, v4, s[0:3], 0 offen offset:218 +; ALIGNED-NEXT: buffer_load_ubyte v48, v4, s[0:3], 0 offen offset:219 +; ALIGNED-NEXT: buffer_load_ubyte v64, v4, s[0:3], 0 offen offset:220 +; ALIGNED-NEXT: buffer_load_ubyte v51, v4, s[0:3], 0 offen offset:221 +; ALIGNED-NEXT: buffer_load_ubyte v50, v4, s[0:3], 0 offen offset:222 +; ALIGNED-NEXT: buffer_load_ubyte v39, v4, s[0:3], 0 offen offset:223 +; ALIGNED-NEXT: s_waitcnt vmcnt(6) +; ALIGNED-NEXT: v_lshl_or_b32 v2, v53, 8, v66 +; ALIGNED-NEXT: s_waitcnt vmcnt(4) +; ALIGNED-NEXT: v_lshl_or_b32 v3, v48, 8, v49 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v3, 16, v2 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: v_lshl_or_b32 v2, v51, 8, v64 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: v_lshl_or_b32 v3, v39, 8, v50 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1472 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v3, 16, v2 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1476 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_clause 0x2 +; ALIGNED-NEXT: buffer_load_ubyte v38, v4, s[0:3], 0 offen offset:208 +; ALIGNED-NEXT: buffer_load_ubyte v36, v4, s[0:3], 0 offen offset:209 +; ALIGNED-NEXT: buffer_load_ubyte v37, v4, s[0:3], 0 offen offset:210 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: v_lshl_or_b32 v2, v36, 8, v38 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: v_lshl_or_b32 v3, v55, 8, v37 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v3, 16, v2 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1480 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_clause 0x7 +; ALIGNED-NEXT: buffer_load_ubyte v35, v4, s[0:3], 0 offen offset:224 +; ALIGNED-NEXT: buffer_load_ubyte v33, v4, s[0:3], 0 offen offset:225 +; ALIGNED-NEXT: buffer_load_ubyte v29, v4, s[0:3], 0 offen offset:226 +; ALIGNED-NEXT: buffer_load_ubyte v30, v4, s[0:3], 0 offen offset:227 +; ALIGNED-NEXT: buffer_load_ubyte v34, v4, s[0:3], 0 offen offset:228 +; ALIGNED-NEXT: buffer_load_ubyte v31, v4, s[0:3], 0 offen offset:229 +; ALIGNED-NEXT: buffer_load_ubyte v32, v4, s[0:3], 0 offen offset:230 +; ALIGNED-NEXT: buffer_load_ubyte v28, v4, s[0:3], 0 offen offset:231 +; ALIGNED-NEXT: s_waitcnt vmcnt(6) +; ALIGNED-NEXT: v_lshl_or_b32 v2, v33, 8, v35 +; ALIGNED-NEXT: s_waitcnt vmcnt(4) +; ALIGNED-NEXT: v_lshl_or_b32 v3, v30, 8, v29 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v3, 16, v2 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: v_lshl_or_b32 v2, v31, 8, v34 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: v_lshl_or_b32 v3, v28, 8, v32 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1484 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_clause 0x17 +; ALIGNED-NEXT: buffer_load_ubyte v27, v4, s[0:3], 0 offen offset:236 +; ALIGNED-NEXT: buffer_load_ubyte v25, v4, s[0:3], 0 offen offset:237 +; ALIGNED-NEXT: buffer_load_ubyte v26, v4, s[0:3], 0 offen offset:238 +; ALIGNED-NEXT: buffer_load_ubyte v24, v4, s[0:3], 0 offen offset:239 +; ALIGNED-NEXT: buffer_load_ubyte v23, v4, s[0:3], 0 offen offset:235 +; ALIGNED-NEXT: buffer_load_ubyte v22, v4, s[0:3], 0 offen offset:232 +; ALIGNED-NEXT: buffer_load_ubyte v21, v4, s[0:3], 0 offen offset:233 +; ALIGNED-NEXT: buffer_load_ubyte v20, v4, s[0:3], 0 offen offset:234 +; ALIGNED-NEXT: buffer_load_ubyte v19, v4, s[0:3], 0 offen offset:240 +; ALIGNED-NEXT: buffer_load_ubyte v17, v4, s[0:3], 0 offen offset:241 +; ALIGNED-NEXT: buffer_load_ubyte v13, v4, s[0:3], 0 offen offset:242 +; ALIGNED-NEXT: buffer_load_ubyte v14, v4, s[0:3], 0 offen offset:243 +; ALIGNED-NEXT: buffer_load_ubyte v18, v4, s[0:3], 0 offen offset:244 +; ALIGNED-NEXT: buffer_load_ubyte v15, v4, s[0:3], 0 offen offset:245 +; ALIGNED-NEXT: buffer_load_ubyte v16, v4, s[0:3], 0 offen offset:246 +; ALIGNED-NEXT: buffer_load_ubyte v12, v4, s[0:3], 0 offen offset:247 +; ALIGNED-NEXT: buffer_load_ubyte v11, v4, s[0:3], 0 offen offset:252 +; ALIGNED-NEXT: buffer_load_ubyte v9, v4, s[0:3], 0 offen offset:253 +; ALIGNED-NEXT: buffer_load_ubyte v10, v4, s[0:3], 0 offen offset:254 +; ALIGNED-NEXT: buffer_load_ubyte v8, v4, s[0:3], 0 offen offset:255 +; ALIGNED-NEXT: buffer_load_ubyte v7, v4, s[0:3], 0 offen offset:251 +; ALIGNED-NEXT: buffer_load_ubyte v6, v4, s[0:3], 0 offen offset:248 +; ALIGNED-NEXT: buffer_load_ubyte v5, v4, s[0:3], 0 offen offset:249 +; ALIGNED-NEXT: buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:250 +; ALIGNED-NEXT: v_lshl_or_b32 v123, v3, 16, v2 +; ALIGNED-NEXT: buffer_load_ubyte v0, v4, s[0:3], 0 offen +; ALIGNED-NEXT: s_waitcnt vmcnt(23) +; ALIGNED-NEXT: v_lshl_or_b32 v2, v25, 8, v27 +; ALIGNED-NEXT: s_waitcnt vmcnt(21) +; ALIGNED-NEXT: v_lshl_or_b32 v3, v24, 8, v26 +; ALIGNED-NEXT: s_waitcnt vmcnt(9) +; ALIGNED-NEXT: v_lshl_or_b32 v43, v12, 8, v16 +; ALIGNED-NEXT: s_waitcnt vmcnt(5) +; ALIGNED-NEXT: v_lshl_or_b32 v57, v8, 8, v10 +; ALIGNED-NEXT: v_lshl_or_b32 v104, v3, 16, v2 +; ALIGNED-NEXT: v_lshl_or_b32 v2, v21, 8, v22 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v23, 8, v20 +; ALIGNED-NEXT: v_lshl_or_b32 v76, v3, 16, v2 +; ALIGNED-NEXT: v_lshl_or_b32 v2, v17, 8, v19 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v14, 8, v13 +; ALIGNED-NEXT: v_lshl_or_b32 v101, v3, 16, v2 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v15, 8, v18 +; ALIGNED-NEXT: v_lshl_or_b32 v84, v43, 16, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v43, v9, 8, v11 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v57, 16, v43 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: v_lshl_or_b32 v43, v5, 8, v6 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: v_lshl_or_b32 v57, v7, 8, v1 +; ALIGNED-NEXT: v_lshl_or_b32 v2, v57, 16, v43 +; ALIGNED-NEXT: buffer_load_ubyte v43, v4, s[0:3], 0 offen offset:1 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1336 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:1344 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_clause 0x5 +; ALIGNED-NEXT: buffer_load_ubyte v127, v4, s[0:3], 0 offen offset:2 +; ALIGNED-NEXT: buffer_load_ubyte v57, v4, s[0:3], 0 offen offset:3 +; ALIGNED-NEXT: buffer_load_ubyte v78, v4, s[0:3], 0 offen offset:4 +; ALIGNED-NEXT: buffer_load_ubyte v90, v4, s[0:3], 0 offen offset:5 +; ALIGNED-NEXT: buffer_load_ubyte v91, v4, s[0:3], 0 offen offset:6 +; ALIGNED-NEXT: buffer_load_ubyte v124, v4, s[0:3], 0 offen offset:7 +; ALIGNED-NEXT: v_lshl_or_b32 v43, v43, 8, v0 +; ALIGNED-NEXT: s_waitcnt vmcnt(4) +; ALIGNED-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:1360 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v57, v57, 8, v127 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: buffer_store_dword v78, off, s[0:3], s32 offset:1364 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: buffer_store_dword v90, off, s[0:3], s32 offset:1376 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: buffer_store_dword v91, off, s[0:3], s32 offset:1380 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v57, 16, v43 +; ALIGNED-NEXT: v_lshl_or_b32 v43, v90, 8, v78 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: v_lshl_or_b32 v57, v124, 8, v91 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1388 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v57, 16, v43 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1396 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_clause 0x4 +; ALIGNED-NEXT: buffer_load_ubyte v121, v4, s[0:3], 0 offen offset:12 +; ALIGNED-NEXT: buffer_load_ubyte v107, v4, s[0:3], 0 offen offset:13 +; ALIGNED-NEXT: buffer_load_ubyte v110, v4, s[0:3], 0 offen offset:14 +; ALIGNED-NEXT: buffer_load_ubyte v108, v4, s[0:3], 0 offen offset:15 +; ALIGNED-NEXT: buffer_load_ubyte v93, v4, s[0:3], 0 offen offset:11 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: v_lshl_or_b32 v43, v107, 8, v121 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: v_lshl_or_b32 v57, v108, 8, v110 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v57, 16, v43 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1404 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_clause 0x2 +; ALIGNED-NEXT: buffer_load_ubyte v95, v4, s[0:3], 0 offen offset:8 +; ALIGNED-NEXT: buffer_load_ubyte v91, v4, s[0:3], 0 offen offset:9 +; ALIGNED-NEXT: buffer_load_ubyte v90, v4, s[0:3], 0 offen offset:10 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: v_lshl_or_b32 v43, v91, 8, v95 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: v_lshl_or_b32 v57, v93, 8, v90 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v57, 16, v43 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1412 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_clause 0x2 +; ALIGNED-NEXT: buffer_load_ubyte v78, v4, s[0:3], 0 offen offset:16 +; ALIGNED-NEXT: buffer_load_ubyte v43, v4, s[0:3], 0 offen offset:18 +; ALIGNED-NEXT: buffer_load_ubyte v57, v4, s[0:3], 0 offen offset:17 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:488 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:492 +; ALIGNED-NEXT: buffer_store_dword v84, off, s[0:3], s32 offset:484 +; ALIGNED-NEXT: buffer_store_dword v101, off, s[0:3], s32 offset:480 +; ALIGNED-NEXT: s_clause 0x1 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:704 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:708 +; ALIGNED-NEXT: v_add_nc_u32_e32 v4, 0xffffff00, v4 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: v_lshl_or_b32 v0, v126, 8, v43 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: v_lshl_or_b32 v126, v57, 8, v78 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: v_add_co_u32 v2, vcc_lo, v2, s4 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, s5, v3, vcc_lo +; ALIGNED-NEXT: flat_store_byte v[2:3], v1 offset:250 +; ALIGNED-NEXT: flat_store_byte v[2:3], v7 offset:251 +; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:249 +; ALIGNED-NEXT: flat_store_byte v[2:3], v8 offset:255 +; ALIGNED-NEXT: flat_store_byte v[2:3], v9 offset:253 +; ALIGNED-NEXT: flat_store_byte v[2:3], v10 offset:254 +; ALIGNED-NEXT: flat_store_byte v[2:3], v11 offset:252 +; ALIGNED-NEXT: flat_store_byte v[2:3], v6 offset:248 +; ALIGNED-NEXT: flat_store_byte v[2:3], v13 offset:242 +; ALIGNED-NEXT: flat_store_byte v[2:3], v14 offset:243 +; ALIGNED-NEXT: flat_store_byte v[2:3], v17 offset:241 +; ALIGNED-NEXT: flat_store_byte v[2:3], v12 offset:247 +; ALIGNED-NEXT: flat_store_byte v[2:3], v15 offset:245 +; ALIGNED-NEXT: flat_store_byte v[2:3], v16 offset:246 +; ALIGNED-NEXT: flat_store_byte v[2:3], v18 offset:244 +; ALIGNED-NEXT: flat_store_byte v[2:3], v19 offset:240 +; ALIGNED-NEXT: buffer_store_dword v76, off, s[0:3], s32 offset:504 +; ALIGNED-NEXT: buffer_store_dword v104, off, s[0:3], s32 offset:508 +; ALIGNED-NEXT: buffer_store_dword v123, off, s[0:3], s32 offset:500 +; ALIGNED-NEXT: v_lshl_or_b32 v126, v0, 16, v126 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1484 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_add_u32 s4, s4, 0xffffff00 +; ALIGNED-NEXT: s_addc_u32 s5, s5, -1 +; ALIGNED-NEXT: s_cmp_eq_u64 s[4:5], s[6:7] +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:496 +; ALIGNED-NEXT: flat_store_byte v[2:3], v20 offset:234 +; ALIGNED-NEXT: flat_store_byte v[2:3], v23 offset:235 +; ALIGNED-NEXT: flat_store_byte v[2:3], v21 offset:233 +; ALIGNED-NEXT: flat_store_byte v[2:3], v24 offset:239 +; ALIGNED-NEXT: flat_store_byte v[2:3], v25 offset:237 +; ALIGNED-NEXT: flat_store_byte v[2:3], v26 offset:238 +; ALIGNED-NEXT: flat_store_byte v[2:3], v27 offset:236 +; ALIGNED-NEXT: flat_store_byte v[2:3], v22 offset:232 +; ALIGNED-NEXT: flat_store_byte v[2:3], v29 offset:226 +; ALIGNED-NEXT: flat_store_byte v[2:3], v30 offset:227 +; ALIGNED-NEXT: flat_store_byte v[2:3], v33 offset:225 +; ALIGNED-NEXT: flat_store_byte v[2:3], v28 offset:231 +; ALIGNED-NEXT: flat_store_byte v[2:3], v31 offset:229 +; ALIGNED-NEXT: flat_store_byte v[2:3], v32 offset:230 +; ALIGNED-NEXT: flat_store_byte v[2:3], v34 offset:228 +; ALIGNED-NEXT: flat_store_byte v[2:3], v35 offset:224 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1480 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:448 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1476 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:460 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1472 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:456 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1468 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:452 +; ALIGNED-NEXT: flat_store_byte v[2:3], v54 offset:213 +; ALIGNED-NEXT: flat_store_byte v[2:3], v52 offset:215 +; ALIGNED-NEXT: flat_store_byte v[2:3], v36 offset:209 +; ALIGNED-NEXT: flat_store_byte v[2:3], v55 offset:211 +; ALIGNED-NEXT: flat_store_byte v[2:3], v37 offset:210 +; ALIGNED-NEXT: flat_store_byte v[2:3], v65 offset:214 +; ALIGNED-NEXT: flat_store_byte v[2:3], v67 offset:212 +; ALIGNED-NEXT: flat_store_byte v[2:3], v49 offset:218 +; ALIGNED-NEXT: flat_store_byte v[2:3], v48 offset:219 +; ALIGNED-NEXT: flat_store_byte v[2:3], v53 offset:217 +; ALIGNED-NEXT: flat_store_byte v[2:3], v39 offset:223 +; ALIGNED-NEXT: flat_store_byte v[2:3], v51 offset:221 +; ALIGNED-NEXT: flat_store_byte v[2:3], v50 offset:222 +; ALIGNED-NEXT: flat_store_byte v[2:3], v64 offset:220 +; ALIGNED-NEXT: flat_store_byte v[2:3], v66 offset:216 +; ALIGNED-NEXT: flat_store_byte v[2:3], v38 offset:208 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1464 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:472 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1460 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:476 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1456 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:468 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1452 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:464 +; ALIGNED-NEXT: flat_store_byte v[2:3], v68 offset:202 +; ALIGNED-NEXT: flat_store_byte v[2:3], v71 offset:203 +; ALIGNED-NEXT: flat_store_byte v[2:3], v69 offset:201 +; ALIGNED-NEXT: flat_store_byte v[2:3], v80 offset:207 +; ALIGNED-NEXT: flat_store_byte v[2:3], v81 offset:205 +; ALIGNED-NEXT: flat_store_byte v[2:3], v82 offset:206 +; ALIGNED-NEXT: flat_store_byte v[2:3], v83 offset:204 +; ALIGNED-NEXT: flat_store_byte v[2:3], v70 offset:200 +; ALIGNED-NEXT: flat_store_byte v[2:3], v87 offset:194 +; ALIGNED-NEXT: flat_store_byte v[2:3], v86 offset:195 +; ALIGNED-NEXT: flat_store_byte v[2:3], v98 offset:193 +; ALIGNED-NEXT: flat_store_byte v[2:3], v85 offset:199 +; ALIGNED-NEXT: flat_store_byte v[2:3], v97 offset:197 +; ALIGNED-NEXT: flat_store_byte v[2:3], v96 offset:198 +; ALIGNED-NEXT: flat_store_byte v[2:3], v99 offset:196 +; ALIGNED-NEXT: flat_store_byte v[2:3], v100 offset:192 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1448 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1444 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1440 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:548 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1436 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:544 +; ALIGNED-NEXT: flat_store_byte v[2:3], v102 offset:186 +; ALIGNED-NEXT: flat_store_byte v[2:3], v112 offset:187 +; ALIGNED-NEXT: flat_store_byte v[2:3], v103 offset:185 +; ALIGNED-NEXT: flat_store_byte v[2:3], v114 offset:191 +; ALIGNED-NEXT: flat_store_byte v[2:3], v115 offset:189 +; ALIGNED-NEXT: flat_store_byte v[2:3], v116 offset:190 +; ALIGNED-NEXT: flat_store_byte v[2:3], v117 offset:188 +; ALIGNED-NEXT: flat_store_byte v[2:3], v113 offset:184 +; ALIGNED-NEXT: flat_store_byte v[2:3], v119 offset:178 +; ALIGNED-NEXT: flat_store_byte v[2:3], v40 offset:179 +; ALIGNED-NEXT: flat_store_byte v[2:3], v44 offset:177 +; ALIGNED-NEXT: flat_store_byte v[2:3], v118 offset:183 +; ALIGNED-NEXT: flat_store_byte v[2:3], v41 offset:181 +; ALIGNED-NEXT: flat_store_byte v[2:3], v42 offset:182 +; ALIGNED-NEXT: flat_store_byte v[2:3], v45 offset:180 +; ALIGNED-NEXT: flat_store_byte v[2:3], v46 offset:176 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1432 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1428 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1424 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1420 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 +; ALIGNED-NEXT: flat_store_byte v[2:3], v47 offset:170 +; ALIGNED-NEXT: flat_store_byte v[2:3], v58 offset:171 +; ALIGNED-NEXT: flat_store_byte v[2:3], v56 offset:169 +; ALIGNED-NEXT: flat_store_byte v[2:3], v60 offset:175 +; ALIGNED-NEXT: flat_store_byte v[2:3], v61 offset:173 +; ALIGNED-NEXT: flat_store_byte v[2:3], v62 offset:174 +; ALIGNED-NEXT: flat_store_byte v[2:3], v63 offset:172 +; ALIGNED-NEXT: flat_store_byte v[2:3], v59 offset:168 +; ALIGNED-NEXT: flat_store_byte v[2:3], v73 offset:162 +; ALIGNED-NEXT: flat_store_byte v[2:3], v74 offset:163 +; ALIGNED-NEXT: flat_store_byte v[2:3], v79 offset:161 +; ALIGNED-NEXT: flat_store_byte v[2:3], v72 offset:167 +; ALIGNED-NEXT: flat_store_byte v[2:3], v75 offset:165 +; ALIGNED-NEXT: flat_store_byte v[2:3], v77 offset:166 +; ALIGNED-NEXT: flat_store_byte v[2:3], v88 offset:164 +; ALIGNED-NEXT: flat_store_byte v[2:3], v89 offset:160 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1408 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:520 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1400 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:524 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1392 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:516 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1384 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:512 +; ALIGNED-NEXT: flat_store_byte v[2:3], v92 offset:154 +; ALIGNED-NEXT: flat_store_byte v[2:3], v106 offset:155 +; ALIGNED-NEXT: flat_store_byte v[2:3], v94 offset:153 +; ALIGNED-NEXT: flat_store_byte v[2:3], v109 offset:159 +; ALIGNED-NEXT: flat_store_byte v[2:3], v111 offset:157 +; ALIGNED-NEXT: flat_store_byte v[2:3], v120 offset:158 +; ALIGNED-NEXT: flat_store_byte v[2:3], v122 offset:156 +; ALIGNED-NEXT: flat_store_byte v[2:3], v105 offset:152 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1356 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:146 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1348 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:147 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1340 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:145 +; ALIGNED-NEXT: flat_store_byte v[2:3], v125 offset:151 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1368 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:149 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1372 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:150 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1352 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:148 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1332 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:144 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1328 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:536 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1312 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:540 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1292 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:532 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1284 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:528 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1324 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:138 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1320 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:139 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1316 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:137 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1304 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:143 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1296 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:141 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1300 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:142 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1288 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:140 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1308 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:136 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1268 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:130 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1260 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:131 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1256 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:129 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1280 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:135 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1272 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:133 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1276 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:134 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1264 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:132 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1252 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:128 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1248 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1232 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1212 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1204 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1244 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:122 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1240 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:123 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1236 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:121 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1224 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:127 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1216 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:125 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1220 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:126 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1208 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:124 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1228 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:120 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1188 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:114 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1180 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:115 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1176 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:113 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1200 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:119 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1192 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:117 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1196 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:118 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1184 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:116 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1172 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:112 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1168 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:632 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1152 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:636 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1132 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:628 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1124 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:624 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1164 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:106 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1160 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:107 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1156 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:105 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1144 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:111 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1136 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:109 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1140 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:110 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1128 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:108 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1148 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:104 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1108 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:98 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1100 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:99 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1096 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:97 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1120 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:103 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1112 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:101 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1116 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:102 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1104 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:100 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1092 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:96 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1088 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1072 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:588 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1052 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1044 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1084 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:90 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1080 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:91 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1076 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:89 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1064 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:95 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1056 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:93 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1060 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:94 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1048 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:92 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1068 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:88 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1028 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:82 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1020 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:83 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1016 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:81 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1040 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:87 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1032 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:85 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1036 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:86 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1024 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:84 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1012 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:80 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1008 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:992 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:988 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:596 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:976 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:592 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1004 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:74 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1000 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:75 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:996 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:73 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:980 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:79 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:968 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:77 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:972 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:78 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:964 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:76 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:984 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:72 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:956 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:66 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:948 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:67 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:936 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:65 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:960 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:71 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:944 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:69 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:952 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:70 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:940 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:68 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:924 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:64 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:932 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:680 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:928 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:684 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:920 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:676 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:916 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:672 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:900 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:61 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:912 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:58 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:908 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:59 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:904 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:57 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:896 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:63 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:888 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:62 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:884 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:60 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:892 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:56 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:876 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:53 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:860 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:50 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:856 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:51 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:844 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:49 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:872 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:55 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:864 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:54 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:848 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:52 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:48 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:880 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:700 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:868 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:696 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:852 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:692 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:840 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:688 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:43 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:42 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:41 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:40 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:47 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:46 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:45 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:44 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:35 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:34 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:33 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:32 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:39 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:38 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:37 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:36 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:648 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:652 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:644 +; ALIGNED-NEXT: buffer_store_dword v126, off, s[0:3], s32 offset:640 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:26 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:27 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:25 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:31 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:29 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:30 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:28 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:24 +; ALIGNED-NEXT: flat_store_byte v[2:3], v43 offset:18 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1416 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:19 +; ALIGNED-NEXT: flat_store_byte v[2:3], v57 offset:17 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:23 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:21 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:22 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:20 +; ALIGNED-NEXT: flat_store_byte v[2:3], v78 offset:16 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1412 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:664 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1404 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:668 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1396 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:660 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1388 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:656 +; ALIGNED-NEXT: flat_store_byte v[2:3], v90 offset:10 +; ALIGNED-NEXT: flat_store_byte v[2:3], v93 offset:11 +; ALIGNED-NEXT: flat_store_byte v[2:3], v107 offset:13 +; ALIGNED-NEXT: flat_store_byte v[2:3], v91 offset:9 +; ALIGNED-NEXT: flat_store_byte v[2:3], v108 offset:15 +; ALIGNED-NEXT: flat_store_byte v[2:3], v110 offset:14 +; ALIGNED-NEXT: flat_store_byte v[2:3], v121 offset:12 +; ALIGNED-NEXT: flat_store_byte v[2:3], v95 offset:8 +; ALIGNED-NEXT: flat_store_byte v[2:3], v127 offset:2 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1360 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:3 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1344 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:1 +; ALIGNED-NEXT: flat_store_byte v[2:3], v124 offset:7 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1376 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:5 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1380 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:6 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1364 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:4 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1336 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[2:3], v0 +; ALIGNED-NEXT: s_cbranch_scc0 .LBB9_4 +; ALIGNED-NEXT: .LBB9_5: ; %Flow11 +; ALIGNED-NEXT: s_or_b32 exec_lo, exec_lo, s8 +; ALIGNED-NEXT: s_clause 0x2f +; ALIGNED-NEXT: buffer_load_dword v127, off, s[0:3], s32 +; ALIGNED-NEXT: buffer_load_dword v126, off, s[0:3], s32 offset:4 +; ALIGNED-NEXT: buffer_load_dword v125, off, s[0:3], s32 offset:8 +; ALIGNED-NEXT: buffer_load_dword v124, off, s[0:3], s32 offset:12 +; ALIGNED-NEXT: buffer_load_dword v123, off, s[0:3], s32 offset:16 +; ALIGNED-NEXT: buffer_load_dword v122, off, s[0:3], s32 offset:20 +; ALIGNED-NEXT: buffer_load_dword v121, off, s[0:3], s32 offset:24 +; ALIGNED-NEXT: buffer_load_dword v120, off, s[0:3], s32 offset:28 +; ALIGNED-NEXT: buffer_load_dword v111, off, s[0:3], s32 offset:32 +; ALIGNED-NEXT: buffer_load_dword v110, off, s[0:3], s32 offset:36 +; ALIGNED-NEXT: buffer_load_dword v109, off, s[0:3], s32 offset:40 +; ALIGNED-NEXT: buffer_load_dword v108, off, s[0:3], s32 offset:44 +; ALIGNED-NEXT: buffer_load_dword v107, off, s[0:3], s32 offset:48 +; ALIGNED-NEXT: buffer_load_dword v106, off, s[0:3], s32 offset:52 +; ALIGNED-NEXT: buffer_load_dword v105, off, s[0:3], s32 offset:56 +; ALIGNED-NEXT: buffer_load_dword v104, off, s[0:3], s32 offset:60 +; ALIGNED-NEXT: buffer_load_dword v95, off, s[0:3], s32 offset:64 +; ALIGNED-NEXT: buffer_load_dword v94, off, s[0:3], s32 offset:68 +; ALIGNED-NEXT: buffer_load_dword v93, off, s[0:3], s32 offset:72 +; ALIGNED-NEXT: buffer_load_dword v92, off, s[0:3], s32 offset:76 +; ALIGNED-NEXT: buffer_load_dword v91, off, s[0:3], s32 offset:80 +; ALIGNED-NEXT: buffer_load_dword v90, off, s[0:3], s32 offset:84 +; ALIGNED-NEXT: buffer_load_dword v89, off, s[0:3], s32 offset:88 +; ALIGNED-NEXT: buffer_load_dword v88, off, s[0:3], s32 offset:92 +; ALIGNED-NEXT: buffer_load_dword v79, off, s[0:3], s32 offset:96 +; ALIGNED-NEXT: buffer_load_dword v78, off, s[0:3], s32 offset:100 +; ALIGNED-NEXT: buffer_load_dword v77, off, s[0:3], s32 offset:104 +; ALIGNED-NEXT: buffer_load_dword v76, off, s[0:3], s32 offset:108 +; ALIGNED-NEXT: buffer_load_dword v75, off, s[0:3], s32 offset:112 +; ALIGNED-NEXT: buffer_load_dword v74, off, s[0:3], s32 offset:116 +; ALIGNED-NEXT: buffer_load_dword v73, off, s[0:3], s32 offset:120 +; ALIGNED-NEXT: buffer_load_dword v72, off, s[0:3], s32 offset:124 +; ALIGNED-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:128 +; ALIGNED-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:132 +; ALIGNED-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:136 +; ALIGNED-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:140 +; ALIGNED-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:144 +; ALIGNED-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:148 +; ALIGNED-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:152 +; ALIGNED-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:156 +; ALIGNED-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:160 +; ALIGNED-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:164 +; ALIGNED-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:168 +; ALIGNED-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:172 +; ALIGNED-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:176 +; ALIGNED-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:180 +; ALIGNED-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:184 +; ALIGNED-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:188 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; ALIGNED-NEXT: s_setpc_b64 s[30:31] +; +; UNROLL3-LABEL: memmove_p0_p5_sz2048: +; UNROLL3: ; %bb.0: ; %entry +; UNROLL3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; UNROLL3-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; UNROLL3-NEXT: s_mov_b64 s[4:5], 0 +; UNROLL3-NEXT: s_mov_b32 s6, exec_lo +; UNROLL3-NEXT: v_cndmask_b32_e32 v3, -1, v0, vcc_lo +; UNROLL3-NEXT: v_cmpx_ge_u32_e64 v2, v3 +; UNROLL3-NEXT: s_xor_b32 s6, exec_lo, s6 +; UNROLL3-NEXT: s_cbranch_execz .LBB9_4 +; UNROLL3-NEXT: ; %bb.1: ; %memmove_fwd_loop.preheader +; UNROLL3-NEXT: v_mov_b32_e32 v3, v2 +; UNROLL3-NEXT: s_inst_prefetch 0x1 +; UNROLL3-NEXT: .p2align 6 +; UNROLL3-NEXT: .LBB9_2: ; %memmove_fwd_loop +; UNROLL3-NEXT: ; =>This Inner Loop Header: Depth=1 +; UNROLL3-NEXT: s_clause 0xb +; UNROLL3-NEXT: buffer_load_dword v4, v3, s[0:3], 0 offen +; UNROLL3-NEXT: buffer_load_dword v5, v3, s[0:3], 0 offen offset:4 +; UNROLL3-NEXT: buffer_load_dword v6, v3, s[0:3], 0 offen offset:8 +; UNROLL3-NEXT: buffer_load_dword v7, v3, s[0:3], 0 offen offset:12 +; UNROLL3-NEXT: buffer_load_dword v8, v3, s[0:3], 0 offen offset:16 +; UNROLL3-NEXT: buffer_load_dword v9, v3, s[0:3], 0 offen offset:20 +; UNROLL3-NEXT: buffer_load_dword v10, v3, s[0:3], 0 offen offset:24 +; UNROLL3-NEXT: buffer_load_dword v11, v3, s[0:3], 0 offen offset:28 +; UNROLL3-NEXT: buffer_load_dword v12, v3, s[0:3], 0 offen offset:32 +; UNROLL3-NEXT: buffer_load_dword v13, v3, s[0:3], 0 offen offset:36 +; UNROLL3-NEXT: buffer_load_dword v14, v3, s[0:3], 0 offen offset:40 +; UNROLL3-NEXT: buffer_load_dword v15, v3, s[0:3], 0 offen offset:44 +; UNROLL3-NEXT: v_add_co_u32 v16, vcc_lo, v0, s4 +; UNROLL3-NEXT: v_add_co_ci_u32_e32 v17, vcc_lo, s5, v1, vcc_lo +; UNROLL3-NEXT: s_add_u32 s4, s4, 48 +; UNROLL3-NEXT: v_add_nc_u32_e32 v3, 48, v3 +; UNROLL3-NEXT: s_addc_u32 s5, s5, 0 +; UNROLL3-NEXT: s_waitcnt vmcnt(4) +; UNROLL3-NEXT: flat_store_dwordx4 v[16:17], v[8:11] offset:16 +; UNROLL3-NEXT: flat_store_dwordx4 v[16:17], v[4:7] +; UNROLL3-NEXT: s_waitcnt vmcnt(0) +; UNROLL3-NEXT: flat_store_dwordx4 v[16:17], v[12:15] offset:32 +; UNROLL3-NEXT: s_cmp_lg_u64 s[4:5], 0x7e0 +; UNROLL3-NEXT: s_cbranch_scc1 .LBB9_2 +; UNROLL3-NEXT: ; %bb.3: ; %memmove_fwd_residual +; UNROLL3-NEXT: s_inst_prefetch 0x2 +; UNROLL3-NEXT: s_clause 0x3 +; UNROLL3-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen offset:2016 +; UNROLL3-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:2020 +; UNROLL3-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:2024 +; UNROLL3-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:2028 +; UNROLL3-NEXT: s_waitcnt vmcnt(0) +; UNROLL3-NEXT: flat_store_dwordx4 v[0:1], v[3:6] offset:2016 +; UNROLL3-NEXT: s_clause 0x3 +; UNROLL3-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen offset:2032 +; UNROLL3-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:2036 +; UNROLL3-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:2040 +; UNROLL3-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:2044 +; UNROLL3-NEXT: ; implicit-def: $vgpr2 +; UNROLL3-NEXT: s_waitcnt vmcnt(0) +; UNROLL3-NEXT: flat_store_dwordx4 v[0:1], v[3:6] offset:2032 +; UNROLL3-NEXT: ; implicit-def: $vgpr0_vgpr1 +; UNROLL3-NEXT: .LBB9_4: ; %Flow8 +; UNROLL3-NEXT: s_andn2_saveexec_b32 s8, s6 +; UNROLL3-NEXT: s_cbranch_execz .LBB9_7 +; UNROLL3-NEXT: ; %bb.5: ; %memmove_bwd_residual +; UNROLL3-NEXT: s_clause 0x3 +; UNROLL3-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen offset:2032 +; UNROLL3-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:2036 +; UNROLL3-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:2040 +; UNROLL3-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:2044 +; UNROLL3-NEXT: s_movk_i32 s6, 0xffd0 +; UNROLL3-NEXT: s_mov_b64 s[4:5], 0x7b0 +; UNROLL3-NEXT: s_mov_b32 s7, -1 +; UNROLL3-NEXT: s_waitcnt vmcnt(0) +; UNROLL3-NEXT: flat_store_dwordx4 v[0:1], v[3:6] offset:2032 +; UNROLL3-NEXT: s_clause 0x3 +; UNROLL3-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen offset:2016 +; UNROLL3-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:2020 +; UNROLL3-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:2024 +; UNROLL3-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:2028 +; UNROLL3-NEXT: v_add_nc_u32_e32 v2, 0x7b0, v2 +; UNROLL3-NEXT: s_waitcnt vmcnt(0) +; UNROLL3-NEXT: flat_store_dwordx4 v[0:1], v[3:6] offset:2016 +; UNROLL3-NEXT: s_inst_prefetch 0x1 +; UNROLL3-NEXT: .p2align 6 +; UNROLL3-NEXT: .LBB9_6: ; %memmove_bwd_loop +; UNROLL3-NEXT: ; =>This Inner Loop Header: Depth=1 +; UNROLL3-NEXT: s_clause 0xb +; UNROLL3-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen +; UNROLL3-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4 +; UNROLL3-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:8 +; UNROLL3-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:12 +; UNROLL3-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen offset:16 +; UNROLL3-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:20 +; UNROLL3-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:24 +; UNROLL3-NEXT: buffer_load_dword v10, v2, s[0:3], 0 offen offset:28 +; UNROLL3-NEXT: buffer_load_dword v11, v2, s[0:3], 0 offen offset:32 +; UNROLL3-NEXT: buffer_load_dword v12, v2, s[0:3], 0 offen offset:36 +; UNROLL3-NEXT: buffer_load_dword v13, v2, s[0:3], 0 offen offset:40 +; UNROLL3-NEXT: buffer_load_dword v14, v2, s[0:3], 0 offen offset:44 +; UNROLL3-NEXT: v_add_co_u32 v15, vcc_lo, v0, s4 +; UNROLL3-NEXT: v_add_co_ci_u32_e32 v16, vcc_lo, s5, v1, vcc_lo +; UNROLL3-NEXT: v_subrev_nc_u32_e32 v2, 48, v2 +; UNROLL3-NEXT: s_add_u32 s4, s4, 0xffffffd0 +; UNROLL3-NEXT: s_addc_u32 s5, s5, -1 +; UNROLL3-NEXT: s_waitcnt vmcnt(4) +; UNROLL3-NEXT: flat_store_dwordx4 v[15:16], v[7:10] offset:16 +; UNROLL3-NEXT: flat_store_dwordx4 v[15:16], v[3:6] +; UNROLL3-NEXT: s_waitcnt vmcnt(0) +; UNROLL3-NEXT: flat_store_dwordx4 v[15:16], v[11:14] offset:32 +; UNROLL3-NEXT: s_cmp_eq_u64 s[4:5], s[6:7] +; UNROLL3-NEXT: s_cbranch_scc0 .LBB9_6 +; UNROLL3-NEXT: .LBB9_7: ; %Flow9 +; UNROLL3-NEXT: s_inst_prefetch 0x2 +; UNROLL3-NEXT: s_or_b32 exec_lo, exec_lo, s8 +; UNROLL3-NEXT: s_waitcnt lgkmcnt(0) +; UNROLL3-NEXT: s_setpc_b64 s[30:31] +entry: + tail call void @llvm.memmove.p0.p5.i64(ptr addrspace(0) noundef nonnull align 1 %dst, ptr addrspace(5) noundef nonnull align 1 %src, i64 2048, i1 false) + ret void +} + + +declare void @llvm.memcpy.p0.p0.i64(ptr addrspace(0) noalias nocapture writeonly, ptr addrspace(0) noalias nocapture readonly, i64, i1 immarg) #2 +declare void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) noalias nocapture writeonly, ptr addrspace(1) noalias nocapture readonly, i64, i1 immarg) #2 +declare void @llvm.memcpy.p0.p4.i64(ptr addrspace(0) noalias nocapture writeonly, ptr addrspace(4) noalias nocapture readonly, i64, i1 immarg) #2 +declare void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) noalias nocapture writeonly, ptr addrspace(5) noalias nocapture readonly, i64, i1 immarg) #2 + +declare void @llvm.memcpy.p0.p5.i64(ptr addrspace(0) noalias nocapture writeonly, ptr addrspace(5) noalias nocapture readonly, i64, i1 immarg) #2 + +declare void @llvm.memmove.p0.p0.i64(ptr addrspace(0) nocapture writeonly, ptr addrspace(0) nocapture readonly, i64, i1 immarg) #2 +declare void @llvm.memmove.p1.p1.i64(ptr addrspace(1) nocapture writeonly, ptr addrspace(1) nocapture readonly, i64, i1 immarg) #2 +declare void @llvm.memmove.p0.p4.i64(ptr addrspace(0) nocapture writeonly, ptr addrspace(4) nocapture readonly, i64, i1 immarg) #2 +declare void @llvm.memmove.p5.p5.i64(ptr addrspace(5) nocapture writeonly, ptr addrspace(5) nocapture readonly, i64, i1 immarg) #2 + +declare void @llvm.memmove.p0.p5.i64(ptr addrspace(0) nocapture writeonly, ptr addrspace(5) nocapture readonly, i64, i1 immarg) #2 + +attributes #2 = { nocallback nofree nounwind willreturn memory(argmem: readwrite) }