diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp index c0920e3e71bee..733cea8ac69ca 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp @@ -159,10 +159,17 @@ void AMDGPUAsmPrinter::emitEndOfAsmFile(Module &M) { } void AMDGPUAsmPrinter::emitFunctionBodyStart() { - const SIMachineFunctionInfo &MFI = *MF->getInfo(); + SIMachineFunctionInfo &MFI = *MF->getInfo(); const GCNSubtarget &STM = MF->getSubtarget(); const Function &F = MF->getFunction(); + const MCAsmInfo *MAI = MF->getTarget().getMCAsmInfo(); + if (MAI->hasFunctionAlignment()) { + Align Alignment = MF->getAlignment(); + MFI.Alignment = Alignment.value(); + MFI.Offset = 0; + } + // TODO: We're checking this late, would be nice to check it earlier. if (STM.requiresCodeObjectV6() && CodeObjectVersion < AMDGPU::AMDHSA_COV6) { reportFatalUsageError( @@ -298,6 +305,18 @@ void AMDGPUAsmPrinter::emitBasicBlockStart(const MachineBasicBlock &MBB) { HexLines.emplace_back(""); } AsmPrinter::emitBasicBlockStart(MBB); + + Align Alignment = MBB.getAlignment(); + if (Alignment != Align(1)) { + const MachineFunction *MF = MBB.getParent(); + SIMachineFunctionInfo *MFI = const_cast( + MF->getInfo()); + unsigned BlockAlignment = Alignment.value(); + // Do not decrease known Alignment. Increment Offset to satisfy + // BlockAlignment. + MFI->Alignment = std::max(MFI->Alignment, BlockAlignment); + MFI->Offset += (BlockAlignment - (MFI->Offset % BlockAlignment)); + } } void AMDGPUAsmPrinter::emitGlobalVariable(const GlobalVariable *GV) { @@ -640,6 +659,12 @@ AMDGPUAsmPrinter::getAmdhsaKernelDescriptor(const MachineFunction &MF, return KernelDescriptor; } +cl::opt PreventHalfCacheLineStraddling( + "amdgpu-prevent-half-cache-line-straddling", cl::Hidden, + cl::desc( + "Add NOPs to prevent instructions from straddling half a cache-line"), + cl::init(false)); + bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) { // Init target streamer lazily on the first function so that previous passes // can set metadata. @@ -654,7 +679,11 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) { // The starting address of all shader programs must be 256 bytes aligned. // Regular functions just need the basic required instruction alignment. - MF.setAlignment(MFI->isEntryFunction() ? Align(256) : Align(4)); + // However, align regular functions to half a cache-line 64/2 bytes if + // PreventHalfCacheLineStraddling is enabled. + MF.setAlignment(MFI->isEntryFunction() ? Align(256) + : PreventHalfCacheLineStraddling ? Align(32) + : Align(4)); SetupMachineFunction(MF); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp index 2dec16de940d1..6d8c7b72b0e2c 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp @@ -274,12 +274,72 @@ static void emitVGPRBlockComment(const MachineInstr *MI, const SIInstrInfo *TII, OS.emitRawComment(" transferring at most " + TransferredRegs); } +extern cl::opt PreventHalfCacheLineStraddling; + +static unsigned getMCInstSizeInBytes(const MCInst &LoweredMCI, + const GCNSubtarget &STI, + MCContext &OutContext) { + SmallVector Fixups; + SmallVector CodeBytes; + + std::unique_ptr InstEmitter( + createAMDGPUMCCodeEmitter(*STI.getInstrInfo(), OutContext)); + InstEmitter->encodeInstruction(LoweredMCI, CodeBytes, Fixups, STI); + return (CodeBytes.size()); +}; + void AMDGPUAsmPrinter::emitInstruction(const MachineInstr *MI) { // FIXME: Enable feature predicate checks once all the test pass. // AMDGPU_MC::verifyInstructionPredicates(MI->getOpcode(), // getSubtargetInfo().getFeatureBits()); + auto AvoidHalfCacheLineBoundary = [this](const MachineInstr *MI, + const MachineFunction *MF, + const MCInst &LoweredMCI) -> void { + const GCNSubtarget &STI = MF->getSubtarget(); + SIMachineFunctionInfo *MFI = const_cast( + MF->getInfo()); + + unsigned InstSizeInBytes = STI.getInstrInfo()->getInstSizeInBytes(*MI); + + // getInstrSizeInBytes convervatively overestimates the size of branches due + // to a NOP added for the 0x3f offset bug. Any inaccuracies in instruction + // sizes will cause problems when avoiding straddling half cache-line + // boundaries. A NOP is usually not added so remove the +4 that was added. + if (MI->isBranch() && STI.hasOffset3fBug()) + InstSizeInBytes -= 4; + // Rarely, some MachineInstr do not have accurate instruction sizes. Try to + // calculate the size from the lowered MCInst. + else if (InstSizeInBytes == 0 && STI.isCPUStringValid(STI.getCPU()) && + !(MI->getOpcode() == AMDGPU::SI_ILLEGAL_COPY || + MI->getOpcode() == AMDGPU::ATOMIC_FENCE)) + InstSizeInBytes = getMCInstSizeInBytes(LoweredMCI, STI, OutContext); + + // FIXME: Workaround bug in V_MADMK_F32 size. + if (MI->getOpcode() == AMDGPU::V_MADMK_F32) + InstSizeInBytes = 8; + + unsigned Alignment = MFI->Alignment; + unsigned Offset = MFI->Offset; + constexpr unsigned HalfCacheLineBoundary = 32; + + unsigned Boundary = std::min(Alignment, HalfCacheLineBoundary); + Offset %= Boundary; + + if (Offset + InstSizeInBytes > Boundary) { + emitAlignment(Align(HalfCacheLineBoundary)); + // Do not decrease known Alignment. Increment Offset to satisfy + // HalfCacheLineBoundary. + MFI->Alignment = std::max(Alignment, HalfCacheLineBoundary); + MFI->Offset += + (HalfCacheLineBoundary - (MFI->Offset % HalfCacheLineBoundary)); + } + MFI->Offset += InstSizeInBytes; + }; + if (MCInst OutInst; lowerPseudoInstExpansion(MI, OutInst)) { + if (PreventHalfCacheLineStraddling) + AvoidHalfCacheLineBoundary(MI, MF, OutInst); EmitToStreamer(*OutStreamer, OutInst); return; } @@ -370,6 +430,8 @@ void AMDGPUAsmPrinter::emitInstruction(const MachineInstr *MI) { MCInst TmpInst; MCInstLowering.lower(MI, TmpInst); + if (PreventHalfCacheLineStraddling) + AvoidHalfCacheLineBoundary(MI, MF, TmpInst); EmitToStreamer(*OutStreamer, TmpInst); #ifdef EXPENSIVE_CHECKS @@ -382,16 +444,9 @@ void AMDGPUAsmPrinter::emitInstruction(const MachineInstr *MI) { // // We also overestimate branch sizes with the offset bug. if (!MI->isPseudo() && STI.isCPUStringValid(STI.getCPU()) && - (!STI.hasOffset3fBug() || !MI->isBranch())) { - SmallVector Fixups; - SmallVector CodeBytes; - - std::unique_ptr InstEmitter(createAMDGPUMCCodeEmitter( - *STI.getInstrInfo(), OutContext)); - InstEmitter->encodeInstruction(TmpInst, CodeBytes, Fixups, STI); - - assert(CodeBytes.size() == STI.getInstrInfo()->getInstSizeInBytes(*MI)); - } + (!STI.hasOffset3fBug() || !MI->isBranch())) + assert(getMCInstSizeInBytes(TmpInst, STI, OutContext) == + STI.getInstrInfo()->getInstSizeInBytes(*MI)); #endif if (DumpCodeInstEmitter) { diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h index 274a60adb8d07..d3def4ce51319 100644 --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h @@ -525,6 +525,12 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction, void MRI_NoteCloneVirtualRegister(Register NewReg, Register SrcReg) override; public: + // Current known instruction alignment and offset in bytes. + // Used to prevent instructions from straddling half cache-line boundaries + // for performance. + unsigned Alignment = 1; + unsigned Offset = 0; + struct VGPRSpillToAGPR { SmallVector Lanes; bool FullyAllocated = false; diff --git a/llvm/test/CodeGen/AMDGPU/has_cache_straddle.py b/llvm/test/CodeGen/AMDGPU/has_cache_straddle.py new file mode 100755 index 0000000000000..19d6e8d3e86c7 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/has_cache_straddle.py @@ -0,0 +1,29 @@ +#!/usr/bin/env python3 + +import re +import sys + +if len(sys.argv) !=2 : + print("Usage: has_straddle.py ") + sys.exit(1) + +inputFilename = sys.argv[1] +address_and_encoding_regex = r"// (\S{12}):(( [0-9A-F]{8})+)"; + +file = open(inputFilename) + +for line in file : + match = re.search(address_and_encoding_regex,line) + if match : + hexaddress = match.group(1) + encoding = match.group(2) + dwords = encoding.split() + address = int(hexaddress, 16) + address_end = address + len(dwords)*4 - 1 + #Cache-line is 64 bytes. Check for half cache-line straddle. + if address//32 != address_end//32: + print("Straddling instruction found at:") + print(line) + sys.exit(1) + +sys.exit(0) diff --git a/llvm/test/CodeGen/AMDGPU/no_straddle.ll b/llvm/test/CodeGen/AMDGPU/no_straddle.ll new file mode 100644 index 0000000000000..b5010a03a60cb --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/no_straddle.ll @@ -0,0 +1,153 @@ +;RUN: llc --amdgpu-prevent-half-cache-line-straddling -mtriple=amdgcn -mcpu=fiji -mattr=dumpcode --filetype=obj < %s | llvm-objdump --triple=amdgcn --mcpu=fiji -d - > %t.dis +;RUN: %python %p/has_cache_straddle.py %t.dis + +define amdgpu_kernel void @xor_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) { + %a = load <2 x i32>, ptr addrspace(1) %in0 + %b = load <2 x i32>, ptr addrspace(1) %in1 + %result = xor <2 x i32> %a, %b + store <2 x i32> %result, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @xor_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) { + %a = load <4 x i32>, ptr addrspace(1) %in0 + %b = load <4 x i32>, ptr addrspace(1) %in1 + %result = xor <4 x i32> %a, %b + store <4 x i32> %result, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @xor_i1(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) { + %a = load float, ptr addrspace(1) %in0 + %b = load float, ptr addrspace(1) %in1 + %acmp = fcmp oge float %a, 0.000000e+00 + %bcmp = fcmp oge float %b, 1.000000e+00 + %xor = xor i1 %acmp, %bcmp + %result = select i1 %xor, float %a, float %b + store float %result, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_xor_i1(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) { + %a = load volatile i1, ptr addrspace(1) %in0 + %b = load volatile i1, ptr addrspace(1) %in1 + %xor = xor i1 %a, %b + store i1 %xor, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @vector_xor_i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) { + %a = load i32, ptr addrspace(1) %in0 + %b = load i32, ptr addrspace(1) %in1 + %result = xor i32 %a, %b + store i32 %result, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @scalar_xor_i32(ptr addrspace(1) %out, i32 %a, i32 %b) { + %result = xor i32 %a, %b + store i32 %result, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @scalar_not_i32(ptr addrspace(1) %out, i32 %a) { + %result = xor i32 %a, -1 + store i32 %result, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @vector_not_i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) { + %a = load i32, ptr addrspace(1) %in0 + %b = load i32, ptr addrspace(1) %in1 + %result = xor i32 %a, -1 + store i32 %result, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @vector_xor_i64(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) { + %a = load i64, ptr addrspace(1) %in0 + %b = load i64, ptr addrspace(1) %in1 + %result = xor i64 %a, %b + store i64 %result, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @scalar_xor_i64(ptr addrspace(1) %out, i64 %a, i64 %b) { + %result = xor i64 %a, %b + store i64 %result, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @scalar_not_i64(ptr addrspace(1) %out, i64 %a) { + %result = xor i64 %a, -1 + store i64 %result, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @vector_not_i64(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) { + %a = load i64, ptr addrspace(1) %in0 + %b = load i64, ptr addrspace(1) %in1 + %result = xor i64 %a, -1 + store i64 %result, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @xor_cf(ptr addrspace(1) %out, ptr addrspace(1) %in, i64 %a, i64 %b) { +entry: + %0 = icmp eq i64 %a, 0 + br i1 %0, label %if, label %else + +if: + %1 = xor i64 %a, %b + br label %endif + +else: + %2 = load i64, ptr addrspace(1) %in + br label %endif + +endif: + %3 = phi i64 [%1, %if], [%2, %else] + store i64 %3, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @scalar_xor_literal_i64(ptr addrspace(1) %out, [8 x i32], i64 %a) { + %or = xor i64 %a, 4261135838621753 + store i64 %or, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @scalar_xor_literal_multi_use_i64(ptr addrspace(1) %out, [8 x i32], i64 %a, i64 %b) { + %or = xor i64 %a, 4261135838621753 + store i64 %or, ptr addrspace(1) %out + + %foo = add i64 %b, 4261135838621753 + store volatile i64 %foo, ptr addrspace(1) poison + ret void +} + +define amdgpu_kernel void @scalar_xor_inline_imm_i64(ptr addrspace(1) %out, [8 x i32], i64 %a) { + %or = xor i64 %a, 63 + store i64 %or, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @scalar_xor_neg_inline_imm_i64(ptr addrspace(1) %out, [8 x i32], i64 %a) { + %or = xor i64 %a, -8 + store i64 %or, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @vector_xor_i64_neg_inline_imm(ptr addrspace(1) %out, ptr addrspace(1) %a, ptr addrspace(1) %b) { + %loada = load i64, ptr addrspace(1) %a, align 8 + %or = xor i64 %loada, -8 + store i64 %or, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @vector_xor_literal_i64(ptr addrspace(1) %out, ptr addrspace(1) %a, ptr addrspace(1) %b) { + %loada = load i64, ptr addrspace(1) %a, align 8 + %or = xor i64 %loada, 22470723082367 + store i64 %or, ptr addrspace(1) %out + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/no_straddle_amdgcn.bitcast.1024bit.ll b/llvm/test/CodeGen/AMDGPU/no_straddle_amdgcn.bitcast.1024bit.ll new file mode 100644 index 0000000000000..d4096e2be2b54 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/no_straddle_amdgcn.bitcast.1024bit.ll @@ -0,0 +1,2024 @@ +;RUN: llc --amdgpu-prevent-half-cache-line-straddling -mtriple=amdgcn -mcpu=tonga -mattr=dumpcode --filetype=obj < %s | llvm-objdump --triple=amdgcn --mcpu=tonga -d - > %t.dis +;RUN: %python %p/has_cache_straddle.py %t.dis + +;RUN: llc --amdgpu-prevent-half-cache-line-straddling -mtriple=amdgcn -mcpu=gfx900 -mattr=dumpcode --filetype=obj < %s | llvm-objdump --triple=amdgcn --mcpu=gfx900 -d - > %t.dis +;RUN: %python %p/has_cache_straddle.py %t.dis + +;RUN: llc --amdgpu-prevent-half-cache-line-straddling -mtriple=amdgcn -mcpu=gfx1100 -mattr=dumpcode --filetype=obj < %s | llvm-objdump --triple=amdgcn --mcpu=gfx1100 -d - > %t.dis +;RUN: %python %p/has_cache_straddle.py %t.dis + +define <32 x float> @bitcast_v32i32_to_v32f32(<32 x i32> %a, i32 %b) { + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <32 x i32> %a, splat (i32 3) + %a2 = bitcast <32 x i32> %a1 to <32 x float> + br label %end + +cmp.false: + %a3 = bitcast <32 x i32> %a to <32 x float> + br label %end + +end: + %phi = phi <32 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <32 x float> %phi +} + +define inreg <32 x float> @bitcast_v32i32_to_v32f32_scalar(<32 x i32> inreg %a, i32 inreg %b) { + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <32 x i32> %a, splat (i32 3) + %a2 = bitcast <32 x i32> %a1 to <32 x float> + br label %end + +cmp.false: + %a3 = bitcast <32 x i32> %a to <32 x float> + br label %end + +end: + %phi = phi <32 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <32 x float> %phi +} + +define <32 x i32> @bitcast_v32f32_to_v32i32(<32 x float> %a, i32 %b) { + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <32 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <32 x float> %a1 to <32 x i32> + br label %end + +cmp.false: + %a3 = bitcast <32 x float> %a to <32 x i32> + br label %end + +end: + %phi = phi <32 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <32 x i32> %phi +} + +define inreg <32 x i32> @bitcast_v32f32_to_v32i32_scalar(<32 x float> inreg %a, i32 inreg %b) { + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <32 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <32 x float> %a1 to <32 x i32> + br label %end + +cmp.false: + %a3 = bitcast <32 x float> %a to <32 x i32> + br label %end + +end: + %phi = phi <32 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <32 x i32> %phi +} + +define <16 x i64> @bitcast_v32i32_to_v16i64(<32 x i32> %a, i32 %b) { + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <32 x i32> %a, splat (i32 3) + %a2 = bitcast <32 x i32> %a1 to <16 x i64> + br label %end + +cmp.false: + %a3 = bitcast <32 x i32> %a to <16 x i64> + br label %end + +end: + %phi = phi <16 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x i64> %phi +} + +define inreg <16 x i64> @bitcast_v32i32_to_v16i64_scalar(<32 x i32> inreg %a, i32 inreg %b) { + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <32 x i32> %a, splat (i32 3) + %a2 = bitcast <32 x i32> %a1 to <16 x i64> + br label %end + +cmp.false: + %a3 = bitcast <32 x i32> %a to <16 x i64> + br label %end + +end: + %phi = phi <16 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x i64> %phi +} + +define <32 x i32> @bitcast_v16i64_to_v32i32(<16 x i64> %a, i32 %b) { + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <16 x i64> %a, splat (i64 3) + %a2 = bitcast <16 x i64> %a1 to <32 x i32> + br label %end + +cmp.false: + %a3 = bitcast <16 x i64> %a to <32 x i32> + br label %end + +end: + %phi = phi <32 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <32 x i32> %phi +} + +define inreg <32 x i32> @bitcast_v16i64_to_v32i32_scalar(<16 x i64> inreg %a, i32 inreg %b) { + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <16 x i64> %a, splat (i64 3) + %a2 = bitcast <16 x i64> %a1 to <32 x i32> + br label %end + +cmp.false: + %a3 = bitcast <16 x i64> %a to <32 x i32> + br label %end + +end: + %phi = phi <32 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <32 x i32> %phi +} + +define <16 x double> @bitcast_v32i32_to_v16f64(<32 x i32> %a, i32 %b) { + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <32 x i32> %a, splat (i32 3) + %a2 = bitcast <32 x i32> %a1 to <16 x double> + br label %end + +cmp.false: + %a3 = bitcast <32 x i32> %a to <16 x double> + br label %end + +end: + %phi = phi <16 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x double> %phi +} + +define inreg <16 x double> @bitcast_v32i32_to_v16f64_scalar(<32 x i32> inreg %a, i32 inreg %b) { + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <32 x i32> %a, splat (i32 3) + %a2 = bitcast <32 x i32> %a1 to <16 x double> + br label %end + +cmp.false: + %a3 = bitcast <32 x i32> %a to <16 x double> + br label %end + +end: + %phi = phi <16 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x double> %phi +} + +define <32 x i32> @bitcast_v16f64_to_v32i32(<16 x double> %a, i32 %b) { + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <16 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <16 x double> %a1 to <32 x i32> + br label %end + +cmp.false: + %a3 = bitcast <16 x double> %a to <32 x i32> + br label %end + +end: + %phi = phi <32 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <32 x i32> %phi +} + +define inreg <32 x i32> @bitcast_v16f64_to_v32i32_scalar(<16 x double> inreg %a, i32 inreg %b) { + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <16 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <16 x double> %a1 to <32 x i32> + br label %end + +cmp.false: + %a3 = bitcast <16 x double> %a to <32 x i32> + br label %end + +end: + %phi = phi <32 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <32 x i32> %phi +} + +define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <32 x i32> %a, splat (i32 3) + %a2 = bitcast <32 x i32> %a1 to <128 x i8> + br label %end + +cmp.false: + %a3 = bitcast <32 x i32> %a to <128 x i8> + br label %end + +end: + %phi = phi <128 x i8> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <128 x i8> %phi +} + +define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i32 inreg %b) { + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <32 x i32> %a, splat (i32 3) + %a2 = bitcast <32 x i32> %a1 to <128 x i8> + br label %end + +cmp.false: + %a3 = bitcast <32 x i32> %a to <128 x i8> + br label %end + +end: + %phi = phi <128 x i8> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <128 x i8> %phi +} + +define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <128 x i8> %a, splat (i8 3) + %a2 = bitcast <128 x i8> %a1 to <32 x i32> + br label %end + +cmp.false: + %a3 = bitcast <128 x i8> %a to <32 x i32> + br label %end + +end: + %phi = phi <32 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <32 x i32> %phi +} + +define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i32 inreg %b) { + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <128 x i8> %a, splat (i8 3) + %a2 = bitcast <128 x i8> %a1 to <32 x i32> + br label %end + +cmp.false: + %a3 = bitcast <128 x i8> %a to <32 x i32> + br label %end + +end: + %phi = phi <32 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <32 x i32> %phi +} + +define <64 x bfloat> @bitcast_v32i32_to_v64bf16(<32 x i32> %a, i32 %b) { + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <32 x i32> %a, splat (i32 3) + %a2 = bitcast <32 x i32> %a1 to <64 x bfloat> + br label %end + +cmp.false: + %a3 = bitcast <32 x i32> %a to <64 x bfloat> + br label %end + +end: + %phi = phi <64 x bfloat> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <64 x bfloat> %phi +} + +define inreg <64 x bfloat> @bitcast_v32i32_to_v64bf16_scalar(<32 x i32> inreg %a, i32 inreg %b) { + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <32 x i32> %a, splat (i32 3) + %a2 = bitcast <32 x i32> %a1 to <64 x bfloat> + br label %end + +cmp.false: + %a3 = bitcast <32 x i32> %a to <64 x bfloat> + br label %end + +end: + %phi = phi <64 x bfloat> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <64 x bfloat> %phi +} + +define <32 x i32> @bitcast_v64bf16_to_v32i32(<64 x bfloat> %a, i32 %b) { + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <64 x bfloat> %a, splat (bfloat 0xR40C0) + %a2 = bitcast <64 x bfloat> %a1 to <32 x i32> + br label %end + +cmp.false: + %a3 = bitcast <64 x bfloat> %a to <32 x i32> + br label %end + +end: + %phi = phi <32 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <32 x i32> %phi +} + +define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a, i32 inreg %b) { + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <64 x bfloat> %a, splat (bfloat 0xR40C0) + %a2 = bitcast <64 x bfloat> %a1 to <32 x i32> + br label %end + +cmp.false: + %a3 = bitcast <64 x bfloat> %a to <32 x i32> + br label %end + +end: + %phi = phi <32 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <32 x i32> %phi +} + +define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) { + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <32 x i32> %a, splat (i32 3) + %a2 = bitcast <32 x i32> %a1 to <64 x half> + br label %end + +cmp.false: + %a3 = bitcast <32 x i32> %a to <64 x half> + br label %end + +end: + %phi = phi <64 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <64 x half> %phi +} + +define inreg <64 x half> @bitcast_v32i32_to_v64f16_scalar(<32 x i32> inreg %a, i32 inreg %b) { + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <32 x i32> %a, splat (i32 3) + %a2 = bitcast <32 x i32> %a1 to <64 x half> + br label %end + +cmp.false: + %a3 = bitcast <32 x i32> %a to <64 x half> + br label %end + +end: + %phi = phi <64 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <64 x half> %phi +} + +define <32 x i32> @bitcast_v64f16_to_v32i32(<64 x half> %a, i32 %b) { + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <64 x half> %a, splat (half 0xH0200) + %a2 = bitcast <64 x half> %a1 to <32 x i32> + br label %end + +cmp.false: + %a3 = bitcast <64 x half> %a to <32 x i32> + br label %end + +end: + %phi = phi <32 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <32 x i32> %phi +} + +define inreg <32 x i32> @bitcast_v64f16_to_v32i32_scalar(<64 x half> inreg %a, i32 inreg %b) { + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <64 x half> %a, splat (half 0xH0200) + %a2 = bitcast <64 x half> %a1 to <32 x i32> + br label %end + +cmp.false: + %a3 = bitcast <64 x half> %a to <32 x i32> + br label %end + +end: + %phi = phi <32 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <32 x i32> %phi +} + +define <64 x i16> @bitcast_v32i32_to_v64i16(<32 x i32> %a, i32 %b) { + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <32 x i32> %a, splat (i32 3) + %a2 = bitcast <32 x i32> %a1 to <64 x i16> + br label %end + +cmp.false: + %a3 = bitcast <32 x i32> %a to <64 x i16> + br label %end + +end: + %phi = phi <64 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <64 x i16> %phi +} + +define inreg <64 x i16> @bitcast_v32i32_to_v64i16_scalar(<32 x i32> inreg %a, i32 inreg %b) { + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <32 x i32> %a, splat (i32 3) + %a2 = bitcast <32 x i32> %a1 to <64 x i16> + br label %end + +cmp.false: + %a3 = bitcast <32 x i32> %a to <64 x i16> + br label %end + +end: + %phi = phi <64 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <64 x i16> %phi +} + +define <32 x i32> @bitcast_v64i16_to_v32i32(<64 x i16> %a, i32 %b) { + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <64 x i16> %a, splat (i16 3) + %a2 = bitcast <64 x i16> %a1 to <32 x i32> + br label %end + +cmp.false: + %a3 = bitcast <64 x i16> %a to <32 x i32> + br label %end + +end: + %phi = phi <32 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <32 x i32> %phi +} + +define inreg <32 x i32> @bitcast_v64i16_to_v32i32_scalar(<64 x i16> inreg %a, i32 inreg %b) { + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <64 x i16> %a, splat (i16 3) + %a2 = bitcast <64 x i16> %a1 to <32 x i32> + br label %end + +cmp.false: + %a3 = bitcast <64 x i16> %a to <32 x i32> + br label %end + +end: + %phi = phi <32 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <32 x i32> %phi +} + +define <16 x i64> @bitcast_v32f32_to_v16i64(<32 x float> %a, i32 %b) { + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <32 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <32 x float> %a1 to <16 x i64> + br label %end + +cmp.false: + %a3 = bitcast <32 x float> %a to <16 x i64> + br label %end + +end: + %phi = phi <16 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x i64> %phi +} + +define inreg <16 x i64> @bitcast_v32f32_to_v16i64_scalar(<32 x float> inreg %a, i32 inreg %b) { + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <32 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <32 x float> %a1 to <16 x i64> + br label %end + +cmp.false: + %a3 = bitcast <32 x float> %a to <16 x i64> + br label %end + +end: + %phi = phi <16 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x i64> %phi +} + +define <32 x float> @bitcast_v16i64_to_v32f32(<16 x i64> %a, i32 %b) { + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <16 x i64> %a, splat (i64 3) + %a2 = bitcast <16 x i64> %a1 to <32 x float> + br label %end + +cmp.false: + %a3 = bitcast <16 x i64> %a to <32 x float> + br label %end + +end: + %phi = phi <32 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <32 x float> %phi +} + +define inreg <32 x float> @bitcast_v16i64_to_v32f32_scalar(<16 x i64> inreg %a, i32 inreg %b) { + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <16 x i64> %a, splat (i64 3) + %a2 = bitcast <16 x i64> %a1 to <32 x float> + br label %end + +cmp.false: + %a3 = bitcast <16 x i64> %a to <32 x float> + br label %end + +end: + %phi = phi <32 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <32 x float> %phi +} + +define <16 x double> @bitcast_v32f32_to_v16f64(<32 x float> %a, i32 %b) { + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <32 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <32 x float> %a1 to <16 x double> + br label %end + +cmp.false: + %a3 = bitcast <32 x float> %a to <16 x double> + br label %end + +end: + %phi = phi <16 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x double> %phi +} + +define inreg <16 x double> @bitcast_v32f32_to_v16f64_scalar(<32 x float> inreg %a, i32 inreg %b) { + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <32 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <32 x float> %a1 to <16 x double> + br label %end + +cmp.false: + %a3 = bitcast <32 x float> %a to <16 x double> + br label %end + +end: + %phi = phi <16 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x double> %phi +} + +define <32 x float> @bitcast_v16f64_to_v32f32(<16 x double> %a, i32 %b) { + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <16 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <16 x double> %a1 to <32 x float> + br label %end + +cmp.false: + %a3 = bitcast <16 x double> %a to <32 x float> + br label %end + +end: + %phi = phi <32 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <32 x float> %phi +} + +define inreg <32 x float> @bitcast_v16f64_to_v32f32_scalar(<16 x double> inreg %a, i32 inreg %b) { + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <16 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <16 x double> %a1 to <32 x float> + br label %end + +cmp.false: + %a3 = bitcast <16 x double> %a to <32 x float> + br label %end + +end: + %phi = phi <32 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <32 x float> %phi +} + +define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <32 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <32 x float> %a1 to <128 x i8> + br label %end + +cmp.false: + %a3 = bitcast <32 x float> %a to <128 x i8> + br label %end + +end: + %phi = phi <128 x i8> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <128 x i8> %phi +} + +define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, i32 inreg %b) { + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <32 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <32 x float> %a1 to <128 x i8> + br label %end + +cmp.false: + %a3 = bitcast <32 x float> %a to <128 x i8> + br label %end + +end: + %phi = phi <128 x i8> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <128 x i8> %phi +} + +define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <128 x i8> %a, splat (i8 3) + %a2 = bitcast <128 x i8> %a1 to <32 x float> + br label %end + +cmp.false: + %a3 = bitcast <128 x i8> %a to <32 x float> + br label %end + +end: + %phi = phi <32 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <32 x float> %phi +} + +define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, i32 inreg %b) { + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <128 x i8> %a, splat (i8 3) + %a2 = bitcast <128 x i8> %a1 to <32 x float> + br label %end + +cmp.false: + %a3 = bitcast <128 x i8> %a to <32 x float> + br label %end + +end: + %phi = phi <32 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <32 x float> %phi +} + +define <64 x bfloat> @bitcast_v32f32_to_v64bf16(<32 x float> %a, i32 %b) { + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <32 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <32 x float> %a1 to <64 x bfloat> + br label %end + +cmp.false: + %a3 = bitcast <32 x float> %a to <64 x bfloat> + br label %end + +end: + %phi = phi <64 x bfloat> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <64 x bfloat> %phi +} + +define inreg <64 x bfloat> @bitcast_v32f32_to_v64bf16_scalar(<32 x float> inreg %a, i32 inreg %b) { + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <32 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <32 x float> %a1 to <64 x bfloat> + br label %end + +cmp.false: + %a3 = bitcast <32 x float> %a to <64 x bfloat> + br label %end + +end: + %phi = phi <64 x bfloat> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <64 x bfloat> %phi +} + +define <32 x float> @bitcast_v64bf16_to_v32f32(<64 x bfloat> %a, i32 %b) { + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <64 x bfloat> %a, splat (bfloat 0xR40C0) + %a2 = bitcast <64 x bfloat> %a1 to <32 x float> + br label %end + +cmp.false: + %a3 = bitcast <64 x bfloat> %a to <32 x float> + br label %end + +end: + %phi = phi <32 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <32 x float> %phi +} + +define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg %a, i32 inreg %b) { + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <64 x bfloat> %a, splat (bfloat 0xR40C0) + %a2 = bitcast <64 x bfloat> %a1 to <32 x float> + br label %end + +cmp.false: + %a3 = bitcast <64 x bfloat> %a to <32 x float> + br label %end + +end: + %phi = phi <32 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <32 x float> %phi +} + +define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) { + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <32 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <32 x float> %a1 to <64 x half> + br label %end + +cmp.false: + %a3 = bitcast <32 x float> %a to <64 x half> + br label %end + +end: + %phi = phi <64 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <64 x half> %phi +} + +define inreg <64 x half> @bitcast_v32f32_to_v64f16_scalar(<32 x float> inreg %a, i32 inreg %b) { + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <32 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <32 x float> %a1 to <64 x half> + br label %end + +cmp.false: + %a3 = bitcast <32 x float> %a to <64 x half> + br label %end + +end: + %phi = phi <64 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <64 x half> %phi +} + +define <32 x float> @bitcast_v64f16_to_v32f32(<64 x half> %a, i32 %b) { + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <64 x half> %a, splat (half 0xH0200) + %a2 = bitcast <64 x half> %a1 to <32 x float> + br label %end + +cmp.false: + %a3 = bitcast <64 x half> %a to <32 x float> + br label %end + +end: + %phi = phi <32 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <32 x float> %phi +} + +define inreg <32 x float> @bitcast_v64f16_to_v32f32_scalar(<64 x half> inreg %a, i32 inreg %b) { + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <64 x half> %a, splat (half 0xH0200) + %a2 = bitcast <64 x half> %a1 to <32 x float> + br label %end + +cmp.false: + %a3 = bitcast <64 x half> %a to <32 x float> + br label %end + +end: + %phi = phi <32 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <32 x float> %phi +} + +define <64 x i16> @bitcast_v32f32_to_v64i16(<32 x float> %a, i32 %b) { + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <32 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <32 x float> %a1 to <64 x i16> + br label %end + +cmp.false: + %a3 = bitcast <32 x float> %a to <64 x i16> + br label %end + +end: + %phi = phi <64 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <64 x i16> %phi +} + +define inreg <64 x i16> @bitcast_v32f32_to_v64i16_scalar(<32 x float> inreg %a, i32 inreg %b) { + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <32 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <32 x float> %a1 to <64 x i16> + br label %end + +cmp.false: + %a3 = bitcast <32 x float> %a to <64 x i16> + br label %end + +end: + %phi = phi <64 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <64 x i16> %phi +} + +define <32 x float> @bitcast_v64i16_to_v32f32(<64 x i16> %a, i32 %b) { + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <64 x i16> %a, splat (i16 3) + %a2 = bitcast <64 x i16> %a1 to <32 x float> + br label %end + +cmp.false: + %a3 = bitcast <64 x i16> %a to <32 x float> + br label %end + +end: + %phi = phi <32 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <32 x float> %phi +} + +define inreg <32 x float> @bitcast_v64i16_to_v32f32_scalar(<64 x i16> inreg %a, i32 inreg %b) { + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <64 x i16> %a, splat (i16 3) + %a2 = bitcast <64 x i16> %a1 to <32 x float> + br label %end + +cmp.false: + %a3 = bitcast <64 x i16> %a to <32 x float> + br label %end + +end: + %phi = phi <32 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <32 x float> %phi +} + +define <16 x double> @bitcast_v16i64_to_v16f64(<16 x i64> %a, i32 %b) { + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <16 x i64> %a, splat (i64 3) + %a2 = bitcast <16 x i64> %a1 to <16 x double> + br label %end + +cmp.false: + %a3 = bitcast <16 x i64> %a to <16 x double> + br label %end + +end: + %phi = phi <16 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x double> %phi +} + +define inreg <16 x double> @bitcast_v16i64_to_v16f64_scalar(<16 x i64> inreg %a, i32 inreg %b) { + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <16 x i64> %a, splat (i64 3) + %a2 = bitcast <16 x i64> %a1 to <16 x double> + br label %end + +cmp.false: + %a3 = bitcast <16 x i64> %a to <16 x double> + br label %end + +end: + %phi = phi <16 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x double> %phi +} + +define <16 x i64> @bitcast_v16f64_to_v16i64(<16 x double> %a, i32 %b) { + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <16 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <16 x double> %a1 to <16 x i64> + br label %end + +cmp.false: + %a3 = bitcast <16 x double> %a to <16 x i64> + br label %end + +end: + %phi = phi <16 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x i64> %phi +} + +define inreg <16 x i64> @bitcast_v16f64_to_v16i64_scalar(<16 x double> inreg %a, i32 inreg %b) { + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <16 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <16 x double> %a1 to <16 x i64> + br label %end + +cmp.false: + %a3 = bitcast <16 x double> %a to <16 x i64> + br label %end + +end: + %phi = phi <16 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x i64> %phi +} + +define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <16 x i64> %a, splat (i64 3) + %a2 = bitcast <16 x i64> %a1 to <128 x i8> + br label %end + +cmp.false: + %a3 = bitcast <16 x i64> %a to <128 x i8> + br label %end + +end: + %phi = phi <128 x i8> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <128 x i8> %phi +} + +define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i32 inreg %b) { + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <16 x i64> %a, splat (i64 3) + %a2 = bitcast <16 x i64> %a1 to <128 x i8> + br label %end + +cmp.false: + %a3 = bitcast <16 x i64> %a to <128 x i8> + br label %end + +end: + %phi = phi <128 x i8> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <128 x i8> %phi +} + +define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <128 x i8> %a, splat (i8 3) + %a2 = bitcast <128 x i8> %a1 to <16 x i64> + br label %end + +cmp.false: + %a3 = bitcast <128 x i8> %a to <16 x i64> + br label %end + +end: + %phi = phi <16 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x i64> %phi +} + +define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i32 inreg %b) { + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <128 x i8> %a, splat (i8 3) + %a2 = bitcast <128 x i8> %a1 to <16 x i64> + br label %end + +cmp.false: + %a3 = bitcast <128 x i8> %a to <16 x i64> + br label %end + +end: + %phi = phi <16 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x i64> %phi +} + +define <64 x bfloat> @bitcast_v16i64_to_v64bf16(<16 x i64> %a, i32 %b) { + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <16 x i64> %a, splat (i64 3) + %a2 = bitcast <16 x i64> %a1 to <64 x bfloat> + br label %end + +cmp.false: + %a3 = bitcast <16 x i64> %a to <64 x bfloat> + br label %end + +end: + %phi = phi <64 x bfloat> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <64 x bfloat> %phi +} + +define inreg <64 x bfloat> @bitcast_v16i64_to_v64bf16_scalar(<16 x i64> inreg %a, i32 inreg %b) { + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <16 x i64> %a, splat (i64 3) + %a2 = bitcast <16 x i64> %a1 to <64 x bfloat> + br label %end + +cmp.false: + %a3 = bitcast <16 x i64> %a to <64 x bfloat> + br label %end + +end: + %phi = phi <64 x bfloat> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <64 x bfloat> %phi +} + +define <16 x i64> @bitcast_v64bf16_to_v16i64(<64 x bfloat> %a, i32 %b) { + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <64 x bfloat> %a, splat (bfloat 0xR40C0) + %a2 = bitcast <64 x bfloat> %a1 to <16 x i64> + br label %end + +cmp.false: + %a3 = bitcast <64 x bfloat> %a to <16 x i64> + br label %end + +end: + %phi = phi <16 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x i64> %phi +} + +define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a, i32 inreg %b) { + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <64 x bfloat> %a, splat (bfloat 0xR40C0) + %a2 = bitcast <64 x bfloat> %a1 to <16 x i64> + br label %end + +cmp.false: + %a3 = bitcast <64 x bfloat> %a to <16 x i64> + br label %end + +end: + %phi = phi <16 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x i64> %phi +} + +define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) { + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <16 x i64> %a, splat (i64 3) + %a2 = bitcast <16 x i64> %a1 to <64 x half> + br label %end + +cmp.false: + %a3 = bitcast <16 x i64> %a to <64 x half> + br label %end + +end: + %phi = phi <64 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <64 x half> %phi +} + +define inreg <64 x half> @bitcast_v16i64_to_v64f16_scalar(<16 x i64> inreg %a, i32 inreg %b) { + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <16 x i64> %a, splat (i64 3) + %a2 = bitcast <16 x i64> %a1 to <64 x half> + br label %end + +cmp.false: + %a3 = bitcast <16 x i64> %a to <64 x half> + br label %end + +end: + %phi = phi <64 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <64 x half> %phi +} + +define <16 x i64> @bitcast_v64f16_to_v16i64(<64 x half> %a, i32 %b) { + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <64 x half> %a, splat (half 0xH0200) + %a2 = bitcast <64 x half> %a1 to <16 x i64> + br label %end + +cmp.false: + %a3 = bitcast <64 x half> %a to <16 x i64> + br label %end + +end: + %phi = phi <16 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x i64> %phi +} + +define inreg <16 x i64> @bitcast_v64f16_to_v16i64_scalar(<64 x half> inreg %a, i32 inreg %b) { + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <64 x half> %a, splat (half 0xH0200) + %a2 = bitcast <64 x half> %a1 to <16 x i64> + br label %end + +cmp.false: + %a3 = bitcast <64 x half> %a to <16 x i64> + br label %end + +end: + %phi = phi <16 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x i64> %phi +} + +define <64 x i16> @bitcast_v16i64_to_v64i16(<16 x i64> %a, i32 %b) { + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <16 x i64> %a, splat (i64 3) + %a2 = bitcast <16 x i64> %a1 to <64 x i16> + br label %end + +cmp.false: + %a3 = bitcast <16 x i64> %a to <64 x i16> + br label %end + +end: + %phi = phi <64 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <64 x i16> %phi +} + +define inreg <64 x i16> @bitcast_v16i64_to_v64i16_scalar(<16 x i64> inreg %a, i32 inreg %b) { + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <16 x i64> %a, splat (i64 3) + %a2 = bitcast <16 x i64> %a1 to <64 x i16> + br label %end + +cmp.false: + %a3 = bitcast <16 x i64> %a to <64 x i16> + br label %end + +end: + %phi = phi <64 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <64 x i16> %phi +} + +define <16 x i64> @bitcast_v64i16_to_v16i64(<64 x i16> %a, i32 %b) { + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <64 x i16> %a, splat (i16 3) + %a2 = bitcast <64 x i16> %a1 to <16 x i64> + br label %end + +cmp.false: + %a3 = bitcast <64 x i16> %a to <16 x i64> + br label %end + +end: + %phi = phi <16 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x i64> %phi +} + +define inreg <16 x i64> @bitcast_v64i16_to_v16i64_scalar(<64 x i16> inreg %a, i32 inreg %b) { + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <64 x i16> %a, splat (i16 3) + %a2 = bitcast <64 x i16> %a1 to <16 x i64> + br label %end + +cmp.false: + %a3 = bitcast <64 x i16> %a to <16 x i64> + br label %end + +end: + %phi = phi <16 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x i64> %phi +} + +define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <16 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <16 x double> %a1 to <128 x i8> + br label %end + +cmp.false: + %a3 = bitcast <16 x double> %a to <128 x i8> + br label %end + +end: + %phi = phi <128 x i8> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <128 x i8> %phi +} + +define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, i32 inreg %b) { + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <16 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <16 x double> %a1 to <128 x i8> + br label %end + +cmp.false: + %a3 = bitcast <16 x double> %a to <128 x i8> + br label %end + +end: + %phi = phi <128 x i8> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <128 x i8> %phi +} + +define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <128 x i8> %a, splat (i8 3) + %a2 = bitcast <128 x i8> %a1 to <16 x double> + br label %end + +cmp.false: + %a3 = bitcast <128 x i8> %a to <16 x double> + br label %end + +end: + %phi = phi <16 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x double> %phi +} + +define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, i32 inreg %b) { + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <128 x i8> %a, splat (i8 3) + %a2 = bitcast <128 x i8> %a1 to <16 x double> + br label %end + +cmp.false: + %a3 = bitcast <128 x i8> %a to <16 x double> + br label %end + +end: + %phi = phi <16 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x double> %phi +} + +define <64 x bfloat> @bitcast_v16f64_to_v64bf16(<16 x double> %a, i32 %b) { + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <16 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <16 x double> %a1 to <64 x bfloat> + br label %end + +cmp.false: + %a3 = bitcast <16 x double> %a to <64 x bfloat> + br label %end + +end: + %phi = phi <64 x bfloat> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <64 x bfloat> %phi +} + +define inreg <64 x bfloat> @bitcast_v16f64_to_v64bf16_scalar(<16 x double> inreg %a, i32 inreg %b) { + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <16 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <16 x double> %a1 to <64 x bfloat> + br label %end + +cmp.false: + %a3 = bitcast <16 x double> %a to <64 x bfloat> + br label %end + +end: + %phi = phi <64 x bfloat> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <64 x bfloat> %phi +} + +define <16 x double> @bitcast_v64bf16_to_v16f64(<64 x bfloat> %a, i32 %b) { + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <64 x bfloat> %a, splat (bfloat 0xR40C0) + %a2 = bitcast <64 x bfloat> %a1 to <16 x double> + br label %end + +cmp.false: + %a3 = bitcast <64 x bfloat> %a to <16 x double> + br label %end + +end: + %phi = phi <16 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x double> %phi +} + +define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg %a, i32 inreg %b) { + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <64 x bfloat> %a, splat (bfloat 0xR40C0) + %a2 = bitcast <64 x bfloat> %a1 to <16 x double> + br label %end + +cmp.false: + %a3 = bitcast <64 x bfloat> %a to <16 x double> + br label %end + +end: + %phi = phi <16 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x double> %phi +} + +define <64 x half> @bitcast_v16f64_to_v64f16(<16 x double> %a, i32 %b) { + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <16 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <16 x double> %a1 to <64 x half> + br label %end + +cmp.false: + %a3 = bitcast <16 x double> %a to <64 x half> + br label %end + +end: + %phi = phi <64 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <64 x half> %phi +} + +define inreg <64 x half> @bitcast_v16f64_to_v64f16_scalar(<16 x double> inreg %a, i32 inreg %b) { + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <16 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <16 x double> %a1 to <64 x half> + br label %end + +cmp.false: + %a3 = bitcast <16 x double> %a to <64 x half> + br label %end + +end: + %phi = phi <64 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <64 x half> %phi +} + +define <16 x double> @bitcast_v64f16_to_v16f64(<64 x half> %a, i32 %b) { + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <64 x half> %a, splat (half 0xH0200) + %a2 = bitcast <64 x half> %a1 to <16 x double> + br label %end + +cmp.false: + %a3 = bitcast <64 x half> %a to <16 x double> + br label %end + +end: + %phi = phi <16 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x double> %phi +} + +define inreg <16 x double> @bitcast_v64f16_to_v16f64_scalar(<64 x half> inreg %a, i32 inreg %b) { + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <64 x half> %a, splat (half 0xH0200) + %a2 = bitcast <64 x half> %a1 to <16 x double> + br label %end + +cmp.false: + %a3 = bitcast <64 x half> %a to <16 x double> + br label %end + +end: + %phi = phi <16 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x double> %phi +} + +define <64 x i16> @bitcast_v16f64_to_v64i16(<16 x double> %a, i32 %b) { + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <16 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <16 x double> %a1 to <64 x i16> + br label %end + +cmp.false: + %a3 = bitcast <16 x double> %a to <64 x i16> + br label %end + +end: + %phi = phi <64 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <64 x i16> %phi +} + +define inreg <64 x i16> @bitcast_v16f64_to_v64i16_scalar(<16 x double> inreg %a, i32 inreg %b) { + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <16 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <16 x double> %a1 to <64 x i16> + br label %end + +cmp.false: + %a3 = bitcast <16 x double> %a to <64 x i16> + br label %end + +end: + %phi = phi <64 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <64 x i16> %phi +} + +define <16 x double> @bitcast_v64i16_to_v16f64(<64 x i16> %a, i32 %b) { + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <64 x i16> %a, splat (i16 3) + %a2 = bitcast <64 x i16> %a1 to <16 x double> + br label %end + +cmp.false: + %a3 = bitcast <64 x i16> %a to <16 x double> + br label %end + +end: + %phi = phi <16 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x double> %phi +} + +define inreg <16 x double> @bitcast_v64i16_to_v16f64_scalar(<64 x i16> inreg %a, i32 inreg %b) { + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <64 x i16> %a, splat (i16 3) + %a2 = bitcast <64 x i16> %a1 to <16 x double> + br label %end + +cmp.false: + %a3 = bitcast <64 x i16> %a to <16 x double> + br label %end + +end: + %phi = phi <16 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x double> %phi +} + +define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <128 x i8> %a, splat (i8 3) + %a2 = bitcast <128 x i8> %a1 to <64 x bfloat> + br label %end + +cmp.false: + %a3 = bitcast <128 x i8> %a to <64 x bfloat> + br label %end + +end: + %phi = phi <64 x bfloat> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <64 x bfloat> %phi +} + +define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a, i32 inreg %b) { + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <128 x i8> %a, splat (i8 3) + %a2 = bitcast <128 x i8> %a1 to <64 x bfloat> + br label %end + +cmp.false: + %a3 = bitcast <128 x i8> %a to <64 x bfloat> + br label %end + +end: + %phi = phi <64 x bfloat> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <64 x bfloat> %phi +} + +define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <64 x bfloat> %a, splat (bfloat 0xR40C0) + %a2 = bitcast <64 x bfloat> %a1 to <128 x i8> + br label %end + +cmp.false: + %a3 = bitcast <64 x bfloat> %a to <128 x i8> + br label %end + +end: + %phi = phi <128 x i8> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <128 x i8> %phi +} + +define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a, i32 inreg %b) { + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <64 x bfloat> %a, splat (bfloat 0xR40C0) + %a2 = bitcast <64 x bfloat> %a1 to <128 x i8> + br label %end + +cmp.false: + %a3 = bitcast <64 x bfloat> %a to <128 x i8> + br label %end + +end: + %phi = phi <128 x i8> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <128 x i8> %phi +} + +define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <128 x i8> %a, splat (i8 3) + %a2 = bitcast <128 x i8> %a1 to <64 x half> + br label %end + +cmp.false: + %a3 = bitcast <128 x i8> %a to <64 x half> + br label %end + +end: + %phi = phi <64 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <64 x half> %phi +} + +define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i32 inreg %b) { + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <128 x i8> %a, splat (i8 3) + %a2 = bitcast <128 x i8> %a1 to <64 x half> + br label %end + +cmp.false: + %a3 = bitcast <128 x i8> %a to <64 x half> + br label %end + +end: + %phi = phi <64 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <64 x half> %phi +} + +define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <64 x half> %a, splat (half 0xH0200) + %a2 = bitcast <64 x half> %a1 to <128 x i8> + br label %end + +cmp.false: + %a3 = bitcast <64 x half> %a to <128 x i8> + br label %end + +end: + %phi = phi <128 x i8> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <128 x i8> %phi +} + +define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i32 inreg %b) { + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <64 x half> %a, splat (half 0xH0200) + %a2 = bitcast <64 x half> %a1 to <128 x i8> + br label %end + +cmp.false: + %a3 = bitcast <64 x half> %a to <128 x i8> + br label %end + +end: + %phi = phi <128 x i8> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <128 x i8> %phi +} + +define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <128 x i8> %a, splat (i8 3) + %a2 = bitcast <128 x i8> %a1 to <64 x i16> + br label %end + +cmp.false: + %a3 = bitcast <128 x i8> %a to <64 x i16> + br label %end + +end: + %phi = phi <64 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <64 x i16> %phi +} + +define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i32 inreg %b) { + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <128 x i8> %a, splat (i8 3) + %a2 = bitcast <128 x i8> %a1 to <64 x i16> + br label %end + +cmp.false: + %a3 = bitcast <128 x i8> %a to <64 x i16> + br label %end + +end: + %phi = phi <64 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <64 x i16> %phi +} + +define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <64 x i16> %a, splat (i16 3) + %a2 = bitcast <64 x i16> %a1 to <128 x i8> + br label %end + +cmp.false: + %a3 = bitcast <64 x i16> %a to <128 x i8> + br label %end + +end: + %phi = phi <128 x i8> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <128 x i8> %phi +} + +define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i32 inreg %b) { + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <64 x i16> %a, splat (i16 3) + %a2 = bitcast <64 x i16> %a1 to <128 x i8> + br label %end + +cmp.false: + %a3 = bitcast <64 x i16> %a to <128 x i8> + br label %end + +end: + %phi = phi <128 x i8> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <128 x i8> %phi +} + +define <64 x half> @bitcast_v64bf16_to_v64f16(<64 x bfloat> %a, i32 %b) { + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <64 x bfloat> %a, splat (bfloat 0xR40C0) + %a2 = bitcast <64 x bfloat> %a1 to <64 x half> + br label %end + +cmp.false: + %a3 = bitcast <64 x bfloat> %a to <64 x half> + br label %end + +end: + %phi = phi <64 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <64 x half> %phi +} + +define inreg <64 x half> @bitcast_v64bf16_to_v64f16_scalar(<64 x bfloat> inreg %a, i32 inreg %b) { + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <64 x bfloat> %a, splat (bfloat 0xR40C0) + %a2 = bitcast <64 x bfloat> %a1 to <64 x half> + br label %end + +cmp.false: + %a3 = bitcast <64 x bfloat> %a to <64 x half> + br label %end + +end: + %phi = phi <64 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <64 x half> %phi +} + +define <64 x bfloat> @bitcast_v64f16_to_v64bf16(<64 x half> %a, i32 %b) { + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <64 x half> %a, splat (half 0xH0200) + %a2 = bitcast <64 x half> %a1 to <64 x bfloat> + br label %end + +cmp.false: + %a3 = bitcast <64 x half> %a to <64 x bfloat> + br label %end + +end: + %phi = phi <64 x bfloat> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <64 x bfloat> %phi +} + +define inreg <64 x bfloat> @bitcast_v64f16_to_v64bf16_scalar(<64 x half> inreg %a, i32 inreg %b) { + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <64 x half> %a, splat (half 0xH0200) + %a2 = bitcast <64 x half> %a1 to <64 x bfloat> + br label %end + +cmp.false: + %a3 = bitcast <64 x half> %a to <64 x bfloat> + br label %end + +end: + %phi = phi <64 x bfloat> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <64 x bfloat> %phi +} + +define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) { + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <64 x bfloat> %a, splat (bfloat 0xR40C0) + %a2 = bitcast <64 x bfloat> %a1 to <64 x i16> + br label %end + +cmp.false: + %a3 = bitcast <64 x bfloat> %a to <64 x i16> + br label %end + +end: + %phi = phi <64 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <64 x i16> %phi +} + +define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a, i32 inreg %b) { + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <64 x bfloat> %a, splat (bfloat 0xR40C0) + %a2 = bitcast <64 x bfloat> %a1 to <64 x i16> + br label %end + +cmp.false: + %a3 = bitcast <64 x bfloat> %a to <64 x i16> + br label %end + +end: + %phi = phi <64 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <64 x i16> %phi +} + +define <64 x bfloat> @bitcast_v64i16_to_v64bf16(<64 x i16> %a, i32 %b) { + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <64 x i16> %a, splat (i16 3) + %a2 = bitcast <64 x i16> %a1 to <64 x bfloat> + br label %end + +cmp.false: + %a3 = bitcast <64 x i16> %a to <64 x bfloat> + br label %end + +end: + %phi = phi <64 x bfloat> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <64 x bfloat> %phi +} + +define inreg <64 x bfloat> @bitcast_v64i16_to_v64bf16_scalar(<64 x i16> inreg %a, i32 inreg %b) { + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <64 x i16> %a, splat (i16 3) + %a2 = bitcast <64 x i16> %a1 to <64 x bfloat> + br label %end + +cmp.false: + %a3 = bitcast <64 x i16> %a to <64 x bfloat> + br label %end + +end: + %phi = phi <64 x bfloat> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <64 x bfloat> %phi +} + +define <64 x i16> @bitcast_v64f16_to_v64i16(<64 x half> %a, i32 %b) { + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <64 x half> %a, splat (half 0xH0200) + %a2 = bitcast <64 x half> %a1 to <64 x i16> + br label %end + +cmp.false: + %a3 = bitcast <64 x half> %a to <64 x i16> + br label %end + +end: + %phi = phi <64 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <64 x i16> %phi +} + +define inreg <64 x i16> @bitcast_v64f16_to_v64i16_scalar(<64 x half> inreg %a, i32 inreg %b) { + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <64 x half> %a, splat (half 0xH0200) + %a2 = bitcast <64 x half> %a1 to <64 x i16> + br label %end + +cmp.false: + %a3 = bitcast <64 x half> %a to <64 x i16> + br label %end + +end: + %phi = phi <64 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <64 x i16> %phi +} + +define <64 x half> @bitcast_v64i16_to_v64f16(<64 x i16> %a, i32 %b) { + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <64 x i16> %a, splat (i16 3) + %a2 = bitcast <64 x i16> %a1 to <64 x half> + br label %end + +cmp.false: + %a3 = bitcast <64 x i16> %a to <64 x half> + br label %end + +end: + %phi = phi <64 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <64 x half> %phi +} + +define inreg <64 x half> @bitcast_v64i16_to_v64f16_scalar(<64 x i16> inreg %a, i32 inreg %b) { + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <64 x i16> %a, splat (i16 3) + %a2 = bitcast <64 x i16> %a1 to <64 x half> + br label %end + +cmp.false: + %a3 = bitcast <64 x i16> %a to <64 x half> + br label %end + +end: + %phi = phi <64 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <64 x half> %phi +} diff --git a/llvm/test/CodeGen/AMDGPU/no_straddle_fcanonicalize.f16.ll b/llvm/test/CodeGen/AMDGPU/no_straddle_fcanonicalize.f16.ll new file mode 100644 index 0000000000000..c1a1ceb129625 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/no_straddle_fcanonicalize.f16.ll @@ -0,0 +1,452 @@ +;RUN: llc --amdgpu-prevent-half-cache-line-straddling -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16,dumpcode --filetype=obj < %s | llvm-objdump --triple=amdgcn --mcpu=gfx1100 -d - > %t.dis +;RUN: %python %p/has_cache_straddle.py %t.dis + +declare half @llvm.fabs.f16(half) #0 +declare half @llvm.canonicalize.f16(half) #0 +declare <2 x half> @llvm.fabs.v2f16(<2 x half>) #0 +declare <2 x half> @llvm.canonicalize.v2f16(<2 x half>) #0 +declare <3 x half> @llvm.canonicalize.v3f16(<3 x half>) #0 +declare <4 x half> @llvm.canonicalize.v4f16(<4 x half>) #0 +declare <6 x half> @llvm.canonicalize.v6f16(<6 x half>) #0 +declare <8 x half> @llvm.canonicalize.v8f16(<8 x half>) #0 +declare <12 x half> @llvm.canonicalize.v12f16(<12 x half>) #0 +declare <16 x half> @llvm.canonicalize.v16f16(<16 x half>) #0 +declare <32 x half> @llvm.canonicalize.v32f16(<32 x half>) #0 +declare <64 x half> @llvm.canonicalize.v64f16(<64 x half>) #0 +declare i32 @llvm.amdgcn.workitem.id.x() #0 + +define amdgpu_kernel void @test_fold_canonicalize_undef_value_f16(ptr addrspace(1) %out) #1 { + %canonicalized = call half @llvm.canonicalize.f16(half undef) + store half %canonicalized, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_test_canonicalize_var_f16(ptr addrspace(1) %out) #1 { + %val = load half, ptr addrspace(1) %out + %canonicalized = call half @llvm.canonicalize.f16(half %val) + store half %canonicalized, ptr addrspace(1) poison + ret void +} + +define amdgpu_kernel void @s_test_canonicalize_var_f16(ptr addrspace(1) %out, i16 zeroext %val.arg) #1 { + %val = bitcast i16 %val.arg to half + %canonicalized = call half @llvm.canonicalize.f16(half %val) + store half %canonicalized, ptr addrspace(1) %out + ret void +} + +define <2 x half> @v_test_canonicalize_build_vector_v2f16(half %lo, half %hi) #1 { + %ins0 = insertelement <2 x half> poison, half %lo, i32 0 + %ins1 = insertelement <2 x half> %ins0, half %hi, i32 1 + %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %ins1) + ret <2 x half> %canonicalized +} + +define amdgpu_kernel void @v_test_canonicalize_fabs_var_f16(ptr addrspace(1) %out) #1 { + %val = load half, ptr addrspace(1) %out + %val.fabs = call half @llvm.fabs.f16(half %val) + %canonicalized = call half @llvm.canonicalize.f16(half %val.fabs) + store half %canonicalized, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f16(ptr addrspace(1) %out) #1 { + %val = load half, ptr addrspace(1) %out + %val.fabs = call half @llvm.fabs.f16(half %val) + %val.fabs.fneg = fneg half %val.fabs + %canonicalized = call half @llvm.canonicalize.f16(half %val.fabs.fneg) + store half %canonicalized, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_test_canonicalize_fneg_var_f16(ptr addrspace(1) %out) #1 { + %val = load half, ptr addrspace(1) %out + %val.fneg = fneg half %val + %canonicalized = call half @llvm.canonicalize.f16(half %val.fneg) + store half %canonicalized, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_test_no_denormals_canonicalize_fneg_var_f16(ptr addrspace(1) %out) #2 { + %val = load half, ptr addrspace(1) %out + %val.fneg = fneg half %val + %canonicalized = call half @llvm.canonicalize.f16(half %val.fneg) + store half %canonicalized, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_test_no_denormals_canonicalize_fneg_fabs_var_f16(ptr addrspace(1) %out) #2 { + %val = load half, ptr addrspace(1) %out + %val.fabs = call half @llvm.fabs.f16(half %val) + %val.fabs.fneg = fneg half %val.fabs + %canonicalized = call half @llvm.canonicalize.f16(half %val.fabs.fneg) + store half %canonicalized, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @test_fold_canonicalize_p0_f16(ptr addrspace(1) %out) #1 { + %canonicalized = call half @llvm.canonicalize.f16(half 0.0) + store half %canonicalized, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @test_fold_canonicalize_n0_f16(ptr addrspace(1) %out) #1 { + %canonicalized = call half @llvm.canonicalize.f16(half -0.0) + store half %canonicalized, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @test_fold_canonicalize_p1_f16(ptr addrspace(1) %out) #1 { + %canonicalized = call half @llvm.canonicalize.f16(half 1.0) + store half %canonicalized, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @test_fold_canonicalize_n1_f16(ptr addrspace(1) %out) #1 { + %canonicalized = call half @llvm.canonicalize.f16(half -1.0) + store half %canonicalized, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @test_fold_canonicalize_literal_f16(ptr addrspace(1) %out) #1 { + %canonicalized = call half @llvm.canonicalize.f16(half 16.0) + store half %canonicalized, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @test_default_denormals_fold_canonicalize_denormal0_f16(ptr addrspace(1) %out) #1 { + %canonicalized = call half @llvm.canonicalize.f16(half 0xH03FF) + store half %canonicalized, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_f16(ptr addrspace(1) %out) #3 { + %canonicalized = call half @llvm.canonicalize.f16(half 0xH03FF) + store half %canonicalized, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @test_default_denormals_fold_canonicalize_denormal1_f16(ptr addrspace(1) %out) #1 { + %canonicalized = call half @llvm.canonicalize.f16(half 0xH83FF) + store half %canonicalized, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_f16(ptr addrspace(1) %out) #3 { + %canonicalized = call half @llvm.canonicalize.f16(half 0xH83FF) + store half %canonicalized, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @test_fold_canonicalize_qnan_f16(ptr addrspace(1) %out) #1 { + %canonicalized = call half @llvm.canonicalize.f16(half 0xH7C00) + store half %canonicalized, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_f16(ptr addrspace(1) %out) #1 { + %canonicalized = call half @llvm.canonicalize.f16(half bitcast (i16 -1 to half)) + store half %canonicalized, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_f16(ptr addrspace(1) %out) #1 { + %canonicalized = call half @llvm.canonicalize.f16(half bitcast (i16 -2 to half)) + store half %canonicalized, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @test_fold_canonicalize_snan0_value_f16(ptr addrspace(1) %out) #1 { + %canonicalized = call half @llvm.canonicalize.f16(half 0xH7C01) + store half %canonicalized, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @test_fold_canonicalize_snan1_value_f16(ptr addrspace(1) %out) #1 { + %canonicalized = call half @llvm.canonicalize.f16(half 0xH7DFF) + store half %canonicalized, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @test_fold_canonicalize_snan2_value_f16(ptr addrspace(1) %out) #1 { + %canonicalized = call half @llvm.canonicalize.f16(half 0xHFDFF) + store half %canonicalized, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f16(ptr addrspace(1) %out) #1 { + %canonicalized = call half @llvm.canonicalize.f16(half 0xHFC01) + store half %canonicalized, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_test_canonicalize_var_v2f16(ptr addrspace(1) %out) #1 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr <2 x half>, ptr addrspace(1) %out, i32 %tid + %val = load <2 x half>, ptr addrspace(1) %gep + %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %val) + store <2 x half> %canonicalized, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_test_canonicalize_fabs_var_v2f16(ptr addrspace(1) %out) #1 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr <2 x half>, ptr addrspace(1) %out, i32 %tid + %val = load <2 x half>, ptr addrspace(1) %gep + %val.fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %val) + %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %val.fabs) + store <2 x half> %canonicalized, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_v2f16(ptr addrspace(1) %out) #1 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr <2 x half>, ptr addrspace(1) %out, i32 %tid + %val = load <2 x half>, ptr addrspace(1) %gep + %val.fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %val) + %val.fabs.fneg = fneg <2 x half> %val.fabs + %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %val.fabs.fneg) + store <2 x half> %canonicalized, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_test_canonicalize_fneg_var_v2f16(ptr addrspace(1) %out) #1 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr <2 x half>, ptr addrspace(1) %out, i32 %tid + %val = load <2 x half>, ptr addrspace(1) %gep + %fneg.val = fneg <2 x half> %val + %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %fneg.val) + store <2 x half> %canonicalized, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @s_test_canonicalize_var_v2f16(ptr addrspace(1) %out, i32 zeroext %val.arg) #1 { + %val = bitcast i32 %val.arg to <2 x half> + %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %val) + store <2 x half> %canonicalized, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @test_fold_canonicalize_p0_v2f16(ptr addrspace(1) %out) #1 { + %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> zeroinitializer) + store <2 x half> %canonicalized, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @test_fold_canonicalize_n0_v2f16(ptr addrspace(1) %out) #1 { + %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> ) + store <2 x half> %canonicalized, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @test_fold_canonicalize_p1_v2f16(ptr addrspace(1) %out) #1 { + %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> ) + store <2 x half> %canonicalized, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @test_fold_canonicalize_n1_v2f16(ptr addrspace(1) %out) #1 { + %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> ) + store <2 x half> %canonicalized, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @test_fold_canonicalize_literal_v2f16(ptr addrspace(1) %out) #1 { + %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> ) + store <2 x half> %canonicalized, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_v2f16(ptr addrspace(1) %out) #1 { + %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> ) + store <2 x half> %canonicalized, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_v2f16(ptr addrspace(1) %out) #3 { + %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> ) + store <2 x half> %canonicalized, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal1_v2f16(ptr addrspace(1) %out) #1 { + %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> ) + store <2 x half> %canonicalized, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_v2f16(ptr addrspace(1) %out) #3 { + %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> ) + store <2 x half> %canonicalized, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @test_fold_canonicalize_qnan_v2f16(ptr addrspace(1) %out) #1 { + %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> ) + store <2 x half> %canonicalized, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_v2f16(ptr addrspace(1) %out) #1 { + %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> bitcast (i32 -1 to <2 x half>)) + store <2 x half> %canonicalized, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_v2f16(ptr addrspace(1) %out) #1 { + %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> ) + store <2 x half> %canonicalized, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @test_fold_canonicalize_snan0_value_v2f16(ptr addrspace(1) %out) #1 { + %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> ) + store <2 x half> %canonicalized, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @test_fold_canonicalize_snan1_value_v2f16(ptr addrspace(1) %out) #1 { + %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> ) + store <2 x half> %canonicalized, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @test_fold_canonicalize_snan2_value_v2f16(ptr addrspace(1) %out) #1 { + %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> ) + store <2 x half> %canonicalized, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @test_fold_canonicalize_snan3_value_v2f16(ptr addrspace(1) %out) #1 { + %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> ) + store <2 x half> %canonicalized, ptr addrspace(1) %out + ret void +} + +define <3 x half> @v_test_canonicalize_var_v3f16(<3 x half> %val) #1 { + %canonicalized = call <3 x half> @llvm.canonicalize.v3f16(<3 x half> %val) + ret <3 x half> %canonicalized +} + +define <4 x half> @v_test_canonicalize_var_v4f16(<4 x half> %val) #1 { + %canonicalized = call <4 x half> @llvm.canonicalize.v4f16(<4 x half> %val) + ret <4 x half> %canonicalized +} + +define amdgpu_kernel void @s_test_canonicalize_undef_v2f16(ptr addrspace(1) %out) #1 { + %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> undef) + store <2 x half> %canonicalized, ptr addrspace(1) %out + ret void +} + +define <2 x half> @v_test_canonicalize_reg_undef_v2f16(half %val) #1 { + %vec = insertelement <2 x half> poison, half %val, i32 0 + %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %vec) + ret <2 x half> %canonicalized +} + +define <2 x half> @v_test_canonicalize_undef_reg_v2f16(half %val) #1 { + %vec = insertelement <2 x half> poison, half %val, i32 1 + %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %vec) + ret <2 x half> %canonicalized +} + +define <2 x half> @v_test_canonicalize_undef_lo_imm_hi_v2f16() #1 { + %vec = insertelement <2 x half> undef, half 1.0, i32 1 + %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %vec) + ret <2 x half> %canonicalized +} + +define <2 x half> @v_test_canonicalize_imm_lo_undef_hi_v2f16() #1 { + %vec = insertelement <2 x half> undef, half 1.0, i32 0 + %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %vec) + ret <2 x half> %canonicalized +} + +define <2 x half> @v_test_canonicalize_undef_lo_k_hi_v2f16() #1 { + %vec = insertelement <2 x half> undef, half 16.0, i32 1 + %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %vec) + ret <2 x half> %canonicalized +} + +define <2 x half> @v_test_canonicalize_k_lo_undef_hi_v2f16() #1 { + %vec = insertelement <2 x half> undef, half 16.0, i32 0 + %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %vec) + ret <2 x half> %canonicalized +} + +define <2 x half> @v_test_canonicalize_reg_k_v2f16(half %val) #1 { + %vec0 = insertelement <2 x half> poison, half %val, i32 0 + %vec1 = insertelement <2 x half> %vec0, half 2.0, i32 1 + %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %vec1) + ret <2 x half> %canonicalized +} + +define <2 x half> @v_test_canonicalize_k_reg_v2f16(half %val) #1 { + %vec0 = insertelement <2 x half> poison, half 2.0, i32 0 + %vec1 = insertelement <2 x half> %vec0, half %val, i32 1 + %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %vec1) + ret <2 x half> %canonicalized +} + +define amdgpu_kernel void @s_test_canonicalize_undef_v4f16(ptr addrspace(1) %out) #1 { + %canonicalized = call <4 x half> @llvm.canonicalize.v4f16(<4 x half> undef) + store <4 x half> %canonicalized, ptr addrspace(1) %out + ret void +} + +define <4 x half> @v_test_canonicalize_reg_undef_undef_undef_v4f16(half %val) #1 { + %vec = insertelement <4 x half> poison, half %val, i32 0 + %canonicalized = call <4 x half> @llvm.canonicalize.v4f16(<4 x half> %vec) + ret <4 x half> %canonicalized +} + +define <4 x half> @v_test_canonicalize_reg_reg_undef_undef_v4f16(half %val0, half %val1) #1 { + %vec0 = insertelement <4 x half> poison, half %val0, i32 0 + %vec1 = insertelement <4 x half> %vec0, half %val1, i32 1 + %canonicalized = call <4 x half> @llvm.canonicalize.v4f16(<4 x half> %vec1) + ret <4 x half> %canonicalized +} + +define <4 x half> @v_test_canonicalize_reg_undef_reg_reg_v4f16(half %val0, half %val1, half %val2) #1 { + %vec0 = insertelement <4 x half> poison, half %val0, i32 0 + %vec1 = insertelement <4 x half> %vec0, half %val1, i32 2 + %vec2 = insertelement <4 x half> %vec1, half %val2, i32 3 + %canonicalized = call <4 x half> @llvm.canonicalize.v4f16(<4 x half> %vec2) + ret <4 x half> %canonicalized +} + +define <6 x half> @v_test_canonicalize_var_v6f16(<6 x half> %val) #1 { + %canonicalized = call <6 x half> @llvm.canonicalize.v6f16(<6 x half> %val) + ret <6 x half> %canonicalized +} + +define <8 x half> @v_test_canonicalize_var_v8f16(<8 x half> %val) #1 { + %canonicalized = call <8 x half> @llvm.canonicalize.v8f16(<8 x half> %val) + ret <8 x half> %canonicalized +} + +define <12 x half> @v_test_canonicalize_var_v12f16(<12 x half> %val) #1 { + %canonicalized = call <12 x half> @llvm.canonicalize.v12f16(<12 x half> %val) + ret <12 x half> %canonicalized +} + +define <16 x half> @v_test_canonicalize_var_v16f16(<16 x half> %val) #1 { + %canonicalized = call <16 x half> @llvm.canonicalize.v16f16(<16 x half> %val) + ret <16 x half> %canonicalized +} + +define <32 x half> @v_test_canonicalize_var_v32f16(<32 x half> %val) #1 { + %canonicalized = call <32 x half> @llvm.canonicalize.v32f16(<32 x half> %val) + ret <32 x half> %canonicalized +} + +define <64 x half> @v_test_canonicalize_var_v64f16(<64 x half> %val) #1 { + %canonicalized = call <64 x half> @llvm.canonicalize.v64f16(<64 x half> %val) + ret <64 x half> %canonicalized +} + +attributes #0 = { nounwind readnone } +attributes #1 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" } +attributes #2 = { nounwind "denormal-fp-math"="preserve-sign,preserve-sign" } +attributes #3 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" } diff --git a/llvm/test/CodeGen/AMDGPU/no_straddle_global_store_short_saddr_t16.ll b/llvm/test/CodeGen/AMDGPU/no_straddle_global_store_short_saddr_t16.ll new file mode 100644 index 0000000000000..bfc3f490ad7c4 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/no_straddle_global_store_short_saddr_t16.ll @@ -0,0 +1,19 @@ +;RUN: llc --amdgpu-prevent-half-cache-line-straddling -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16,dumpcode --filetype=obj < %s | llvm-objdump --triple=amdgcn --mcpu=gfx1100 -d - > %t.dis +;RUN: %python %p/has_cache_straddle.py %t.dis + +declare half @llvm.canonicalize.f16(half) #0 + +define amdgpu_kernel void @test_fold_canonicalize_undef_value_f16(ptr addrspace(1) %out, half %value) #1 { + %canonA = call half @llvm.canonicalize.f16(half %value) + %canonB = call half @llvm.canonicalize.f16(half undef) + store half %canonA, ptr addrspace(1) %out + %out2 = getelementptr half, ptr addrspace(1) %out, i64 10 + store half %canonB, ptr addrspace(1) %out2 + %out3 = getelementptr half, ptr addrspace(1) %out, i64 3333158 + store half %canonB, ptr addrspace(1) %out3 + ret void +} + + +attributes #1 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" } + diff --git a/llvm/test/CodeGen/AMDGPU/no_straddle_llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll b/llvm/test/CodeGen/AMDGPU/no_straddle_llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll new file mode 100644 index 0000000000000..aff9464bfc30b --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/no_straddle_llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll @@ -0,0 +1,665 @@ +;RUN: llc --amdgpu-prevent-half-cache-line-straddling -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 -global-isel=0 -mattr=dumpcode --filetype=obj < %s | llvm-objdump --triple=amdgcn-amd-amdhsa --mcpu=gfx950 -d - > %t.dis +;RUN: %python %p/has_cache_straddle.py %t.dis + +define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp0(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, + i32 0, ; cbsz + i32 0, ; blgp + i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <4 x float> %result +} + +define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_1_1__cbsz1__blgp1(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, + i32 0, ; cbsz + i32 0, ; blgp + i32 1, i32 %scale0, i32 1, i32 %scale1) + ret <4 x float> %result +} + +define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_2_2__cbsz1__blgp1(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, + i32 0, ; cbsz + i32 0, ; blgp + i32 2, i32 %scale0, i32 2, i32 %scale1) + ret <4 x float> %result +} + +define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_3_3__cbsz1__blgp1(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, + i32 0, ; cbsz + i32 0, ; blgp + i32 3, i32 %scale0, i32 3, i32 %scale1) + ret <4 x float> %result +} + +define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_3__cbsz1__blgp1(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, + i32 0, ; cbsz + i32 0, ; blgp + i32 0, i32 %scale0, i32 3, i32 %scale1) + ret <4 x float> %result +} + +define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_3_0__cbsz1__blgp1(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, + i32 0, ; cbsz + i32 0, ; blgp + i32 3, i32 %scale0, i32 0, i32 %scale1) + ret <4 x float> %result +} + +define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_2_3__cbsz1__blgp1(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, + i32 0, ; cbsz + i32 0, ; blgp + i32 2, i32 %scale0, i32 3, i32 %scale1) + ret <4 x float> %result +} + +define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_3_2__cbsz1__blgp1(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, + i32 0, ; cbsz + i32 0, ; blgp + i32 3, i32 %scale0, i32 2, i32 %scale1) + ret <4 x float> %result +} + +define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp0__constant_scale_0_0(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2) { + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, + i32 0, ; cbsz + i32 0, ; blgp + i32 0, i32 0, i32 0, i32 0) + ret <4 x float> %result +} + +define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp1(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, + i32 0, ; cbsz + i32 1, ; blgp + i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <4 x float> %result +} + +define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp1__constant_scale_0_0(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2) { + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, + i32 0, ; cbsz + i32 1, ; blgp + i32 0, i32 0, i32 0, i32 0) + ret <4 x float> %result +} + +define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp2(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, + i32 0, ; cbsz + i32 2, ; blgp + i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <4 x float> %result +} + +define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp2__constant_scale_0_0(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2) { + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, + i32 0, ; cbsz + i32 2, ; blgp + i32 0, i32 0, i32 0, i32 0) + ret <4 x float> %result +} + +define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp3(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, + i32 0, ; cbsz + i32 3, ; blgp + i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <4 x float> %result +} + +define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp3__constant_scale_0_0(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2) { + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, + i32 0, ; cbsz + i32 3, ; blgp + i32 0, i32 0, i32 0, i32 0) + ret <4 x float> %result +} + +define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp4(<8 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v4i32(<8 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2, + i32 0, ; cbsz + i32 4, ; blgp + i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <4 x float> %result +} + +define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp4__constant_scale_0_0(<8 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2) { + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v4i32(<8 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2, + i32 0, ; cbsz + i32 4, ; blgp + i32 0, i32 0, i32 0, i32 0) + ret <4 x float> %result +} + +define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp0(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, + i32 1, ; cbsz + i32 0, ; blgp + i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <4 x float> %result +} + +define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp0__constant_scale_0_0(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2) { + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, + i32 1, ; cbsz + i32 0, ; blgp + i32 0, i32 0, i32 0, i32 0) + ret <4 x float> %result +} + +define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp1(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, + i32 1, ; cbsz + i32 1, ; blgp + i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <4 x float> %result +} + + +define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp1__constant_scale_0_0(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2) { + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, + i32 1, ; cbsz + i32 1, ; blgp + i32 0, i32 0, i32 0, i32 0) + ret <4 x float> %result +} + +define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp2(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, + i32 1, ; cbsz + i32 2, ; blgp + i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <4 x float> %result +} + +define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp2__constant_scale_0(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2) { + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, + i32 1, ; cbsz + i32 2, ; blgp + i32 0, i32 0, i32 0, i32 0) + ret <4 x float> %result +} + +define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp3(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, + i32 1, ; cbsz + i32 3, ; blgp + i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <4 x float> %result +} + +define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp3__constant_scale_0_0(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2) { + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, + i32 1, ; cbsz + i32 3, ; blgp + i32 0, i32 0, i32 0, i32 0) + ret <4 x float> %result +} + +define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp4(<8 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v4i32(<8 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2, + i32 1, ; cbsz + i32 4, ; blgp + i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <4 x float> %result +} + +define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp4__constant_scale_0_0(<8 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2) { + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v4i32(<8 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2, + i32 1, ; cbsz + i32 4, ; blgp + i32 0, i32 0, i32 0, i32 0) + ret <4 x float> %result +} + +define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp0(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, + i32 2, ; cbsz + i32 0, ; blgp + i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <4 x float> %result +} + +define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp0__constant_scale_0_0(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2) { + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, + i32 2, ; cbsz + i32 0, ; blgp + i32 0, i32 0, i32 0, i32 0) + ret <4 x float> %result +} + +define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp1(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, + i32 2, ; cbsz + i32 1, ; blgp + i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <4 x float> %result +} + +define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp1__constant_scale_0_0(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2) { + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, + i32 2, ; cbsz + i32 1, ; blgp + i32 0, i32 0, i32 0, i32 0) + ret <4 x float> %result +} + +define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp2(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, + i32 2, ; cbsz + i32 2, ; blgp + i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <4 x float> %result +} + +define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp2__constant_scale_0_0(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2) { + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, + i32 2, ; cbsz + i32 2, ; blgp + i32 0, i32 0, i32 0, i32 0) + ret <4 x float> %result +} + +define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp3(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, + i32 2, ; cbsz + i32 3, ; blgp + i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <4 x float> %result +} + +define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp3__constant_scale_0_0(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2) { + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, + i32 2, ; cbsz + i32 3, ; blgp + i32 0, i32 0, i32 0, i32 0) + ret <4 x float> %result +} + + +define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp0(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, + i32 3, ; cbsz + i32 0, ; blgp + i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <4 x float> %result +} + +define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp0__constant_scale_0_0(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2) { + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, + i32 3, ; cbsz + i32 0, ; blgp + i32 0, i32 0, i32 0, i32 0) + ret <4 x float> %result +} + +define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp1(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, + i32 3, ; cbsz + i32 1, ; blgp + i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <4 x float> %result +} + +define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp1__constant_scale_0_0(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2) { + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, + i32 3, ; cbsz + i32 1, ; blgp + i32 0, i32 0, i32 0, i32 0) + ret <4 x float> %result +} + +define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp2(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, + i32 3, ; cbsz + i32 2, ; blgp + i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <4 x float> %result +} + +define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp2__constant_scale_0_0(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2) { + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, + i32 3, ; cbsz + i32 2, ; blgp + i32 0, i32 0, i32 0, i32 0) + ret <4 x float> %result +} + +define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp4(<6 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v4i32(<6 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2, + i32 3, ; cbsz + i32 4, ; blgp + i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <4 x float> %result +} + +define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp4__constant_scale_0_0(<6 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2) { + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v4i32(<6 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2, + i32 3, ; cbsz + i32 4, ; blgp + i32 0, i32 0, i32 0, i32 0) + ret <4 x float> %result +} + +define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp3(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, + i32 3, ; cbsz + i32 3, ; blgp + i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <4 x float> %result +} + +define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp3__constant_scale_0_0(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2) { + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, + i32 3, ; cbsz + i32 3, ; blgp + i32 0, i32 0, i32 0, i32 0) + ret <4 x float> %result +} + +define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp4(<6 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v4i32(<6 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2, + i32 2, ; cbsz + i32 4, ; blgp + i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <4 x float> %result +} + +define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp4__constant_scale_0_0(<6 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2) { + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v4i32(<6 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2, + i32 2, ; cbsz + i32 4, ; blgp + i32 0, i32 0, i32 0, i32 0) + ret <4 x float> %result +} + +define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp0(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v8i32(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, + i32 4, ; cbsz + i32 0, ; blgp + i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <4 x float> %result +} + +define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp0__constant_scale_0_0(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2) { + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v8i32(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, + i32 4, ; cbsz + i32 0, ; blgp + i32 0, i32 0, i32 0, i32 0) + ret <4 x float> %result +} + +define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp1(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v8i32(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, + i32 4, ; cbsz + i32 1, ; blgp + i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <4 x float> %result +} + +define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp1__constant_scale_0_0(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2) { + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v8i32(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, + i32 4, ; cbsz + i32 1, ; blgp + i32 0, i32 0, i32 0, i32 0) + ret <4 x float> %result +} + +define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp2(<4 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v6i32(<4 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, + i32 4, ; cbsz + i32 2, ; blgp + i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <4 x float> %result +} + +define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp2__constant_scale_0_0(<4 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2) { + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v6i32(<4 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, + i32 4, ; cbsz + i32 2, ; blgp + i32 0, i32 0, i32 0, i32 0) + ret <4 x float> %result +} + +define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp3(<4 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v6i32(<4 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, + i32 4, ; cbsz + i32 3, ; blgp + i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <4 x float> %result +} + +define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp3__constant_scale_0_0(<4 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2) { + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v6i32(<4 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, + i32 4, ; cbsz + i32 3, ; blgp + i32 0, i32 0, i32 0, i32 0) + ret <4 x float> %result +} + +define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp4(<4 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v4i32(<4 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2, + i32 4, ; cbsz + i32 4, ; blgp + i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <4 x float> %result +} + +define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp4__constant_scale_0_0(<4 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2) { + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v4i32(<4 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2, + i32 4, ; cbsz + i32 4, ; blgp + i32 0, i32 0, i32 0, i32 0) + ret <4 x float> %result +} + + +define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__sgpr_scaleA__sgpr_scaleB(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 inreg %scale0, i32 inreg %scale1) { + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <4 x float> %result +} + +define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__sgpr_scaleA__vgpr_scaleB(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 inreg %scale0, i32 %scale1) { + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <4 x float> %result +} + +define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__vgpr_scaleA__sgpr_scaleB(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 inreg %scale1) { + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <4 x float> %result +} + +define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgprs(<8 x i32> inreg %arg0, <8 x i32> inreg %arg1, <4 x float> inreg %arg2, i32 %scale0, i32 %scale1) { + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <4 x float> %result +} + +define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgpr_vgpr_vgpr__sgpr_vgpr(<8 x i32> inreg %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 inreg %scale0, i32 %scale1) { + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <4 x float> %result +} + +define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgpr_vgpr_vgpr__vgpr_sgpr(<8 x i32> inreg %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 inreg %scale1) { + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <4 x float> %result +} + +define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_vgpr_sgpr_vgpr__vgpr_sgpr(<8 x i32> %arg0, <8 x i32> inreg %arg1, <4 x float> %arg2, i32 %scale0, i32 inreg %scale1) { + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <4 x float> %result +} + +define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_vgpr_vgpr_sgpr__vgpr_sgpr(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> inreg %arg2, i32 %scale0, i32 inreg %scale1) { + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <4 x float> %result +} + +define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgpr_vgpr_sgpr__vgpr_sgpr(<8 x i32> inreg %arg0, <8 x i32> %arg1, <4 x float> inreg %arg2, i32 %scale0, i32 inreg %scale1) { + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <4 x float> %result +} + +define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_inlineimm__scaleB_inlineimm(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 2, i32 33, i32 2, i32 -2) + ret <4 x float> %result +} + +define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_kimm__scaleB_inlineimm(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 2, i32 65, i32 2, i32 -2) + ret <4 x float> %result +} + +define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_kimm__scaleB_kimm(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 2, i32 65, i32 2, i32 77) + ret <4 x float> %result +} + +define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1, ptr addrspace(1) %ptr) #0 { + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 2, i32 3, i32 %scale0, i32 1, i32 %scale1) + store <4 x float> %result, ptr addrspace(1) %ptr, align 16 + ret void +} + +define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA_kimm__scaleB__inlineimm(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, ptr addrspace(1) %ptr) #0 { + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 3, i32 65, i32 1, i32 -2) + store <4 x float> %result, ptr addrspace(1) %ptr, align 16 + ret void +} + +define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA_kimm__scaleB__FP_literal(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, ptr addrspace(1) %ptr) #0 { + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 3, i32 65, i32 1, i32 1065353216) + store <4 x float> %result, ptr addrspace(1) %ptr, align 16 + ret void +} + +define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA_FP_literal__scaleB__inline_imm(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, ptr addrspace(1) %ptr) #0 { + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 3, i32 1065353216, i32 1, i32 -2) + store <4 x float> %result, ptr addrspace(1) %ptr, align 16 + ret void +} + +define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA_FP_literal__scaleB__FP_literal(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, ptr addrspace(1) %ptr) #0 { + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 3, i32 1065353216, i32 1, i32 1042479491) + store <4 x float> %result, ptr addrspace(1) %ptr, align 16 + ret void +} + +define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_0_a(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + ret <4 x float> %result +} + +define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_0_b(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 3, i32 0, i32 1, i32 0) + ret <4 x float> %result +} + +define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_1(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1) + ret <4 x float> %result +} + +define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_1_0_a(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0, i32 1, i32 0, i32 0) + ret <4 x float> %result +} + + +define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp8__v8i32_fp6(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, + i32 0, ; cbsz + i32 2, ; blgp + i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <4 x float> %result +} + +define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp6__v8i32_fp8(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, + i32 2, ; cbsz + i32 0, ; blgp + i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <4 x float> %result +} + +define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp6__v8i32_fp6(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, + i32 2, ; cbsz + i32 2, ; blgp + i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <4 x float> %result +} + +define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp6__v8i32_fp6__0_scale(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2) { + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, + i32 2, ; cbsz + i32 2, ; blgp + i32 0, i32 0, i32 0, i32 0) + ret <4 x float> %result +} + +define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp8__v8i32_fp4(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, + i32 0, ; cbsz + i32 4, ; blgp + i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <4 x float> %result +} + +define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp4__v8i32_fp8(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, + i32 4, ; cbsz + i32 0, ; blgp + i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <4 x float> %result +} + +define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp8__v6i32_fp4(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, + i32 0, ; cbsz + i32 4, ; blgp + i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <4 x float> %result +} + +define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v6i32_fp4__v8i32_fp8(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, + i32 4, ; cbsz + i32 0, ; blgp + i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <4 x float> %result +} + +define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp4__v8i32_fp4(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, + i32 4, ; cbsz + i32 4, ; blgp + i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <4 x float> %result +} + +define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp4__v8i32_fp4__0_scale(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2) { + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, + i32 4, ; cbsz + i32 4, ; blgp + i32 0, i32 0, i32 0, i32 0) + ret <4 x float> %result +} + +declare <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32>, <8 x i32>, <4 x float>, i32 immarg, i32 immarg, i32 immarg, i32, i32 immarg, i32) #1 +declare <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v6i32(<6 x i32>, <6 x i32>, <4 x float>, i32 immarg, i32 immarg, i32 immarg, i32, i32 immarg, i32) #1 +declare <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v4i32(<4 x i32>, <4 x i32>, <4 x float>, i32 immarg, i32 immarg, i32 immarg, i32, i32 immarg, i32) #1 +declare <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v6i32(<4 x i32>, <6 x i32>, <4 x float>, i32 immarg, i32 immarg, i32 immarg, i32, i32 immarg, i32) #1 +declare <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v8i32(<4 x i32>, <8 x i32>, <4 x float>, i32 immarg, i32 immarg, i32 immarg, i32, i32 immarg, i32) #1 +declare <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v4i32(<6 x i32>, <4 x i32>, <4 x float>, i32 immarg, i32 immarg, i32 immarg, i32, i32 immarg, i32) #1 +declare <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v8i32(<6 x i32>, <8 x i32>, <4 x float>, i32 immarg, i32 immarg, i32 immarg, i32, i32 immarg, i32) #1 +declare <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v4i32(<8 x i32>, <4 x i32>, <4 x float>, i32 immarg, i32 immarg, i32 immarg, i32, i32 immarg, i32) #1 +declare <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v6i32(<8 x i32>, <6 x i32>, <4 x float>, i32 immarg, i32 immarg, i32 immarg, i32, i32 immarg, i32) #1 + +attributes #0 = { "amdgpu-flat-work-group-size"="512,512" } +attributes #1 = { convergent nocallback nofree nosync nounwind willreturn memory(none) } diff --git a/llvm/test/CodeGen/AMDGPU/no_straddle_llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll b/llvm/test/CodeGen/AMDGPU/no_straddle_llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll new file mode 100644 index 0000000000000..1e7153cabceb7 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/no_straddle_llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll @@ -0,0 +1,692 @@ +;RUN: llc --amdgpu-prevent-half-cache-line-straddling -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 -global-isel=0 -mattr=dumpcode --filetype=obj < %s | llvm-objdump --triple=amdgcn-amd-amdhsa --mcpu=gfx950 -d - > %t.dis +;RUN: %python %p/has_cache_straddle.py %t.dis + + +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp0(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, + i32 0, ; cbsz + i32 0, ; blgp + i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <16 x float> %result +} + +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_1_1__cbsz1__blgp1(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, + i32 0, ; cbsz + i32 0, ; blgp + i32 1, i32 %scale0, i32 1, i32 %scale1) + ret <16 x float> %result +} + +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_2_2__cbsz1__blgp1(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, + i32 0, ; cbsz + i32 0, ; blgp + i32 2, i32 %scale0, i32 2, i32 %scale1) + ret <16 x float> %result +} + +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_3_3__cbsz1__blgp1(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, + i32 0, ; cbsz + i32 0, ; blgp + i32 3, i32 %scale0, i32 3, i32 %scale1) + ret <16 x float> %result +} + +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_3__cbsz1__blgp1(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, + i32 0, ; cbsz + i32 0, ; blgp + i32 0, i32 %scale0, i32 3, i32 %scale1) + ret <16 x float> %result +} + +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_3_0__cbsz1__blgp1(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, + i32 0, ; cbsz + i32 0, ; blgp + i32 3, i32 %scale0, i32 0, i32 %scale1) + ret <16 x float> %result +} + +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_2_3__cbsz1__blgp1(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, + i32 0, ; cbsz + i32 0, ; blgp + i32 2, i32 %scale0, i32 3, i32 %scale1) + ret <16 x float> %result +} + +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_3_2__cbsz1__blgp1(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, + i32 0, ; cbsz + i32 0, ; blgp + i32 3, i32 %scale0, i32 2, i32 %scale1) + ret <16 x float> %result +} + +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp0__constant_scale_0_0(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2) { + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, + i32 0, ; cbsz + i32 0, ; blgp + i32 0, i32 0, i32 0, i32 0) + ret <16 x float> %result +} + +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp1(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, + i32 0, ; cbsz + i32 1, ; blgp + i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <16 x float> %result +} + +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp1__constant_scale_0_0(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2) { + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, + i32 0, ; cbsz + i32 1, ; blgp + i32 0, i32 0, i32 0, i32 0) + ret <16 x float> %result +} + +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp2(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, + i32 0, ; cbsz + i32 2, ; blgp + i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <16 x float> %result +} + +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp2__constant_scale_0_0(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2) { + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, + i32 0, ; cbsz + i32 2, ; blgp + i32 0, i32 0, i32 0, i32 0) + ret <16 x float> %result +} + +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp3(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, + i32 0, ; cbsz + i32 3, ; blgp + i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <16 x float> %result +} + +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp3__constant_scale_0_0(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2) { + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, + i32 0, ; cbsz + i32 3, ; blgp + i32 0, i32 0, i32 0, i32 0) + ret <16 x float> %result +} + +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp4(<8 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v4i32(<8 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2, + i32 0, ; cbsz + i32 4, ; blgp + i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <16 x float> %result +} + +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp4__constant_scale_0_0(<8 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2) { + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v4i32(<8 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2, + i32 0, ; cbsz + i32 4, ; blgp + i32 0, i32 0, i32 0, i32 0) + ret <16 x float> %result +} + +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp0(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, + i32 1, ; cbsz + i32 0, ; blgp + i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <16 x float> %result +} + +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp0__constant_scale_0_0(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2) { + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, + i32 1, ; cbsz + i32 0, ; blgp + i32 0, i32 0, i32 0, i32 0) + ret <16 x float> %result +} + +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp1(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, + i32 1, ; cbsz + i32 1, ; blgp + i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <16 x float> %result +} + + +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp1__constant_scale_0_0(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2) { + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, + i32 1, ; cbsz + i32 1, ; blgp + i32 0, i32 0, i32 0, i32 0) + ret <16 x float> %result +} + +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp2(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, + i32 1, ; cbsz + i32 2, ; blgp + i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <16 x float> %result +} + +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp2__constant_scale_0(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2) { + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, + i32 1, ; cbsz + i32 2, ; blgp + i32 0, i32 0, i32 0, i32 0) + ret <16 x float> %result +} + +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp3(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, + i32 1, ; cbsz + i32 3, ; blgp + i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <16 x float> %result +} + +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp3__constant_scale_0_0(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2) { + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, + i32 1, ; cbsz + i32 3, ; blgp + i32 0, i32 0, i32 0, i32 0) + ret <16 x float> %result +} + +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp4(<8 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v4i32(<8 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2, + i32 1, ; cbsz + i32 4, ; blgp + i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <16 x float> %result +} + +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp4__constant_scale_0_0(<8 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2) { + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v4i32(<8 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2, + i32 1, ; cbsz + i32 4, ; blgp + i32 0, i32 0, i32 0, i32 0) + ret <16 x float> %result +} + +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp0(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, + i32 2, ; cbsz + i32 0, ; blgp + i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <16 x float> %result +} + +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp0__constant_scale_0_0(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2) { + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, + i32 2, ; cbsz + i32 0, ; blgp + i32 0, i32 0, i32 0, i32 0) + ret <16 x float> %result +} + +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp1(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, + i32 2, ; cbsz + i32 1, ; blgp + i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <16 x float> %result +} + +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp1__constant_scale_0_0(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2) { + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, + i32 2, ; cbsz + i32 1, ; blgp + i32 0, i32 0, i32 0, i32 0) + ret <16 x float> %result +} + +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp2(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, + i32 2, ; cbsz + i32 2, ; blgp + i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <16 x float> %result +} + +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp2__constant_scale_0_0(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2) { + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, + i32 2, ; cbsz + i32 2, ; blgp + i32 0, i32 0, i32 0, i32 0) + ret <16 x float> %result +} + +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp3(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, + i32 2, ; cbsz + i32 3, ; blgp + i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <16 x float> %result +} + +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp3__constant_scale_0_0(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2) { + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, + i32 2, ; cbsz + i32 3, ; blgp + i32 0, i32 0, i32 0, i32 0) + ret <16 x float> %result +} + + +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp0(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, + i32 3, ; cbsz + i32 0, ; blgp + i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <16 x float> %result +} + +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp0__constant_scale_0_0(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2) { + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, + i32 3, ; cbsz + i32 0, ; blgp + i32 0, i32 0, i32 0, i32 0) + ret <16 x float> %result +} + +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp1(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, + i32 3, ; cbsz + i32 1, ; blgp + i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <16 x float> %result +} + +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp1__constant_scale_0_0(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2) { + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, + i32 3, ; cbsz + i32 1, ; blgp + i32 0, i32 0, i32 0, i32 0) + ret <16 x float> %result +} + +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp2(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, + i32 3, ; cbsz + i32 2, ; blgp + i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <16 x float> %result +} + +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp2__constant_scale_0_0(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2) { + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, + i32 3, ; cbsz + i32 2, ; blgp + i32 0, i32 0, i32 0, i32 0) + ret <16 x float> %result +} + +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp4(<6 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v4i32(<6 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2, + i32 3, ; cbsz + i32 4, ; blgp + i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <16 x float> %result +} + +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp4__constant_scale_0_0(<6 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2) { + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v4i32(<6 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2, + i32 3, ; cbsz + i32 4, ; blgp + i32 0, i32 0, i32 0, i32 0) + ret <16 x float> %result +} + +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp3(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, + i32 3, ; cbsz + i32 3, ; blgp + i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <16 x float> %result +} + +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp3__constant_scale_0_0(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2) { + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, + i32 3, ; cbsz + i32 3, ; blgp + i32 0, i32 0, i32 0, i32 0) + ret <16 x float> %result +} + +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp4(<6 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v4i32(<6 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2, + i32 2, ; cbsz + i32 4, ; blgp + i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <16 x float> %result +} + +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp4__constant_scale_0_0(<6 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2) { + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v4i32(<6 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2, + i32 2, ; cbsz + i32 4, ; blgp + i32 0, i32 0, i32 0, i32 0) + ret <16 x float> %result +} + +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp0(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v8i32(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, + i32 4, ; cbsz + i32 0, ; blgp + i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <16 x float> %result +} + +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp0__constant_scale_0_0(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2) { + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v8i32(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, + i32 4, ; cbsz + i32 0, ; blgp + i32 0, i32 0, i32 0, i32 0) + ret <16 x float> %result +} + +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp1(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v8i32(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, + i32 4, ; cbsz + i32 1, ; blgp + i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <16 x float> %result +} + +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp1__constant_scale_0_0(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2) { + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v8i32(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, + i32 4, ; cbsz + i32 1, ; blgp + i32 0, i32 0, i32 0, i32 0) + ret <16 x float> %result +} + +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp2(<4 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v6i32(<4 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, + i32 4, ; cbsz + i32 2, ; blgp + i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <16 x float> %result +} + +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp2__constant_scale_0_0(<4 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2) { + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v6i32(<4 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, + i32 4, ; cbsz + i32 2, ; blgp + i32 0, i32 0, i32 0, i32 0) + ret <16 x float> %result +} + +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp3(<4 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v6i32(<4 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, + i32 4, ; cbsz + i32 3, ; blgp + i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <16 x float> %result +} + +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp3__constant_scale_0_0(<4 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2) { + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v6i32(<4 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, + i32 4, ; cbsz + i32 3, ; blgp + i32 0, i32 0, i32 0, i32 0) + ret <16 x float> %result +} + +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp4(<4 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v4i32(<4 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2, + i32 4, ; cbsz + i32 4, ; blgp + i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <16 x float> %result +} + +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp4__constant_scale_0_0(<4 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2) { + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v4i32(<4 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2, + i32 4, ; cbsz + i32 4, ; blgp + i32 0, i32 0, i32 0, i32 0) + ret <16 x float> %result +} + + +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__sgpr_scaleA__sgpr_scaleB(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 inreg %scale0, i32 inreg %scale1) { + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <16 x float> %result +} + +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__sgpr_scaleA__vgpr_scaleB(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 inreg %scale0, i32 %scale1) { + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <16 x float> %result +} + +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__vgpr_scaleA__sgpr_scaleB(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 inreg %scale1) { + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <16 x float> %result +} + +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgprs(<8 x i32> inreg %arg0, <8 x i32> inreg %arg1, <16 x float> inreg %arg2, i32 %scale0, i32 %scale1) { + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <16 x float> %result +} + +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_vgpr__sgpr_vgpr(<8 x i32> inreg %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 inreg %scale0, i32 %scale1) { + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <16 x float> %result +} + +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_vgpr__vgpr_sgpr(<8 x i32> inreg %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 inreg %scale1) { + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <16 x float> %result +} + +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_vgpr_sgpr_vgpr__vgpr_sgpr(<8 x i32> %arg0, <8 x i32> inreg %arg1, <16 x float> %arg2, i32 %scale0, i32 inreg %scale1) { + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <16 x float> %result +} + +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_vgpr_vgpr_sgpr__vgpr_sgpr(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> inreg %arg2, i32 %scale0, i32 inreg %scale1) { + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <16 x float> %result +} + +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_sgpr__vgpr_sgpr(<8 x i32> inreg %arg0, <8 x i32> %arg1, <16 x float> inreg %arg2, i32 %scale0, i32 inreg %scale1) { + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <16 x float> %result +} + +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_inlineimm__scaleB_inlineimm(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 2, i32 33, i32 2, i32 -2) + ret <16 x float> %result +} + +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scaleB_inlineimm(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 2, i32 65, i32 2, i32 -2) + ret <16 x float> %result +} + +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scaleB_FP_literal(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 2, i32 65, i32 2, i32 1065353216) + ret <16 x float> %result +} + +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_FP_literal__scaleB_inlineimm(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 2, i32 1065353216, i32 2, i32 -2) + ret <16 x float> %result +} + +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_FP_literal__scaleB_FP_literal(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 2, i32 1042479491, i32 2, i32 1065353216) + ret <16 x float> %result +} + +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scaleB_kimm(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 2, i32 65, i32 2, i32 77) + ret <16 x float> %result +} + +define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1, ptr addrspace(1) %ptr) #0 { + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 2, i32 3, i32 %scale0, i32 1, i32 %scale1) + store <16 x float> %result, ptr addrspace(1) %ptr, align 64 + ret void +} + +define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd___scaleA_kimm__scaleB__inlineimm(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, ptr addrspace(1) %ptr) #0 { + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 2, i32 3, i32 65, i32 1, i32 -2) + store <16 x float> %result, ptr addrspace(1) %ptr, align 64 + ret void +} + +define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__nonmac(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) #1 { + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1) + store volatile <16 x float> %arg2, ptr addrspace(1) null, align 64 + store volatile <16 x float> %result, ptr addrspace(1) null, align 64 + ret void +} + +define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_25_42__nonmac(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2) #1 { + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 2, i32 0, i32 25, i32 0, i32 42) + store volatile <16 x float> %arg2, ptr addrspace(1) null, align 64 + store volatile <16 x float> %result, ptr addrspace(1) null, align 64 + ret void +} + +define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__vgprcd_nonmac(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2) #0 { + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 2, i32 0, i32 0, i32 0, i32 0) + store volatile <16 x float> %arg2, ptr addrspace(1) null, align 64 + store volatile <16 x float> %result, ptr addrspace(1) null, align 64 + ret void +} + +define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_25_42__vgprcd_nonmac(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2) #0 { + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 2, i32 0, i32 25, i32 0, i32 42) + store volatile <16 x float> %arg2, ptr addrspace(1) null, align 64 + store volatile <16 x float> %result, ptr addrspace(1) null, align 64 + ret void +} + +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_0_0_a(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + ret <16 x float> %result +} + +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_0_0_b(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 3, i32 0, i32 1, i32 0) + ret <16 x float> %result +} + +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_0_1(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1) + ret <16 x float> %result +} + +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_1_0_a(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 1, i32 0, i32 0) + ret <16 x float> %result +} + + +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v8i32_fp6(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, + i32 0, ; cbsz + i32 2, ; blgp + i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <16 x float> %result +} + +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp6__v8i32_fp8(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, + i32 2, ; cbsz + i32 0, ; blgp + i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <16 x float> %result +} + +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp6__v8i32_fp6(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, + i32 2, ; cbsz + i32 2, ; blgp + i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <16 x float> %result +} + +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp6__v8i32_fp6__0_scale(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2) { + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, + i32 2, ; cbsz + i32 2, ; blgp + i32 0, i32 0, i32 0, i32 0) + ret <16 x float> %result +} + +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v8i32_fp4(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, + i32 0, ; cbsz + i32 4, ; blgp + i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <16 x float> %result +} + +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp8(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, + i32 4, ; cbsz + i32 0, ; blgp + i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <16 x float> %result +} + +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v6i32_fp4(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, + i32 0, ; cbsz + i32 4, ; blgp + i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <16 x float> %result +} + +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v6i32_fp4__v8i32_fp8(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, + i32 4, ; cbsz + i32 0, ; blgp + i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <16 x float> %result +} + +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp4(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, + i32 4, ; cbsz + i32 4, ; blgp + i32 0, i32 %scale0, i32 0, i32 %scale1) + ret <16 x float> %result +} + +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp4__0_scale(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2) { + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, + i32 4, ; cbsz + i32 4, ; blgp + i32 0, i32 0, i32 0, i32 0) + ret <16 x float> %result +} + +declare <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32>, <8 x i32>, <16 x float>, i32 immarg, i32 immarg, i32 immarg, i32, i32 immarg, i32) #2 +declare <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v6i32(<6 x i32>, <6 x i32>, <16 x float>, i32 immarg, i32 immarg, i32 immarg, i32, i32 immarg, i32) #2 +declare <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v4i32(<4 x i32>, <4 x i32>, <16 x float>, i32 immarg, i32 immarg, i32 immarg, i32, i32 immarg, i32) #2 +declare <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v6i32(<4 x i32>, <6 x i32>, <16 x float>, i32 immarg, i32 immarg, i32 immarg, i32, i32 immarg, i32) #2 +declare <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v8i32(<4 x i32>, <8 x i32>, <16 x float>, i32 immarg, i32 immarg, i32 immarg, i32, i32 immarg, i32) #2 +declare <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v4i32(<6 x i32>, <4 x i32>, <16 x float>, i32 immarg, i32 immarg, i32 immarg, i32, i32 immarg, i32) #2 +declare <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 x i32>, <8 x i32>, <16 x float>, i32 immarg, i32 immarg, i32 immarg, i32, i32 immarg, i32) #2 +declare <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v4i32(<8 x i32>, <4 x i32>, <16 x float>, i32 immarg, i32 immarg, i32 immarg, i32, i32 immarg, i32) #2 +declare <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32>, <6 x i32>, <16 x float>, i32 immarg, i32 immarg, i32 immarg, i32, i32 immarg, i32) #2 + +attributes #0 = { "amdgpu-flat-work-group-size"="512,512" } +attributes #1 = { "amdgpu-flat-work-group-size"="128,128" } +attributes #2 = { convergent nocallback nofree nosync nounwind willreturn memory(none) } diff --git a/llvm/test/CodeGen/AMDGPU/no_straddle_memmove-var-size.ll b/llvm/test/CodeGen/AMDGPU/no_straddle_memmove-var-size.ll new file mode 100644 index 0000000000000..abde6601afd75 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/no_straddle_memmove-var-size.ll @@ -0,0 +1,149 @@ +;RUN: llc --amdgpu-prevent-half-cache-line-straddling -mtriple=amdgcn -mcpu=gfx1030 -mattr=dumpcode --filetype=obj < %s | llvm-objdump --triple=amdgcn --mcpu=gfx1030 -d - > %t.dis +;RUN: %python %p/has_cache_straddle.py %t.dis + +define void @memmove_p0_p0(ptr addrspace(0) align 1 %dst, ptr addrspace(0) align 1 readonly %src, i64 %sz) { +entry: + tail call void @llvm.memmove.p0.p0.i64(ptr addrspace(0) noundef nonnull align 1 %dst, ptr addrspace(0) noundef nonnull align 1 %src, i64 %sz, i1 false) + ret void +} + +define void @memmove_p0_p1(ptr addrspace(0) align 1 %dst, ptr addrspace(1) align 1 readonly %src, i64 %sz) { +entry: + tail call void @llvm.memmove.p0.p1.i64(ptr addrspace(0) noundef nonnull align 1 %dst, ptr addrspace(1) noundef nonnull align 1 %src, i64 %sz, i1 false) + ret void +} + +define void @memmove_p0_p3(ptr addrspace(0) align 1 %dst, ptr addrspace(3) align 1 readonly %src, i64 %sz) { +entry: + tail call void @llvm.memmove.p0.p3.i64(ptr addrspace(0) noundef nonnull align 1 %dst, ptr addrspace(3) noundef nonnull align 1 %src, i64 %sz, i1 false) + ret void +} + +define void @memmove_p0_p4(ptr addrspace(0) align 1 %dst, ptr addrspace(4) align 1 readonly %src, i64 %sz) { +entry: + tail call void @llvm.memmove.p0.p4.i64(ptr addrspace(0) noundef nonnull align 1 %dst, ptr addrspace(4) noundef nonnull align 1 %src, i64 %sz, i1 false) + ret void +} + +define void @memmove_p0_p5(ptr addrspace(0) align 1 %dst, ptr addrspace(5) align 1 readonly %src, i64 %sz) { +entry: + tail call void @llvm.memmove.p0.p5.i64(ptr addrspace(0) noundef nonnull align 1 %dst, ptr addrspace(5) noundef nonnull align 1 %src, i64 %sz, i1 false) + ret void +} + + +define void @memmove_p1_p0(ptr addrspace(1) align 1 %dst, ptr addrspace(0) align 1 readonly %src, i64 %sz) { +entry: + tail call void @llvm.memmove.p1.p0.i64(ptr addrspace(1) noundef nonnull align 1 %dst, ptr addrspace(0) noundef nonnull align 1 %src, i64 %sz, i1 false) + ret void +} + +define void @memmove_p1_p1(ptr addrspace(1) align 1 %dst, ptr addrspace(1) align 1 readonly %src, i64 %sz) { +entry: + tail call void @llvm.memmove.p1.p1.i64(ptr addrspace(1) noundef nonnull align 1 %dst, ptr addrspace(1) noundef nonnull align 1 %src, i64 %sz, i1 false) + ret void +} + +define void @memmove_p1_p3(ptr addrspace(1) align 1 %dst, ptr addrspace(3) align 1 readonly %src, i64 %sz) { +entry: + tail call void @llvm.memmove.p1.p3.i64(ptr addrspace(1) noundef nonnull align 1 %dst, ptr addrspace(3) noundef nonnull align 1 %src, i64 %sz, i1 false) + ret void +} + +define void @memmove_p1_p4(ptr addrspace(1) align 1 %dst, ptr addrspace(4) align 1 readonly %src, i64 %sz) { +entry: + tail call void @llvm.memmove.p1.p4.i64(ptr addrspace(1) noundef nonnull align 1 %dst, ptr addrspace(4) noundef nonnull align 1 %src, i64 %sz, i1 false) + ret void +} + +define void @memmove_p1_p5(ptr addrspace(1) align 1 %dst, ptr addrspace(5) align 1 readonly %src, i64 %sz) { +entry: + tail call void @llvm.memmove.p1.p5.i64(ptr addrspace(1) noundef nonnull align 1 %dst, ptr addrspace(5) noundef nonnull align 1 %src, i64 %sz, i1 false) + ret void +} + + +define void @memmove_p3_p0(ptr addrspace(3) align 1 %dst, ptr addrspace(0) align 1 readonly %src, i64 %sz) { +entry: + tail call void @llvm.memmove.p3.p0.i64(ptr addrspace(3) noundef nonnull align 1 %dst, ptr addrspace(0) noundef nonnull align 1 %src, i64 %sz, i1 false) + ret void +} + +define void @memmove_p3_p1(ptr addrspace(3) align 1 %dst, ptr addrspace(1) align 1 readonly %src, i64 %sz) { +entry: + tail call void @llvm.memmove.p3.p1.i64(ptr addrspace(3) noundef nonnull align 1 %dst, ptr addrspace(1) noundef nonnull align 1 %src, i64 %sz, i1 false) + ret void +} + +define void @memmove_p3_p3(ptr addrspace(3) align 1 %dst, ptr addrspace(3) align 1 readonly %src, i64 %sz) { +entry: + tail call void @llvm.memmove.p3.p3.i64(ptr addrspace(3) noundef nonnull align 1 %dst, ptr addrspace(3) noundef nonnull align 1 %src, i64 %sz, i1 false) + ret void +} + +define void @memmove_p3_p4(ptr addrspace(3) align 1 %dst, ptr addrspace(4) align 1 readonly %src, i64 %sz) { +entry: + tail call void @llvm.memmove.p3.p4.i64(ptr addrspace(3) noundef nonnull align 1 %dst, ptr addrspace(4) noundef nonnull align 1 %src, i64 %sz, i1 false) + ret void +} + +define void @memmove_p3_p5(ptr addrspace(3) align 1 %dst, ptr addrspace(5) align 1 readonly %src, i64 %sz) { +entry: + tail call void @llvm.memmove.p3.p5.i64(ptr addrspace(3) noundef nonnull align 1 %dst, ptr addrspace(5) noundef nonnull align 1 %src, i64 %sz, i1 false) + ret void +} + + +define void @memmove_p5_p0(ptr addrspace(5) align 1 %dst, ptr addrspace(0) align 1 readonly %src, i64 %sz) { +entry: + tail call void @llvm.memmove.p5.p0.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(0) noundef nonnull align 1 %src, i64 %sz, i1 false) + ret void +} + +define void @memmove_p5_p1(ptr addrspace(5) align 1 %dst, ptr addrspace(1) align 1 readonly %src, i64 %sz) { +entry: + tail call void @llvm.memmove.p5.p1.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(1) noundef nonnull align 1 %src, i64 %sz, i1 false) + ret void +} + +define void @memmove_p5_p3(ptr addrspace(5) align 1 %dst, ptr addrspace(3) align 1 readonly %src, i64 %sz) { +entry: + tail call void @llvm.memmove.p5.p3.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(3) noundef nonnull align 1 %src, i64 %sz, i1 false) + ret void +} + +define void @memmove_p5_p4(ptr addrspace(5) align 1 %dst, ptr addrspace(4) align 1 readonly %src, i64 %sz) { +entry: + tail call void @llvm.memmove.p5.p4.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(4) noundef nonnull align 1 %src, i64 %sz, i1 false) + ret void +} + +define void @memmove_p5_p5(ptr addrspace(5) align 1 %dst, ptr addrspace(5) align 1 readonly %src, i64 %sz) { +entry: + tail call void @llvm.memmove.p5.p5.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(5) noundef nonnull align 1 %src, i64 %sz, i1 false) + ret void +} + + +declare void @llvm.memmove.p0.p0.i64(ptr addrspace(0) nocapture writeonly, ptr addrspace(0) nocapture readonly, i64, i1 immarg) #0 +declare void @llvm.memmove.p0.p1.i64(ptr addrspace(0) nocapture writeonly, ptr addrspace(1) nocapture readonly, i64, i1 immarg) #0 +declare void @llvm.memmove.p0.p3.i64(ptr addrspace(0) nocapture writeonly, ptr addrspace(3) nocapture readonly, i64, i1 immarg) #0 +declare void @llvm.memmove.p0.p4.i64(ptr addrspace(0) nocapture writeonly, ptr addrspace(4) nocapture readonly, i64, i1 immarg) #0 +declare void @llvm.memmove.p0.p5.i64(ptr addrspace(0) nocapture writeonly, ptr addrspace(5) nocapture readonly, i64, i1 immarg) #0 +declare void @llvm.memmove.p1.p0.i64(ptr addrspace(1) nocapture writeonly, ptr addrspace(0) nocapture readonly, i64, i1 immarg) #0 +declare void @llvm.memmove.p1.p1.i64(ptr addrspace(1) nocapture writeonly, ptr addrspace(1) nocapture readonly, i64, i1 immarg) #0 +declare void @llvm.memmove.p1.p3.i64(ptr addrspace(1) nocapture writeonly, ptr addrspace(3) nocapture readonly, i64, i1 immarg) #0 +declare void @llvm.memmove.p1.p4.i64(ptr addrspace(1) nocapture writeonly, ptr addrspace(4) nocapture readonly, i64, i1 immarg) #0 +declare void @llvm.memmove.p1.p5.i64(ptr addrspace(1) nocapture writeonly, ptr addrspace(5) nocapture readonly, i64, i1 immarg) #0 +declare void @llvm.memmove.p3.p0.i64(ptr addrspace(3) nocapture writeonly, ptr addrspace(0) nocapture readonly, i64, i1 immarg) #0 +declare void @llvm.memmove.p3.p1.i64(ptr addrspace(3) nocapture writeonly, ptr addrspace(1) nocapture readonly, i64, i1 immarg) #0 +declare void @llvm.memmove.p3.p3.i64(ptr addrspace(3) nocapture writeonly, ptr addrspace(3) nocapture readonly, i64, i1 immarg) #0 +declare void @llvm.memmove.p3.p4.i64(ptr addrspace(3) nocapture writeonly, ptr addrspace(4) nocapture readonly, i64, i1 immarg) #0 +declare void @llvm.memmove.p3.p5.i64(ptr addrspace(3) nocapture writeonly, ptr addrspace(5) nocapture readonly, i64, i1 immarg) #0 +declare void @llvm.memmove.p5.p0.i64(ptr addrspace(5) nocapture writeonly, ptr addrspace(0) nocapture readonly, i64, i1 immarg) #0 +declare void @llvm.memmove.p5.p1.i64(ptr addrspace(5) nocapture writeonly, ptr addrspace(1) nocapture readonly, i64, i1 immarg) #0 +declare void @llvm.memmove.p5.p3.i64(ptr addrspace(5) nocapture writeonly, ptr addrspace(3) nocapture readonly, i64, i1 immarg) #0 +declare void @llvm.memmove.p5.p4.i64(ptr addrspace(5) nocapture writeonly, ptr addrspace(4) nocapture readonly, i64, i1 immarg) #0 +declare void @llvm.memmove.p5.p5.i64(ptr addrspace(5) nocapture writeonly, ptr addrspace(5) nocapture readonly, i64, i1 immarg) #0 + +attributes #0 = { nocallback nofree nounwind willreturn memory(argmem: readwrite) } diff --git a/llvm/test/CodeGen/AMDGPU/no_straddle_mfma-loop.ll b/llvm/test/CodeGen/AMDGPU/no_straddle_mfma-loop.ll new file mode 100644 index 0000000000000..b9b1e04bbecec --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/no_straddle_mfma-loop.ll @@ -0,0 +1,303 @@ +;RUN: llc --amdgpu-prevent-half-cache-line-straddling -mtriple=amdgcn -mcpu=gfx908 -mattr=dumpcode --filetype=obj < %s | llvm-objdump --triple=amdgcn --mcpu=gfx908 -d - > %t.dis +;RUN: %python %p/has_cache_straddle.py %t.dis + +define amdgpu_kernel void @test_mfma_loop_zeroinit(ptr addrspace(1) %arg) #0 { +entry: + br label %for.cond.preheader + +for.cond.preheader: + %phi = phi <32 x float> [ zeroinitializer, %entry ], [ %mai.1, %for.cond.preheader ] + %c = phi i32 [ 0, %entry ], [ %inc, %for.cond.preheader ] + %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %phi, i32 0, i32 0, i32 0) + %inc = add nuw nsw i32 %c, 1 + %cc = icmp eq i32 %inc, 16 + br i1 %cc, label %exit, label %for.cond.preheader + +exit: + store <32 x float> %mai.1, ptr addrspace(1) %arg + ret void +} + + + +define amdgpu_kernel void @test_mfma_loop_unfoldable_splat(ptr addrspace(1) %arg) #0 { +entry: + br label %for.cond.preheader + +for.cond.preheader: + %phi = phi <32 x float> [ , %entry ], [ %mai.1, %for.cond.preheader ] + %c = phi i32 [ 0, %entry ], [ %inc, %for.cond.preheader ] + %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %phi, i32 0, i32 0, i32 0) + %inc = add nuw nsw i32 %c, 1 + %cc = icmp eq i32 %inc, 16 + br i1 %cc, label %exit, label %for.cond.preheader + +exit: + store <32 x float> %mai.1, ptr addrspace(1) %arg + ret void +} + +define amdgpu_kernel void @test_mfma_loop_non_splat(ptr addrspace(1) %arg) #0 { +entry: + br label %for.cond.preheader + +for.cond.preheader: + %phi = phi <32 x float> [ , %entry ], [ %mai.1, %for.cond.preheader ] + %c = phi i32 [ 0, %entry ], [ %inc, %for.cond.preheader ] + %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %phi, i32 0, i32 0, i32 0) + %inc = add nuw nsw i32 %c, 1 + %cc = icmp eq i32 %inc, 16 + br i1 %cc, label %exit, label %for.cond.preheader + +exit: + store <32 x float> %mai.1, ptr addrspace(1) %arg + ret void +} + + + +define amdgpu_kernel void @test_mfma_loop_unfoldable_seq(ptr addrspace(1) %arg) #0 { +entry: + br label %for.cond.preheader + +for.cond.preheader: + %phi = phi <32 x float> [ , %entry ], [ %mai.1, %for.cond.preheader ] + %c = phi i32 [ 0, %entry ], [ %inc, %for.cond.preheader ] + %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %phi, i32 0, i32 0, i32 0) + %inc = add nuw nsw i32 %c, 1 + %cc = icmp eq i32 %inc, 16 + br i1 %cc, label %exit, label %for.cond.preheader + +exit: + store <32 x float> %mai.1, ptr addrspace(1) %arg + ret void +} + +define amdgpu_kernel void @test_mfma_loop_vgpr_init(ptr addrspace(1) %arg) #0 { +entry: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %init = bitcast i32 %tid to float + %tmp0 = insertelement <32 x float> poison, float %init, i32 0 + %tmp1 = insertelement <32 x float> %tmp0, float %init, i32 1 + %tmp2 = insertelement <32 x float> %tmp1, float %init, i32 2 + %tmp3 = insertelement <32 x float> %tmp2, float %init, i32 3 + %tmp4 = insertelement <32 x float> %tmp3, float %init, i32 4 + %tmp5 = insertelement <32 x float> %tmp4, float %init, i32 5 + %tmp6 = insertelement <32 x float> %tmp5, float %init, i32 6 + %tmp7 = insertelement <32 x float> %tmp6, float %init, i32 7 + %tmp8 = insertelement <32 x float> %tmp7, float %init, i32 8 + %tmp9 = insertelement <32 x float> %tmp8, float %init, i32 9 + %tmp10 = insertelement <32 x float> %tmp9, float %init, i32 10 + %tmp11 = insertelement <32 x float> %tmp10, float %init, i32 11 + %tmp12 = insertelement <32 x float> %tmp11, float %init, i32 12 + %tmp13 = insertelement <32 x float> %tmp12, float %init, i32 13 + %tmp14 = insertelement <32 x float> %tmp13, float %init, i32 14 + %tmp15 = insertelement <32 x float> %tmp14, float %init, i32 15 + %tmp16 = insertelement <32 x float> %tmp15, float %init, i32 16 + %tmp17 = insertelement <32 x float> %tmp16, float %init, i32 17 + %tmp18 = insertelement <32 x float> %tmp17, float %init, i32 18 + %tmp19 = insertelement <32 x float> %tmp18, float %init, i32 19 + %tmp20 = insertelement <32 x float> %tmp19, float %init, i32 20 + %tmp21 = insertelement <32 x float> %tmp20, float %init, i32 21 + %tmp22 = insertelement <32 x float> %tmp21, float %init, i32 22 + %tmp23 = insertelement <32 x float> %tmp22, float %init, i32 23 + %tmp24 = insertelement <32 x float> %tmp23, float %init, i32 24 + %tmp25 = insertelement <32 x float> %tmp24, float %init, i32 25 + %tmp26 = insertelement <32 x float> %tmp25, float %init, i32 26 + %tmp27 = insertelement <32 x float> %tmp26, float %init, i32 27 + %tmp28 = insertelement <32 x float> %tmp27, float %init, i32 28 + %tmp29 = insertelement <32 x float> %tmp28, float %init, i32 29 + %tmp30 = insertelement <32 x float> %tmp29, float %init, i32 30 + %tmp31 = insertelement <32 x float> %tmp30, float %init, i32 31 + + br label %for.cond.preheader + +for.cond.preheader: + %phi = phi <32 x float> [ %tmp31, %entry ], [ %mai.1, %for.cond.preheader ] + %c = phi i32 [ 0, %entry ], [ %inc, %for.cond.preheader ] + %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %phi, i32 0, i32 0, i32 0) + %inc = add nuw nsw i32 %c, 1 + %cc = icmp eq i32 %inc, 16 + br i1 %cc, label %exit, label %for.cond.preheader + +exit: + store <32 x float> %mai.1, ptr addrspace(1) %arg + ret void +} + +define amdgpu_kernel void @test_mfma_loop_sgpr_init(ptr addrspace(1) %arg, float %init) #0 { +entry: + %tmp0 = insertelement <32 x float> poison, float %init, i32 0 + %tmp1 = insertelement <32 x float> %tmp0, float %init, i32 1 + %tmp2 = insertelement <32 x float> %tmp1, float %init, i32 2 + %tmp3 = insertelement <32 x float> %tmp2, float %init, i32 3 + %tmp4 = insertelement <32 x float> %tmp3, float %init, i32 4 + %tmp5 = insertelement <32 x float> %tmp4, float %init, i32 5 + %tmp6 = insertelement <32 x float> %tmp5, float %init, i32 6 + %tmp7 = insertelement <32 x float> %tmp6, float %init, i32 7 + %tmp8 = insertelement <32 x float> %tmp7, float %init, i32 8 + %tmp9 = insertelement <32 x float> %tmp8, float %init, i32 9 + %tmp10 = insertelement <32 x float> %tmp9, float %init, i32 10 + %tmp11 = insertelement <32 x float> %tmp10, float %init, i32 11 + %tmp12 = insertelement <32 x float> %tmp11, float %init, i32 12 + %tmp13 = insertelement <32 x float> %tmp12, float %init, i32 13 + %tmp14 = insertelement <32 x float> %tmp13, float %init, i32 14 + %tmp15 = insertelement <32 x float> %tmp14, float %init, i32 15 + %tmp16 = insertelement <32 x float> %tmp15, float %init, i32 16 + %tmp17 = insertelement <32 x float> %tmp16, float %init, i32 17 + %tmp18 = insertelement <32 x float> %tmp17, float %init, i32 18 + %tmp19 = insertelement <32 x float> %tmp18, float %init, i32 19 + %tmp20 = insertelement <32 x float> %tmp19, float %init, i32 20 + %tmp21 = insertelement <32 x float> %tmp20, float %init, i32 21 + %tmp22 = insertelement <32 x float> %tmp21, float %init, i32 22 + %tmp23 = insertelement <32 x float> %tmp22, float %init, i32 23 + %tmp24 = insertelement <32 x float> %tmp23, float %init, i32 24 + %tmp25 = insertelement <32 x float> %tmp24, float %init, i32 25 + %tmp26 = insertelement <32 x float> %tmp25, float %init, i32 26 + %tmp27 = insertelement <32 x float> %tmp26, float %init, i32 27 + %tmp28 = insertelement <32 x float> %tmp27, float %init, i32 28 + %tmp29 = insertelement <32 x float> %tmp28, float %init, i32 29 + %tmp30 = insertelement <32 x float> %tmp29, float %init, i32 30 + %tmp31 = insertelement <32 x float> %tmp30, float %init, i32 31 + + br label %for.cond.preheader + +for.cond.preheader: + %phi = phi <32 x float> [ %tmp31, %entry ], [ %mai.1, %for.cond.preheader ] + %c = phi i32 [ 0, %entry ], [ %inc, %for.cond.preheader ] + %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %phi, i32 0, i32 0, i32 0) + %inc = add nuw nsw i32 %c, 1 + %cc = icmp eq i32 %inc, 16 + br i1 %cc, label %exit, label %for.cond.preheader + +exit: + store <32 x float> %mai.1, ptr addrspace(1) %arg + ret void +} + +define amdgpu_kernel void @test_mfma_loop_mixed_init(ptr addrspace(1) %arg, float %x) #0 { +entry: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %init = bitcast i32 %tid to float + %tmp0 = insertelement <32 x float> zeroinitializer, float %init, i32 0 + %tmp1 = insertelement <32 x float> %tmp0, float %x, i32 1 + + br label %for.cond.preheader + +for.cond.preheader: + %phi = phi <32 x float> [ %tmp1, %entry ], [ %mai.1, %for.cond.preheader ] + %c = phi i32 [ 0, %entry ], [ %inc, %for.cond.preheader ] + %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %phi, i32 0, i32 0, i32 0) + %inc = add nuw nsw i32 %c, 1 + %cc = icmp eq i32 %inc, 16 + br i1 %cc, label %exit, label %for.cond.preheader + +exit: + store <32 x float> %mai.1, ptr addrspace(1) %arg + ret void +} + +define amdgpu_kernel void @test_mfma_loop_mfma_forward_init(ptr addrspace(1) %arg) #0 { +entry: + %mai.0 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> zeroinitializer, i32 0, i32 0, i32 0) + + br label %for.cond.preheader + +for.cond.preheader: + %phi = phi <32 x float> [ %mai.0, %entry ], [ %mai.1, %for.cond.preheader ] + %c = phi i32 [ 0, %entry ], [ %inc, %for.cond.preheader ] + %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %phi, i32 0, i32 0, i32 0) + %inc = add nuw nsw i32 %c, 1 + %cc = icmp eq i32 %inc, 16 + br i1 %cc, label %exit, label %for.cond.preheader + +exit: + store <32 x float> %mai.1, ptr addrspace(1) %arg + ret void +} + + +define amdgpu_kernel void @test_mfma_loop_agpr_init(ptr addrspace(1) %arg) #0 { +entry: + %mai.0 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> zeroinitializer, i32 0, i32 0, i32 0) + %init = extractelement <32 x float> %mai.0, i32 0 + %tmp0 = insertelement <32 x float> poison, float %init, i32 0 + %tmp1 = insertelement <32 x float> %tmp0, float %init, i32 1 + %tmp2 = insertelement <32 x float> %tmp1, float %init, i32 2 + %tmp3 = insertelement <32 x float> %tmp2, float %init, i32 3 + %tmp4 = insertelement <32 x float> %tmp3, float %init, i32 4 + %tmp5 = insertelement <32 x float> %tmp4, float %init, i32 5 + %tmp6 = insertelement <32 x float> %tmp5, float %init, i32 6 + %tmp7 = insertelement <32 x float> %tmp6, float %init, i32 7 + %tmp8 = insertelement <32 x float> %tmp7, float %init, i32 8 + %tmp9 = insertelement <32 x float> %tmp8, float %init, i32 9 + %tmp10 = insertelement <32 x float> %tmp9, float %init, i32 10 + %tmp11 = insertelement <32 x float> %tmp10, float %init, i32 11 + %tmp12 = insertelement <32 x float> %tmp11, float %init, i32 12 + %tmp13 = insertelement <32 x float> %tmp12, float %init, i32 13 + %tmp14 = insertelement <32 x float> %tmp13, float %init, i32 14 + %tmp15 = insertelement <32 x float> %tmp14, float %init, i32 15 + %tmp16 = insertelement <32 x float> %tmp15, float %init, i32 16 + %tmp17 = insertelement <32 x float> %tmp16, float %init, i32 17 + %tmp18 = insertelement <32 x float> %tmp17, float %init, i32 18 + %tmp19 = insertelement <32 x float> %tmp18, float %init, i32 19 + %tmp20 = insertelement <32 x float> %tmp19, float %init, i32 20 + %tmp21 = insertelement <32 x float> %tmp20, float %init, i32 21 + %tmp22 = insertelement <32 x float> %tmp21, float %init, i32 22 + %tmp23 = insertelement <32 x float> %tmp22, float %init, i32 23 + %tmp24 = insertelement <32 x float> %tmp23, float %init, i32 24 + %tmp25 = insertelement <32 x float> %tmp24, float %init, i32 25 + %tmp26 = insertelement <32 x float> %tmp25, float %init, i32 26 + %tmp27 = insertelement <32 x float> %tmp26, float %init, i32 27 + %tmp28 = insertelement <32 x float> %tmp27, float %init, i32 28 + %tmp29 = insertelement <32 x float> %tmp28, float %init, i32 29 + %tmp30 = insertelement <32 x float> %tmp29, float %init, i32 30 + %tmp31 = insertelement <32 x float> %tmp30, float %init, i32 31 + + br label %for.cond.preheader + +for.cond.preheader: + %phi = phi <32 x float> [ %tmp31, %entry ], [ %mai.1, %for.cond.preheader ] + %c = phi i32 [ 0, %entry ], [ %inc, %for.cond.preheader ] + %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %phi, i32 0, i32 0, i32 0) + %inc = add nuw nsw i32 %c, 1 + %cc = icmp eq i32 %inc, 16 + br i1 %cc, label %exit, label %for.cond.preheader + +exit: + store <32 x float> %mai.1, ptr addrspace(1) %arg + ret void +} + + +define amdgpu_kernel void @test_mfma_nested_loop_zeroinit(ptr addrspace(1) %arg) #0 { +entry: + br label %for.cond.preheader + +for.cond.preheader: + %phi.0 = phi <32 x float> [ zeroinitializer, %entry ], [ %mai.1, %inner.exit ] + %c.0 = phi i32 [ 0, %entry ], [ %inc.0, %inner.exit ] + br label %inner.for.cond.preheader + +inner.for.cond.preheader: + %phi = phi <32 x float> [ %phi.0, %for.cond.preheader ], [ %mai.1, %inner.for.cond.preheader ] + %c = phi i32 [ 0, %for.cond.preheader ], [ %inc, %inner.for.cond.preheader ] + %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %phi, i32 0, i32 0, i32 0) + %inc = add nuw nsw i32 %c, 1 + %cc = icmp eq i32 %inc, 16 + br i1 %cc, label %inner.exit, label %inner.for.cond.preheader + +inner.exit: + %inc.0 = add nuw nsw i32 %c.0, 1 + %cc.0 = icmp eq i32 %inc.0, 16 + br i1 %cc.0, label %exit, label %for.cond.preheader + +exit: + store <32 x float> %mai.1, ptr addrspace(1) %arg + ret void +} + +declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float, float, <32 x float>, i32, i32, i32) +declare i32 @llvm.amdgcn.workitem.id.x() + +attributes #0 = { "amdgpu-flat-work-group-size"="1,256" } diff --git a/llvm/test/CodeGen/AMDGPU/no_straddle_mfma-no-register-aliasing.ll b/llvm/test/CodeGen/AMDGPU/no_straddle_mfma-no-register-aliasing.ll new file mode 100644 index 0000000000000..94cc599c02aab --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/no_straddle_mfma-no-register-aliasing.ll @@ -0,0 +1,42 @@ +;RUN: llc --amdgpu-prevent-half-cache-line-straddling -mtriple=amdgcn -mcpu=gfx908 -mattr=dumpcode --filetype=obj < %s | llvm-objdump --triple=amdgcn --mcpu=gfx908 -d - > %t.dis +;RUN: %python %p/has_cache_straddle.py %t.dis + +declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float, float, <32 x float>, i32, i32, i32) +declare <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float, float, <16 x float>, i32, i32, i32) +declare <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float, float, <4 x float>, i32, i32, i32) + +define amdgpu_kernel void @test_mfma_f32_32x32x1f32(ptr addrspace(1) %arg) #0 { +bb: + %in.1 = load <32 x float>, ptr addrspace(1) %arg + %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %in.1, i32 0, i32 0, i32 0) + %mai.2 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %mai.1, i32 0, i32 0, i32 0) + %tmp.1 = shufflevector <32 x float> %mai.2, <32 x float> %mai.1, <32 x i32> + %mai.3 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %tmp.1, i32 0, i32 0, i32 0) + store <32 x float> %mai.3, ptr addrspace(1) %arg + ret void +} + +define amdgpu_kernel void @test_mfma_f32_16x16x1f32(ptr addrspace(1) %arg) #0 { +bb: + %in.1 = load <16 x float>, ptr addrspace(1) %arg + %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float 1.0, float 2.0, <16 x float> %in.1, i32 0, i32 0, i32 0) + %mai.2 = tail call <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float 1.0, float 2.0, <16 x float> %mai.1, i32 0, i32 0, i32 0) + %tmp.1 = shufflevector <16 x float> %mai.2, <16 x float> %mai.1, <16 x i32> + %mai.3 = tail call <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float 1.0, float 2.0, <16 x float> %tmp.1, i32 0, i32 0, i32 0) + store <16 x float> %mai.3, ptr addrspace(1) %arg + ret void +} + + +define amdgpu_kernel void @test_mfma_f32_4x4x1f32(ptr addrspace(1) %arg) #0 { +bb: + %in.1 = load <4 x float>, ptr addrspace(1) %arg + %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float 1.0, float 2.0, <4 x float> %in.1, i32 0, i32 0, i32 0) + %mai.2 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float 1.0, float 2.0, <4 x float> %mai.1, i32 0, i32 0, i32 0) + %tmp.1 = shufflevector <4 x float> %mai.1, <4 x float> %mai.2, <4 x i32> + %mai.3 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float 1.0, float 2.0, <4 x float> %tmp.1, i32 0, i32 0, i32 0) + store <4 x float> %mai.3, ptr addrspace(1) %arg + ret void +} + +attributes #0 = { "amdgpu-flat-work-group-size"="1,256" } diff --git a/llvm/test/CodeGen/AMDGPU/no_straddle_mfma-vgpr-cd-select.ll b/llvm/test/CodeGen/AMDGPU/no_straddle_mfma-vgpr-cd-select.ll new file mode 100644 index 0000000000000..a474c38610032 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/no_straddle_mfma-vgpr-cd-select.ll @@ -0,0 +1,122 @@ +;RUN: llc --amdgpu-prevent-half-cache-line-straddling -mtriple=amdgcn -mcpu=gfx90a -mattr=dumpcode --filetype=obj < %s | llvm-objdump --triple=amdgcn --mcpu=gfx90a -d - > %t.dis +;RUN: %python %p/has_cache_straddle.py %t.dis + +declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float, float, <32 x float>, i32, i32, i32) +declare <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float, float, <16 x float>, i32, i32, i32) +declare <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float, float, <4 x float>, i32, i32, i32) +declare <16 x float> @llvm.amdgcn.mfma.f32.32x32x2f32(float, float, <16 x float>, i32, i32, i32) +declare <4 x float> @llvm.amdgcn.mfma.f32.16x16x4f32(float, float, <4 x float>, i32, i32, i32) +declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x4f16(<4 x half>, <4 x half>, <32 x float>, i32, i32, i32) +declare <16 x float> @llvm.amdgcn.mfma.f32.16x16x4f16(<4 x half>, <4 x half>, <16 x float>, i32, i32, i32) +declare <4 x float> @llvm.amdgcn.mfma.f32.4x4x4f16(<4 x half>, <4 x half>, <4 x float>, i32, i32, i32) +declare <16 x float> @llvm.amdgcn.mfma.f32.32x32x8f16(<4 x half>, <4 x half>, <16 x float>, i32, i32, i32) +declare <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half>, <4 x half>, <4 x float>, i32, i32, i32) +declare <32 x i32> @llvm.amdgcn.mfma.i32.32x32x4i8(i32, i32, <32 x i32>, i32, i32, i32) +declare <16 x i32> @llvm.amdgcn.mfma.i32.16x16x4i8(i32, i32, <16 x i32>, i32, i32, i32) +declare <4 x i32> @llvm.amdgcn.mfma.i32.4x4x4i8(i32, i32, <4 x i32>, i32, i32, i32) + +define amdgpu_kernel void @test_mfma_f32_32x32x1f32(ptr addrspace(1) %arg) #0 { +bb: + %in.1 = load <32 x float>, ptr addrspace(1) %arg + %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 1.0, <32 x float> %in.1, i32 0, i32 0, i32 0) + store <32 x float> %mai.1, ptr addrspace(1) %arg + ret void +} + +define amdgpu_kernel void @test_mfma_f32_16x16x1f32(ptr addrspace(1) %arg) #0 { +bb: + %in.1 = load <16 x float>, ptr addrspace(1) %arg + %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float 1.0, float 1.0, <16 x float> %in.1, i32 0, i32 0, i32 0) + store <16 x float> %mai.1, ptr addrspace(1) %arg + ret void +} + +define amdgpu_kernel void @test_mfma_f32_4x4x1f32(ptr addrspace(1) %arg) #0 { +bb: + %in.1 = load <4 x float>, ptr addrspace(1) %arg + %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float 1.0, float 1.0, <4 x float> %in.1, i32 0, i32 0, i32 0) + store <4 x float> %mai.1, ptr addrspace(1) %arg + ret void +} + +define amdgpu_kernel void @test_mfma_f32_32x32x2f32(ptr addrspace(1) %arg) #0 { +bb: + %in.1 = load <16 x float>, ptr addrspace(1) %arg + %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x2f32(float 1.0, float 1.0, <16 x float> %in.1, i32 0, i32 0, i32 0) + store <16 x float> %mai.1, ptr addrspace(1) %arg + ret void +} + +define amdgpu_kernel void @test_mfma_f32_16x16x4f32(ptr addrspace(1) %arg) #0 { +bb: + %in.1 = load <4 x float>, ptr addrspace(1) %arg + %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x4f32(float 1.0, float 1.0, <4 x float> %in.1, i32 0, i32 0, i32 0) + store <4 x float> %mai.1, ptr addrspace(1) %arg + ret void +} + +define amdgpu_kernel void @test_mfma_f32_32x32x4f16(ptr addrspace(1) %arg) #0 { +bb: + %in.1 = load <32 x float>, ptr addrspace(1) %arg + %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x4f16(<4 x half> poison, <4 x half> poison, <32 x float> %in.1, i32 0, i32 0, i32 0) + store <32 x float> %mai.1, ptr addrspace(1) %arg + ret void +} + +define amdgpu_kernel void @test_mfma_f32_16x16x4f16(ptr addrspace(1) %arg) #0 { +bb: + %in.1 = load <16 x float>, ptr addrspace(1) %arg + %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.16x16x4f16(<4 x half> poison, <4 x half> poison, <16 x float> %in.1, i32 0, i32 0, i32 0) + store <16 x float> %mai.1, ptr addrspace(1) %arg + ret void +} + +define amdgpu_kernel void @test_mfma_f32_4x4x4f16(ptr addrspace(1) %arg) #0 { +bb: + %in.1 = load <4 x float>, ptr addrspace(1) %arg + %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x4f16(<4 x half> poison, <4 x half> poison, <4 x float> %in.1, i32 0, i32 0, i32 0) + store <4 x float> %mai.1, ptr addrspace(1) %arg + ret void +} + +define amdgpu_kernel void @test_mfma_f32_32x32x8f16(ptr addrspace(1) %arg) #0 { +bb: + %in.1 = load <16 x float>, ptr addrspace(1) %arg + %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x8f16(<4 x half> poison, <4 x half> poison, <16 x float> %in.1, i32 0, i32 0, i32 0) + store <16 x float> %mai.1, ptr addrspace(1) %arg + ret void +} + +define amdgpu_kernel void @test_mfma_f32_16x16x16f16(ptr addrspace(1) %arg) #0 { +bb: + %in.1 = load <4 x float>, ptr addrspace(1) %arg + %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> poison, <4 x half> poison, <4 x float> %in.1, i32 0, i32 0, i32 0) + store <4 x float> %mai.1, ptr addrspace(1) %arg + ret void +} + +define amdgpu_kernel void @test_mfma_i32_32x32x4i8(ptr addrspace(1) %arg) #0 { +bb: + %in.1 = load <32 x i32>, ptr addrspace(1) %arg + %mai.1 = tail call <32 x i32> @llvm.amdgcn.mfma.i32.32x32x4i8(i32 1, i32 1, <32 x i32> %in.1, i32 0, i32 0, i32 0) + store <32 x i32> %mai.1, ptr addrspace(1) %arg + ret void +} + +define amdgpu_kernel void @test_mfma_i32_16x16x4i8(ptr addrspace(1) %arg) #0 { +bb: + %in.1 = load <16 x i32>, ptr addrspace(1) %arg + %mai.1 = tail call <16 x i32> @llvm.amdgcn.mfma.i32.16x16x4i8(i32 1, i32 1, <16 x i32> %in.1, i32 0, i32 0, i32 0) + store <16 x i32> %mai.1, ptr addrspace(1) %arg + ret void +} + +define amdgpu_kernel void @test_mfma_i32_4x4x4i8(ptr addrspace(1) %arg) #0 { +bb: + %in.1 = load <4 x i32>, ptr addrspace(1) %arg + %mai.1 = tail call <4 x i32> @llvm.amdgcn.mfma.i32.4x4x4i8(i32 1, i32 1, <4 x i32> %in.1, i32 0, i32 0, i32 0) + store <4 x i32> %mai.1, ptr addrspace(1) %arg + ret void +} + +attributes #0 = { "amdgpu-agpr-alloc"="0" } diff --git a/llvm/test/CodeGen/AMDGPU/no_straddle_saddsat.ll b/llvm/test/CodeGen/AMDGPU/no_straddle_saddsat.ll new file mode 100644 index 0000000000000..4658fbe9db63c --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/no_straddle_saddsat.ll @@ -0,0 +1,61 @@ +;RUN: llc --amdgpu-prevent-half-cache-line-straddling -mtriple=amdgcn -mcpu=fiji -mattr=dumpcode --filetype=obj < %s | llvm-objdump --triple=amdgcn --mcpu=fiji -d - > %t.dis +;RUN: %python %p/has_cache_straddle.py %t.dis + +;RUN: llc --amdgpu-prevent-half-cache-line-straddling -mtriple=amdgcn -mcpu=gfx900 -mattr=dumpcode --filetype=obj < %s | llvm-objdump --triple=amdgcn --mcpu=gfx900 -d - > %t.dis +;RUN: %python %p/has_cache_straddle.py %t.dis + +;RUN: llc --amdgpu-prevent-half-cache-line-straddling -mtriple=amdgcn -mcpu=gfx1010 -mattr=-real-true16 -mattr=dumpcode --filetype=obj < %s | llvm-objdump --triple=amdgcn --mcpu=gfx1010 -d - > %t.dis +;RUN: %python %p/has_cache_straddle.py %t.dis + +;RUN: llc --amdgpu-prevent-half-cache-line-straddling -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -amdgpu-enable-delay-alu=0 -mattr=dumpcode --filetype=obj < %s | llvm-objdump --triple=amdgcn --mcpu=gfx1100 -d - > %t.dis +;RUN: %python %p/has_cache_straddle.py %t.dis + +define i8 @v_saddsat_i8(i8 %lhs, i8 %rhs) { + %result = call i8 @llvm.sadd.sat.i8(i8 %lhs, i8 %rhs) + ret i8 %result +} + +define i16 @v_saddsat_i16(i16 %lhs, i16 %rhs) { + %result = call i16 @llvm.sadd.sat.i16(i16 %lhs, i16 %rhs) + ret i16 %result +} + +define i32 @v_saddsat_i32(i32 %lhs, i32 %rhs) { + %result = call i32 @llvm.sadd.sat.i32(i32 %lhs, i32 %rhs) + ret i32 %result +} + +define <2 x i16> @v_saddsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) { + %result = call <2 x i16> @llvm.sadd.sat.v2i16(<2 x i16> %lhs, <2 x i16> %rhs) + ret <2 x i16> %result +} + +define <3 x i16> @v_saddsat_v3i16(<3 x i16> %lhs, <3 x i16> %rhs) { + %result = call <3 x i16> @llvm.sadd.sat.v3i16(<3 x i16> %lhs, <3 x i16> %rhs) + ret <3 x i16> %result +} + +define <2 x float> @v_saddsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) { + %result = call <4 x i16> @llvm.sadd.sat.v4i16(<4 x i16> %lhs, <4 x i16> %rhs) + %cast = bitcast <4 x i16> %result to <2 x float> + ret <2 x float> %cast +} + +define <2 x i32> @v_saddsat_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) { + %result = call <2 x i32> @llvm.sadd.sat.v2i32(<2 x i32> %lhs, <2 x i32> %rhs) + ret <2 x i32> %result +} + +define i64 @v_saddsat_i64(i64 %lhs, i64 %rhs) { + %result = call i64 @llvm.sadd.sat.i64(i64 %lhs, i64 %rhs) + ret i64 %result +} + +declare i8 @llvm.sadd.sat.i8(i8, i8) #0 +declare i16 @llvm.sadd.sat.i16(i16, i16) #0 +declare <2 x i16> @llvm.sadd.sat.v2i16(<2 x i16>, <2 x i16>) #0 +declare <3 x i16> @llvm.sadd.sat.v3i16(<3 x i16>, <3 x i16>) #0 +declare <4 x i16> @llvm.sadd.sat.v4i16(<4 x i16>, <4 x i16>) #0 +declare i32 @llvm.sadd.sat.i32(i32, i32) #0 +declare <2 x i32> @llvm.sadd.sat.v2i32(<2 x i32>, <2 x i32>) #0 +declare i64 @llvm.sadd.sat.i64(i64, i64) #0 diff --git a/llvm/test/CodeGen/AMDGPU/no_straddle_shufflevector.v2p3.v8p3.ll b/llvm/test/CodeGen/AMDGPU/no_straddle_shufflevector.v2p3.v8p3.ll new file mode 100644 index 0000000000000..d014c1193590d --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/no_straddle_shufflevector.v2p3.v8p3.ll @@ -0,0 +1,4435 @@ +; Test --amdgpu-prevent-half-cache-line-straddling with MetaInstructions. +; Based on shufflevector.v2p3.v8p3.ll + +;RUN: llc --amdgpu-prevent-half-cache-line-straddling -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=dumpcode --filetype=obj < %s | llvm-objdump --triple=amdgcn-amd-amdhsa --mcpu=gfx900 -d - > %t.dis +;RUN: %python %p/has_cache_straddle.py %t.dis + +;RUN: llc --amdgpu-prevent-half-cache-line-straddling -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=dumpcode --filetype=obj < %s | llvm-objdump --triple=amdgcn-amd-amdhsa --mcpu=gfx90a -d - > %t.dis +;RUN: %python %p/has_cache_straddle.py %t.dis + +;RUN: llc --amdgpu-prevent-half-cache-line-straddling -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -mattr=dumpcode --filetype=obj < %s | llvm-objdump --triple=amdgcn-amd-amdhsa --mcpu=gfx942 -d - > %t.dis +;RUN: %python %p/has_cache_straddle.py %t.dis + +define void @v_shuffle_v2p3_v8p3__u_u(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> poison + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__0_u(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__1_u(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__2_u(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__3_u(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__4_u(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__5_u(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__6_u(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__7_u(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__8_u(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__9_u(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__10_u(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__11_u(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__12_u(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__13_u(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__14_u(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__15_u(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__15_0(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__15_1(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__15_2(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__15_3(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__15_4(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__15_5(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__15_6(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__15_7(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__15_8(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__15_9(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__15_10(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__15_11(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__15_12(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__15_13(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__15_14(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__15_15(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__u_0(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__0_0(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> zeroinitializer + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__1_0(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__2_0(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__3_0(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__4_0(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__5_0(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__6_0(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__7_0(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__8_0(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__9_0(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__10_0(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__11_0(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__12_0(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__13_0(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__14_0(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__u_1(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__0_1(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__1_1(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__2_1(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__3_1(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__4_1(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__5_1(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__6_1(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__7_1(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__8_1(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__9_1(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__10_1(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__11_1(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__12_1(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__13_1(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__14_1(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__u_2(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__0_2(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__1_2(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__2_2(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__3_2(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__4_2(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__5_2(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__6_2(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__7_2(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__8_2(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__9_2(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__10_2(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__11_2(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__12_2(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__13_2(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__14_2(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__u_3(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__0_3(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__1_3(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__2_3(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__3_3(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__4_3(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__5_3(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__6_3(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__7_3(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__8_3(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__9_3(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__10_3(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__11_3(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__12_3(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__13_3(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__14_3(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__u_4(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__0_4(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__1_4(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__2_4(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__3_4(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__4_4(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__5_4(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__6_4(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__7_4(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__8_4(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__9_4(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__10_4(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__11_4(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__12_4(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__13_4(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__14_4(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__u_5(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__0_5(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__1_5(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__2_5(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__3_5(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__4_5(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__5_5(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__6_5(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__7_5(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__8_5(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__9_5(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__10_5(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__11_5(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__12_5(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__13_5(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__14_5(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__u_6(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__0_6(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__1_6(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__2_6(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__3_6(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__4_6(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__5_6(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__6_6(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__7_6(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__8_6(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__9_6(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__10_6(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__11_6(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__12_6(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__13_6(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__14_6(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__u_7(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__0_7(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__1_7(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__2_7(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__3_7(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__4_7(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__5_7(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__6_7(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__7_7(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__8_7(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__9_7(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__10_7(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__11_7(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__12_7(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__13_7(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__14_7(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__u_8(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__0_8(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__1_8(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__2_8(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__3_8(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__4_8(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__5_8(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__6_8(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__7_8(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__8_8(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__9_8(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__10_8(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__11_8(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__12_8(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__13_8(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__14_8(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__u_9(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__0_9(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__1_9(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__2_9(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__3_9(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__4_9(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__5_9(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__6_9(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__7_9(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__8_9(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__9_9(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__10_9(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__11_9(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__12_9(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__13_9(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__14_9(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__u_10(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__0_10(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__1_10(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__2_10(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__3_10(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__4_10(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__5_10(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__6_10(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__7_10(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__8_10(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__9_10(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__10_10(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__11_10(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__12_10(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__13_10(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__14_10(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__u_11(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__0_11(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__1_11(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__2_11(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__3_11(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__4_11(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__5_11(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__6_11(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__7_11(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__8_11(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__9_11(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__10_11(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__11_11(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__12_11(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__13_11(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__14_11(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__u_12(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__0_12(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__1_12(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__2_12(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__3_12(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__4_12(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__5_12(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__6_12(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__7_12(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__8_12(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__9_12(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__10_12(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__11_12(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__12_12(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__13_12(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__14_12(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__u_13(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__0_13(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__1_13(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__2_13(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__3_13(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__4_13(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__5_13(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__6_13(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__7_13(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__8_13(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__9_13(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__10_13(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__11_13(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__12_13(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__13_13(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__14_13(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__u_14(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__0_14(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__1_14(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__2_14(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__3_14(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__4_14(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__5_14(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__6_14(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__7_14(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__8_14(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__9_14(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__10_14(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__11_14(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__12_14(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__13_14(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__14_14(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__u_15(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__0_15(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__1_15(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__2_15(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__3_15(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__4_15(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__5_15(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__6_15(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__7_15(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__8_15(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__9_15(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__10_15(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__11_15(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__12_15(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__13_15(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__14_15(ptr addrspace(1) inreg %ptr) { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @s_shuffle_v2p3_v8p3__u_u() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> poison + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__0_u() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__1_u() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__2_u() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__3_u() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__4_u() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__5_u() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__6_u() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__7_u() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__8_u() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__9_u() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__10_u() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__11_u() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__12_u() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__13_u() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__14_u() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__15_u() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__15_0() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__15_1() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__15_2() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__15_3() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__15_4() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__15_5() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__15_6() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__15_7() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__15_8() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__15_9() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__15_10() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__15_11() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__15_12() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__15_13() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__15_14() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__15_15() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__u_0() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__0_0() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> zeroinitializer + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__1_0() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__2_0() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__3_0() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__4_0() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__5_0() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__6_0() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__7_0() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__8_0() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__9_0() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__10_0() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__11_0() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__12_0() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__13_0() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__14_0() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__u_1() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__0_1() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__1_1() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__2_1() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__3_1() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__4_1() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__5_1() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__6_1() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__7_1() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__8_1() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__9_1() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__10_1() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__11_1() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__12_1() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__13_1() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__14_1() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__u_2() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__0_2() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__1_2() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__2_2() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__3_2() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__4_2() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__5_2() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__6_2() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__7_2() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__8_2() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__9_2() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__10_2() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__11_2() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__12_2() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__13_2() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__14_2() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__u_3() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__0_3() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__1_3() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__2_3() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__3_3() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__4_3() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__5_3() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__6_3() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__7_3() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__8_3() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__9_3() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__10_3() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__11_3() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__12_3() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__13_3() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__14_3() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__u_4() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__0_4() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__1_4() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__2_4() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__3_4() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__4_4() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__5_4() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__6_4() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__7_4() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__8_4() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__9_4() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__10_4() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__11_4() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__12_4() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__13_4() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__14_4() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__u_5() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__0_5() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__1_5() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__2_5() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__3_5() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__4_5() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__5_5() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__6_5() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__7_5() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__8_5() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__9_5() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__10_5() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__11_5() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__12_5() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__13_5() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__14_5() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__u_6() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__0_6() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__1_6() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__2_6() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__3_6() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__4_6() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__5_6() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__6_6() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__7_6() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__8_6() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__9_6() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__10_6() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__11_6() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__12_6() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__13_6() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__14_6() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__u_7() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__0_7() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__1_7() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__2_7() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__3_7() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__4_7() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__5_7() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__6_7() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__7_7() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__8_7() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__9_7() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__10_7() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__11_7() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__12_7() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__13_7() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__14_7() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__u_8() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__0_8() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__1_8() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__2_8() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__3_8() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__4_8() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__5_8() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__6_8() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__7_8() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__8_8() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__9_8() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__10_8() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__11_8() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__12_8() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__13_8() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__14_8() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__u_9() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__0_9() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__1_9() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__2_9() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__3_9() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__4_9() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__5_9() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__6_9() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__7_9() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__8_9() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__9_9() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__10_9() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__11_9() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__12_9() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__13_9() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__14_9() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__u_10() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__0_10() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__1_10() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__2_10() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__3_10() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__4_10() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__5_10() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__6_10() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__7_10() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__8_10() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__9_10() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__10_10() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__11_10() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__12_10() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__13_10() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__14_10() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__u_11() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__0_11() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__1_11() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__2_11() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__3_11() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__4_11() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__5_11() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__6_11() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__7_11() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__8_11() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__9_11() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__10_11() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__11_11() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__12_11() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__13_11() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__14_11() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__u_12() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__0_12() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__1_12() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__2_12() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__3_12() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__4_12() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__5_12() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__6_12() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__7_12() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__8_12() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__9_12() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__10_12() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__11_12() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__12_12() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__13_12() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__14_12() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__u_13() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__0_13() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__1_13() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__2_13() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__3_13() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__4_13() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__5_13() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__6_13() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__7_13() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__8_13() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__9_13() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__10_13() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__11_13() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__12_13() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__13_13() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__14_13() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__u_14() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__0_14() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__1_14() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__2_14() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__3_14() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__4_14() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__5_14() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__6_14() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__7_14() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__8_14() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__9_14() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__10_14() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__11_14() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__12_14() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__13_14() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__14_14() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__u_15() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__0_15() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__1_15() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__2_15() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__3_15() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__4_15() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__5_15() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__6_15() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__7_15() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__8_15() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__9_15() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__10_15() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__11_15() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__12_15() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__13_15() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__14_15() { + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/no_straddle_sub.ll b/llvm/test/CodeGen/AMDGPU/no_straddle_sub.ll new file mode 100644 index 0000000000000..ba2000295e201 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/no_straddle_sub.ll @@ -0,0 +1,131 @@ +;RUN: llc --amdgpu-prevent-half-cache-line-straddling -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16,dumpcode -verify-machineinstrs --filetype=obj < %s | llvm-objdump --triple=amdgcn --mcpu=gfx1200 -d - > %t.dis +;RUN: %python %p/has_cache_straddle.py %t.dis + +declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone speculatable + +define amdgpu_kernel void @s_sub_i32(ptr addrspace(1) %out, i32 %a, i32 %b) { + %result = sub i32 %a, %b + store i32 %result, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @s_sub_imm_i32(ptr addrspace(1) %out, i32 %a) { + %result = sub i32 1234, %a + store i32 %result, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @test_sub_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { + %b_ptr = getelementptr i32, ptr addrspace(1) %in, i32 1 + %a = load i32, ptr addrspace(1) %in + %b = load i32, ptr addrspace(1) %b_ptr + %result = sub i32 %a, %b + store i32 %result, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @test_sub_imm_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { + %a = load i32, ptr addrspace(1) %in + %result = sub i32 123, %a + store i32 %result, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @test_sub_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { + %b_ptr = getelementptr <2 x i32>, ptr addrspace(1) %in, i32 1 + %a = load <2 x i32>, ptr addrspace(1) %in + %b = load <2 x i32>, ptr addrspace(1) %b_ptr + %result = sub <2 x i32> %a, %b + store <2 x i32> %result, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @test_sub_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { + %b_ptr = getelementptr <4 x i32>, ptr addrspace(1) %in, i32 1 + %a = load <4 x i32>, ptr addrspace(1) %in + %b = load <4 x i32>, ptr addrspace(1) %b_ptr + %result = sub <4 x i32> %a, %b + store <4 x i32> %result, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @test_sub_i16(ptr addrspace(1) %out, ptr addrspace(1) %in) { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr i16, ptr addrspace(1) %in, i32 %tid + %b_ptr = getelementptr i16, ptr addrspace(1) %gep, i32 1 + %a = load volatile i16, ptr addrspace(1) %gep + %b = load volatile i16, ptr addrspace(1) %b_ptr + %result = sub i16 %a, %b + store i16 %result, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @test_sub_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in) { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr <2 x i16>, ptr addrspace(1) %in, i32 %tid + %b_ptr = getelementptr <2 x i16>, ptr addrspace(1) %gep, i16 1 + %a = load <2 x i16>, ptr addrspace(1) %gep + %b = load <2 x i16>, ptr addrspace(1) %b_ptr + %result = sub <2 x i16> %a, %b + store <2 x i16> %result, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @test_sub_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %in) { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr <4 x i16>, ptr addrspace(1) %in, i32 %tid + %b_ptr = getelementptr <4 x i16>, ptr addrspace(1) %gep, i16 1 + %a = load <4 x i16>, ptr addrspace(1) %gep + %b = load <4 x i16>, ptr addrspace(1) %b_ptr + %result = sub <4 x i16> %a, %b + store <4 x i16> %result, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @s_sub_i64(ptr addrspace(1) noalias %out, i64 %a, i64 %b) nounwind { + %result = sub i64 %a, %b + store i64 %result, ptr addrspace(1) %out, align 8 + ret void +} + +define amdgpu_kernel void @v_sub_i64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %inA, ptr addrspace(1) noalias %inB) nounwind { + %tid = call i32 @llvm.amdgcn.workitem.id.x() readnone + %a_ptr = getelementptr i64, ptr addrspace(1) %inA, i32 %tid + %b_ptr = getelementptr i64, ptr addrspace(1) %inB, i32 %tid + %a = load i64, ptr addrspace(1) %a_ptr + %b = load i64, ptr addrspace(1) %b_ptr + %result = sub i64 %a, %b + store i64 %result, ptr addrspace(1) %out, align 8 + ret void +} + +define amdgpu_kernel void @v_test_sub_v2i64(ptr addrspace(1) %out, ptr addrspace(1) noalias %inA, ptr addrspace(1) noalias %inB) { + %tid = call i32 @llvm.amdgcn.workitem.id.x() readnone + %a_ptr = getelementptr <2 x i64>, ptr addrspace(1) %inA, i32 %tid + %b_ptr = getelementptr <2 x i64>, ptr addrspace(1) %inB, i32 %tid + %a = load <2 x i64>, ptr addrspace(1) %a_ptr + %b = load <2 x i64>, ptr addrspace(1) %b_ptr + %result = sub <2 x i64> %a, %b + store <2 x i64> %result, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_test_sub_v4i64(ptr addrspace(1) %out, ptr addrspace(1) noalias %inA, ptr addrspace(1) noalias %inB) { + %tid = call i32 @llvm.amdgcn.workitem.id.x() readnone + %a_ptr = getelementptr <4 x i64>, ptr addrspace(1) %inA, i32 %tid + %b_ptr = getelementptr <4 x i64>, ptr addrspace(1) %inB, i32 %tid + %a = load <4 x i64>, ptr addrspace(1) %a_ptr + %b = load <4 x i64>, ptr addrspace(1) %b_ptr + %result = sub <4 x i64> %a, %b + store <4 x i64> %result, ptr addrspace(1) %out + ret void +} + + +define amdgpu_ps void @sub_select_vop3(i32 inreg %s, i32 %v) { + %vcc = call i64 asm sideeffect "; def vcc", "={vcc}"() + %sub = sub i32 %v, %s + store i32 %sub, ptr addrspace(3) poison + call void asm sideeffect "; use vcc", "{vcc}"(i64 %vcc) + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/no_straddle_wave32.ll b/llvm/test/CodeGen/AMDGPU/no_straddle_wave32.ll new file mode 100644 index 0000000000000..59ed7c970b127 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/no_straddle_wave32.ll @@ -0,0 +1,783 @@ +; Test --amdgpu-prevent-half-cache-line-straddling with MetaInstructions. +; Based on wave32.ll. + +;RUN: llc --amdgpu-prevent-half-cache-line-straddling -mattr=+wavefrontsize32,-wavefrontsize64 -mtriple=amdgcn -mcpu=gfx1010 -mattr=dumpcode --filetype=obj < %s | llvm-objdump --triple=amdgcn --mcpu=gfx1010 -d - > %t.dis +;RUN: %python %p/has_cache_straddle.py %t.dis + +define amdgpu_kernel void @test_vopc_i32(ptr addrspace(1) %arg) { + %lid = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds i32, ptr addrspace(1) %arg, i32 %lid + %load = load i32, ptr addrspace(1) %gep, align 4 + %cmp = icmp sgt i32 %load, 0 + %sel = select i1 %cmp, i32 1, i32 2 + store i32 %sel, ptr addrspace(1) %gep, align 4 + ret void +} + +define amdgpu_kernel void @test_vopc_f32(ptr addrspace(1) %arg) { + %lid = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %lid + %load = load float, ptr addrspace(1) %gep, align 4 + %cmp = fcmp ugt float %load, 0.0 + %sel = select i1 %cmp, float 1.0, float 2.0 + store float %sel, ptr addrspace(1) %gep, align 4 + ret void +} + +define amdgpu_ps void @test_vopc_vcmp(float %x) { + %cmp = fcmp oge float %x, 0.0 + call void @llvm.amdgcn.kill(i1 %cmp) + ret void +} + +define amdgpu_kernel void @test_vopc_2xf16(ptr addrspace(1) %arg) { + %lid = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds <2 x half>, ptr addrspace(1) %arg, i32 %lid + %load = load <2 x half>, ptr addrspace(1) %gep, align 4 + %elt = extractelement <2 x half> %load, i32 1 + %cmp = fcmp ugt half %elt, 0.0 + %sel = select i1 %cmp, <2 x half> , <2 x half> %load + store <2 x half> %sel, ptr addrspace(1) %gep, align 4 + ret void +} + +define amdgpu_kernel void @test_vopc_class(ptr addrspace(1) %out, float %x) #0 { + %fabs = tail call float @llvm.fabs.f32(float %x) + %cmp = fcmp oeq float %fabs, 0x7FF0000000000000 + %ext = zext i1 %cmp to i32 + store i32 %ext, ptr addrspace(1) %out, align 4 + ret void +} + +define amdgpu_kernel void @test_vcmp_vcnd_f16(ptr addrspace(1) %out, half %x) #0 { + %cmp = fcmp oeq half %x, 0x7FF0000000000000 + %sel = select i1 %cmp, half 1.0, half %x + store half %sel, ptr addrspace(1) %out, align 2 + ret void +} + +define amdgpu_kernel void @test_vop3_cmp_f32_sop_and(ptr addrspace(1) %arg) { + %lid = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %lid + %load = load float, ptr addrspace(1) %gep, align 4 + %cmp = fcmp ugt float %load, 0.0 + %cmp2 = fcmp ult float %load, 1.0 + %and = and i1 %cmp, %cmp2 + %sel = select i1 %and, float 1.0, float 2.0 + store float %sel, ptr addrspace(1) %gep, align 4 + ret void +} + +define amdgpu_kernel void @test_vop3_cmp_i32_sop_xor(ptr addrspace(1) %arg) { + %lid = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds i32, ptr addrspace(1) %arg, i32 %lid + %load = load i32, ptr addrspace(1) %gep, align 4 + %cmp = icmp sgt i32 %load, 0 + %cmp2 = icmp slt i32 %load, 1 + %xor = xor i1 %cmp, %cmp2 + %sel = select i1 %xor, i32 1, i32 2 + store i32 %sel, ptr addrspace(1) %gep, align 4 + ret void +} + +define amdgpu_kernel void @test_vop3_cmp_u32_sop_or(ptr addrspace(1) %arg) { + %lid = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds i32, ptr addrspace(1) %arg, i32 %lid + %load = load i32, ptr addrspace(1) %gep, align 4 + %cmp = icmp ugt i32 %load, 3 + %cmp2 = icmp ult i32 %load, 2 + %or = or i1 %cmp, %cmp2 + %sel = select i1 %or, i32 1, i32 2 + store i32 %sel, ptr addrspace(1) %gep, align 4 + ret void +} + +define amdgpu_kernel void @test_mask_if(ptr addrspace(1) %arg) #0 { + %lid = tail call i32 @llvm.amdgcn.workitem.id.x() + %cmp = icmp ugt i32 %lid, 10 + br i1 %cmp, label %if, label %endif + +if: + store i32 0, ptr addrspace(1) %arg, align 4 + br label %endif + +endif: + ret void +} + +define amdgpu_kernel void @test_loop_with_if(ptr addrspace(1) %arg) #0 { +bb: + %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() + br label %bb2 + +bb1: + ret void + +bb2: + %tmp3 = phi i32 [ 0, %bb ], [ %tmp15, %bb13 ] + %tmp4 = icmp slt i32 %tmp3, %tmp + br i1 %tmp4, label %bb5, label %bb11 + +bb5: + %tmp6 = sext i32 %tmp3 to i64 + %tmp7 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 %tmp6 + %tmp8 = load i32, ptr addrspace(1) %tmp7, align 4 + %tmp9 = icmp sgt i32 %tmp8, 10 + br i1 %tmp9, label %bb10, label %bb11 + +bb10: + store i32 %tmp, ptr addrspace(1) %tmp7, align 4 + br label %bb13 + +bb11: + %tmp12 = sdiv i32 %tmp3, 2 + br label %bb13 + +bb13: + %tmp14 = phi i32 [ %tmp3, %bb10 ], [ %tmp12, %bb11 ] + %tmp15 = add nsw i32 %tmp14, 1 + %tmp16 = icmp slt i32 %tmp14, 255 + br i1 %tmp16, label %bb2, label %bb1 +} + + + +define amdgpu_kernel void @test_loop_with_if_else_break(ptr addrspace(1) %arg) #0 { +bb: + %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() + %tmp1 = icmp eq i32 %tmp, 0 + br i1 %tmp1, label %.loopexit, label %.preheader + +.preheader: + br label %bb2 + +bb2: + %tmp3 = phi i32 [ %tmp9, %bb8 ], [ 0, %.preheader ] + %tmp4 = zext i32 %tmp3 to i64 + %tmp5 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 %tmp4 + %tmp6 = load i32, ptr addrspace(1) %tmp5, align 4 + %tmp7 = icmp sgt i32 %tmp6, 10 + br i1 %tmp7, label %bb8, label %.loopexit + +bb8: + store i32 %tmp, ptr addrspace(1) %tmp5, align 4 + %tmp9 = add nuw nsw i32 %tmp3, 1 + %tmp10 = icmp ult i32 %tmp9, 256 + %tmp11 = icmp ult i32 %tmp9, %tmp + %tmp12 = and i1 %tmp10, %tmp11 + br i1 %tmp12, label %bb2, label %.loopexit + +.loopexit: + ret void +} + +define amdgpu_kernel void @test_addc_vop2b(ptr addrspace(1) %arg, i64 %arg1) #0 { +bb: + %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() + %tmp3 = getelementptr inbounds i64, ptr addrspace(1) %arg, i32 %tmp + %tmp4 = load i64, ptr addrspace(1) %tmp3, align 8 + %tmp5 = add nsw i64 %tmp4, %arg1 + store i64 %tmp5, ptr addrspace(1) %tmp3, align 8 + ret void +} + +define amdgpu_kernel void @test_subbrev_vop2b(ptr addrspace(1) %arg, i64 %arg1) #0 { +bb: + %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() + %tmp3 = getelementptr inbounds i64, ptr addrspace(1) %arg, i32 %tmp + %tmp4 = load i64, ptr addrspace(1) %tmp3, align 8 + %tmp5 = sub nsw i64 %tmp4, %arg1 + store i64 %tmp5, ptr addrspace(1) %tmp3, align 8 + ret void +} + +define amdgpu_kernel void @test_subb_vop2b(ptr addrspace(1) %arg, i64 %arg1) #0 { +bb: + %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() + %tmp3 = getelementptr inbounds i64, ptr addrspace(1) %arg, i32 %tmp + %tmp4 = load i64, ptr addrspace(1) %tmp3, align 8 + %tmp5 = sub nsw i64 %arg1, %tmp4 + store i64 %tmp5, ptr addrspace(1) %tmp3, align 8 + ret void +} + +define amdgpu_kernel void @test_udiv64(ptr addrspace(1) %arg) #0 { +bb: + %tmp = getelementptr inbounds i64, ptr addrspace(1) %arg, i64 1 + %tmp1 = load i64, ptr addrspace(1) %tmp, align 8 + %tmp2 = load i64, ptr addrspace(1) %arg, align 8 + %tmp3 = udiv i64 %tmp1, %tmp2 + %tmp4 = getelementptr inbounds i64, ptr addrspace(1) %arg, i64 2 + store i64 %tmp3, ptr addrspace(1) %tmp4, align 8 + ret void +} + +define amdgpu_kernel void @test_div_scale_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone + %gep.0 = getelementptr float, ptr addrspace(1) %in, i32 %tid + %gep.1 = getelementptr float, ptr addrspace(1) %gep.0, i32 1 + + %a = load volatile float, ptr addrspace(1) %gep.0, align 4 + %b = load volatile float, ptr addrspace(1) %gep.1, align 4 + + %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float %a, float %b, i1 false) nounwind readnone + %result0 = extractvalue { float, i1 } %result, 0 + store float %result0, ptr addrspace(1) %out, align 4 + ret void +} + +define amdgpu_kernel void @test_div_scale_f64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %in) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone + %gep.0 = getelementptr double, ptr addrspace(1) %in, i32 %tid + %gep.1 = getelementptr double, ptr addrspace(1) %gep.0, i32 1 + + %a = load volatile double, ptr addrspace(1) %gep.0, align 8 + %b = load volatile double, ptr addrspace(1) %gep.1, align 8 + + %result = call { double, i1 } @llvm.amdgcn.div.scale.f64(double %a, double %b, i1 true) nounwind readnone + %result0 = extractvalue { double, i1 } %result, 0 + store double %result0, ptr addrspace(1) %out, align 8 + ret void +} + +define i64 @test_mad_i64_i32(i32 %arg0, i32 %arg1, i64 %arg2) #0 { + %sext0 = sext i32 %arg0 to i64 + %sext1 = sext i32 %arg1 to i64 + %mul = mul i64 %sext0, %sext1 + %mad = add i64 %mul, %arg2 + ret i64 %mad +} + +define i64 @test_mad_u64_u32(i32 %arg0, i32 %arg1, i64 %arg2) #0 { + %sext0 = zext i32 %arg0 to i64 + %sext1 = zext i32 %arg1 to i64 + %mul = mul i64 %sext0, %sext1 + %mad = add i64 %mul, %arg2 + ret i64 %mad +} + +define amdgpu_kernel void @test_div_fmas_f32(ptr addrspace(1) %out, float %a, float %b, float %c, i1 %d) nounwind { + %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 %d) nounwind readnone + store float %result, ptr addrspace(1) %out, align 4 + ret void +} + +define amdgpu_kernel void @test_div_fmas_f64(ptr addrspace(1) %out, double %a, double %b, double %c, i1 %d) nounwind { + %result = call double @llvm.amdgcn.div.fmas.f64(double %a, double %b, double %c, i1 %d) nounwind readnone + store double %result, ptr addrspace(1) %out, align 8 + ret void +} + + + +define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(ptr addrspace(1) %out, ptr addrspace(1) %in, ptr addrspace(1) %dummy) #0 { +entry: + %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone + %gep.out = getelementptr float, ptr addrspace(1) %out, i32 2 + %gep.a = getelementptr float, ptr addrspace(1) %in, i32 %tid + %gep.b = getelementptr float, ptr addrspace(1) %gep.a, i32 1 + %gep.c = getelementptr float, ptr addrspace(1) %gep.a, i32 2 + + %a = load float, ptr addrspace(1) %gep.a + %b = load float, ptr addrspace(1) %gep.b + %c = load float, ptr addrspace(1) %gep.c + + %cmp0 = icmp eq i32 %tid, 0 + br i1 %cmp0, label %bb, label %exit + +bb: + %val = load volatile i32, ptr addrspace(1) %dummy + %cmp1 = icmp ne i32 %val, 0 + br label %exit + +exit: + %cond = phi i1 [false, %entry], [%cmp1, %bb] + %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 %cond) nounwind readnone + store float %result, ptr addrspace(1) %gep.out, align 4 + ret void +} + + +define amdgpu_kernel void @fdiv_f32(ptr addrspace(1) %out, float %a, float %b) #0 { +entry: + %fdiv = fdiv float %a, %b + store float %fdiv, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @test_br_cc_f16( + ptr addrspace(1) %r, + ptr addrspace(1) %a, + ptr addrspace(1) %b) { +entry: + %a.val = load half, ptr addrspace(1) %a + %b.val = load half, ptr addrspace(1) %b + %fcmp = fcmp olt half %a.val, %b.val + br i1 %fcmp, label %one, label %two + +one: + store half %a.val, ptr addrspace(1) %r + ret void + +two: + store half %b.val, ptr addrspace(1) %r + ret void +} + +define amdgpu_kernel void @test_brcc_i1(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in, i1 %val) #0 { + %cmp0 = icmp ne i1 %val, 0 + br i1 %cmp0, label %store, label %end + +store: + store i32 222, ptr addrspace(1) %out + ret void + +end: + ret void +} + +define amdgpu_kernel void @test_preserve_condition_undef_flag(float %arg, i32 %arg1, float %arg2) #0 { +bb0: + %tmp = icmp sgt i32 %arg1, 4 + %undef = call i1 @llvm.amdgcn.class.f32(float poison, i32 0) + %tmp4 = select i1 %undef, float %arg, float 1.000000e+00 + %tmp5 = fcmp ogt float %arg2, 0.000000e+00 + %tmp6 = fcmp olt float %arg2, 1.000000e+00 + %tmp7 = fcmp olt float %arg, %tmp4 + %tmp8 = and i1 %tmp5, %tmp6 + %tmp9 = and i1 %tmp8, %tmp7 + br i1 %tmp9, label %bb1, label %bb2 + +bb1: + store volatile i32 0, ptr addrspace(1) poison + br label %bb2 + +bb2: + ret void +} + +define amdgpu_kernel void @test_invert_true_phi_cond_break_loop(i32 %arg) #0 { +bb: + %id = call i32 @llvm.amdgcn.workitem.id.x() + %tmp = sub i32 %id, %arg + br label %bb1 + +bb1: ; preds = %Flow, %bb + %lsr.iv = phi i32 [ poison, %bb ], [ %tmp2, %Flow ] + %lsr.iv.next = add i32 %lsr.iv, 1 + %cmp0 = icmp slt i32 %lsr.iv.next, 0 + br i1 %cmp0, label %bb4, label %Flow + +bb4: ; preds = %bb1 + %load = load volatile i32, ptr addrspace(1) poison, align 4 + %cmp1 = icmp sge i32 %tmp, %load + br label %Flow + +Flow: ; preds = %bb4, %bb1 + %tmp2 = phi i32 [ %lsr.iv.next, %bb4 ], [ poison, %bb1 ] + %tmp3 = phi i1 [ %cmp1, %bb4 ], [ true, %bb1 ] + br i1 %tmp3, label %bb1, label %bb9 + +bb9: ; preds = %Flow + store volatile i32 7, ptr addrspace(3) poison + ret void +} + +define amdgpu_kernel void @test_movrels_extract_neg_offset_vgpr(ptr addrspace(1) %out) #0 { +entry: + %id = call i32 @llvm.amdgcn.workitem.id.x() #1 + %index = add i32 %id, -512 + %value = extractelement <4 x i32> , i32 %index + store i32 %value, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @test_set_inactive(ptr addrspace(1) %out, i32 %in) #0 { + %tmp.0 = call i32 @llvm.amdgcn.set.inactive.i32(i32 %in, i32 42) + %tmp = call i32 @llvm.amdgcn.strict.wwm.i32(i32 %tmp.0) + store i32 %tmp, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @test_set_inactive_64(ptr addrspace(1) %out, i64 %in) #0 { + %tmp.0 = call i64 @llvm.amdgcn.set.inactive.i64(i64 %in, i64 0) + %tmp = call i64 @llvm.amdgcn.strict.wwm.i64(i64 %tmp.0) + store i64 %tmp, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_kill_i1_terminator_float() #0 { + call void @llvm.amdgcn.kill(i1 false) + ret void +} + +define amdgpu_gs void @test_kill_i1_terminator_i1(i32 %a, i32 %b, i32 %c, i32 %d) #0 { + %c1 = icmp slt i32 %a, %b + %c2 = icmp slt i32 %c, %d + %x = or i1 %c1, %c2 + call void @llvm.amdgcn.kill(i1 %x) + call void @llvm.amdgcn.exp.f32(i32 0, i32 0, float 0.0, float 0.0, float 0.0, float 0.0, i1 false, i1 false) + ret void +} + +define amdgpu_ps <4 x float> @test_loop_vcc(<4 x float> %in) #0 { +entry: + br label %loop + +loop: + %ctr.iv = phi float [ 0.0, %entry ], [ %ctr.next, %body ] + %c.iv = phi <4 x float> [ %in, %entry ], [ %c.next, %body ] + %cc = fcmp ogt float %ctr.iv, 7.0 + br i1 %cc, label %break, label %body + +body: + %c.iv0 = extractelement <4 x float> %c.iv, i32 0 + %c.next = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.iv0, <8 x i32> poison, <4 x i32> poison, i1 0, i32 0, i32 0) + %ctr.next = fadd float %ctr.iv, 2.0 + br label %loop + +break: + ret <4 x float> %c.iv +} + +define amdgpu_ps float @test_wwm1(i32 inreg %idx0, i32 inreg %idx1, float %src0, float %src1) { +main_body: + %out = fadd float %src0, %src1 + %out.0 = call float @llvm.amdgcn.wwm.f32(float %out) + ret float %out.0 +} + +define amdgpu_ps float @test_wwm2(i32 inreg %idx) { +main_body: + ; use mbcnt to make sure the branch is divergent + %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) + %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo) + %cc = icmp uge i32 %hi, 16 + br i1 %cc, label %endif, label %if + +if: + %src = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) poison, i32 %idx, i32 0, i32 0, i32 0) + %out = fadd float %src, %src + %out.0 = call float @llvm.amdgcn.wwm.f32(float %out) + %out.1 = fadd float %src, %out.0 + br label %endif + +endif: + %out.2 = phi float [ %out.1, %if ], [ 0.0, %main_body ] + ret float %out.2 +} + +define amdgpu_ps float @test_strict_wwm1(i32 inreg %idx0, i32 inreg %idx1, float %src0, float %src1) { +main_body: + %out = fadd float %src0, %src1 + %out.0 = call float @llvm.amdgcn.strict.wwm.f32(float %out) + ret float %out.0 +} + +define amdgpu_ps float @test_strict_wwm2(i32 inreg %idx) { +main_body: + ; use mbcnt to make sure the branch is divergent + %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) + %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo) + %cc = icmp uge i32 %hi, 16 + br i1 %cc, label %endif, label %if + +if: + %src = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) poison, i32 %idx, i32 0, i32 0, i32 0) + %out = fadd float %src, %src + %out.0 = call float @llvm.amdgcn.strict.wwm.f32(float %out) + %out.1 = fadd float %src, %out.0 + br label %endif + +endif: + %out.2 = phi float [ %out.1, %if ], [ 0.0, %main_body ] + ret float %out.2 +} + + +define amdgpu_ps <4 x float> @test_wqm1(i32 inreg, i32 inreg, i32 inreg, i32 inreg %m0, <8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, <2 x float> %pos) #0 { +main_body: + %inst23 = extractelement <2 x float> %pos, i32 0 + %inst24 = extractelement <2 x float> %pos, i32 1 + %inst25 = tail call float @llvm.amdgcn.interp.p1(float %inst23, i32 0, i32 0, i32 %m0) + %inst26 = tail call float @llvm.amdgcn.interp.p2(float %inst25, float %inst24, i32 0, i32 0, i32 %m0) + %inst28 = tail call float @llvm.amdgcn.interp.p1(float %inst23, i32 1, i32 0, i32 %m0) + %inst29 = tail call float @llvm.amdgcn.interp.p2(float %inst28, float %inst24, i32 1, i32 0, i32 %m0) + %tex = call <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %inst26, float %inst29, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) + ret <4 x float> %tex +} + +define amdgpu_ps float @test_wqm2(i32 inreg %idx0, i32 inreg %idx1) #0 { +main_body: + %src0 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) poison, i32 %idx0, i32 0, i32 0, i32 0) + %src1 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) poison, i32 %idx1, i32 0, i32 0, i32 0) + %out = fadd float %src0, %src1 + %out.0 = bitcast float %out to i32 + %out.1 = call i32 @llvm.amdgcn.wqm.i32(i32 %out.0) + %out.2 = bitcast i32 %out.1 to float + ret float %out.2 +} + +define amdgpu_kernel void @test_intr_fcmp_i64(ptr addrspace(1) %out, float %src, float %a) { + %temp = call float @llvm.fabs.f32(float %a) + %result = call i64 @llvm.amdgcn.fcmp.i64.f32(float %src, float %temp, i32 1) + store i64 %result, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @test_intr_icmp_i64(ptr addrspace(1) %out, i32 %src) { + %result = call i64 @llvm.amdgcn.icmp.i64.i32(i32 %src, i32 100, i32 32) + store i64 %result, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @test_intr_fcmp_i32(ptr addrspace(1) %out, float %src, float %a) { + %temp = call float @llvm.fabs.f32(float %a) + %result = call i32 @llvm.amdgcn.fcmp.i32.f32(float %src, float %temp, i32 1) + store i32 %result, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @test_intr_icmp_i32(ptr addrspace(1) %out, i32 %src) { + %result = call i32 @llvm.amdgcn.icmp.i32.i32(i32 %src, i32 100, i32 32) + store i32 %result, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wqm_vote(float %a) { + %c1 = fcmp une float %a, 0.0 + %c2 = call i1 @llvm.amdgcn.wqm.vote(i1 %c1) + call void @llvm.amdgcn.kill(i1 %c2) + call void @llvm.amdgcn.exp.f32(i32 0, i32 0, float 0.0, float 0.0, float 0.0, float 0.0, i1 false, i1 false) + ret void +} + +define amdgpu_kernel void @test_branch_true() #2 { +entry: + br i1 true, label %for.end, label %for.body.lr.ph + +for.body.lr.ph: ; preds = %entry + br label %for.body + +for.body: ; preds = %for.body, %for.body.lr.ph + br i1 poison, label %for.end, label %for.body + +for.end: ; preds = %for.body, %entry + ret void +} + +define amdgpu_ps float @test_ps_live() #0 { + %live = call i1 @llvm.amdgcn.ps.live() + %live.32 = zext i1 %live to i32 + %r = bitcast i32 %live.32 to float + ret float %r +} + +define amdgpu_kernel void @test_vccnz_ifcvt_triangle64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { +entry: + %v = load double, ptr addrspace(1) %in + %cc = fcmp oeq double %v, 1.000000e+00 + br i1 %cc, label %if, label %endif + +if: + %u = fadd double %v, %v + br label %endif + +endif: + %r = phi double [ %v, %entry ], [ %u, %if ] + store double %r, ptr addrspace(1) %out + ret void +} + +define amdgpu_gs float @test_vgprblocks_w32_attr(float %a, float %b, float %c, float %d, float %e, + float %f, float %g, float %h, float %i, float %j, float %k, float %l) #3 { +main_body: + %s = fadd float %a, %b + %s.1 = fadd float %s, %c + %s.2 = fadd float %s.1, %d + %s.3 = fadd float %s.2, %e + %s.4 = fadd float %s.3, %f + %s.5 = fadd float %s.4, %g + %s.6 = fadd float %s.5, %h + %s.7 = fadd float %s.6, %i + %s.8 = fadd float %s.7, %j + %s.9 = fadd float %s.8, %k + %s.10 = fadd float %s.9, %l + ret float %s.10 +} + +define amdgpu_gs float @test_vgprblocks_w64_attr(float %a, float %b, float %c, float %d, float %e, + float %f, float %g, float %h, float %i, float %j, float %k, float %l) #4 { +main_body: + %s = fadd float %a, %b + %s.1 = fadd float %s, %c + %s.2 = fadd float %s.1, %d + %s.3 = fadd float %s.2, %e + %s.4 = fadd float %s.3, %f + %s.5 = fadd float %s.4, %g + %s.6 = fadd float %s.5, %h + %s.7 = fadd float %s.6, %i + %s.8 = fadd float %s.7, %j + %s.9 = fadd float %s.8, %k + %s.10 = fadd float %s.9, %l + ret float %s.10 +} + +define amdgpu_kernel void @icmp64(i32 %n, i32 %s) { +entry: + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + %mul4 = mul nsw i32 %s, %n + %cmp = icmp slt i32 0, %mul4 + br label %if.end + +if.end: ; preds = %entry + %rem = urem i32 %id, %s + %icmp = tail call i64 @llvm.amdgcn.icmp.i64.i32(i32 %rem, i32 0, i32 32) + %shr = lshr i64 %icmp, 1 + %notmask = shl nsw i64 -1, 0 + %and = and i64 %notmask, %shr + %or = or i64 %and, -9223372036854775808 + %cttz = tail call i64 @llvm.cttz.i64(i64 %or, i1 true) + %cast = trunc i64 %cttz to i32 + %cmp3 = icmp ugt i32 10, %cast + %cmp6 = icmp ne i32 %rem, 0 + %brmerge = or i1 %cmp6, %cmp3 + br i1 %brmerge, label %if.end2, label %if.then + +if.then: ; preds = %if.end + unreachable + +if.end2: ; preds = %if.end + ret void +} + +define amdgpu_kernel void @fcmp64(float %n, float %s) { +entry: + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + %id.f = uitofp i32 %id to float + %mul4 = fmul float %s, %n + %cmp = fcmp ult float 0.0, %mul4 + br label %if.end + +if.end: ; preds = %entry + %rem.f = frem float %id.f, %s + %fcmp = tail call i64 @llvm.amdgcn.fcmp.i64.f32(float %rem.f, float 0.0, i32 1) + %shr = lshr i64 %fcmp, 1 + %notmask = shl nsw i64 -1, 0 + %and = and i64 %notmask, %shr + %or = or i64 %and, -9223372036854775808 + %cttz = tail call i64 @llvm.cttz.i64(i64 %or, i1 true) + %cast = trunc i64 %cttz to i32 + %cmp3 = icmp ugt i32 10, %cast + %cmp6 = fcmp one float %rem.f, 0.0 + %brmerge = or i1 %cmp6, %cmp3 + br i1 %brmerge, label %if.end2, label %if.then + +if.then: ; preds = %if.end + unreachable + +if.end2: ; preds = %if.end + ret void +} + +define amdgpu_kernel void @icmp32(i32 %n, i32 %s) { +entry: + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + %mul4 = mul nsw i32 %s, %n + %cmp = icmp slt i32 0, %mul4 + br label %if.end + +if.end: ; preds = %entry + %rem = urem i32 %id, %s + %icmp = tail call i32 @llvm.amdgcn.icmp.i32.i32(i32 %rem, i32 0, i32 32) + %shr = lshr i32 %icmp, 1 + %notmask = shl nsw i32 -1, 0 + %and = and i32 %notmask, %shr + %or = or i32 %and, 2147483648 + %cttz = tail call i32 @llvm.cttz.i32(i32 %or, i1 true) + %cmp3 = icmp ugt i32 10, %cttz + %cmp6 = icmp ne i32 %rem, 0 + %brmerge = or i1 %cmp6, %cmp3 + br i1 %brmerge, label %if.end2, label %if.then + +if.then: ; preds = %if.end + unreachable + +if.end2: ; preds = %if.end + ret void +} + +define amdgpu_kernel void @fcmp32(float %n, float %s) { +entry: + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + %id.f = uitofp i32 %id to float + %mul4 = fmul float %s, %n + %cmp = fcmp ult float 0.0, %mul4 + br label %if.end + +if.end: ; preds = %entry + %rem.f = frem float %id.f, %s + %fcmp = tail call i32 @llvm.amdgcn.fcmp.i32.f32(float %rem.f, float 0.0, i32 1) + %shr = lshr i32 %fcmp, 1 + %notmask = shl nsw i32 -1, 0 + %and = and i32 %notmask, %shr + %or = or i32 %and, 2147483648 + %cttz = tail call i32 @llvm.cttz.i32(i32 %or, i1 true) + %cmp3 = icmp ugt i32 10, %cttz + %cmp6 = fcmp one float %rem.f, 0.0 + %brmerge = or i1 %cmp6, %cmp3 + br i1 %brmerge, label %if.end2, label %if.then + +if.then: ; preds = %if.end + unreachable + +if.end2: ; preds = %if.end + ret void +} + +declare void @external_void_func_void() #1 + +define void @callee_no_stack_with_call() #1 { + call void @external_void_func_void() + ret void +} + + +declare i32 @llvm.amdgcn.workitem.id.x() +declare float @llvm.fabs.f32(float) +declare { float, i1 } @llvm.amdgcn.div.scale.f32(float, float, i1) +declare { double, i1 } @llvm.amdgcn.div.scale.f64(double, double, i1) +declare float @llvm.amdgcn.div.fmas.f32(float, float, float, i1) +declare double @llvm.amdgcn.div.fmas.f64(double, double, double, i1) +declare i1 @llvm.amdgcn.class.f32(float, i32) +declare i32 @llvm.amdgcn.set.inactive.i32(i32, i32) +declare i64 @llvm.amdgcn.set.inactive.i64(i64, i64) +declare <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) +declare <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) +declare float @llvm.amdgcn.strict.wwm.f32(float) +declare i32 @llvm.amdgcn.strict.wwm.i32(i32) +declare i64 @llvm.amdgcn.strict.wwm.i64(i64) +declare float @llvm.amdgcn.wwm.f32(float) +declare i32 @llvm.amdgcn.wqm.i32(i32) +declare float @llvm.amdgcn.interp.p1(float, i32, i32, i32) +declare float @llvm.amdgcn.interp.p2(float, float, i32, i32, i32) +declare float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8), i32, i32, i32, i32 immarg) +declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) +declare i32 @llvm.amdgcn.mbcnt.hi(i32, i32) +declare i64 @llvm.amdgcn.fcmp.i64.f32(float, float, i32) +declare i64 @llvm.amdgcn.icmp.i64.i32(i32, i32, i32) +declare i32 @llvm.amdgcn.fcmp.i32.f32(float, float, i32) +declare i32 @llvm.amdgcn.icmp.i32.i32(i32, i32, i32) +declare void @llvm.amdgcn.kill(i1) +declare i1 @llvm.amdgcn.wqm.vote(i1) +declare i1 @llvm.amdgcn.ps.live() +declare i64 @llvm.cttz.i64(i64, i1) +declare i32 @llvm.cttz.i32(i32, i1) +declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #5 + +attributes #0 = { nounwind readnone speculatable } +attributes #1 = { nounwind } +attributes #2 = { nounwind readnone optnone noinline } +attributes #3 = { "target-features"="+wavefrontsize32" } +attributes #4 = { "target-features"="+wavefrontsize64" } +attributes #5 = { inaccessiblememonly nounwind } diff --git a/llvm/test/CodeGen/AMDGPU/no_straddle_wqm.ll b/llvm/test/CodeGen/AMDGPU/no_straddle_wqm.ll new file mode 100644 index 0000000000000..3430146b23ec8 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/no_straddle_wqm.ll @@ -0,0 +1,1033 @@ +;RUN: llc --amdgpu-prevent-half-cache-line-straddling -mtriple=amdgcn -mcpu=gfx900 -mattr=dumpcode --filetype=obj < %s | llvm-objdump --triple=amdgcn --mcpu=gfx900 -d - > %t.dis +;RUN: %python %p/has_cache_straddle.py %t.dis + +;RUN: llc --amdgpu-prevent-half-cache-line-straddling -mtriple=amdgcn -mcpu=gfx908 -mattr=dumpcode --filetype=obj < %s | llvm-objdump --triple=amdgcn --mcpu=gfx908 -d - > %t.dis +;RUN: %python %p/has_cache_straddle.py %t.dis + +define amdgpu_ps <4 x float> @test1(<8 x i32> inreg %rsrc, i32 %c) { +main_body: + %tex = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32(i32 15, i32 %c, <8 x i32> %rsrc, i32 0, i32 0) + call void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float> %tex, i32 15, i32 %c, <8 x i32> %rsrc, i32 0, i32 0) + ret <4 x float> %tex +} + +define amdgpu_ps <4 x float> @test2(i32 inreg, i32 inreg, i32 inreg, i32 inreg %m0, <8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, <2 x float> %pos) #6 { +main_body: + %inst23 = extractelement <2 x float> %pos, i32 0 + %inst24 = extractelement <2 x float> %pos, i32 1 + %inst25 = tail call float @llvm.amdgcn.interp.p1(float %inst23, i32 0, i32 0, i32 %m0) + %inst26 = tail call float @llvm.amdgcn.interp.p2(float %inst25, float %inst24, i32 0, i32 0, i32 %m0) + %inst28 = tail call float @llvm.amdgcn.interp.p1(float %inst23, i32 1, i32 0, i32 %m0) + %inst29 = tail call float @llvm.amdgcn.interp.p2(float %inst28, float %inst24, i32 1, i32 0, i32 %m0) + %tex = call <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %inst26, float %inst29, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 + ret <4 x float> %tex +} + +define amdgpu_ps <4 x float> @test3(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float %c) { +main_body: + %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 + %tex.1 = bitcast <4 x float> %tex to <4 x i32> + %tex.2 = extractelement <4 x i32> %tex.1, i32 0 + + call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %tex, <4 x i32> poison, i32 %tex.2, i32 0, i32 0, i32 0) + + ret <4 x float> %tex +} + +define amdgpu_ps <4 x float> @test3_ptr_buf(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float %c) { +main_body: + %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 + %tex.1 = bitcast <4 x float> %tex to <4 x i32> + %tex.2 = extractelement <4 x i32> %tex.1, i32 0 + + call void @llvm.amdgcn.struct.ptr.buffer.store.v4f32(<4 x float> %tex, ptr addrspace(8) poison, i32 %tex.2, i32 0, i32 0, i32 0) + + ret <4 x float> %tex +} + +define amdgpu_ps void @test3x(i32 inreg, i32 inreg, i32 inreg, i32 inreg %m0, <8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, <2 x float> %pos) #6 { +main_body: + %inst23 = extractelement <2 x float> %pos, i32 0 + %inst24 = extractelement <2 x float> %pos, i32 1 + %inst25 = tail call float @llvm.amdgcn.interp.p1(float %inst23, i32 0, i32 0, i32 %m0) + %inst26 = tail call float @llvm.amdgcn.interp.p2(float %inst25, float %inst24, i32 0, i32 0, i32 %m0) + %inst28 = tail call float @llvm.amdgcn.interp.p1(float %inst23, i32 1, i32 0, i32 %m0) + %inst29 = tail call float @llvm.amdgcn.interp.p2(float %inst28, float %inst24, i32 1, i32 0, i32 %m0) + %tex = call <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %inst26, float %inst29, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 + %tex.0 = extractelement <4 x float> %tex, i32 0 + %tex.1 = extractelement <4 x float> %tex, i32 1 + %tex.2 = extractelement <4 x float> %tex, i32 2 + %tex.3 = extractelement <4 x float> %tex, i32 3 + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %tex.0, float %tex.1, float %tex.2, float %tex.3, i1 true, i1 true) + ret void +} + +define amdgpu_ps <4 x float> @test4(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, ptr addrspace(1) inreg %ptr, i32 %c, i32 %d, float %data) { +main_body: + %c.1 = mul i32 %c, %d + + call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> poison, <4 x i32> poison, i32 %c.1, i32 0, i32 0, i32 0) + %c.1.bc = bitcast i32 %c.1 to float + %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.1.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 + %tex0 = extractelement <4 x float> %tex, i32 0 + %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 + ret <4 x float> %dtex +} + +define amdgpu_ps <4 x float> @test4_ptr_buf(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, ptr addrspace(1) inreg %ptr, i32 %c, i32 %d, float %data) { +main_body: + %c.1 = mul i32 %c, %d + + call void @llvm.amdgcn.struct.ptr.buffer.store.v4f32(<4 x float> poison, ptr addrspace(8) poison, i32 %c.1, i32 0, i32 0, i32 0) + %c.1.bc = bitcast i32 %c.1 to float + %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.1.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 + %tex0 = extractelement <4 x float> %tex, i32 0 + %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 + ret <4 x float> %dtex +} + +define amdgpu_ps float @test5(i32 inreg %idx0, i32 inreg %idx1) { +main_body: + %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> poison, i32 %idx0, i32 0, i32 0, i32 0) + %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> poison, i32 %idx1, i32 0, i32 0, i32 0) + %out = fadd float %src0, %src1 + %out.0 = call float @llvm.amdgcn.wqm.f32(float %out) + ret float %out.0 +} + +define amdgpu_ps float @test5_ptr_buf(i32 inreg %idx0, i32 inreg %idx1) { +main_body: + %src0 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) poison, i32 %idx0, i32 0, i32 0, i32 0) + %src1 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) poison, i32 %idx1, i32 0, i32 0, i32 0) + %out = fadd float %src0, %src1 + %out.0 = call float @llvm.amdgcn.wqm.f32(float %out) + ret float %out.0 +} + +define amdgpu_ps float @test6(i32 inreg %idx0, i32 inreg %idx1) { +main_body: + %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> poison, i32 %idx0, i32 0, i32 0, i32 0) + %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> poison, i32 %idx1, i32 0, i32 0, i32 0) + %out = fadd float %src0, %src1 + %out.0 = bitcast float %out to i32 + %out.1 = call i32 @llvm.amdgcn.wqm.i32(i32 %out.0) + %out.2 = bitcast i32 %out.1 to float + ret float %out.2 +} + +define amdgpu_ps float @test6_ptr_buf(i32 inreg %idx0, i32 inreg %idx1) { +main_body: + %src0 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) poison, i32 %idx0, i32 0, i32 0, i32 0) + %src1 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) poison, i32 %idx1, i32 0, i32 0, i32 0) + %out = fadd float %src0, %src1 + %out.0 = bitcast float %out to i32 + %out.1 = call i32 @llvm.amdgcn.wqm.i32(i32 %out.0) + %out.2 = bitcast i32 %out.1 to float + ret float %out.2 +} + + +define amdgpu_ps float @test_wwm1(i32 inreg %idx0, i32 inreg %idx1) { +main_body: + %src0 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) poison, i32 %idx0, i32 0, i32 0, i32 0) + %src1 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) poison, i32 %idx1, i32 0, i32 0, i32 0) + %out = fadd float %src0, %src1 + %out.0 = call float @llvm.amdgcn.wwm.f32(float %out) + ret float %out.0 +} + +define amdgpu_ps float @test_wwm2(i32 inreg %idx0, i32 inreg %idx1) { +main_body: + %src0 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) poison, i32 %idx0, i32 0, i32 0, i32 0) + %src1 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) poison, i32 %idx1, i32 0, i32 0, i32 0) + %src0.0 = bitcast float %src0 to i32 + %src1.0 = bitcast float %src1 to i32 + %out = add i32 %src0.0, %src1.0 + %out.0 = call i32 @llvm.amdgcn.wwm.i32(i32 %out) + %out.1 = bitcast i32 %out.0 to float + ret float %out.1 +} + +define amdgpu_ps float @test_wwm3(i32 inreg %idx) { +main_body: + ; use mbcnt to make sure the branch is divergent + %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) + %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo) + %cc = icmp uge i32 %hi, 16 + br i1 %cc, label %endif, label %if + +if: + %src = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) poison, i32 %idx, i32 0, i32 0, i32 0) + %out = fadd float %src, %src + %out.0 = call float @llvm.amdgcn.wwm.f32(float %out) + %out.1 = fadd float %src, %out.0 + br label %endif + +endif: + %out.2 = phi float [ %out.1, %if ], [ 0.0, %main_body ] + ret float %out.2 +} + +define amdgpu_ps float @test_wwm4(i32 inreg %idx) { +main_body: + ; use mbcnt to make sure the branch is divergent + %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) + %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo) + %cc = icmp uge i32 %hi, 16 + br i1 %cc, label %endif, label %if + +if: + %src = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) poison, i32 %idx, i32 0, i32 0, i32 0) + %out = fadd float %src, %src + %out.0 = call float @llvm.amdgcn.wwm.f32(float %out) + br label %endif + +endif: + %out.1 = phi float [ %out.0, %if ], [ 0.0, %main_body ] + ret float %out.1 +} + +define amdgpu_ps float @test_wwm5(i32 inreg %idx0, i32 inreg %idx1) { +main_body: + %src0 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) poison, i32 %idx0, i32 0, i32 0, i32 0) + call void @llvm.amdgcn.struct.ptr.buffer.store.f32(float %src0, ptr addrspace(8) poison, i32 %idx0, i32 0, i32 0, i32 0) + %src1 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) poison, i32 %idx1, i32 0, i32 0, i32 0) + %temp = fadd float %src1, %src1 + %temp.0 = call float @llvm.amdgcn.wwm.f32(float %temp) + %out = fadd float %temp.0, %temp.0 + %out.0 = call float @llvm.amdgcn.wqm.f32(float %out) + ret float %out.0 +} + +define amdgpu_ps float @test_wwm6_then() { +main_body: + %src0 = load volatile float, ptr addrspace(1) poison + ; use mbcnt to make sure the branch is divergent + %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) + %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo) + %cc = icmp uge i32 %hi, 16 + br i1 %cc, label %endif, label %if + +if: + %src1 = load volatile float, ptr addrspace(1) poison + %out = fadd float %src0, %src1 + %out.0 = call float @llvm.amdgcn.wwm.f32(float %out) + br label %endif + +endif: + %out.1 = phi float [ %out.0, %if ], [ 0.0, %main_body ] + ret float %out.1 +} + +define amdgpu_ps float @test_wwm6_loop() { +main_body: + %src0 = load volatile float, ptr addrspace(1) poison + ; use mbcnt to make sure the branch is divergent + %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) + %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo) + br label %loop + +loop: + %counter = phi i32 [ %hi, %main_body ], [ %counter.1, %loop ] + %src1 = load volatile float, ptr addrspace(1) poison + %out = fadd float %src0, %src1 + %out.0 = call float @llvm.amdgcn.wwm.f32(float %out) + %counter.1 = sub i32 %counter, 1 + %cc = icmp ne i32 %counter.1, 0 + br i1 %cc, label %loop, label %endloop + +endloop: + ret float %out.0 +} + +define amdgpu_ps void @test_wwm_set_inactive1(i32 inreg %idx) { +main_body: + %src = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) poison, i32 %idx, i32 0, i32 0, i32 0) + %src.0 = bitcast float %src to i32 + %src.1 = call i32 @llvm.amdgcn.set.inactive.i32(i32 %src.0, i32 0) + %out = add i32 %src.1, %src.1 + %out.0 = call i32 @llvm.amdgcn.wwm.i32(i32 %out) + %out.1 = bitcast i32 %out.0 to float + call void @llvm.amdgcn.struct.ptr.buffer.store.f32(float %out.1, ptr addrspace(8) poison, i32 %idx, i32 0, i32 0, i32 0) + ret void +} + +define amdgpu_ps float @test_strict_wqm1(i32 inreg %idx0, i32 inreg %idx1) { +main_body: + %src0 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) poison, i32 %idx0, i32 0, i32 0, i32 0) + %src1 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) poison, i32 %idx1, i32 0, i32 0, i32 0) + %out = fadd float %src0, %src1 + %out.0 = call float @llvm.amdgcn.strict.wqm.f32(float %out) + ret float %out.0 +} + +define amdgpu_ps float @test_strict_wqm2(i32 inreg %idx0, i32 inreg %idx1) { +main_body: + %src0 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) poison, i32 %idx0, i32 0, i32 0, i32 0) + %src1 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) poison, i32 %idx1, i32 0, i32 0, i32 0) + %src0.0 = bitcast float %src0 to i32 + %src1.0 = bitcast float %src1 to i32 + %out = add i32 %src0.0, %src1.0 + %out.0 = call i32 @llvm.amdgcn.strict.wqm.i32(i32 %out) + %out.1 = bitcast i32 %out.0 to float + ret float %out.1 +} + +define amdgpu_ps float @test_strict_wqm3(i32 inreg %idx) { +main_body: + ; use mbcnt to make sure the branch is divergent + %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) + %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo) + %cc = icmp uge i32 %hi, 16 + br i1 %cc, label %endif, label %if + +if: + %src = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) poison, i32 %idx, i32 0, i32 0, i32 0) + %out = fadd float %src, %src + %out.0 = call float @llvm.amdgcn.strict.wqm.f32(float %out) + %out.1 = fadd float %src, %out.0 + br label %endif + +endif: + %out.2 = phi float [ %out.1, %if ], [ 0.0, %main_body ] + ret float %out.2 +} + +define amdgpu_ps float @test_strict_wqm4(i32 inreg %idx) { +main_body: + ; use mbcnt to make sure the branch is divergent + %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) + %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo) + %cc = icmp uge i32 %hi, 16 + br i1 %cc, label %endif, label %if + +if: + %src = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) poison, i32 %idx, i32 0, i32 0, i32 0) + %out = fadd float %src, %src + %out.0 = call float @llvm.amdgcn.strict.wqm.f32(float %out) + br label %endif + +endif: + %out.1 = phi float [ %out.0, %if ], [ 0.0, %main_body ] + ret float %out.1 +} + +define amdgpu_ps float @test_strict_wqm5(i32 inreg %idx0, i32 inreg %idx1) { +main_body: + %src0 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) poison, i32 %idx0, i32 0, i32 0, i32 0) + call void @llvm.amdgcn.struct.ptr.buffer.store.f32(float %src0, ptr addrspace(8) poison, i32 %idx0, i32 0, i32 0, i32 0) + %src1 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) poison, i32 %idx1, i32 0, i32 0, i32 0) + %temp = fadd float %src1, %src1 + %temp.0 = call float @llvm.amdgcn.strict.wqm.f32(float %temp) + %out = fadd float %temp.0, %temp.0 + %out.0 = call float @llvm.amdgcn.wqm.f32(float %out) + ret float %out.0 +} + +define amdgpu_ps float @test_strict_wqm6_then() { +main_body: + %src0 = load volatile float, ptr addrspace(1) poison + ; use mbcnt to make sure the branch is divergent + %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) + %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo) + %cc = icmp uge i32 %hi, 16 + br i1 %cc, label %endif, label %if + +if: + %src1 = load volatile float, ptr addrspace(1) poison + %out = fadd float %src0, %src1 + %out.0 = call float @llvm.amdgcn.strict.wqm.f32(float %out) + br label %endif + +endif: + %out.1 = phi float [ %out.0, %if ], [ 0.0, %main_body ] + ret float %out.1 +} + +define amdgpu_ps float @test_strict_wqm6_loop() { +main_body: + %src0 = load volatile float, ptr addrspace(1) poison + ; use mbcnt to make sure the branch is divergent + %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) + %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo) + br label %loop + +loop: + %counter = phi i32 [ %hi, %main_body ], [ %counter.1, %loop ] + %src1 = load volatile float, ptr addrspace(1) poison + %out = fadd float %src0, %src1 + %out.0 = call float @llvm.amdgcn.strict.wqm.f32(float %out) + %counter.1 = sub i32 %counter, 1 + %cc = icmp ne i32 %counter.1, 0 + br i1 %cc, label %loop, label %endloop + +endloop: + ret float %out.0 +} + +define amdgpu_ps void @test_set_inactive2(i32 inreg %idx0, i32 inreg %idx1) { +main_body: + %src1 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) poison, i32 %idx1, i32 0, i32 0, i32 0) + %src1.0 = bitcast float %src1 to i32 + %src1.1 = call i32 @llvm.amdgcn.set.inactive.i32(i32 %src1.0, i32 poison) + %src0 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) poison, i32 %idx0, i32 0, i32 0, i32 0) + %src0.0 = bitcast float %src0 to i32 + %src0.1 = call i32 @llvm.amdgcn.wqm.i32(i32 %src0.0) + %out = add i32 %src0.1, %src1.1 + %out.0 = bitcast i32 %out to float + call void @llvm.amdgcn.struct.ptr.buffer.store.f32(float %out.0, ptr addrspace(8) poison, i32 %idx1, i32 0, i32 0, i32 0) + ret void +} + +define amdgpu_ps float @test_control_flow_0(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %c, i32 %z, float %data) { +main_body: + %cmp = icmp eq i32 %z, 0 + br i1 %cmp, label %IF, label %ELSE + +IF: + %c.bc = bitcast i32 %c to float + %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 + %tex0 = extractelement <4 x float> %tex, i32 0 + %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 + %data.if = extractelement <4 x float> %dtex, i32 0 + br label %END + +ELSE: + call void @llvm.amdgcn.struct.ptr.buffer.store.f32(float %data, ptr addrspace(8) poison, i32 %c, i32 0, i32 0, i32 0) + br label %END + +END: + %r = phi float [ %data.if, %IF ], [ %data, %ELSE ] + ret float %r +} + +define amdgpu_ps float @test_control_flow_1(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %c, i32 %z, float %data) { +main_body: + %cmp = icmp eq i32 %z, 0 + br i1 %cmp, label %ELSE, label %IF + +IF: + %c.bc = bitcast i32 %c to float + %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 + %tex0 = extractelement <4 x float> %tex, i32 0 + %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 + %data.if = extractelement <4 x float> %dtex, i32 0 + br label %END + +ELSE: + call void @llvm.amdgcn.struct.ptr.buffer.store.f32(float %data, ptr addrspace(8) poison, i32 %c, i32 0, i32 0, i32 0) + br label %END + +END: + %r = phi float [ %data.if, %IF ], [ %data, %ELSE ] + ret float %r +} + +define amdgpu_ps <4 x float> @test_control_flow_2(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, <3 x i32> %idx, <2 x float> %data, i32 %coord) { +main_body: + %idx.1 = extractelement <3 x i32> %idx, i32 0 + %data.1 = extractelement <2 x float> %data, i32 0 + call void @llvm.amdgcn.struct.ptr.buffer.store.f32(float %data.1, ptr addrspace(8) poison, i32 %idx.1, i32 0, i32 0, i32 0) + + ; The load that determines the branch (and should therefore be WQM) is + ; surrounded by stores that require disabled WQM. + %idx.2 = extractelement <3 x i32> %idx, i32 1 + %z = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) poison, i32 %idx.2, i32 0, i32 0, i32 0) + + %idx.3 = extractelement <3 x i32> %idx, i32 2 + %data.3 = extractelement <2 x float> %data, i32 1 + call void @llvm.amdgcn.struct.ptr.buffer.store.f32(float %data.3, ptr addrspace(8) poison, i32 %idx.3, i32 0, i32 0, i32 0) + + %cc = fcmp ogt float %z, 0.0 + br i1 %cc, label %IF, label %ELSE + +IF: + %coord.IF = mul i32 %coord, 3 + br label %END + +ELSE: + %coord.ELSE = mul i32 %coord, 4 + br label %END + +END: + %coord.END = phi i32 [ %coord.IF, %IF ], [ %coord.ELSE, %ELSE ] + %coord.END.bc = bitcast i32 %coord.END to float + %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord.END.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 + ret <4 x float> %tex +} + +define amdgpu_ps float @test_control_flow_3(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %idx, float %coord) { +main_body: + %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 + %tex0 = extractelement <4 x float> %tex, i32 0 + %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 + %dtex.1 = extractelement <4 x float> %dtex, i32 0 + call void @llvm.amdgcn.struct.ptr.buffer.store.f32(float %dtex.1, ptr addrspace(8) poison, i32 %idx, i32 0, i32 0, i32 0) + + %cc = fcmp ogt float %dtex.1, 0.0 + br i1 %cc, label %IF, label %ELSE + +IF: + %tex.IF = fmul float %dtex.1, 3.0 + br label %END + +ELSE: + %tex.ELSE = fmul float %dtex.1, 4.0 + br label %END + +END: + %tex.END = phi float [ %tex.IF, %IF ], [ %tex.ELSE, %ELSE ] + ret float %tex.END +} + +define amdgpu_ps <4 x float> @test_control_flow_4(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float %coord, i32 %y, float %z) { +main_body: + %cond = icmp eq i32 %y, 0 + br i1 %cond, label %IF, label %END + +IF: + %data = call float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8) poison, i32 0, i32 0, i32 0) + call void @llvm.amdgcn.struct.ptr.buffer.store.f32(float %data, ptr addrspace(8) poison, i32 1, i32 0, i32 0, i32 0) + br label %END + +END: + %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 + %tex0 = extractelement <4 x float> %tex, i32 0 + %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 + ret <4 x float> %dtex +} + +define amdgpu_ps <4 x float> @test_kill_0(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, ptr addrspace(1) inreg %ptr, <2 x i32> %idx, <2 x float> %data, float %coord, float %coord2, float %z) { +main_body: + %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 + %idx.0 = extractelement <2 x i32> %idx, i32 0 + %data.0 = extractelement <2 x float> %data, i32 0 + call void @llvm.amdgcn.struct.ptr.buffer.store.f32(float %data.0, ptr addrspace(8) poison, i32 %idx.0, i32 0, i32 0, i32 0) + + %z.cmp = fcmp olt float %z, 0.0 + call void @llvm.amdgcn.kill(i1 %z.cmp) + + %idx.1 = extractelement <2 x i32> %idx, i32 1 + %data.1 = extractelement <2 x float> %data, i32 1 + call void @llvm.amdgcn.struct.ptr.buffer.store.f32(float %data.1, ptr addrspace(8) poison, i32 %idx.1, i32 0, i32 0, i32 0) + %tex2 = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord2, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 + %tex2.0 = extractelement <4 x float> %tex2, i32 0 + %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex2.0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 + %out = fadd <4 x float> %tex, %dtex + + ret <4 x float> %out +} + +define amdgpu_ps <4 x float> @test_kill_1(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %idx, float %data, float %coord, float %coord2, float %z) { +main_body: + %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 + %tex0 = extractelement <4 x float> %tex, i32 0 + %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 + + call void @llvm.amdgcn.raw.ptr.buffer.store.f32(float %data, ptr addrspace(8) poison, i32 0, i32 0, i32 0) + + %z.cmp = fcmp olt float %z, 0.0 + call void @llvm.amdgcn.kill(i1 %z.cmp) + + ret <4 x float> %dtex +} + +define amdgpu_ps float @test_prolog_1(float %a, float %b) #5 { +main_body: + %s = fadd float %a, %b + ret float %s +} + +define amdgpu_ps <4 x float> @test_loop_vcc(<4 x float> %in) nounwind { +entry: + call void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float> %in, i32 15, i32 poison, <8 x i32> poison, i32 0, i32 0) + br label %loop + +loop: + %ctr.iv = phi float [ 0.0, %entry ], [ %ctr.next, %body ] + %c.iv = phi <4 x float> [ %in, %entry ], [ %c.next, %body ] + %cc = fcmp ogt float %ctr.iv, 7.0 + br i1 %cc, label %break, label %body + +body: + %c.iv0 = extractelement <4 x float> %c.iv, i32 0 + %c.next = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.iv0, <8 x i32> poison, <4 x i32> poison, i1 false, i32 0, i32 0) #0 + %ctr.next = fadd float %ctr.iv, 2.0 + br label %loop + +break: + ret <4 x float> %c.iv +} + +define amdgpu_ps void @test_alloca(float %data, i32 %a, i32 %idx) nounwind { +entry: + %array = alloca [32 x i32], align 4, addrspace(5) + + call void @llvm.amdgcn.raw.ptr.buffer.store.f32(float %data, ptr addrspace(8) poison, i32 0, i32 0, i32 0) + + store volatile i32 %a, ptr addrspace(5) %array, align 4 + + call void @llvm.amdgcn.struct.ptr.buffer.store.f32(float %data, ptr addrspace(8) poison, i32 1, i32 0, i32 0, i32 0) + + %c.gep = getelementptr [32 x i32], ptr addrspace(5) %array, i32 0, i32 %idx + %c = load i32, ptr addrspace(5) %c.gep, align 4 + %c.bc = bitcast i32 %c to float + %t = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.bc, <8 x i32> poison, <4 x i32> poison, i1 false, i32 0, i32 0) #0 + call void @llvm.amdgcn.raw.ptr.buffer.store.v4f32(<4 x float> %t, ptr addrspace(8) poison, i32 0, i32 0, i32 0) + + ret void +} + +define amdgpu_ps <4 x float> @test_nonvoid_return() nounwind { + %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float poison, <8 x i32> poison, <4 x i32> poison, i1 false, i32 0, i32 0) #0 + %tex0 = extractelement <4 x float> %tex, i32 0 + %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> poison, <4 x i32> poison, i1 false, i32 0, i32 0) #0 + ret <4 x float> %dtex +} + +define amdgpu_ps <4 x float> @test_nonvoid_return_unreachable(i32 inreg %c) nounwind { +entry: + %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float poison, <8 x i32> poison, <4 x i32> poison, i1 false, i32 0, i32 0) #0 + %tex0 = extractelement <4 x float> %tex, i32 0 + %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> poison, <4 x i32> poison, i1 false, i32 0, i32 0) #0 + %cc = icmp sgt i32 %c, 0 + br i1 %cc, label %if, label %else + +if: + store volatile <4 x float> %dtex, ptr addrspace(1) poison + unreachable + +else: + ret <4 x float> %dtex +} + +define amdgpu_ps <4 x float> @test_scc(i32 inreg %sel, i32 %idx) #1 { +main_body: + %cc = icmp sgt i32 %sel, 0 + br i1 %cc, label %if, label %else + +if: + %r.if = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float 0.0, <8 x i32> poison, <4 x i32> poison, i1 false, i32 0, i32 0) #0 + br label %end + +else: + %r.else = call <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float 0.0, float bitcast (i32 1 to float), <8 x i32> poison, <4 x i32> poison, i1 false, i32 0, i32 0) #0 + br label %end + +end: + %r = phi <4 x float> [ %r.if, %if ], [ %r.else, %else ] + call void @llvm.amdgcn.struct.ptr.buffer.store.f32(float 1.0, ptr addrspace(8) poison, i32 %idx, i32 0, i32 0, i32 0) + ret <4 x float> %r +} + +define amdgpu_ps float @test_wwm_within_wqm(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %c, i32 %z, float %data) { +main_body: + %cmp = icmp eq i32 %z, 0 + br i1 %cmp, label %IF, label %ENDIF + +IF: + %c.bc = bitcast i32 %c to float + %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 + %tex0 = extractelement <4 x float> %tex, i32 0 + %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 + %dataf = extractelement <4 x float> %dtex, i32 0 + %data1 = fptosi float %dataf to i32 + %data2 = call i32 @llvm.amdgcn.set.inactive.i32(i32 %data1, i32 0) + %data3 = call i32 @llvm.amdgcn.ds.swizzle(i32 %data2, i32 2079) + %data4 = call i32 @llvm.amdgcn.wwm.i32(i32 %data3) + %data4f = sitofp i32 %data4 to float + br label %ENDIF + +ENDIF: + %r = phi float [ %data4f, %IF ], [ 0.0, %main_body ] + ret float %r +} + +define amdgpu_ps float @test_strict_wwm1(i32 inreg %idx0, i32 inreg %idx1) { +main_body: + %src0 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) poison, i32 %idx0, i32 0, i32 0, i32 0) + %src1 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) poison, i32 %idx1, i32 0, i32 0, i32 0) + %out = fadd float %src0, %src1 + %out.0 = call float @llvm.amdgcn.strict.wwm.f32(float %out) + ret float %out.0 +} + +define amdgpu_ps float @test_strict_wwm2(i32 inreg %idx0, i32 inreg %idx1) { +main_body: + %src0 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) poison, i32 %idx0, i32 0, i32 0, i32 0) + %src1 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) poison, i32 %idx1, i32 0, i32 0, i32 0) + %src0.0 = bitcast float %src0 to i32 + %src1.0 = bitcast float %src1 to i32 + %out = add i32 %src0.0, %src1.0 + %out.0 = call i32 @llvm.amdgcn.strict.wwm.i32(i32 %out) + %out.1 = bitcast i32 %out.0 to float + ret float %out.1 +} + +define amdgpu_ps float @test_strict_wwm3(i32 inreg %idx) { +main_body: + ; use mbcnt to make sure the branch is divergent + %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) + %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo) + %cc = icmp uge i32 %hi, 16 + br i1 %cc, label %endif, label %if + +if: + %src = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) poison, i32 %idx, i32 0, i32 0, i32 0) + %out = fadd float %src, %src + %out.0 = call float @llvm.amdgcn.strict.wwm.f32(float %out) + %out.1 = fadd float %src, %out.0 + br label %endif + +endif: + %out.2 = phi float [ %out.1, %if ], [ 0.0, %main_body ] + ret float %out.2 +} + +define amdgpu_ps float @test_strict_wwm4(i32 inreg %idx) { +main_body: + ; use mbcnt to make sure the branch is divergent + %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) + %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo) + %cc = icmp uge i32 %hi, 16 + br i1 %cc, label %endif, label %if + +if: + %src = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) poison, i32 %idx, i32 0, i32 0, i32 0) + %out = fadd float %src, %src + %out.0 = call float @llvm.amdgcn.strict.wwm.f32(float %out) + br label %endif + +endif: + %out.1 = phi float [ %out.0, %if ], [ 0.0, %main_body ] + ret float %out.1 +} + +define amdgpu_ps float @test_strict_wwm5(i32 inreg %idx0, i32 inreg %idx1) { +main_body: + %src0 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) poison, i32 %idx0, i32 0, i32 0, i32 0) + call void @llvm.amdgcn.struct.ptr.buffer.store.f32(float %src0, ptr addrspace(8) poison, i32 %idx0, i32 0, i32 0, i32 0) + %src1 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) poison, i32 %idx1, i32 0, i32 0, i32 0) + %temp = fadd float %src1, %src1 + %temp.0 = call float @llvm.amdgcn.strict.wwm.f32(float %temp) + %out = fadd float %temp.0, %temp.0 + %out.0 = call float @llvm.amdgcn.wqm.f32(float %out) + ret float %out.0 +} + +define amdgpu_ps float @test_strict_wwm6_then() { +main_body: + %src0 = load volatile float, ptr addrspace(1) poison + ; use mbcnt to make sure the branch is divergent + %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) + %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo) + %cc = icmp uge i32 %hi, 16 + br i1 %cc, label %endif, label %if + +if: + %src1 = load volatile float, ptr addrspace(1) poison + %out = fadd float %src0, %src1 + %out.0 = call float @llvm.amdgcn.strict.wwm.f32(float %out) + br label %endif + +endif: + %out.1 = phi float [ %out.0, %if ], [ 0.0, %main_body ] + ret float %out.1 +} + +define amdgpu_ps float @test_strict_wwm6_loop() { +main_body: + %src0 = load volatile float, ptr addrspace(1) poison + ; use mbcnt to make sure the branch is divergent + %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) + %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo) + br label %loop + +loop: + %counter = phi i32 [ %hi, %main_body ], [ %counter.1, %loop ] + %src1 = load volatile float, ptr addrspace(1) poison + %out = fadd float %src0, %src1 + %out.0 = call float @llvm.amdgcn.strict.wwm.f32(float %out) + %counter.1 = sub i32 %counter, 1 + %cc = icmp ne i32 %counter.1, 0 + br i1 %cc, label %loop, label %endloop + +endloop: + ret float %out.0 +} + +define amdgpu_ps void @test_strict_wwm_set_inactive1(i32 inreg %idx) { +main_body: + %src = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) poison, i32 %idx, i32 0, i32 0, i32 0) + %src.0 = bitcast float %src to i32 + %src.1 = call i32 @llvm.amdgcn.set.inactive.i32(i32 %src.0, i32 0) + %out = add i32 %src.1, %src.1 + %out.0 = call i32 @llvm.amdgcn.strict.wwm.i32(i32 %out) + %out.1 = bitcast i32 %out.0 to float + call void @llvm.amdgcn.struct.ptr.buffer.store.f32(float %out.1, ptr addrspace(8) poison, i32 %idx, i32 0, i32 0, i32 0) + ret void +} + +define amdgpu_ps float @test_strict_wwm_within_wqm(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %c, i32 %z, float %data) { +main_body: + %cmp = icmp eq i32 %z, 0 + br i1 %cmp, label %IF, label %ENDIF + +IF: + %c.bc = bitcast i32 %c to float + %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 + %tex0 = extractelement <4 x float> %tex, i32 0 + %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 + %dataf = extractelement <4 x float> %dtex, i32 0 + %data1 = fptosi float %dataf to i32 + %data2 = call i32 @llvm.amdgcn.set.inactive.i32(i32 %data1, i32 0) + %data3 = call i32 @llvm.amdgcn.ds.swizzle(i32 %data2, i32 2079) + %data4 = call i32 @llvm.amdgcn.strict.wwm.i32(i32 %data3) + %data4f = sitofp i32 %data4 to float + br label %ENDIF + +ENDIF: + %r = phi float [ %data4f, %IF ], [ 0.0, %main_body ] + ret float %r +} + +define amdgpu_ps float @test_strict_wqm_within_wqm(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %c, i32 %z, float %data) { +main_body: + %cmp = icmp eq i32 %z, 0 + br i1 %cmp, label %IF, label %ENDIF + +IF: + %c.bc = bitcast i32 %c to float + %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 + %tex0 = extractelement <4 x float> %tex, i32 0 + %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 + %dataf = extractelement <4 x float> %dtex, i32 0 + %data1 = fptosi float %dataf to i32 + %data2 = call i32 @llvm.amdgcn.ds.swizzle(i32 %data1, i32 2079) + %data3 = call i32 @llvm.amdgcn.strict.wqm.i32(i32 %data2) + %data3f = sitofp i32 %data3 to float + br label %ENDIF + +ENDIF: + %r = phi float [ %data3f, %IF ], [ 0.0, %main_body ] + ret float %r +} + +define amdgpu_ps float @test_strict_wqm_within_wqm_with_kill(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %c, i32 %z, float %data, i32 %wqm_data) { +main_body: + %c.bc = bitcast i32 %c to float + %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 + %tex0 = extractelement <4 x float> %tex, i32 0 + %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 + %cmp = icmp eq i32 %z, 0 + call void @llvm.amdgcn.kill(i1 %cmp) + %dataf = extractelement <4 x float> %dtex, i32 0 + %data2 = call i32 @llvm.amdgcn.ds.swizzle(i32 %wqm_data, i32 2079) + %data3 = call i32 @llvm.amdgcn.strict.wqm.i32(i32 %data2) + %data3f = sitofp i32 %data3 to float + %result.f = fadd float %dataf, %data3f + %result.i = bitcast float %result.f to i32 + %result.wqm = call i32 @llvm.amdgcn.wqm.i32(i32 %result.i) + %result = bitcast i32 %result.wqm to float + ret float %result +} + +define amdgpu_ps float @test_strict_wqm_strict_wwm_wqm(i32 inreg %idx0, i32 inreg %idx1, ptr addrspace(8) inreg %res, ptr addrspace(8) inreg %res2, float %inp, <8 x i32> inreg %res3) { +main_body: + call void @llvm.amdgcn.struct.ptr.buffer.store.f32(float %inp, ptr addrspace(8) %res, i32 %idx1, i32 0, i32 0, i32 0) + %reload = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) %res, i32 %idx1, i32 0, i32 0, i32 0) + %temp = fadd float %reload, %reload + %temp2 = call float @llvm.amdgcn.strict.wqm.f32(float %temp) + %temp3 = fadd float %temp2, %temp2 + %reload_wwm = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) %res2, i32 %idx0, i32 0, i32 0, i32 0) + %temp4 = call float @llvm.amdgcn.strict.wwm.f32(float %reload_wwm) + %temp5 = fadd float %temp3, %temp4 + %res.int = ptrtoint ptr addrspace(8) %res to i128 + %res.vec = bitcast i128 %res.int to <4 x i32> + %tex = call float @llvm.amdgcn.image.sample.1d.f32.f32(i32 1, float %temp5, <8 x i32> %res3, <4 x i32> %res.vec, i1 false, i32 0, i32 0) + call void @llvm.amdgcn.struct.ptr.buffer.store.f32(float %tex, ptr addrspace(8) %res, i32 %idx1, i32 0, i32 0, i32 0) + %out = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) %res, i32 %idx1, i32 0, i32 0, i32 0) + ret float %out +} + +define amdgpu_ps float @test_strict_wwm_strict_wqm_wqm(i32 inreg %idx0, i32 inreg %idx1, ptr addrspace(8) inreg %res, float %inp, <8 x i32> inreg %res2) { +main_body: + call void @llvm.amdgcn.struct.ptr.buffer.store.f32(float %inp, ptr addrspace(8) %res, i32 %idx0, i32 0, i32 0, i32 0) + %reload = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) %res, i32 %idx1, i32 0, i32 0, i32 0) + %temp = fadd float %reload, %reload + %temp2 = call float @llvm.amdgcn.strict.wwm.f32(float %temp) + %temp3 = fadd float %temp2, %temp2 + %reload_wwm = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) %res, i32 %idx0, i32 0, i32 0, i32 0) + %temp4 = call float @llvm.amdgcn.strict.wqm.f32(float %reload_wwm) + %temp5 = fadd float %temp3, %temp4 + %res.int = ptrtoint ptr addrspace(8) %res to i128 + %res.vec = bitcast i128 %res.int to <4 x i32> + %tex = call float @llvm.amdgcn.image.sample.1d.f32.f32(i32 1, float %temp5, <8 x i32> %res2, <4 x i32> %res.vec, i1 false, i32 0, i32 0) + call void @llvm.amdgcn.struct.ptr.buffer.store.f32(float %tex, ptr addrspace(8) %res, i32 %idx0, i32 0, i32 0, i32 0) + %out = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) %res, i32 %idx0, i32 0, i32 0, i32 0) + ret float %out +} + +define amdgpu_ps float @test_wqm_strict_wqm_wqm(i32 inreg %idx0, i32 inreg %idx1, ptr addrspace(8) inreg %res, float %inp, <8 x i32> inreg %res2) { +main_body: + call void @llvm.amdgcn.struct.ptr.buffer.store.f32(float %inp, ptr addrspace(8) %res, i32 %idx0, i32 0, i32 0, i32 0) + %reload = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) %res, i32 %idx1, i32 0, i32 0, i32 0) + %temp = fadd float %reload, %reload + %res.int = ptrtoint ptr addrspace(8) %res to i128 + %res.vec = bitcast i128 %res.int to <4 x i32> + %tex = call float @llvm.amdgcn.image.sample.1d.f32.f32(i32 1, float %temp, <8 x i32> %res2, <4 x i32> %res.vec, i1 false, i32 0, i32 0) + %temp2 = fadd float %tex, %tex + %reload_wwm = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) %res, i32 %idx0, i32 0, i32 0, i32 0) + %temp3 = call float @llvm.amdgcn.strict.wqm.f32(float %reload_wwm) + %temp4 = fadd float %temp2, %temp3 + %tex2 = call float @llvm.amdgcn.image.sample.1d.f32.f32(i32 1, float %temp4, <8 x i32> %res2, <4 x i32> %res.vec, i1 false, i32 0, i32 0) + call void @llvm.amdgcn.struct.ptr.buffer.store.f32(float %tex2, ptr addrspace(8) %res, i32 %idx0, i32 0, i32 0, i32 0) + %out = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) %res, i32 %idx0, i32 0, i32 0, i32 0) + ret float %out +} + +define amdgpu_ps void @test_for_deactivating_lanes_in_wave32(ptr addrspace(6) inreg %0) { +main_body: + %1 = ptrtoint ptr addrspace(6) %0 to i32 + %2 = insertelement <4 x i32> , i32 %1, i32 0 + %3 = call nsz arcp float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %2, i32 0, i32 0) #3 + %4 = fcmp nsz arcp ugt float %3, 0.000000e+00 + call void @llvm.amdgcn.kill(i1 %4) #1 + ret void +} + +define amdgpu_gs void @wqm_init_exec() { +bb: + call void @llvm.amdgcn.init.exec(i64 -1) + call void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float> zeroinitializer, <4 x i32> zeroinitializer, i32 0, i32 0, i32 0) + %i = call i32 @llvm.amdgcn.wqm.i32(i32 0) + store i32 %i, i32 addrspace(3)* null, align 4 + ret void +} + +define amdgpu_gs void @wqm_init_exec_switch(i32 %arg) { + call void @llvm.amdgcn.init.exec(i64 0) + switch i32 %arg, label %bb1 [ + i32 0, label %bb3 + i32 1, label %bb2 + ] +bb1: + ret void +bb2: + ret void +bb3: + ret void +} + +define amdgpu_gs void @wqm_init_exec_wwm() { + call void @llvm.amdgcn.init.exec(i64 0) + %i = call i64 @llvm.amdgcn.ballot.i64(i1 true) + %i1 = call i32 @llvm.amdgcn.wwm.i32(i32 0) + %i2 = insertelement <2 x i32> zeroinitializer, i32 %i1, i64 0 + %i3 = bitcast <2 x i32> %i2 to i64 + %i4 = icmp ne i64 %i, 0 + %i5 = icmp ne i64 %i3, 0 + %i6 = xor i1 %i4, %i5 + %i7 = uitofp i1 %i6 to float + call void @llvm.amdgcn.exp.f32(i32 0, i32 0, float %i7, float 0.0, float 0.0, float 0.0, i1 false, i1 false) + ret void +} + +define amdgpu_ps float @short_exact_regions(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float %c, ptr addrspace(4) %p) { +main_body: + %tex1 = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 + %idx0 = load <4 x i32>, ptr addrspace(4) %p, align 4 + %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) + %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo) + %cc = icmp uge i32 %hi, 16 + br i1 %cc, label %endif, label %if + +if: + %idx1 = extractelement <4 x i32> %idx0, i64 0 + %idx2 = call i32 @llvm.amdgcn.readfirstlane.i32(i32 %idx1) + %idx3 = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %sampler, i32 %idx2, i32 0) + + call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %tex1, <4 x i32> poison, i32 %idx3, i32 0, i32 0, i32 0) + br label %endif + +endif: + %d = extractelement <4 x float> %tex1, i64 0 + %tex2 = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %d, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 + %r0 = extractelement <4 x float> %tex1, i64 1 + %r1 = extractelement <4 x float> %tex2, i64 2 + %r2 = fadd float %r0, %r1 + %out = call float @llvm.amdgcn.wqm.f32(float %r2) + + ret float %out +} + +define amdgpu_ps float @short_exact_regions_2(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float %c, ptr addrspace(4) %p) { +main_body: + %tex1 = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 + %idx0 = load <4 x i32>, ptr addrspace(4) %p, align 4 + %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) + %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo) + %idx1 = extractelement <4 x i32> %idx0, i64 0 + %d = extractelement <4 x float> %tex1, i64 0 + + %tex2 = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %d, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 + + %idx2 = call i32 @llvm.amdgcn.readfirstlane.i32(i32 %idx1) + %idx3 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %sampler, i32 %idx2, i32 0) + + %r0 = extractelement <4 x float> %tex1, i64 1 + %r1 = extractelement <4 x float> %tex2, i64 2 + %r2 = fadd float %r0, %r1 + %out = fadd float %r2, %idx3 + + ret float %out +} + +declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #1 +declare void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float>, i32, i32, <8 x i32>, i32, i32) #1 + +declare void @llvm.amdgcn.struct.buffer.store.f32(float, <4 x i32>, i32, i32, i32, i32 immarg) #2 +declare void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i32, i32 immarg) #2 +declare void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i32 immarg) #2 +declare void @llvm.amdgcn.raw.buffer.store.f32(float, <4 x i32>, i32, i32, i32 immarg) #2 +declare float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32>, i32, i32, i32) #3 +declare float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32>, i32, i32, i32, i32) #3 + +declare void @llvm.amdgcn.struct.ptr.buffer.store.f32(float, ptr addrspace(8), i32, i32, i32, i32 immarg) #2 +declare void @llvm.amdgcn.struct.ptr.buffer.store.v4f32(<4 x float>, ptr addrspace(8), i32, i32, i32, i32 immarg) #2 +declare void @llvm.amdgcn.raw.ptr.buffer.store.v4f32(<4 x float>, ptr addrspace(8), i32, i32, i32 immarg) #2 +declare void @llvm.amdgcn.raw.ptr.buffer.store.f32(float, ptr addrspace(8), i32, i32, i32 immarg) #2 +declare float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8), i32, i32, i32) #3 +declare float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8), i32, i32, i32, i32) #3 + +declare <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32(i32, i32, <8 x i32>, i32, i32) #3 +declare <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #3 +declare <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #3 +declare float @llvm.amdgcn.image.sample.1d.f32.f32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #3 +declare void @llvm.amdgcn.kill(i1) #1 +declare float @llvm.amdgcn.wqm.f32(float) #3 +declare i32 @llvm.amdgcn.wqm.i32(i32) #3 +declare float @llvm.amdgcn.strict.wwm.f32(float) #3 +declare i32 @llvm.amdgcn.strict.wwm.i32(i32) #3 +declare float @llvm.amdgcn.wwm.f32(float) #3 +declare i32 @llvm.amdgcn.wwm.i32(i32) #3 +declare float @llvm.amdgcn.strict.wqm.f32(float) #3 +declare i32 @llvm.amdgcn.strict.wqm.i32(i32) #3 +declare i32 @llvm.amdgcn.set.inactive.i32(i32, i32) #4 +declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #3 +declare i32 @llvm.amdgcn.mbcnt.hi(i32, i32) #3 +declare <2 x half> @llvm.amdgcn.cvt.pkrtz(float, float) #3 +declare void @llvm.amdgcn.exp.compr.v2f16(i32, i32, <2 x half>, <2 x half>, i1, i1) #1 +declare float @llvm.amdgcn.interp.p1(float, i32, i32, i32) #2 +declare float @llvm.amdgcn.interp.p2(float, float, i32, i32, i32) #2 +declare i32 @llvm.amdgcn.ds.swizzle(i32, i32) +declare float @llvm.amdgcn.s.buffer.load.f32(<4 x i32>, i32, i32 immarg) #7 +declare i32 @llvm.amdgcn.readfirstlane.i32(i32) + +attributes #1 = { nounwind } +attributes #2 = { nounwind readonly } +attributes #3 = { nounwind readnone } +attributes #4 = { nounwind readnone convergent } +attributes #5 = { "amdgpu-ps-wqm-outputs" } +attributes #6 = { nounwind "InitialPSInputAddr"="2" } +attributes #7 = { nounwind readnone willreturn }