diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def index 6f5d1e024b91d..7b7dbf7043099 100644 --- a/clang/include/clang/Basic/BuiltinsAMDGPU.def +++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def @@ -667,6 +667,9 @@ TARGET_BUILTIN(__builtin_amdgcn_global_load_monitor_b128, "V4iV4i*1Ii", "nc", "g TARGET_BUILTIN(__builtin_amdgcn_flat_load_monitor_b32, "ii*0Ii", "nc", "gfx1250-insts") TARGET_BUILTIN(__builtin_amdgcn_flat_load_monitor_b64, "V2iV2i*0Ii", "nc", "gfx1250-insts") TARGET_BUILTIN(__builtin_amdgcn_flat_load_monitor_b128, "V4iV4i*0Ii", "nc", "gfx1250-insts") +TARGET_BUILTIN(__builtin_amdgcn_cluster_load_b32, "ii*1Iii", "nc", "gfx1250-insts,wavefrontsize32") +TARGET_BUILTIN(__builtin_amdgcn_cluster_load_b64, "V2iV2i*1Iii", "nc", "gfx1250-insts,wavefrontsize32") +TARGET_BUILTIN(__builtin_amdgcn_cluster_load_b128, "V4iV4i*1Iii", "nc", "gfx1250-insts,wavefrontsize32") TARGET_BUILTIN(__builtin_amdgcn_global_load_async_to_lds_b8, "vc*1c*3IiIi", "nc", "gfx1250-insts") TARGET_BUILTIN(__builtin_amdgcn_global_load_async_to_lds_b32, "vi*1i*3IiIi", "nc", "gfx1250-insts") TARGET_BUILTIN(__builtin_amdgcn_global_load_async_to_lds_b64, "vV2i*1V2i*3IiIi", "nc", "gfx1250-insts") diff --git a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp index 433d76b2812db..4c1953e4b8e34 100644 --- a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp +++ b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp @@ -675,6 +675,27 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID, llvm::Function *F = CGM.getIntrinsic(IID, {LoadTy}); return Builder.CreateCall(F, {Addr, Val}); } + case AMDGPU::BI__builtin_amdgcn_cluster_load_b32: + case AMDGPU::BI__builtin_amdgcn_cluster_load_b64: + case AMDGPU::BI__builtin_amdgcn_cluster_load_b128: { + Intrinsic::ID IID; + switch (BuiltinID) { + case AMDGPU::BI__builtin_amdgcn_cluster_load_b32: + IID = Intrinsic::amdgcn_cluster_load_b32; + break; + case AMDGPU::BI__builtin_amdgcn_cluster_load_b64: + IID = Intrinsic::amdgcn_cluster_load_b64; + break; + case AMDGPU::BI__builtin_amdgcn_cluster_load_b128: + IID = Intrinsic::amdgcn_cluster_load_b128; + break; + } + SmallVector Args; + for (int i = 0, e = E->getNumArgs(); i != e; ++i) + Args.push_back(EmitScalarExpr(E->getArg(i))); + llvm::Function *F = CGM.getIntrinsic(IID, {ConvertType(E->getType())}); + return Builder.CreateCall(F, {Args}); + } case AMDGPU::BI__builtin_amdgcn_load_to_lds: { // Should this have asan instrumentation? return emitBuiltinWithOneOverloadedType<5>(*this, E, diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250-cluster-load.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250-cluster-load.cl new file mode 100644 index 0000000000000..4c6e8badf1bce --- /dev/null +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250-cluster-load.cl @@ -0,0 +1,36 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: amdgpu-registered-target +// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu gfx1250 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX1250 + +typedef int v2i __attribute__((ext_vector_type(2))); +typedef int v4i __attribute__((ext_vector_type(4))); + +// CHECK-GFX1250-LABEL: @test_amdgcn_cluster_load_b32( +// CHECK-GFX1250-NEXT: entry: +// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.amdgcn.cluster.load.b32.i32(ptr addrspace(1) [[INPTR:%.*]], i32 10, i32 [[MASK:%.*]]) +// CHECK-GFX1250-NEXT: ret i32 [[TMP0]] +// +int test_amdgcn_cluster_load_b32(global int* inptr, int mask) +{ + return __builtin_amdgcn_cluster_load_b32(inptr, 10, mask); +} + +// CHECK-GFX1250-LABEL: @test_amdgcn_cluster_load_b64( +// CHECK-GFX1250-NEXT: entry: +// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call <2 x i32> @llvm.amdgcn.cluster.load.b64.v2i32(ptr addrspace(1) [[INPTR:%.*]], i32 22, i32 [[MASK:%.*]]) +// CHECK-GFX1250-NEXT: ret <2 x i32> [[TMP0]] +// +v2i test_amdgcn_cluster_load_b64(global v2i* inptr, int mask) +{ + return __builtin_amdgcn_cluster_load_b64(inptr, 22, mask); +} + +// CHECK-GFX1250-LABEL: @test_amdgcn_cluster_load_b128( +// CHECK-GFX1250-NEXT: entry: +// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.cluster.load.b128.v4i32(ptr addrspace(1) [[INPTR:%.*]], i32 27, i32 [[MASK:%.*]]) +// CHECK-GFX1250-NEXT: ret <4 x i32> [[TMP0]] +// +v4i test_amdgcn_cluster_load_b128(global v4i* inptr, int mask) +{ + return __builtin_amdgcn_cluster_load_b128(inptr, 27, mask); +} diff --git a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250-param.cl b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250-param.cl index 4a28f9acdecf7..b5a6a2597b971 100644 --- a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250-param.cl +++ b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250-param.cl @@ -103,6 +103,13 @@ void test_amdgcn_load_monitor(global int* b32gaddr, global v2i* b64gaddr, global *b128out = __builtin_amdgcn_flat_load_monitor_b128(b128faddr, cpol); // expected-error {{'__builtin_amdgcn_flat_load_monitor_b128' must be a constant integer}} } +void test_amdgcn_cluster_load(global int* addr32, global v2i* addr64, global v4i* addr128, global int* b32out, global v2i* b64out, global v4i* b128out, int cpol, int mask) +{ + *b32out = __builtin_amdgcn_cluster_load_b32(addr32, cpol, mask); // expected-error {{'__builtin_amdgcn_cluster_load_b32' must be a constant integer}} + *b64out = __builtin_amdgcn_cluster_load_b64(addr64, cpol, mask); // expected-error {{'__builtin_amdgcn_cluster_load_b64' must be a constant integer}} + *b128out = __builtin_amdgcn_cluster_load_b128(addr128, cpol, mask); // expected-error {{'__builtin_amdgcn_cluster_load_b128' must be a constant integer}} +} + void test_amdgcn_async_load_store_lds_offset(global char* gaddr8, global int *gaddr32, global v2i* gaddr64, global v4i* gaddr128, local char* laddr8, local int *laddr32, local v2i* laddr64, local v4i* laddr128, int offset, int mask) { diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index a4d4adae580d0..3c5ac99512a64 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -4113,6 +4113,23 @@ def int_amdgcn_tensor_load_to_lds_d2 : def int_amdgcn_tensor_store_from_lds_d2 : ClangBuiltin<"__builtin_amdgcn_tensor_store_from_lds_d2">, AMDGPUTensorLoadStoreD2; +class AMDGPUClusterLoad: + Intrinsic< + [llvm_any_ty], + [ptr_ty, + llvm_i32_ty, // gfx12+ cachepolicy: + // bits [0-2] = th + // bits [3-4] = scope + llvm_i32_ty], // workgroup broadcast mask (in M0) + [IntrArgMemOnly, IntrReadMem, ReadOnly>, IntrWillReturn, IntrConvergent, + NoCapture>, ImmArg>, IntrNoCallback, IntrNoFree], + "", [SDNPMemOperand] + >; + +def int_amdgcn_cluster_load_b32 : AMDGPUClusterLoad; +def int_amdgcn_cluster_load_b64 : AMDGPUClusterLoad; +def int_amdgcn_cluster_load_b128 : AMDGPUClusterLoad; + class AMDGPULoadMonitor: Intrinsic< [llvm_any_ty], diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td index 394a143dd3086..bc88404442c3f 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td @@ -128,6 +128,9 @@ def gi_global_saddr : def gi_global_saddr_cpol : GIComplexOperandMatcher, GIComplexPatternEquiv; +def gi_global_saddr_cpol_m0 : + GIComplexOperandMatcher, + GIComplexPatternEquiv; def gi_global_saddr_glc : GIComplexOperandMatcher, GIComplexPatternEquiv; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index b80e43b27129b..2734bc27ede3d 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -2089,6 +2089,23 @@ bool AMDGPUDAGToDAGISel::SelectGlobalSAddrCPol(SDNode *N, SDValue Addr, return true; } +bool AMDGPUDAGToDAGISel::SelectGlobalSAddrCPolM0(SDNode *N, SDValue Addr, + SDValue &SAddr, + SDValue &VOffset, + SDValue &Offset, + SDValue &CPol) const { + bool ScaleOffset; + if (!SelectGlobalSAddr(N, Addr, SAddr, VOffset, Offset, ScaleOffset)) + return false; + + // We are assuming CPol is second from last operand of the intrinsic. + auto PassedCPol = + N->getConstantOperandVal(N->getNumOperands() - 2) & ~AMDGPU::CPol::SCAL; + CPol = CurDAG->getTargetConstant( + (ScaleOffset ? AMDGPU::CPol::SCAL : 0) | PassedCPol, SDLoc(), MVT::i32); + return true; +} + bool AMDGPUDAGToDAGISel::SelectGlobalSAddrGLC(SDNode *N, SDValue Addr, SDValue &SAddr, SDValue &VOffset, SDValue &Offset, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h index 16388e750026c..e79585844a01c 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h @@ -171,6 +171,9 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel { bool SelectGlobalSAddrCPol(SDNode *N, SDValue Addr, SDValue &SAddr, SDValue &VOffset, SDValue &Offset, SDValue &CPol) const; + bool SelectGlobalSAddrCPolM0(SDNode *N, SDValue Addr, SDValue &SAddr, + SDValue &VOffset, SDValue &Offset, + SDValue &CPol) const; bool SelectGlobalSAddrGLC(SDNode *N, SDValue Addr, SDValue &SAddr, SDValue &VOffset, SDValue &Offset, SDValue &CPol) const; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index fac365d015d95..e8482a9b936b3 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -5708,6 +5708,16 @@ AMDGPUInstructionSelector::selectGlobalSAddrCPol(MachineOperand &Root) const { return selectGlobalSAddr(Root, PassedCPol); } +InstructionSelector::ComplexRendererFns +AMDGPUInstructionSelector::selectGlobalSAddrCPolM0(MachineOperand &Root) const { + const MachineInstr &I = *Root.getParent(); + + // We are assuming CPol is second from last operand of the intrinsic. + auto PassedCPol = + I.getOperand(I.getNumOperands() - 2).getImm() & ~AMDGPU::CPol::SCAL; + return selectGlobalSAddr(Root, PassedCPol); +} + InstructionSelector::ComplexRendererFns AMDGPUInstructionSelector::selectGlobalSAddrGLC(MachineOperand &Root) const { return selectGlobalSAddr(Root, AMDGPU::CPol::GLC); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h index 4db46064999c0..194dd6e4099a8 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h @@ -256,6 +256,8 @@ class AMDGPUInstructionSelector final : public InstructionSelector { InstructionSelector::ComplexRendererFns selectGlobalSAddrCPol(MachineOperand &Root) const; InstructionSelector::ComplexRendererFns + selectGlobalSAddrCPolM0(MachineOperand &Root) const; + InstructionSelector::ComplexRendererFns selectGlobalSAddrGLC(MachineOperand &Root) const; InstructionSelector::ComplexRendererFns selectGlobalSAddrNoIOffset(MachineOperand &Root) const; diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index 237929699dd9d..08a9ed2714ec0 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -3338,6 +3338,13 @@ void AMDGPURegisterBankInfo::applyMappingImpl( applyDefaultMapping(OpdMapper); constrainOpWithReadfirstlane(B, MI, 8); // M0 return; + case Intrinsic::amdgcn_cluster_load_b32: + case Intrinsic::amdgcn_cluster_load_b64: + case Intrinsic::amdgcn_cluster_load_b128: { + applyDefaultMapping(OpdMapper); + constrainOpWithReadfirstlane(B, MI, 4); // M0 + return; + } case Intrinsic::amdgcn_s_sleep_var: assert(OpdMapper.getVRegs(1).empty()); constrainOpWithReadfirstlane(B, MI, 1); @@ -5466,6 +5473,16 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { OpdsMapping[1] = AMDGPU::getValueMapping(Bank, 32); break; } + case Intrinsic::amdgcn_cluster_load_b32: + case Intrinsic::amdgcn_cluster_load_b64: + case Intrinsic::amdgcn_cluster_load_b128: { + OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); + OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); + unsigned M0Bank = + getRegBankID(MI.getOperand(4).getReg(), MRI, AMDGPU::SGPRRegBankID); + OpdsMapping[4] = AMDGPU::getValueMapping(M0Bank, 32); + break; + } case Intrinsic::amdgcn_global_store_async_from_lds_b8: case Intrinsic::amdgcn_global_store_async_from_lds_b32: case Intrinsic::amdgcn_global_store_async_from_lds_b64: diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td index 80f0d504ea30c..19f95c5ac4c37 100644 --- a/llvm/lib/Target/AMDGPU/FLATInstructions.td +++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td @@ -15,6 +15,7 @@ let WantsRoot = true in { def GlobalSAddr : ComplexPattern; def GlobalSAddrGLC : ComplexPattern; def GlobalSAddrCPol : ComplexPattern; + def GlobalSAddrCPolM0 : ComplexPattern; def ScratchSAddr : ComplexPattern; def ScratchSVAddr : ComplexPattern; } @@ -1248,6 +1249,14 @@ defm GLOBAL_LOAD_MONITOR_B64 : FLAT_Global_Load_Pseudo <"global_load_monitor_b6 defm GLOBAL_LOAD_MONITOR_B128 : FLAT_Global_Load_Pseudo <"global_load_monitor_b128", VReg_128>; } // End SubtargetPredicate = isGFX125xOnly +let SubtargetPredicate = isGFX1250Plus, WaveSizePredicate = isWave32 in { +let Uses = [M0, EXEC] in { // Use M0 for broadcast workgroup mask. +defm CLUSTER_LOAD_B32 : FLAT_Global_Load_Pseudo <"cluster_load_b32", VGPR_32>; +defm CLUSTER_LOAD_B64 : FLAT_Global_Load_Pseudo <"cluster_load_b64", VReg_64>; +defm CLUSTER_LOAD_B128 : FLAT_Global_Load_Pseudo <"cluster_load_b128", VReg_128>; +} // End Uses = [M0, EXEC] +} // End SubtargetPredicate = isGFX1250Plus, WaveSizePredicate = isWave32 + let SubtargetPredicate = isGFX12Plus in { let Uses = [EXEC, M0] in { defm GLOBAL_LOAD_BLOCK : FLAT_Global_Load_Pseudo <"global_load_block", VReg_1024>; @@ -1394,6 +1403,16 @@ class FlatLoadSaddrPat (inst $saddr, $voffset, $offset, $cpol) >; +class FlatLoadSignedPat_M0 : GCNPat < + (vt (node (GlobalOffset (i64 VReg_64:$vaddr), i32:$offset), (i32 timm:$cpol), M0)), + (inst $vaddr, $offset, $cpol) +>; + +class GlobalLoadSaddrPat_M0 : GCNPat < + (vt (node (GlobalSAddrCPolM0 (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset, CPol:$cpol), (i32 timm), M0)), + (inst $saddr, $voffset, $offset, $cpol) +>; + class FlatLoadSignedPat_CPOL : GCNPat < (vt (node (GlobalOffset (i64 VReg_64:$vaddr), i32:$offset), (i32 timm:$cpol))), (inst $vaddr, $offset, $cpol) @@ -1619,6 +1638,16 @@ multiclass GlobalFLATLoadPats { + def : FlatLoadSignedPat_M0 { + let AddedComplexity = 10; + } + + def : GlobalLoadSaddrPat_M0(!cast(inst)#"_SADDR"), node, vt> { + let AddedComplexity = 11; + } +} + multiclass GlobalFLATLoadPats_CPOL { def : FlatLoadSignedPat_CPOL { let AddedComplexity = 10; @@ -2176,6 +2205,10 @@ let OtherPredicates = [isGFX125xOnly] in { } // End SubtargetPredicate = isGFX125xOnly let OtherPredicates = [isGFX1250Plus] in { + defm : GlobalFLATLoadPats_M0 ; + defm : GlobalFLATLoadPats_M0 ; + defm : GlobalFLATLoadPats_M0 ; + defm : GlobalLoadLDSPats ; defm : GlobalLoadLDSPats ; defm : GlobalLoadLDSPats ; @@ -3470,6 +3503,10 @@ defm GLOBAL_LOAD_MONITOR_B32 : VFLAT_Real_AllAddr_gfx1250<0x070>; defm GLOBAL_LOAD_MONITOR_B64 : VFLAT_Real_AllAddr_gfx1250<0x071>; defm GLOBAL_LOAD_MONITOR_B128 : VFLAT_Real_AllAddr_gfx1250<0x072>; +defm CLUSTER_LOAD_B32 : VFLAT_Real_AllAddr_gfx1250<0x067>; +defm CLUSTER_LOAD_B64 : VFLAT_Real_AllAddr_gfx1250<0x068>; +defm CLUSTER_LOAD_B128 : VFLAT_Real_AllAddr_gfx1250<0x069>; + defm GLOBAL_LOAD_ASYNC_TO_LDS_B8 : VFLAT_Real_AllAddr_gfx1250<0x5f>; defm GLOBAL_LOAD_ASYNC_TO_LDS_B32 : VFLAT_Real_AllAddr_gfx1250<0x60>; defm GLOBAL_LOAD_ASYNC_TO_LDS_B64 : VFLAT_Real_AllAddr_gfx1250<0x61>; diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index a861d9a96c9e3..dad5b292893fd 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -1506,6 +1506,9 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, case Intrinsic::amdgcn_global_load_monitor_b32: case Intrinsic::amdgcn_global_load_monitor_b64: case Intrinsic::amdgcn_global_load_monitor_b128: + case Intrinsic::amdgcn_cluster_load_b32: + case Intrinsic::amdgcn_cluster_load_b64: + case Intrinsic::amdgcn_cluster_load_b128: case Intrinsic::amdgcn_ds_load_tr6_b96: case Intrinsic::amdgcn_ds_load_tr4_b64: case Intrinsic::amdgcn_ds_load_tr8_b64: @@ -1636,6 +1639,9 @@ bool SITargetLowering::getAddrModeArguments(const IntrinsicInst *II, Value *Ptr = nullptr; switch (II->getIntrinsicID()) { case Intrinsic::amdgcn_atomic_cond_sub_u32: + case Intrinsic::amdgcn_cluster_load_b128: + case Intrinsic::amdgcn_cluster_load_b64: + case Intrinsic::amdgcn_cluster_load_b32: case Intrinsic::amdgcn_ds_append: case Intrinsic::amdgcn_ds_consume: case Intrinsic::amdgcn_ds_load_tr8_b64: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cluster.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cluster.load.ll new file mode 100644 index 0000000000000..7746dc60ddfc3 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cluster.load.ll @@ -0,0 +1,183 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 -O3 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-SDAG %s +; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1250 -O3 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-GISEL %s + +declare i32 @llvm.amdgcn.cluster.load.b32.i32.p1(ptr addrspace(1), i32 %cpol, i32 %mask) +declare <2 x i32> @llvm.amdgcn.cluster.load.b64.v2i32.p1(ptr addrspace(1), i32 %cpol, i32 %mask) +declare <4 x i32> @llvm.amdgcn.cluster.load.b128.v4i32.p1(ptr addrspace(1), i32 %cpol, i32 %mask) + +define amdgpu_ps void @cluster_load_b32_vaddr(ptr addrspace(1) %addr, ptr addrspace(1) %use, i32 %mask) { +; GFX1250-LABEL: cluster_load_b32_vaddr: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_readfirstlane_b32 s0, v4 +; GFX1250-NEXT: s_mov_b32 m0, s0 +; GFX1250-NEXT: cluster_load_b32 v0, v[0:1], off offset:32 th:TH_LOAD_NT +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v[2:3], v0, off +; GFX1250-NEXT: s_endpgm +entry: + %gep = getelementptr i64, ptr addrspace(1) %addr, i32 4 + %val = call i32 @llvm.amdgcn.cluster.load.b32.i32.p1(ptr addrspace(1) %gep, i32 1, i32 %mask) + store i32 %val, ptr addrspace(1) %use + ret void +} + +define amdgpu_ps void @cluster_load_b32_vaddr_imm_mask(ptr addrspace(1) %addr, ptr addrspace(1) %use) { +; GFX1250-LABEL: cluster_load_b32_vaddr_imm_mask: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_mov_b32 m0, 7 +; GFX1250-NEXT: cluster_load_b32 v0, v[0:1], off offset:32 th:TH_LOAD_HT scope:SCOPE_SE +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v[2:3], v0, off +; GFX1250-NEXT: s_endpgm +entry: + %gep = getelementptr i64, ptr addrspace(1) %addr, i32 4 + %val = call i32 @llvm.amdgcn.cluster.load.b32.i32.p1(ptr addrspace(1) %gep, i32 10, i32 7) + store i32 %val, ptr addrspace(1) %use + ret void +} + +define amdgpu_ps void @cluster_load_b32_saddr(ptr addrspace(1) inreg %addr, ptr addrspace(1) %use, i32 inreg %mask) { +; GFX1250-LABEL: cluster_load_b32_saddr: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v2, 0 +; GFX1250-NEXT: s_mov_b32 m0, s2 +; GFX1250-NEXT: cluster_load_b32 v2, v2, s[0:1] offset:32 th:TH_LOAD_NT_HT scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v[0:1], v2, off +; GFX1250-NEXT: s_endpgm +entry: + %gep = getelementptr i64, ptr addrspace(1) %addr, i32 4 + %val = call i32 @llvm.amdgcn.cluster.load.b32.i32.p1(ptr addrspace(1) %gep, i32 22, i32 %mask) + store i32 %val, ptr addrspace(1) %use + ret void +} + +define amdgpu_ps void @cluster_load_monitor_b32_saddr_scale_offset(ptr addrspace(1) inreg %addr, ptr addrspace(1) %use, i32 inreg %mask, i32 %idx) { +; GFX1250-LABEL: cluster_load_monitor_b32_saddr_scale_offset: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_mov_b32 m0, s2 +; GFX1250-NEXT: cluster_load_b32 v2, v2, s[0:1] scale_offset th:TH_LOAD_BYPASS scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v[0:1], v2, off +; GFX1250-NEXT: s_endpgm +entry: + %idxprom = sext i32 %idx to i64 + %gep = getelementptr i32, ptr addrspace(1) %addr, i64 %idxprom + %val = call i32 @llvm.amdgcn.cluster.load.b32.i32.p1(ptr addrspace(1) %gep, i32 27, i32 inreg %mask) + store i32 %val, ptr addrspace(1) %use + ret void +} + +define amdgpu_ps void @cluster_load_b64_vaddr(ptr addrspace(1) %addr, ptr addrspace(1) %use, i32 %mask) { +; GFX1250-LABEL: cluster_load_b64_vaddr: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_readfirstlane_b32 s0, v4 +; GFX1250-NEXT: s_mov_b32 m0, s0 +; GFX1250-NEXT: cluster_load_b64 v[0:1], v[0:1], off offset:32 th:TH_LOAD_NT +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b64 v[2:3], v[0:1], off +; GFX1250-NEXT: s_endpgm +entry: + %gep = getelementptr i64, ptr addrspace(1) %addr, i32 4 + %val = call <2 x i32> @llvm.amdgcn.cluster.load.b64.v2i32.p1(ptr addrspace(1) %gep, i32 1, i32 %mask) + store <2 x i32> %val, ptr addrspace(1) %use + ret void +} + +define amdgpu_ps void @cluster_load_b64_vaddr_imm_mask(ptr addrspace(1) %addr, ptr addrspace(1) %use) { +; GFX1250-LABEL: cluster_load_b64_vaddr_imm_mask: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_mov_b32 m0, 0x10007 +; GFX1250-NEXT: cluster_load_b64 v[0:1], v[0:1], off offset:32 th:TH_LOAD_HT scope:SCOPE_SE +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b64 v[2:3], v[0:1], off +; GFX1250-NEXT: s_endpgm +entry: + %gep = getelementptr i64, ptr addrspace(1) %addr, i32 4 + %val = call <2 x i32> @llvm.amdgcn.cluster.load.b64.v2i32.p1(ptr addrspace(1) %gep, i32 10, i32 65543) + store <2 x i32> %val, ptr addrspace(1) %use + ret void +} + +define amdgpu_ps void @cluster_load_b64_saddr(ptr addrspace(1) inreg %addr, ptr addrspace(1) %use, i32 inreg %mask) { +; GFX1250-LABEL: cluster_load_b64_saddr: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v2, 0 +; GFX1250-NEXT: s_mov_b32 m0, s2 +; GFX1250-NEXT: cluster_load_b64 v[2:3], v2, s[0:1] offset:32 th:TH_LOAD_NT_HT scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b64 v[0:1], v[2:3], off +; GFX1250-NEXT: s_endpgm +entry: + %gep = getelementptr i64, ptr addrspace(1) %addr, i32 4 + %val = call <2 x i32> @llvm.amdgcn.cluster.load.b64.v2i32.p1(ptr addrspace(1) %gep, i32 22, i32 %mask) + store <2 x i32> %val, ptr addrspace(1) %use + ret void +} + +define amdgpu_ps void @cluster_load_monitor_b64_saddr_scale_offset(ptr addrspace(1) inreg %addr, ptr addrspace(1) %use, i32 inreg %mask, i32 %idx) { +; GFX1250-LABEL: cluster_load_monitor_b64_saddr_scale_offset: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_mov_b32 m0, s2 +; GFX1250-NEXT: cluster_load_b64 v[2:3], v2, s[0:1] scale_offset th:TH_LOAD_BYPASS scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b64 v[0:1], v[2:3], off +; GFX1250-NEXT: s_endpgm +entry: + %idxprom = sext i32 %idx to i64 + %gep = getelementptr i64, ptr addrspace(1) %addr, i64 %idxprom + %val = call <2 x i32> @llvm.amdgcn.cluster.load.b64.v2i32.p1(ptr addrspace(1) %gep, i32 27, i32 inreg %mask) + store <2 x i32> %val, ptr addrspace(1) %use + ret void +} + +define amdgpu_ps void @cluster_load_b128_vaddr(ptr addrspace(1) %addr, ptr addrspace(1) %use, i32 %mask) { +; GFX1250-LABEL: cluster_load_b128_vaddr: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_readfirstlane_b32 s0, v4 +; GFX1250-NEXT: s_mov_b32 m0, s0 +; GFX1250-NEXT: cluster_load_b128 v[4:7], v[0:1], off offset:32 th:TH_LOAD_NT +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b128 v[2:3], v[4:7], off +; GFX1250-NEXT: s_endpgm +entry: + %gep = getelementptr i64, ptr addrspace(1) %addr, i32 4 + %val = call <4 x i32> @llvm.amdgcn.cluster.load.b128.v4i32.p1(ptr addrspace(1) %gep, i32 1, i32 %mask) + store <4 x i32> %val, ptr addrspace(1) %use + ret void +} + +define amdgpu_ps void @cluster_load_b128_vaddr_imm_mask(ptr addrspace(1) %addr, ptr addrspace(1) %use) { +; GFX1250-LABEL: cluster_load_b128_vaddr_imm_mask: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_mov_b32 m0, 15 +; GFX1250-NEXT: cluster_load_b128 v[4:7], v[0:1], off offset:32 th:TH_LOAD_HT scope:SCOPE_SE +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b128 v[2:3], v[4:7], off +; GFX1250-NEXT: s_endpgm +entry: + %gep = getelementptr i64, ptr addrspace(1) %addr, i32 4 + %val = call <4 x i32> @llvm.amdgcn.cluster.load.b128.v4i32.p1(ptr addrspace(1) %gep, i32 10, i32 15) + store <4 x i32> %val, ptr addrspace(1) %use + ret void +} + +define amdgpu_ps void @cluster_load_b128_saddr(ptr addrspace(1) inreg %addr, ptr addrspace(1) %use, i32 inreg %mask) { +; GFX1250-LABEL: cluster_load_b128_saddr: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v2, 0 +; GFX1250-NEXT: s_mov_b32 m0, s2 +; GFX1250-NEXT: cluster_load_b128 v[2:5], v2, s[0:1] offset:32 th:TH_LOAD_BYPASS scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX1250-NEXT: s_endpgm +entry: + %gep = getelementptr i64, ptr addrspace(1) %addr, i32 4 + %val = call <4 x i32> @llvm.amdgcn.cluster.load.b128.v4i32.p1(ptr addrspace(1) %gep, i32 27, i32 inreg %mask) + store <4 x i32> %val, ptr addrspace(1) %use + ret void +} +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX1250-GISEL: {{.*}} +; GFX1250-SDAG: {{.*}} diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vflat.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vflat.s index 48ddfec2253ce..8323b6c9a483f 100644 --- a/llvm/test/MC/AMDGPU/gfx1250_asm_vflat.s +++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vflat.s @@ -3096,6 +3096,87 @@ flat_load_monitor_b64 v[2:3], v2, s[4:5] offset:64 scale_offset // GFX1250: flat_load_monitor_b64 v[2:3], v2, s[4:5] offset:64 scale_offset ; encoding: [0x04,0x40,0x1c,0xec,0x02,0x00,0x01,0x00,0x02,0x40,0x00,0x00] // GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU +cluster_load_b32 v1, v[2:3], off th:TH_LOAD_BYPASS scope:SCOPE_SYS +// GFX1250: cluster_load_b32 v1, v[2:3], off th:TH_LOAD_BYPASS scope:SCOPE_SYS ; encoding: [0x7c,0xc0,0x19,0xee,0x01,0x00,0x3c,0x00,0x02,0x00,0x00,0x00] +// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU + +cluster_load_b32 v1, v[2:3], off offset:64 +// GFX1250: cluster_load_b32 v1, v[2:3], off offset:64 ; encoding: [0x7c,0xc0,0x19,0xee,0x01,0x00,0x00,0x00,0x02,0x40,0x00,0x00] +// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU + +cluster_load_b32 v1, v[2:3], off offset:-64 +// GFX1250: cluster_load_b32 v1, v[2:3], off offset:-64 ; encoding: [0x7c,0xc0,0x19,0xee,0x01,0x00,0x00,0x00,0x02,0xc0,0xff,0xff] +// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU + +cluster_load_b32 v1, v2, s[0:1] th:TH_LOAD_NT_HT scope:SCOPE_DEV +// GFX1250: cluster_load_b32 v1, v2, s[0:1] th:TH_LOAD_NT_HT scope:SCOPE_DEV ; encoding: [0x00,0xc0,0x19,0xee,0x01,0x00,0x68,0x00,0x02,0x00,0x00,0x00] +// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU + +cluster_load_b32 v1, v2, s[0:1] offset:64 +// GFX1250: cluster_load_b32 v1, v2, s[0:1] offset:64 ; encoding: [0x00,0xc0,0x19,0xee,0x01,0x00,0x00,0x00,0x02,0x40,0x00,0x00] +// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU + +cluster_load_b32 v1, v2, s[0:1] offset:-64 +// GFX1250: cluster_load_b32 v1, v2, s[0:1] offset:-64 ; encoding: [0x00,0xc0,0x19,0xee,0x01,0x00,0x00,0x00,0x02,0xc0,0xff,0xff] +// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU + +cluster_load_b32 v1, v2, s[4:5] offset:64 scale_offset th:TH_LOAD_BYPASS scope:SCOPE_SYS +// GFX1250: cluster_load_b32 v1, v2, s[4:5] offset:64 scale_offset th:TH_LOAD_BYPASS scope:SCOPE_SYS ; encoding: [0x04,0xc0,0x19,0xee,0x01,0x00,0x3d,0x00,0x02,0x40,0x00,0x00] +// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU + +cluster_load_b64 v[0:1], v[2:3], off th:TH_LOAD_BYPASS scope:SCOPE_SYS +// GFX1250: cluster_load_b64 v[0:1], v[2:3], off th:TH_LOAD_BYPASS scope:SCOPE_SYS ; encoding: [0x7c,0x00,0x1a,0xee,0x00,0x00,0x3c,0x00,0x02,0x00,0x00,0x00] +// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU + +cluster_load_b64 v[0:1], v[2:3], off offset:64 +// GFX1250: cluster_load_b64 v[0:1], v[2:3], off offset:64 ; encoding: [0x7c,0x00,0x1a,0xee,0x00,0x00,0x00,0x00,0x02,0x40,0x00,0x00] +// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU + +cluster_load_b64 v[0:1], v[2:3], off offset:-64 +// GFX1250: cluster_load_b64 v[0:1], v[2:3], off offset:-64 ; encoding: [0x7c,0x00,0x1a,0xee,0x00,0x00,0x00,0x00,0x02,0xc0,0xff,0xff] +// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU + +cluster_load_b64 v[0:1], v2, s[0:1] th:TH_LOAD_NT_HT scope:SCOPE_DEV +// GFX1250: cluster_load_b64 v[0:1], v2, s[0:1] th:TH_LOAD_NT_HT scope:SCOPE_DEV ; encoding: [0x00,0x00,0x1a,0xee,0x00,0x00,0x68,0x00,0x02,0x00,0x00,0x00] +// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU + +cluster_load_b64 v[0:1], v2, s[0:1] offset:64 +// GFX1250: cluster_load_b64 v[0:1], v2, s[0:1] offset:64 ; encoding: [0x00,0x00,0x1a,0xee,0x00,0x00,0x00,0x00,0x02,0x40,0x00,0x00] +// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU + +cluster_load_b64 v[0:1], v2, s[0:1] offset:-64 +// GFX1250: cluster_load_b64 v[0:1], v2, s[0:1] offset:-64 ; encoding: [0x00,0x00,0x1a,0xee,0x00,0x00,0x00,0x00,0x02,0xc0,0xff,0xff] +// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU + +cluster_load_b64 v[2:3], v2, s[4:5] offset:64 scale_offset th:TH_LOAD_NT_HT scope:SCOPE_DEV +// GFX1250: cluster_load_b64 v[2:3], v2, s[4:5] offset:64 scale_offset th:TH_LOAD_NT_HT scope:SCOPE_DEV ; encoding: [0x04,0x00,0x1a,0xee,0x02,0x00,0x69,0x00,0x02,0x40,0x00,0x00] +// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU + +cluster_load_b128 v[0:3], v[4:5], off th:TH_LOAD_BYPASS scope:SCOPE_SYS +// GFX1250: cluster_load_b128 v[0:3], v[4:5], off th:TH_LOAD_BYPASS scope:SCOPE_SYS ; encoding: [0x7c,0x40,0x1a,0xee,0x00,0x00,0x3c,0x00,0x04,0x00,0x00,0x00] +// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU + +cluster_load_b128 v[0:3], v[4:5], off offset:64 +// GFX1250: cluster_load_b128 v[0:3], v[4:5], off offset:64 ; encoding: [0x7c,0x40,0x1a,0xee,0x00,0x00,0x00,0x00,0x04,0x40,0x00,0x00] +// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU + +cluster_load_b128 v[0:3], v[4:5], off offset:-64 +// GFX1250: cluster_load_b128 v[0:3], v[4:5], off offset:-64 ; encoding: [0x7c,0x40,0x1a,0xee,0x00,0x00,0x00,0x00,0x04,0xc0,0xff,0xff] +// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU + +cluster_load_b128 v[0:3], v4, s[0:1] th:TH_LOAD_NT_HT scope:SCOPE_DEV +// GFX1250: cluster_load_b128 v[0:3], v4, s[0:1] th:TH_LOAD_NT_HT scope:SCOPE_DEV ; encoding: [0x00,0x40,0x1a,0xee,0x00,0x00,0x68,0x00,0x04,0x00,0x00,0x00] +// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU + +cluster_load_b128 v[0:3], v4, s[0:1] offset:64 +// GFX1250: cluster_load_b128 v[0:3], v4, s[0:1] offset:64 ; encoding: [0x00,0x40,0x1a,0xee,0x00,0x00,0x00,0x00,0x04,0x40,0x00,0x00] +// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU + +cluster_load_b128 v[0:3], v4, s[0:1] offset:-64 +// GFX1250: cluster_load_b128 v[0:3], v4, s[0:1] offset:-64 ; encoding: [0x00,0x40,0x1a,0xee,0x00,0x00,0x00,0x00,0x04,0xc0,0xff,0xff] +// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU + + flat_atomic_add_f64 v[0:1], v[2:3] offset:4095 // GFX1250: flat_atomic_add_f64 v[0:1], v[2:3] offset:4095 ; encoding: [0x7c,0x40,0x15,0xec,0x00,0x00,0x00,0x01,0x00,0xff,0x0f,0x00] // GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vflat.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vflat.txt index 8149deaec376f..f8dd65ae69da7 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vflat.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vflat.txt @@ -3183,6 +3183,66 @@ # GFX1250: global_load_monitor_b64 v[2:3], v2, s[4:5] offset:64 scale_offset ; encoding: [0x04,0x40,0x1c,0xee,0x02,0x00,0x01,0x00,0x02,0x40,0x00,0x00] 0x04,0x40,0x1c,0xee,0x02,0x00,0x01,0x00,0x02,0x40,0x00,0x00 +# GFX1250: cluster_load_b128 v[0:3], v[4:5], off offset:64 ; encoding: [0x7c,0x40,0x1a,0xee,0x00,0x00,0x00,0x00,0x04,0x40,0x00,0x00] +0x7c,0x40,0x1a,0xee,0x00,0x00,0x00,0x00,0x04,0x40,0x00,0x00 + +# GFX1250: cluster_load_b128 v[0:3], v[4:5], off offset:-64 ; encoding: [0x7c,0x40,0x1a,0xee,0x00,0x00,0x00,0x00,0x04,0xc0,0xff,0xff] +0x7c,0x40,0x1a,0xee,0x00,0x00,0x00,0x00,0x04,0xc0,0xff,0xff + +# GFX1250: cluster_load_b128 v[0:3], v[4:5], off th:TH_LOAD_BYPASS scope:SCOPE_SYS ; encoding: [0x7c,0x40,0x1a,0xee,0x00,0x00,0x3c,0x00,0x04,0x00,0x00,0x00] +0x7c,0x40,0x1a,0xee,0x00,0x00,0x3c,0x00,0x04,0x00,0x00,0x00 + +# GFX1250: cluster_load_b128 v[0:3], v4, s[0:1] offset:64 ; encoding: [0x00,0x40,0x1a,0xee,0x00,0x00,0x00,0x00,0x04,0x40,0x00,0x00] +0x00,0x40,0x1a,0xee,0x00,0x00,0x00,0x00,0x04,0x40,0x00,0x00 + +# GFX1250: cluster_load_b128 v[0:3], v4, s[0:1] offset:-64 ; encoding: [0x00,0x40,0x1a,0xee,0x00,0x00,0x00,0x00,0x04,0xc0,0xff,0xff] +0x00,0x40,0x1a,0xee,0x00,0x00,0x00,0x00,0x04,0xc0,0xff,0xff + +# GFX1250: cluster_load_b128 v[0:3], v4, s[0:1] th:TH_LOAD_NT_HT scope:SCOPE_DEV ; encoding: [0x00,0x40,0x1a,0xee,0x00,0x00,0x68,0x00,0x04,0x00,0x00,0x00] +0x00,0x40,0x1a,0xee,0x00,0x00,0x68,0x00,0x04,0x00,0x00,0x00 + +# GFX1250: cluster_load_b32 v1, v[2:3], off offset:64 ; encoding: [0x7c,0xc0,0x19,0xee,0x01,0x00,0x00,0x00,0x02,0x40,0x00,0x00] +0x7c,0xc0,0x19,0xee,0x01,0x00,0x00,0x00,0x02,0x40,0x00,0x00 + +# GFX1250: cluster_load_b32 v1, v[2:3], off offset:-64 ; encoding: [0x7c,0xc0,0x19,0xee,0x01,0x00,0x00,0x00,0x02,0xc0,0xff,0xff] +0x7c,0xc0,0x19,0xee,0x01,0x00,0x00,0x00,0x02,0xc0,0xff,0xff + +# GFX1250: cluster_load_b32 v1, v[2:3], off th:TH_LOAD_BYPASS scope:SCOPE_SYS ; encoding: [0x7c,0xc0,0x19,0xee,0x01,0x00,0x3c,0x00,0x02,0x00,0x00,0x00] +0x7c,0xc0,0x19,0xee,0x01,0x00,0x3c,0x00,0x02,0x00,0x00,0x00 + +# GFX1250: cluster_load_b32 v1, v2, s[0:1] offset:64 ; encoding: [0x00,0xc0,0x19,0xee,0x01,0x00,0x00,0x00,0x02,0x40,0x00,0x00] +0x00,0xc0,0x19,0xee,0x01,0x00,0x00,0x00,0x02,0x40,0x00,0x00 + +# GFX1250: cluster_load_b32 v1, v2, s[0:1] offset:-64 ; encoding: [0x00,0xc0,0x19,0xee,0x01,0x00,0x00,0x00,0x02,0xc0,0xff,0xff] +0x00,0xc0,0x19,0xee,0x01,0x00,0x00,0x00,0x02,0xc0,0xff,0xff + +# GFX1250: cluster_load_b32 v1, v2, s[0:1] th:TH_LOAD_NT_HT scope:SCOPE_DEV ; encoding: [0x00,0xc0,0x19,0xee,0x01,0x00,0x68,0x00,0x02,0x00,0x00,0x00] +0x00,0xc0,0x19,0xee,0x01,0x00,0x68,0x00,0x02,0x00,0x00,0x00 + +# GFX1250: cluster_load_b32 v1, v2, s[4:5] offset:64 scale_offset th:TH_LOAD_BYPASS scope:SCOPE_SYS ; encoding: [0x04,0xc0,0x19,0xee,0x01,0x00,0x3d,0x00,0x02,0x40,0x00,0x00] +0x04,0xc0,0x19,0xee,0x01,0x00,0x3d,0x00,0x02,0x40,0x00,0x00 + +# GFX1250: cluster_load_b64 v[0:1], v[2:3], off offset:64 ; encoding: [0x7c,0x00,0x1a,0xee,0x00,0x00,0x00,0x00,0x02,0x40,0x00,0x00] +0x7c,0x00,0x1a,0xee,0x00,0x00,0x00,0x00,0x02,0x40,0x00,0x00 + +# GFX1250: cluster_load_b64 v[0:1], v[2:3], off offset:-64 ; encoding: [0x7c,0x00,0x1a,0xee,0x00,0x00,0x00,0x00,0x02,0xc0,0xff,0xff] +0x7c,0x00,0x1a,0xee,0x00,0x00,0x00,0x00,0x02,0xc0,0xff,0xff + +# GFX1250: cluster_load_b64 v[0:1], v[2:3], off th:TH_LOAD_BYPASS scope:SCOPE_SYS ; encoding: [0x7c,0x00,0x1a,0xee,0x00,0x00,0x3c,0x00,0x02,0x00,0x00,0x00] +0x7c,0x00,0x1a,0xee,0x00,0x00,0x3c,0x00,0x02,0x00,0x00,0x00 + +# GFX1250: cluster_load_b64 v[0:1], v2, s[0:1] offset:64 ; encoding: [0x00,0x00,0x1a,0xee,0x00,0x00,0x00,0x00,0x02,0x40,0x00,0x00] +0x00,0x00,0x1a,0xee,0x00,0x00,0x00,0x00,0x02,0x40,0x00,0x00 + +# GFX1250: cluster_load_b64 v[0:1], v2, s[0:1] offset:-64 ; encoding: [0x00,0x00,0x1a,0xee,0x00,0x00,0x00,0x00,0x02,0xc0,0xff,0xff] +0x00,0x00,0x1a,0xee,0x00,0x00,0x00,0x00,0x02,0xc0,0xff,0xff + +# GFX1250: cluster_load_b64 v[0:1], v2, s[0:1] th:TH_LOAD_NT_HT scope:SCOPE_DEV ; encoding: [0x00,0x00,0x1a,0xee,0x00,0x00,0x68,0x00,0x02,0x00,0x00,0x00] +0x00,0x00,0x1a,0xee,0x00,0x00,0x68,0x00,0x02,0x00,0x00,0x00 + +# GFX1250: cluster_load_b64 v[2:3], v2, s[4:5] offset:64 scale_offset th:TH_LOAD_NT_HT scope:SCOPE_DEV ; encoding: [0x04,0x00,0x1a,0xee,0x02,0x00,0x69,0x00,0x02,0x40,0x00,0x00] +0x04,0x00,0x1a,0xee,0x02,0x00,0x69,0x00,0x02,0x40,0x00,0x00 + # GFX1250: flat_atomic_add_f64 v[0:1], v[2:3] offset:4095 ; encoding: [0x7c,0x40,0x15,0xec,0x00,0x00,0x00,0x01,0x00,0xff,0x0f,0x00] 0x7c,0x40,0x15,0xec,0x00,0x00,0x00,0x01,0x00,0xff,0x0f,0x00