@@ -2929,6 +2929,208 @@ def NVVM_Tcgen05CpOp : NVVM_Op<"tcgen05.cp"> {
29292929 }];
29302930}
29312931
2932+ //===----------------------------------------------------------------------===//
2933+ // NVVM tcgen05 LdSt Shape Attr
2934+ //===----------------------------------------------------------------------===//
2935+
2936+ def Tcgen05LdStShape16x64b: I32EnumAttrCase<"SHAPE_16X64B", 0, "shape_16x64b">;
2937+ def Tcgen05LdStShape16x128b: I32EnumAttrCase<"SHAPE_16X128B", 1, "shape_16x128b">;
2938+ def Tcgen05LdStShape16x256b: I32EnumAttrCase<"SHAPE_16X256B", 2, "shape_16x256b">;
2939+ def Tcgen05LdStShape32x32b: I32EnumAttrCase<"SHAPE_32X32B", 3, "shape_32x32b">;
2940+ def Tcgen05LdStShape16x32bx2: I32EnumAttrCase<"SHAPE_16X32BX2", 4, "shape_16x32bx2">;
2941+
2942+ def Tcgen05LdStShape: I32EnumAttr<
2943+ "Tcgen05LdStShape",
2944+ "",
2945+ [Tcgen05LdStShape16x64b, Tcgen05LdStShape16x128b, Tcgen05LdStShape16x256b,
2946+ Tcgen05LdStShape32x32b, Tcgen05LdStShape16x32bx2]
2947+ > {
2948+ let cppNamespace = "::mlir::NVVM";
2949+ let genSpecializedAttr = 0;
2950+ }
2951+
2952+ def Tcgen05LdStShapeAttr: EnumAttr<NVVM_Dialect, Tcgen05LdStShape, "tcgen05_ldst_shape"> {
2953+ let assemblyFormat = "`<` $value `>`";
2954+ }
2955+
2956+ //===----------------------------------------------------------------------===//
2957+ // NVVM tcgen05.ld Op
2958+ //===----------------------------------------------------------------------===//
2959+
2960+ def NVVM_Tcgen05LdOp : NVVM_Op<"tcgen05.ld"> {
2961+ let summary = "tensor memory load instructions";
2962+ let arguments = (ins
2963+ // Attributes
2964+ UnitAttr:$pack,
2965+ Tcgen05LdStShapeAttr:$shape,
2966+ // Arguments
2967+ LLVM_PointerTensor:$tmemAddr,
2968+ Optional<I64>:$offset
2969+ );
2970+
2971+ let results = (outs AnyTypeOf<[I32, VectorOfLengthAndType<
2972+ [2, 4, 8, 16, 32, 64, 128], [I32]>]>:$res);
2973+
2974+ let assemblyFormat = [{
2975+ $tmemAddr (`,` $offset^)? (`pack` $pack^)? attr-dict `:` type($res)
2976+ }];
2977+
2978+ let description = [{
2979+ Instruction `tcgen05.ld` asynchronously loads data from the Tensor Memory at
2980+ the location specified by the 32-bit address operand `tmemAddr` into the
2981+ destination register `res`, collectively across all threads of the warps.
2982+
2983+ The `shape` and the `num` attribute together determines the total
2984+ dimension of the data which is loaded from the Tensor Memory. The `shape`
2985+ attribute indicates the base dimension of data to be accessed as described
2986+ in the Data Movement Shape. The `num` attribute indicates the repeat
2987+ factor on the base dimension resulting in the total dimension of the data
2988+ that is accessed.
2989+
2990+ The shape `16x32bx2` performs two accesses into Tensor Memory of the shape
2991+ `16x32b`. The base address of the first access is specified by `tmemAddr`
2992+ and the base address of the second access is specified by
2993+ `tmemAddr + offset`, where `offset` is an immediate argument.
2994+
2995+ The unit attribute `pack` can be used to pack two 16-bit
2996+ elements from adjacent columns into a single 32-bit element during the load.
2997+
2998+ The following table describes the size of the vector for various combinations
2999+ of `num` and `shape` attributes
3000+ |=====================================================================|
3001+ | num/shape | 16x32bx2/16x64b/32x32b | 16x128b | 16x256b |
3002+ |=====================================================================|
3003+ | x1 | 1 | 2 | 4 |
3004+ | x2 | 2 | 4 | 8 |
3005+ | x4 | 4 | 8 | 16 |
3006+ | x8 | 8 | 16 | 32 |
3007+ | x16 | 16 | 32 | 64 |
3008+ | x32 | 32 | 64 | 128 |
3009+ | x64 | 64 | 128 | NA |
3010+ | x128 | 128 | NA | NA |
3011+ |=====================================================================|
3012+
3013+ Example:
3014+ ```mlir
3015+ nvvm.tcgen05.ld %tmemAddr, %offset pack {
3016+ shape = #nvvm.tcgen05_ldst_shape<shape_16x32bx2>,
3017+ } : <2xi32>
3018+ ```
3019+
3020+ [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-instructions-tcgen05-st)
3021+ }];
3022+
3023+ let hasVerifier = 1;
3024+
3025+ string llvmBuilder = [{
3026+ llvm::LLVMContext &Context = moduleTranslation.getLLVMContext();
3027+ auto Pack = llvm::ConstantInt::get(Context, llvm::APInt(1, $pack));
3028+
3029+ unsigned num = $_resultType->isVectorTy()
3030+ ? llvm::cast<llvm::VectorType>($_resultType)
3031+ ->getElementCount()
3032+ .getFixedValue()
3033+ : 1;
3034+
3035+ auto ID = getTcgen05LdIntrinsicID($shape, num);
3036+ if (ID == llvm::Intrinsic::not_intrinsic)
3037+ llvm::report_fatal_error("unknow intrinsic signature for tcgen05.ld");
3038+
3039+ if ($offset)
3040+ $res = createIntrinsicCall(builder, ID, {$tmemAddr, $offset, Pack});
3041+ else
3042+ $res = createIntrinsicCall(builder, ID, {$tmemAddr, Pack});
3043+ }];
3044+ }
3045+
3046+ //===----------------------------------------------------------------------===//
3047+ // NVVM tcgen05.st Op
3048+ //===----------------------------------------------------------------------===//
3049+
3050+ def NVVM_Tcgen05StOp : NVVM_Op<"tcgen05.st"> {
3051+ let summary = "tensor memory store instructions";
3052+ let arguments = (ins
3053+ // Attributes
3054+ UnitAttr:$unpack,
3055+ Tcgen05LdStShapeAttr:$shape,
3056+ // Arguments
3057+ LLVM_PointerTensor:$tmemAddr,
3058+ AnyTypeOf<[I32, VectorOfLengthAndType<
3059+ [2, 4, 8, 16, 32, 64, 128], [I32]>]>:$val,
3060+ Optional<I64>:$offset
3061+ );
3062+
3063+ let assemblyFormat = [{
3064+ $tmemAddr `,` $val (`,` $offset^)? (`unpack` $unpack^)? attr-dict `:` type($val)
3065+ }];
3066+
3067+ let description = [{
3068+ Instruction `tcgen05.st` asynchronously stores data from the source register `r`
3069+ into the Tensor Memory at the location specified by the 32-bit address operand
3070+ `tmemAddr`, collectively across all threads of the warps.
3071+
3072+ The `shape` and the `num` attribute together determines the total dimension of
3073+ the data which is stored to the Tensor Memory. The `shape` indicates the base
3074+ dimension of data to be accessed. The `num` attribute indicates the repeat
3075+ factor on the base dimension resulting in the total dimension of the data that
3076+ is accessed.
3077+
3078+ The shape `16x32bx2` performs two accesses into Tensor Memory of the shape
3079+ `16x32b`. The base address of the first access is specified by `tmemAddr`
3080+ and the base address of the second access is specified by
3081+ `tmemAddr + offset`, where `offset` is an immediate argument.
3082+
3083+ The unit attribute `unpack` can be used to unpack a 32-bit element
3084+ in the register into two 16-bit elements and store them in adjacent columns.
3085+
3086+ The following table describes the size of the vector for various combinations
3087+ of `num` and `shape` attributes
3088+ |=====================================================================|
3089+ | num/shape | 16x32bx2/16x64b/32x32b | 16x128b | 16x256b |
3090+ |=====================================================================|
3091+ | x1 | 1 | 2 | 4 |
3092+ | x2 | 2 | 4 | 8 |
3093+ | x4 | 4 | 8 | 16 |
3094+ | x8 | 8 | 16 | 32 |
3095+ | x16 | 16 | 32 | 64 |
3096+ | x32 | 32 | 64 | 128 |
3097+ | x64 | 64 | 128 | NA |
3098+ | x128 | 128 | NA | NA |
3099+ |=====================================================================|
3100+
3101+ Example:
3102+ ```mlir
3103+ nvvm.tcgen05.st %tmemAddr, %val, %offset unpack {
3104+ shape = #nvvm.tcgen05_ldst_shape<shape_16x32bx2>,
3105+ } : <2xi32>
3106+ ```
3107+
3108+ [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-instructions-tcgen05-st)
3109+ }];
3110+
3111+ string llvmBuilder = [{
3112+ llvm::LLVMContext &Context = moduleTranslation.getLLVMContext();
3113+ auto Unpack = llvm::ConstantInt::get(Context, llvm::APInt(1, $unpack));
3114+
3115+ auto valTy = $val->getType();
3116+ uint32_t num = valTy->isVectorTy() ? llvm::cast<llvm::VectorType>(valTy)
3117+ ->getElementCount()
3118+ .getFixedValue()
3119+ : 1;
3120+
3121+ auto ID = getTcgen05StIntrinsicID($shape, num);
3122+ if (ID == llvm::Intrinsic::not_intrinsic)
3123+ llvm::report_fatal_error("unknow intrinsic signature for tcgen05.st");
3124+
3125+ if ($offset)
3126+ createIntrinsicCall(builder, ID, {$tmemAddr, $offset, $val, Unpack});
3127+ else
3128+ createIntrinsicCall(builder, ID, {$tmemAddr, $val, Unpack});
3129+ }];
3130+
3131+ let hasVerifier = 1;
3132+ }
3133+
29323134//===----------------------------------------------------------------------===//
29333135// NVVM target attribute.
29343136//===----------------------------------------------------------------------===//
0 commit comments