@@ -2599,51 +2599,63 @@ def NVVM_CpAsyncBulkSharedCTAToSharedClusterOp :
25992599}
26002600
26012601def NVVM_CpAsyncBulkSharedCTAToGlobalOp :
2602- NVVM_Op<"cp.async.bulk.global.shared.cta"> {
2602+ NVVM_Op<"cp.async.bulk.global.shared.cta", [AttrSizedOperandSegments] > {
26032603 let summary = "Async bulk copy from Shared CTA memory to Global memory";
26042604 let description = [{
26052605 Initiates an asynchronous copy operation from Shared CTA memory to
2606- global memory.
2606+ global memory. The 32-bit operand `size` specifies the amount of
2607+ memory to be copied, in terms of number of bytes. `size` must be a
2608+ multiple of 16. The `l2CacheHint` operand is optional, and it is used
2609+ to specify cache eviction policy that may be used during the memory
2610+ access. The `byteMask` operand is optional. The i-th bit in the 16-bit
2611+ wide `byteMask` specifies whether the i-th byte of each 16-byte wide
2612+ chunk of source data is copied to the destination. If the bit is set,
2613+ the byte is copied.
2614+
2615+ Example:
2616+ ```mlir
2617+ nvvm.cp.async.bulk.global.shared.cta %dst, %src, %size
2618+ : !llvm.ptr<1>, !llvm.ptr<3>
2619+
2620+ // with l2_cache_hint
2621+ nvvm.cp.async.bulk.global.shared.cta %dst, %src, %size l2_cache_hint = %ch
2622+ : !llvm.ptr<1>, !llvm.ptr<3>
2623+
2624+ // with byte_mask
2625+ nvvm.cp.async.bulk.global.shared.cta %dst, %src, %size byte_mask = %mask
2626+ : !llvm.ptr<1>, !llvm.ptr<3>
2627+
2628+ // with both l2_cache_hint and byte_mask
2629+ nvvm.cp.async.bulk.global.shared.cta %dst, %src, %size l2_cache_hint = %ch byte_mask = %mask
2630+ : !llvm.ptr<1>, !llvm.ptr<3>
2631+ ```
26072632
2608- The `l2CacheHint` operand is optional, and it is used to specify cache
2609- eviction policy that may be used during the memory access.
2610-
26112633 [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk)
26122634 }];
26132635
26142636 let arguments = (ins
26152637 LLVM_PointerGlobal:$dstMem,
26162638 LLVM_PointerShared:$srcMem,
26172639 I32:$size,
2618- Optional<I64>:$l2CacheHint);
2640+ Optional<I64>:$l2CacheHint,
2641+ Optional<I16>:$byteMask);
26192642
26202643 let assemblyFormat = [{
26212644 $dstMem `,` $srcMem `,` $size
26222645 (`l2_cache_hint` `=` $l2CacheHint^ )?
2623- attr-dict `:` type($dstMem) `,` type($srcMem)
2646+ (`byte_mask` `=` $byteMask^ )?
2647+ attr-dict `:` type($dstMem) `,` type($srcMem)
26242648 }];
26252649
2650+ let extraClassDeclaration = [{
2651+ static mlir::NVVM::IDArgPair
2652+ getIntrinsicIDAndArgs(Operation &op, LLVM::ModuleTranslation &mt,
2653+ llvm::IRBuilderBase& builder);
2654+ }];
26262655 string llvmBuilder = [{
2627- // Arguments to the intrinsic:
2628- // dst, src, size, cache_hint,
2629- // Flag for cache_hint
2630- //
2631- llvm::SmallVector<llvm::Value *> translatedOperands;
2632- translatedOperands.push_back($dstMem);
2633- translatedOperands.push_back($srcMem);
2634- translatedOperands.push_back($size);
2635-
2636- // Cachehint, if available
2637- llvm::LLVMContext &ctx = moduleTranslation.getLLVMContext();
2638- auto *i64Unused = llvm::ConstantInt::get(llvm::Type::getInt64Ty(ctx), 0);
2639- bool isCacheHint = op.getL2CacheHint() ? true : false;
2640- translatedOperands.push_back(isCacheHint ? $l2CacheHint : i64Unused);
2641-
2642- // Flag argument for cachehint
2643- translatedOperands.push_back(builder.getInt1(isCacheHint));
2644-
2645- createIntrinsicCall(builder,
2646- llvm::Intrinsic::nvvm_cp_async_bulk_shared_cta_to_global, translatedOperands);
2656+ auto [id, args] = NVVM::CpAsyncBulkSharedCTAToGlobalOp::getIntrinsicIDAndArgs(
2657+ *op, moduleTranslation, builder);
2658+ createIntrinsicCall(builder, id, args);
26472659 }];
26482660}
26492661
0 commit comments