@@ -544,52 +544,50 @@ multiclass CP_ASYNC_BULK_S2G_INTR<bit has_ch> {
544544 [(int_nvvm_cp_async_bulk_shared_cta_to_global_bytemask addr:$dst, addr:$src, i32:$size, i64:$ch, !if(has_ch, -1, 0), i16:$mask)]>,
545545 Requires<[hasPTX<86>, hasSM<100>]>;
546546}
547- defm CP_ASYNC_BULK_S2G : CP_ASYNC_BULK_S2G_INTR<0>;
548- defm CP_ASYNC_BULK_S2G_CH : CP_ASYNC_BULK_S2G_INTR<1>;
547+ defm CP_ASYNC_BULK_S2G : CP_ASYNC_BULK_S2G_INTR<has_ch = 0>;
548+ defm CP_ASYNC_BULK_S2G_CH : CP_ASYNC_BULK_S2G_INTR<has_ch = 1>;
549549
550- multiclass CP_ASYNC_BULK_G2S<NVPTXRegClass rc> {
551- def NAME: NVPTXInst<(outs),
552- (ins rc:$dst, rc:$mbar, Int64Regs:$src, Int32Regs:$size),
553- !strconcat(CpAsyncBulkStr<0, 0>.G2S, " [$dst], [$src], $size, [$mbar];"), []>,
554- Requires<[hasPTX<80>, hasSM<90>]>;
555- def NAME # _MC: NVPTXInst<(outs),
556- (ins rc:$dst, rc:$mbar, Int64Regs:$src, Int32Regs:$size, Int16Regs:$mc),
557- !strconcat(CpAsyncBulkStr<1, 0>.G2S, " [$dst], [$src], $size, [$mbar], $mc;"), []>,
558- Requires<[hasPTX<80>, hasSM<90>]>;
559- def NAME # _CH: NVPTXInst<(outs),
560- (ins rc:$dst, rc:$mbar, Int64Regs:$src, Int32Regs:$size, Int64Regs:$ch),
561- !strconcat(CpAsyncBulkStr<0, 1>.G2S, " [$dst], [$src], $size, [$mbar], $ch;"), []>,
562- Requires<[hasPTX<80>, hasSM<90>]>;
563- def NAME # _MC_CH: NVPTXInst<(outs),
564- (ins rc:$dst, rc:$mbar, Int64Regs:$src, Int32Regs:$size, Int16Regs:$mc, Int64Regs:$ch),
565- !strconcat(CpAsyncBulkStr<1, 1>.G2S, " [$dst], [$src], $size, [$mbar], $mc, $ch;"), []>,
566- Requires<[hasPTX<80>, hasSM<90>]>;
550+ multiclass CP_ASYNC_BULK_G2S_INTR<bit has_ch> {
551+ defvar Intr = int_nvvm_cp_async_bulk_global_to_shared_cluster;
552+
553+ def NAME : NVPTXInst<(outs),
554+ (ins ADDR:$dst, ADDR:$mbar, ADDR:$src,
555+ Int32Regs:$size, Int16Regs:$mask, Int64Regs:$ch),
556+ !if(has_ch,
557+ CpAsyncBulkStr<0, 1>.G2S # " [$dst], [$src], $size, [$mbar], $ch;",
558+ CpAsyncBulkStr<0, 0>.G2S # " [$dst], [$src], $size, [$mbar];"),
559+ [(Intr addr:$dst, addr:$mbar, addr:$src, i32:$size, i16:$mask, i64:$ch, 0, !if(has_ch, -1, 0))]>,
560+ Requires<[hasPTX<80>, hasSM<90>]>;
561+
562+ def NAME # _MC : NVPTXInst<(outs),
563+ (ins ADDR:$dst, ADDR:$mbar, ADDR:$src,
564+ Int32Regs:$size, Int16Regs:$mask, Int64Regs:$ch),
565+ !if(has_ch,
566+ CpAsyncBulkStr<1, 1>.G2S # " [$dst], [$src], $size, [$mbar], $mask, $ch;",
567+ CpAsyncBulkStr<1, 0>.G2S # " [$dst], [$src], $size, [$mbar], $mask;"),
568+ [(Intr addr:$dst, addr:$mbar, addr:$src, i32:$size, i16:$mask, i64:$ch, -1, !if(has_ch, -1, 0))]>,
569+ Requires<[hasPTX<80>, hasSM<90>]>;
567570}
568- defm CP_ASYNC_BULK_G2S : CP_ASYNC_BULK_G2S<Int64Regs >;
569- defm CP_ASYNC_BULK_G2S_SHARED32 : CP_ASYNC_BULK_G2S<Int32Regs >;
571+ defm CP_ASYNC_BULK_G2S : CP_ASYNC_BULK_G2S_INTR<has_ch = 0 >;
572+ defm CP_ASYNC_BULK_G2S_CH : CP_ASYNC_BULK_G2S_INTR<has_ch = 1 >;
570573
571- multiclass CP_ASYNC_BULK_CTA_TO_CLUSTER<NVPTXRegClass rc> {
572- def NAME: NVPTXInst<(outs),
573- (ins rc:$dst, rc:$mbar, rc:$src, Int32Regs:$size),
574- !strconcat(CpAsyncBulkStr<0, 0>.C2C, " [$dst], [$src], $size, [$mbar];"),
575- [(int_nvvm_cp_async_bulk_shared_cta_to_cluster rc:$dst, rc:$mbar, rc:$src, Int32Regs:$size)]>,
576- Requires<[hasPTX<80>, hasSM<90>]>;
574+ def CP_ASYNC_BULK_CTA_TO_CLUSTER : NVPTXInst<(outs),
575+ (ins ADDR:$dst, ADDR:$mbar, ADDR:$src, Int32Regs:$size),
576+ CpAsyncBulkStr<0, 0>.C2C # " [$dst], [$src], $size, [$mbar];",
577+ [(int_nvvm_cp_async_bulk_shared_cta_to_cluster addr:$dst, addr:$mbar, addr:$src, i32:$size)]>,
578+ Requires<[hasPTX<80>, hasSM<90>]>;
579+
580+ multiclass CP_ASYNC_BULK_PREFETCH_INTR<bit has_ch> {
581+ def NAME : NVPTXInst<(outs), (ins ADDR:$src, Int32Regs:$size, Int64Regs:$ch),
582+ !if(has_ch,
583+ "cp.async.bulk.prefetch.L2.global.L2::cache_hint" # " [$src], $size, $ch;",
584+ "cp.async.bulk.prefetch.L2.global" # " [$src], $size;"),
585+ [(int_nvvm_cp_async_bulk_prefetch_L2 addr:$src, i32:$size, i64:$ch, !if(has_ch, -1, 0))]>,
586+ Requires<[hasPTX<80>, hasSM<90>]>;
577587}
578- defm CP_ASYNC_BULK_CTA_TO_CLUSTER : CP_ASYNC_BULK_CTA_TO_CLUSTER<Int64Regs >;
579- defm CP_ASYNC_BULK_CTA_TO_CLUSTER_SHARED32 : CP_ASYNC_BULK_CTA_TO_CLUSTER<Int32Regs >;
588+ defm CP_ASYNC_BULK_PREFETCH : CP_ASYNC_BULK_PREFETCH_INTR<has_ch = 0 >;
589+ defm CP_ASYNC_BULK_PREFETCH_CH : CP_ASYNC_BULK_PREFETCH_INTR<has_ch = 1 >;
580590
581- //------------------------------
582- // Bulk Copy Prefetch Functions
583- //------------------------------
584- def CP_ASYNC_BULK_PREFETCH : NVPTXInst<(outs),
585- (ins Int64Regs:$src, Int32Regs:$size),
586- "cp.async.bulk.prefetch.L2.global [$src], $size;", []>,
587- Requires<[hasPTX<80>, hasSM<90>]>;
588-
589- def CP_ASYNC_BULK_PREFETCH_CH : NVPTXInst<(outs),
590- (ins Int64Regs:$src, Int32Regs:$size, Int64Regs:$ch),
591- "cp.async.bulk.prefetch.L2.global.L2::cache_hint [$src], $size, $ch;", []>,
592- Requires<[hasPTX<80>, hasSM<90>]>;
593591//-------------------------------------
594592// TMA Async Bulk Tensor Copy Functions
595593//-------------------------------------
0 commit comments