@@ -3463,10 +3463,36 @@ let hasSideEffects = false in {
34633463 "{{ .reg .b32 tmp; mov.b64 {$low, tmp}, $s; }}",
34643464 []>;
34653465
3466+ // PTX 7.1 lets you avoid a temp register and just use _ as a "sink" for the
3467+ // unused high/low part.
3468+ def I32toI16H_ptx71 : NVPTXInst<(outs Int16Regs:$high),
3469+ (ins Int32Regs:$s),
3470+ "mov.b32 \t{{_, $high}}, $s;",
3471+ []>, Requires<[hasPTX<71>]>;
3472+ def I32toI16L_ptx71 : NVPTXInst<(outs Int16Regs:$low),
3473+ (ins Int32Regs:$s),
3474+ "mov.b32 \t{{$low, _}}, $s;",
3475+ []>, Requires<[hasPTX<71>]>;
3476+ def I64toI32H_ptx71 : NVPTXInst<(outs Int32Regs:$high),
3477+ (ins Int64Regs:$s),
3478+ "mov.b64 \t{{_, $high}}, $s;",
3479+ []>, Requires<[hasPTX<71>]>;
3480+ def I64toI32L_ptx71 : NVPTXInst<(outs Int32Regs:$low),
3481+ (ins Int64Regs:$s),
3482+ "mov.b64 \t{{$low, _}}, $s;",
3483+ []>, Requires<[hasPTX<71>]>;
34663484}
34673485
3468- // Using partial vectorized move produces better SASS code for extraction of
3469- // upper/lower parts of an integer.
3486+ def : Pat<(i16 (trunc (srl Int32Regs:$s, (i32 16)))),
3487+ (I32toI16H_ptx71 Int32Regs:$s)>, Requires<[hasPTX<71>]>;
3488+ def : Pat<(i16 (trunc (sra Int32Regs:$s, (i32 16)))),
3489+ (I32toI16H_ptx71 Int32Regs:$s)>, Requires<[hasPTX<71>]>;
3490+ def : Pat<(i32 (trunc (srl Int64Regs:$s, (i32 32)))),
3491+ (I64toI32H_ptx71 Int64Regs:$s)>, Requires<[hasPTX<71>]>;
3492+ def : Pat<(i32 (trunc (sra Int64Regs:$s, (i32 32)))),
3493+ (I64toI32H_ptx71 Int64Regs:$s)>, Requires<[hasPTX<71>]>;
3494+
3495+ // Fall back to the old way if we don't have PTX 7.1.
34703496def : Pat<(i16 (trunc (srl Int32Regs:$s, (i32 16)))),
34713497 (I32toI16H Int32Regs:$s)>;
34723498def : Pat<(i16 (trunc (sra Int32Regs:$s, (i32 16)))),
@@ -3480,10 +3506,15 @@ def: Pat<(i32 (sext (extractelt (v2i16 Int32Regs:$src), 0))),
34803506 (CVT_INREG_s32_s16 Int32Regs:$src)>;
34813507
34823508foreach vt = [v2f16, v2bf16, v2i16] in {
3483- def : Pat<(extractelt (vt Int32Regs:$src), 0),
3484- (I32toI16L Int32Regs:$src)>;
3485- def : Pat<(extractelt (vt Int32Regs:$src), 1),
3486- (I32toI16H Int32Regs:$src)>;
3509+ def : Pat<(extractelt (vt Int32Regs:$src), 0),
3510+ (I32toI16L_ptx71 Int32Regs:$src)>, Requires<[hasPTX<71>]>;
3511+ def : Pat<(extractelt (vt Int32Regs:$src), 1),
3512+ (I32toI16H_ptx71 Int32Regs:$src)>, Requires<[hasPTX<71>]>;
3513+
3514+ def : Pat<(extractelt (vt Int32Regs:$src), 0),
3515+ (I32toI16L Int32Regs:$src)>;
3516+ def : Pat<(extractelt (vt Int32Regs:$src), 1),
3517+ (I32toI16H Int32Regs:$src)>;
34873518}
34883519def : Pat<(v2f16 (build_vector (f16 Int16Regs:$a), (f16 Int16Regs:$b))),
34893520 (V2I16toI32 Int16Regs:$a, Int16Regs:$b)>;
@@ -3866,6 +3897,14 @@ def : Pat <
38663897 (v2i16 (bswap v2i16:$a)),
38673898 (INT_NVVM_PRMT Int32Regs:$a, (i32 0), (i32 0x2301))>;
38683899
3900+ def : Pat <
3901+ (i64 (bswap i64:$a)),
3902+ (V2I32toI64
3903+ (INT_NVVM_PRMT (I64toI32H_ptx71 Int64Regs:$a), (i32 0), (i32 0x0123)),
3904+ (INT_NVVM_PRMT (I64toI32L_ptx71 Int64Regs:$a), (i32 0), (i32 0x0123)))>,
3905+ Requires<[hasPTX<71>]>;
3906+
3907+ // Fall back to the old way if we don't have PTX 7.1.
38693908def : Pat <
38703909 (i64 (bswap i64:$a)),
38713910 (V2I32toI64
0 commit comments