Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
76 changes: 32 additions & 44 deletions llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
Original file line number Diff line number Diff line change
Expand Up @@ -1516,20 +1516,19 @@ def : Pat<(i16 (sext_inreg (trunc (prmt i32:$s, 0, byte_extract_prmt:$sel, PrmtN


// Byte extraction via shift/trunc/sext
def : Pat<(i16 (sext_inreg (trunc i32:$s), i8)),
(CVT_s8_s32 $s, CvtNONE)>;
def : Pat<(i16 (sext_inreg (trunc (srl i32:$s, (i32 imm:$o))), i8)),
def : Pat<(i16 (sext_inreg (trunc i32:$s), i8)), (CVT_s8_s32 $s, CvtNONE)>;
def : Pat<(i16 (sext_inreg (trunc i64:$s), i8)), (CVT_s8_s64 $s, CvtNONE)>;

def : Pat<(sext_inreg (srl i32:$s, (i32 imm:$o)), i8), (BFE_S32rii $s, imm:$o, 8)>;
def : Pat<(sext_inreg (srl i64:$s, (i32 imm:$o)), i8), (BFE_S64rii $s, imm:$o, 8)>;

def : Pat<(i16 (sext_inreg (trunc (srl i32:$s, (i32 imm:$o))), i8)),
(CVT_s8_s32 (BFE_S32rii $s, imm:$o, 8), CvtNONE)>;
def : Pat<(sext_inreg (srl i32:$s, (i32 imm:$o)), i8),
(BFE_S32rii $s, imm:$o, 8)>;
def : Pat<(i16 (sext_inreg (trunc (srl i64:$s, (i32 imm:$o))), i8)),
(CVT_s8_s64 (BFE_S64rii $s, imm:$o, 8), CvtNONE)>;

def : Pat<(i16 (sra (i16 (trunc i32:$s)), (i32 8))),
(CVT_s8_s32 (BFE_S32rii $s, 8, 8), CvtNONE)>;
def : Pat<(sext_inreg (srl i64:$s, (i32 imm:$o)), i8),
(BFE_S64rii $s, imm:$o, 8)>;
def : Pat<(i16 (sext_inreg (trunc i64:$s), i8)),
(CVT_s8_s64 $s, CvtNONE)>;
def : Pat<(i16 (sext_inreg (trunc (srl i64:$s, (i32 imm:$o))), i8)),
(CVT_s8_s64 (BFE_S64rii $s, imm:$o, 8), CvtNONE)>;

//-----------------------------------
// Comparison instructions (setp, set)
Expand Down Expand Up @@ -1713,45 +1712,34 @@ def : Pat<(i64 frameindex:$fi), (LEA_ADDRi64 (to_tframeindex $fi), 0)>;
//-----------------------------------
// Comparison and Selection
//-----------------------------------
// TODO: These patterns seem very specific and brittle. We should try to find
// a more general solution.

def cond_signed : PatLeaf<(cond), [{
return isSignedIntSetCC(N->get());
}]>;

def cond_not_signed : PatLeaf<(cond), [{
return !isSignedIntSetCC(N->get());
}]>;
// A 16-bit signed comparison of sign-extended byte extracts can be converted
// to 32-bit comparison if we change the PRMT to sign-extend the extracted
// bytes.
def : Pat<(setcc (i16 (sext_inreg (trunc (prmt i32:$a, 0, byte_extract_prmt:$sel_a, PrmtNONE)), i8)),
(i16 (sext_inreg (trunc (prmt i32:$b, 0, byte_extract_prmt:$sel_b, PrmtNONE)), i8)),
cond_signed:$cc),
(SETP_i32rr (PRMT_B32rii i32:$a, 0, (to_sign_extend_selector $sel_a), PrmtNONE),
(PRMT_B32rii i32:$b, 0, (to_sign_extend_selector $sel_b), PrmtNONE),
(cond2cc $cc))>;

// A 16-bit comparison of truncated byte extracts can be be converted to 32-bit
// comparison because we know that the truncate is just trancating off zeros
// and that the most-significant byte is also zeros so the meaning of signed and
// unsigned comparisons will not be changed.
def : Pat<(setcc (i16 (trunc (prmt i32:$a, 0, byte_extract_prmt:$sel_a, PrmtNONE))),
(i16 (trunc (prmt i32:$b, 0, byte_extract_prmt:$sel_b, PrmtNONE))),
cond:$cc),
(SETP_i32rr (PRMT_B32rii i32:$a, 0, byte_extract_prmt:$sel_a, PrmtNONE),
(PRMT_B32rii i32:$b, 0, byte_extract_prmt:$sel_b, PrmtNONE),
(cond2cc $cc))>;

// comparisons of i8 extracted with PRMT as i32
// It's faster to do comparison directly on i32 extracted by PRMT,
// instead of the long conversion and sign extending.
def: Pat<(setcc (i16 (sext_inreg (i16 (trunc (prmt i32:$a, 0, byte_extract_prmt:$sel_a, PrmtNONE))), i8)),
(i16 (sext_inreg (i16 (trunc (prmt i32:$b, 0, byte_extract_prmt:$sel_b, PrmtNONE))), i8)),
cond_signed:$cc),
(SETP_i32rr (PRMT_B32rii i32:$a, 0, byte_extract_prmt:$sel_a, PrmtNONE),
(PRMT_B32rii i32:$b, 0, byte_extract_prmt:$sel_b, PrmtNONE),
(cond2cc $cc))>;

def: Pat<(setcc (i16 (sext_inreg (trunc (prmt i32:$a, 0, byte_extract_prmt:$sel_a, PrmtNONE)), i8)),
(i16 (sext_inreg (trunc (prmt i32:$b, 0, byte_extract_prmt:$sel_b, PrmtNONE)), i8)),
cond_signed:$cc),
(SETP_i32rr (PRMT_B32rii i32:$a, 0, byte_extract_prmt:$sel_a, PrmtNONE),
(PRMT_B32rii i32:$b, 0, byte_extract_prmt:$sel_b, PrmtNONE),
(cond2cc $cc))>;

def: Pat<(setcc (i16 (trunc (prmt i32:$a, 0, byte_extract_prmt:$sel_a, PrmtNONE))),
(i16 (trunc (prmt i32:$b, 0, byte_extract_prmt:$sel_b, PrmtNONE))),
cond_signed:$cc),
(SETP_i32rr (PRMT_B32rii i32:$a, 0, byte_extract_prmt:$sel_a, PrmtNONE),
(PRMT_B32rii i32:$b, 0, byte_extract_prmt:$sel_b, PrmtNONE),
(cond2cc $cc))>;

def: Pat<(setcc (i16 (trunc (prmt i32:$a, 0, byte_extract_prmt:$sel_a, PrmtNONE))),
(i16 (trunc (prmt i32:$b, 0, byte_extract_prmt:$sel_b, PrmtNONE))),
cond_not_signed:$cc),
(SETP_i32rr (PRMT_B32rii i32:$a, 0, byte_extract_prmt:$sel_a, PrmtNONE),
(PRMT_B32rii i32:$b, 0, byte_extract_prmt:$sel_b, PrmtNONE),
(cond2cc $cc))>;

def SDTDeclareArrayParam :
SDTypeProfile<0, 3, [SDTCisVT<0, i32>, SDTCisVT<1, i32>, SDTCisVT<2, i32>]>;
Expand Down
168 changes: 100 additions & 68 deletions llvm/test/CodeGen/NVPTX/i8x4-instructions.ll
Original file line number Diff line number Diff line change
Expand Up @@ -343,61 +343,77 @@ define <4 x i8> @test_smax(<4 x i8> %a, <4 x i8> %b) #0 {
; O0-LABEL: test_smax(
; O0: {
; O0-NEXT: .reg .pred %p<5>;
; O0-NEXT: .reg .b32 %r<18>;
; O0-NEXT: .reg .b32 %r<26>;
; O0-EMPTY:
; O0-NEXT: // %bb.0:
; O0-NEXT: ld.param.b32 %r2, [test_smax_param_1];
; O0-NEXT: ld.param.b32 %r1, [test_smax_param_0];
; O0-NEXT: prmt.b32 %r3, %r2, 0, 0x7770U;
; O0-NEXT: prmt.b32 %r4, %r1, 0, 0x7770U;
; O0-NEXT: prmt.b32 %r3, %r2, 0, 0x8880U;
; O0-NEXT: prmt.b32 %r4, %r1, 0, 0x8880U;
; O0-NEXT: setp.gt.s32 %p1, %r4, %r3;
; O0-NEXT: prmt.b32 %r5, %r2, 0, 0x7771U;
; O0-NEXT: prmt.b32 %r6, %r1, 0, 0x7771U;
; O0-NEXT: prmt.b32 %r5, %r2, 0, 0x9991U;
; O0-NEXT: prmt.b32 %r6, %r1, 0, 0x9991U;
; O0-NEXT: setp.gt.s32 %p2, %r6, %r5;
; O0-NEXT: prmt.b32 %r7, %r2, 0, 0x7772U;
; O0-NEXT: prmt.b32 %r8, %r1, 0, 0x7772U;
; O0-NEXT: prmt.b32 %r7, %r2, 0, 0xaaa2U;
; O0-NEXT: prmt.b32 %r8, %r1, 0, 0xaaa2U;
; O0-NEXT: setp.gt.s32 %p3, %r8, %r7;
; O0-NEXT: prmt.b32 %r9, %r2, 0, 0x7773U;
; O0-NEXT: prmt.b32 %r10, %r1, 0, 0x7773U;
; O0-NEXT: prmt.b32 %r9, %r2, 0, 0xbbb3U;
; O0-NEXT: prmt.b32 %r10, %r1, 0, 0xbbb3U;
; O0-NEXT: setp.gt.s32 %p4, %r10, %r9;
; O0-NEXT: selp.b32 %r11, %r10, %r9, %p4;
; O0-NEXT: selp.b32 %r12, %r8, %r7, %p3;
; O0-NEXT: prmt.b32 %r13, %r12, %r11, 0x3340U;
; O0-NEXT: selp.b32 %r14, %r6, %r5, %p2;
; O0-NEXT: selp.b32 %r15, %r4, %r3, %p1;
; O0-NEXT: prmt.b32 %r16, %r15, %r14, 0x3340U;
; O0-NEXT: prmt.b32 %r17, %r16, %r13, 0x5410U;
; O0-NEXT: st.param.b32 [func_retval0], %r17;
; O0-NEXT: prmt.b32 %r11, %r2, 0, 0x7770U;
; O0-NEXT: prmt.b32 %r12, %r2, 0, 0x7771U;
; O0-NEXT: prmt.b32 %r13, %r2, 0, 0x7772U;
; O0-NEXT: prmt.b32 %r14, %r2, 0, 0x7773U;
; O0-NEXT: prmt.b32 %r15, %r1, 0, 0x7773U;
; O0-NEXT: selp.b32 %r16, %r15, %r14, %p4;
; O0-NEXT: prmt.b32 %r17, %r1, 0, 0x7772U;
; O0-NEXT: selp.b32 %r18, %r17, %r13, %p3;
; O0-NEXT: prmt.b32 %r19, %r18, %r16, 0x3340U;
; O0-NEXT: prmt.b32 %r20, %r1, 0, 0x7771U;
; O0-NEXT: selp.b32 %r21, %r20, %r12, %p2;
; O0-NEXT: prmt.b32 %r22, %r1, 0, 0x7770U;
; O0-NEXT: selp.b32 %r23, %r22, %r11, %p1;
; O0-NEXT: prmt.b32 %r24, %r23, %r21, 0x3340U;
; O0-NEXT: prmt.b32 %r25, %r24, %r19, 0x5410U;
; O0-NEXT: st.param.b32 [func_retval0], %r25;
; O0-NEXT: ret;
;
; O3-LABEL: test_smax(
; O3: {
; O3-NEXT: .reg .pred %p<5>;
; O3-NEXT: .reg .b32 %r<18>;
; O3-NEXT: .reg .b32 %r<26>;
; O3-EMPTY:
; O3-NEXT: // %bb.0:
; O3-NEXT: ld.param.b32 %r1, [test_smax_param_0];
; O3-NEXT: ld.param.b32 %r2, [test_smax_param_1];
; O3-NEXT: prmt.b32 %r3, %r2, 0, 0x7770U;
; O3-NEXT: prmt.b32 %r4, %r1, 0, 0x7770U;
; O3-NEXT: prmt.b32 %r3, %r2, 0, 0x8880U;
; O3-NEXT: prmt.b32 %r4, %r1, 0, 0x8880U;
; O3-NEXT: setp.gt.s32 %p1, %r4, %r3;
; O3-NEXT: prmt.b32 %r5, %r2, 0, 0x7771U;
; O3-NEXT: prmt.b32 %r6, %r1, 0, 0x7771U;
; O3-NEXT: prmt.b32 %r5, %r2, 0, 0x9991U;
; O3-NEXT: prmt.b32 %r6, %r1, 0, 0x9991U;
; O3-NEXT: setp.gt.s32 %p2, %r6, %r5;
; O3-NEXT: prmt.b32 %r7, %r2, 0, 0x7772U;
; O3-NEXT: prmt.b32 %r8, %r1, 0, 0x7772U;
; O3-NEXT: prmt.b32 %r7, %r2, 0, 0xaaa2U;
; O3-NEXT: prmt.b32 %r8, %r1, 0, 0xaaa2U;
; O3-NEXT: setp.gt.s32 %p3, %r8, %r7;
; O3-NEXT: prmt.b32 %r9, %r2, 0, 0x7773U;
; O3-NEXT: prmt.b32 %r10, %r1, 0, 0x7773U;
; O3-NEXT: prmt.b32 %r9, %r2, 0, 0xbbb3U;
; O3-NEXT: prmt.b32 %r10, %r1, 0, 0xbbb3U;
; O3-NEXT: setp.gt.s32 %p4, %r10, %r9;
; O3-NEXT: selp.b32 %r11, %r10, %r9, %p4;
; O3-NEXT: selp.b32 %r12, %r8, %r7, %p3;
; O3-NEXT: prmt.b32 %r13, %r12, %r11, 0x3340U;
; O3-NEXT: selp.b32 %r14, %r6, %r5, %p2;
; O3-NEXT: selp.b32 %r15, %r4, %r3, %p1;
; O3-NEXT: prmt.b32 %r16, %r15, %r14, 0x3340U;
; O3-NEXT: prmt.b32 %r17, %r16, %r13, 0x5410U;
; O3-NEXT: st.param.b32 [func_retval0], %r17;
; O3-NEXT: prmt.b32 %r11, %r2, 0, 0x7770U;
; O3-NEXT: prmt.b32 %r12, %r2, 0, 0x7771U;
; O3-NEXT: prmt.b32 %r13, %r2, 0, 0x7772U;
; O3-NEXT: prmt.b32 %r14, %r2, 0, 0x7773U;
; O3-NEXT: prmt.b32 %r15, %r1, 0, 0x7773U;
; O3-NEXT: selp.b32 %r16, %r15, %r14, %p4;
; O3-NEXT: prmt.b32 %r17, %r1, 0, 0x7772U;
; O3-NEXT: selp.b32 %r18, %r17, %r13, %p3;
; O3-NEXT: prmt.b32 %r19, %r18, %r16, 0x3340U;
; O3-NEXT: prmt.b32 %r20, %r1, 0, 0x7771U;
; O3-NEXT: selp.b32 %r21, %r20, %r12, %p2;
; O3-NEXT: prmt.b32 %r22, %r1, 0, 0x7770U;
; O3-NEXT: selp.b32 %r23, %r22, %r11, %p1;
; O3-NEXT: prmt.b32 %r24, %r23, %r21, 0x3340U;
; O3-NEXT: prmt.b32 %r25, %r24, %r19, 0x5410U;
; O3-NEXT: st.param.b32 [func_retval0], %r25;
; O3-NEXT: ret;
%cmp = icmp sgt <4 x i8> %a, %b
%r = select <4 x i1> %cmp, <4 x i8> %a, <4 x i8> %b
Expand Down Expand Up @@ -473,61 +489,77 @@ define <4 x i8> @test_smin(<4 x i8> %a, <4 x i8> %b) #0 {
; O0-LABEL: test_smin(
; O0: {
; O0-NEXT: .reg .pred %p<5>;
; O0-NEXT: .reg .b32 %r<18>;
; O0-NEXT: .reg .b32 %r<26>;
; O0-EMPTY:
; O0-NEXT: // %bb.0:
; O0-NEXT: ld.param.b32 %r2, [test_smin_param_1];
; O0-NEXT: ld.param.b32 %r1, [test_smin_param_0];
; O0-NEXT: prmt.b32 %r3, %r2, 0, 0x7770U;
; O0-NEXT: prmt.b32 %r4, %r1, 0, 0x7770U;
; O0-NEXT: prmt.b32 %r3, %r2, 0, 0x8880U;
; O0-NEXT: prmt.b32 %r4, %r1, 0, 0x8880U;
; O0-NEXT: setp.le.s32 %p1, %r4, %r3;
; O0-NEXT: prmt.b32 %r5, %r2, 0, 0x7771U;
; O0-NEXT: prmt.b32 %r6, %r1, 0, 0x7771U;
; O0-NEXT: prmt.b32 %r5, %r2, 0, 0x9991U;
; O0-NEXT: prmt.b32 %r6, %r1, 0, 0x9991U;
; O0-NEXT: setp.le.s32 %p2, %r6, %r5;
; O0-NEXT: prmt.b32 %r7, %r2, 0, 0x7772U;
; O0-NEXT: prmt.b32 %r8, %r1, 0, 0x7772U;
; O0-NEXT: prmt.b32 %r7, %r2, 0, 0xaaa2U;
; O0-NEXT: prmt.b32 %r8, %r1, 0, 0xaaa2U;
; O0-NEXT: setp.le.s32 %p3, %r8, %r7;
; O0-NEXT: prmt.b32 %r9, %r2, 0, 0x7773U;
; O0-NEXT: prmt.b32 %r10, %r1, 0, 0x7773U;
; O0-NEXT: prmt.b32 %r9, %r2, 0, 0xbbb3U;
; O0-NEXT: prmt.b32 %r10, %r1, 0, 0xbbb3U;
; O0-NEXT: setp.le.s32 %p4, %r10, %r9;
; O0-NEXT: selp.b32 %r11, %r10, %r9, %p4;
; O0-NEXT: selp.b32 %r12, %r8, %r7, %p3;
; O0-NEXT: prmt.b32 %r13, %r12, %r11, 0x3340U;
; O0-NEXT: selp.b32 %r14, %r6, %r5, %p2;
; O0-NEXT: selp.b32 %r15, %r4, %r3, %p1;
; O0-NEXT: prmt.b32 %r16, %r15, %r14, 0x3340U;
; O0-NEXT: prmt.b32 %r17, %r16, %r13, 0x5410U;
; O0-NEXT: st.param.b32 [func_retval0], %r17;
; O0-NEXT: prmt.b32 %r11, %r2, 0, 0x7770U;
; O0-NEXT: prmt.b32 %r12, %r2, 0, 0x7771U;
; O0-NEXT: prmt.b32 %r13, %r2, 0, 0x7772U;
; O0-NEXT: prmt.b32 %r14, %r2, 0, 0x7773U;
; O0-NEXT: prmt.b32 %r15, %r1, 0, 0x7773U;
; O0-NEXT: selp.b32 %r16, %r15, %r14, %p4;
; O0-NEXT: prmt.b32 %r17, %r1, 0, 0x7772U;
; O0-NEXT: selp.b32 %r18, %r17, %r13, %p3;
; O0-NEXT: prmt.b32 %r19, %r18, %r16, 0x3340U;
; O0-NEXT: prmt.b32 %r20, %r1, 0, 0x7771U;
; O0-NEXT: selp.b32 %r21, %r20, %r12, %p2;
; O0-NEXT: prmt.b32 %r22, %r1, 0, 0x7770U;
; O0-NEXT: selp.b32 %r23, %r22, %r11, %p1;
; O0-NEXT: prmt.b32 %r24, %r23, %r21, 0x3340U;
; O0-NEXT: prmt.b32 %r25, %r24, %r19, 0x5410U;
; O0-NEXT: st.param.b32 [func_retval0], %r25;
; O0-NEXT: ret;
;
; O3-LABEL: test_smin(
; O3: {
; O3-NEXT: .reg .pred %p<5>;
; O3-NEXT: .reg .b32 %r<18>;
; O3-NEXT: .reg .b32 %r<26>;
; O3-EMPTY:
; O3-NEXT: // %bb.0:
; O3-NEXT: ld.param.b32 %r1, [test_smin_param_0];
; O3-NEXT: ld.param.b32 %r2, [test_smin_param_1];
; O3-NEXT: prmt.b32 %r3, %r2, 0, 0x7770U;
; O3-NEXT: prmt.b32 %r4, %r1, 0, 0x7770U;
; O3-NEXT: prmt.b32 %r3, %r2, 0, 0x8880U;
; O3-NEXT: prmt.b32 %r4, %r1, 0, 0x8880U;
; O3-NEXT: setp.le.s32 %p1, %r4, %r3;
; O3-NEXT: prmt.b32 %r5, %r2, 0, 0x7771U;
; O3-NEXT: prmt.b32 %r6, %r1, 0, 0x7771U;
; O3-NEXT: prmt.b32 %r5, %r2, 0, 0x9991U;
; O3-NEXT: prmt.b32 %r6, %r1, 0, 0x9991U;
; O3-NEXT: setp.le.s32 %p2, %r6, %r5;
; O3-NEXT: prmt.b32 %r7, %r2, 0, 0x7772U;
; O3-NEXT: prmt.b32 %r8, %r1, 0, 0x7772U;
; O3-NEXT: prmt.b32 %r7, %r2, 0, 0xaaa2U;
; O3-NEXT: prmt.b32 %r8, %r1, 0, 0xaaa2U;
; O3-NEXT: setp.le.s32 %p3, %r8, %r7;
; O3-NEXT: prmt.b32 %r9, %r2, 0, 0x7773U;
; O3-NEXT: prmt.b32 %r10, %r1, 0, 0x7773U;
; O3-NEXT: prmt.b32 %r9, %r2, 0, 0xbbb3U;
; O3-NEXT: prmt.b32 %r10, %r1, 0, 0xbbb3U;
; O3-NEXT: setp.le.s32 %p4, %r10, %r9;
; O3-NEXT: selp.b32 %r11, %r10, %r9, %p4;
; O3-NEXT: selp.b32 %r12, %r8, %r7, %p3;
; O3-NEXT: prmt.b32 %r13, %r12, %r11, 0x3340U;
; O3-NEXT: selp.b32 %r14, %r6, %r5, %p2;
; O3-NEXT: selp.b32 %r15, %r4, %r3, %p1;
; O3-NEXT: prmt.b32 %r16, %r15, %r14, 0x3340U;
; O3-NEXT: prmt.b32 %r17, %r16, %r13, 0x5410U;
; O3-NEXT: st.param.b32 [func_retval0], %r17;
; O3-NEXT: prmt.b32 %r11, %r2, 0, 0x7770U;
; O3-NEXT: prmt.b32 %r12, %r2, 0, 0x7771U;
; O3-NEXT: prmt.b32 %r13, %r2, 0, 0x7772U;
; O3-NEXT: prmt.b32 %r14, %r2, 0, 0x7773U;
; O3-NEXT: prmt.b32 %r15, %r1, 0, 0x7773U;
; O3-NEXT: selp.b32 %r16, %r15, %r14, %p4;
; O3-NEXT: prmt.b32 %r17, %r1, 0, 0x7772U;
; O3-NEXT: selp.b32 %r18, %r17, %r13, %p3;
; O3-NEXT: prmt.b32 %r19, %r18, %r16, 0x3340U;
; O3-NEXT: prmt.b32 %r20, %r1, 0, 0x7771U;
; O3-NEXT: selp.b32 %r21, %r20, %r12, %p2;
; O3-NEXT: prmt.b32 %r22, %r1, 0, 0x7770U;
; O3-NEXT: selp.b32 %r23, %r22, %r11, %p1;
; O3-NEXT: prmt.b32 %r24, %r23, %r21, 0x3340U;
; O3-NEXT: prmt.b32 %r25, %r24, %r19, 0x5410U;
; O3-NEXT: st.param.b32 [func_retval0], %r25;
; O3-NEXT: ret;
%cmp = icmp sle <4 x i8> %a, %b
%r = select <4 x i1> %cmp, <4 x i8> %a, <4 x i8> %b
Expand Down