Skip to content

Commit 9bb4ef8

Browse files
committed
[NVPTX] Fixup ISel patterns for setcc of i8 extract
1 parent 59c3fe6 commit 9bb4ef8

File tree

2 files changed

+132
-112
lines changed

2 files changed

+132
-112
lines changed

llvm/lib/Target/NVPTX/NVPTXInstrInfo.td

Lines changed: 32 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -1516,20 +1516,19 @@ def : Pat<(i16 (sext_inreg (trunc (prmt i32:$s, 0, byte_extract_prmt:$sel, PrmtN
15161516

15171517

15181518
// Byte extraction via shift/trunc/sext
1519-
def : Pat<(i16 (sext_inreg (trunc i32:$s), i8)),
1520-
(CVT_s8_s32 $s, CvtNONE)>;
1521-
def : Pat<(i16 (sext_inreg (trunc (srl i32:$s, (i32 imm:$o))), i8)),
1519+
def : Pat<(i16 (sext_inreg (trunc i32:$s), i8)), (CVT_s8_s32 $s, CvtNONE)>;
1520+
def : Pat<(i16 (sext_inreg (trunc i64:$s), i8)), (CVT_s8_s64 $s, CvtNONE)>;
1521+
1522+
def : Pat<(sext_inreg (srl i32:$s, (i32 imm:$o)), i8), (BFE_S32rii $s, imm:$o, 8)>;
1523+
def : Pat<(sext_inreg (srl i64:$s, (i32 imm:$o)), i8), (BFE_S64rii $s, imm:$o, 8)>;
1524+
1525+
def : Pat<(i16 (sext_inreg (trunc (srl i32:$s, (i32 imm:$o))), i8)),
15221526
(CVT_s8_s32 (BFE_S32rii $s, imm:$o, 8), CvtNONE)>;
1523-
def : Pat<(sext_inreg (srl i32:$s, (i32 imm:$o)), i8),
1524-
(BFE_S32rii $s, imm:$o, 8)>;
1527+
def : Pat<(i16 (sext_inreg (trunc (srl i64:$s, (i32 imm:$o))), i8)),
1528+
(CVT_s8_s64 (BFE_S64rii $s, imm:$o, 8), CvtNONE)>;
1529+
15251530
def : Pat<(i16 (sra (i16 (trunc i32:$s)), (i32 8))),
15261531
(CVT_s8_s32 (BFE_S32rii $s, 8, 8), CvtNONE)>;
1527-
def : Pat<(sext_inreg (srl i64:$s, (i32 imm:$o)), i8),
1528-
(BFE_S64rii $s, imm:$o, 8)>;
1529-
def : Pat<(i16 (sext_inreg (trunc i64:$s), i8)),
1530-
(CVT_s8_s64 $s, CvtNONE)>;
1531-
def : Pat<(i16 (sext_inreg (trunc (srl i64:$s, (i32 imm:$o))), i8)),
1532-
(CVT_s8_s64 (BFE_S64rii $s, imm:$o, 8), CvtNONE)>;
15331532

15341533
//-----------------------------------
15351534
// Comparison instructions (setp, set)
@@ -1713,45 +1712,34 @@ def : Pat<(i64 frameindex:$fi), (LEA_ADDRi64 (to_tframeindex $fi), 0)>;
17131712
//-----------------------------------
17141713
// Comparison and Selection
17151714
//-----------------------------------
1715+
// TODO: These patterns seem very specific and brittle. We should try to find
1716+
// a more general solution.
17161717

17171718
def cond_signed : PatLeaf<(cond), [{
17181719
return isSignedIntSetCC(N->get());
17191720
}]>;
17201721

1721-
def cond_not_signed : PatLeaf<(cond), [{
1722-
return !isSignedIntSetCC(N->get());
1723-
}]>;
1722+
// A 16-bit signed comparison of sign-extended byte extracts can be converted
1723+
// to 32-bit comparison if we change the PRMT to sign-extend the extracted
1724+
// bytes.
1725+
def : Pat<(setcc (i16 (sext_inreg (trunc (prmt i32:$a, 0, byte_extract_prmt:$sel_a, PrmtNONE)), i8)),
1726+
(i16 (sext_inreg (trunc (prmt i32:$b, 0, byte_extract_prmt:$sel_b, PrmtNONE)), i8)),
1727+
cond_signed:$cc),
1728+
(SETP_i32rr (PRMT_B32rii i32:$a, 0, (to_sign_extend_selector $sel_a), PrmtNONE),
1729+
(PRMT_B32rii i32:$b, 0, (to_sign_extend_selector $sel_b), PrmtNONE),
1730+
(cond2cc $cc))>;
1731+
1732+
// A 16-bit comparison of truncated byte extracts can be be converted to 32-bit
1733+
// comparison because we know that the truncate is just trancating off zeros
1734+
// and that the most-significant byte is also zeros so the meaning of signed and
1735+
// unsigned comparisons will not be changed.
1736+
def : Pat<(setcc (i16 (trunc (prmt i32:$a, 0, byte_extract_prmt:$sel_a, PrmtNONE))),
1737+
(i16 (trunc (prmt i32:$b, 0, byte_extract_prmt:$sel_b, PrmtNONE))),
1738+
cond:$cc),
1739+
(SETP_i32rr (PRMT_B32rii i32:$a, 0, byte_extract_prmt:$sel_a, PrmtNONE),
1740+
(PRMT_B32rii i32:$b, 0, byte_extract_prmt:$sel_b, PrmtNONE),
1741+
(cond2cc $cc))>;
17241742

1725-
// comparisons of i8 extracted with PRMT as i32
1726-
// It's faster to do comparison directly on i32 extracted by PRMT,
1727-
// instead of the long conversion and sign extending.
1728-
def: Pat<(setcc (i16 (sext_inreg (i16 (trunc (prmt i32:$a, 0, byte_extract_prmt:$sel_a, PrmtNONE))), i8)),
1729-
(i16 (sext_inreg (i16 (trunc (prmt i32:$b, 0, byte_extract_prmt:$sel_b, PrmtNONE))), i8)),
1730-
cond_signed:$cc),
1731-
(SETP_i32rr (PRMT_B32rii i32:$a, 0, byte_extract_prmt:$sel_a, PrmtNONE),
1732-
(PRMT_B32rii i32:$b, 0, byte_extract_prmt:$sel_b, PrmtNONE),
1733-
(cond2cc $cc))>;
1734-
1735-
def: Pat<(setcc (i16 (sext_inreg (trunc (prmt i32:$a, 0, byte_extract_prmt:$sel_a, PrmtNONE)), i8)),
1736-
(i16 (sext_inreg (trunc (prmt i32:$b, 0, byte_extract_prmt:$sel_b, PrmtNONE)), i8)),
1737-
cond_signed:$cc),
1738-
(SETP_i32rr (PRMT_B32rii i32:$a, 0, byte_extract_prmt:$sel_a, PrmtNONE),
1739-
(PRMT_B32rii i32:$b, 0, byte_extract_prmt:$sel_b, PrmtNONE),
1740-
(cond2cc $cc))>;
1741-
1742-
def: Pat<(setcc (i16 (trunc (prmt i32:$a, 0, byte_extract_prmt:$sel_a, PrmtNONE))),
1743-
(i16 (trunc (prmt i32:$b, 0, byte_extract_prmt:$sel_b, PrmtNONE))),
1744-
cond_signed:$cc),
1745-
(SETP_i32rr (PRMT_B32rii i32:$a, 0, byte_extract_prmt:$sel_a, PrmtNONE),
1746-
(PRMT_B32rii i32:$b, 0, byte_extract_prmt:$sel_b, PrmtNONE),
1747-
(cond2cc $cc))>;
1748-
1749-
def: Pat<(setcc (i16 (trunc (prmt i32:$a, 0, byte_extract_prmt:$sel_a, PrmtNONE))),
1750-
(i16 (trunc (prmt i32:$b, 0, byte_extract_prmt:$sel_b, PrmtNONE))),
1751-
cond_not_signed:$cc),
1752-
(SETP_i32rr (PRMT_B32rii i32:$a, 0, byte_extract_prmt:$sel_a, PrmtNONE),
1753-
(PRMT_B32rii i32:$b, 0, byte_extract_prmt:$sel_b, PrmtNONE),
1754-
(cond2cc $cc))>;
17551743

17561744
def SDTDeclareArrayParam :
17571745
SDTypeProfile<0, 3, [SDTCisVT<0, i32>, SDTCisVT<1, i32>, SDTCisVT<2, i32>]>;

llvm/test/CodeGen/NVPTX/i8x4-instructions.ll

Lines changed: 100 additions & 68 deletions
Original file line numberDiff line numberDiff line change
@@ -343,61 +343,77 @@ define <4 x i8> @test_smax(<4 x i8> %a, <4 x i8> %b) #0 {
343343
; O0-LABEL: test_smax(
344344
; O0: {
345345
; O0-NEXT: .reg .pred %p<5>;
346-
; O0-NEXT: .reg .b32 %r<18>;
346+
; O0-NEXT: .reg .b32 %r<26>;
347347
; O0-EMPTY:
348348
; O0-NEXT: // %bb.0:
349349
; O0-NEXT: ld.param.b32 %r2, [test_smax_param_1];
350350
; O0-NEXT: ld.param.b32 %r1, [test_smax_param_0];
351-
; O0-NEXT: prmt.b32 %r3, %r2, 0, 0x7770U;
352-
; O0-NEXT: prmt.b32 %r4, %r1, 0, 0x7770U;
351+
; O0-NEXT: prmt.b32 %r3, %r2, 0, 0x8880U;
352+
; O0-NEXT: prmt.b32 %r4, %r1, 0, 0x8880U;
353353
; O0-NEXT: setp.gt.s32 %p1, %r4, %r3;
354-
; O0-NEXT: prmt.b32 %r5, %r2, 0, 0x7771U;
355-
; O0-NEXT: prmt.b32 %r6, %r1, 0, 0x7771U;
354+
; O0-NEXT: prmt.b32 %r5, %r2, 0, 0x9991U;
355+
; O0-NEXT: prmt.b32 %r6, %r1, 0, 0x9991U;
356356
; O0-NEXT: setp.gt.s32 %p2, %r6, %r5;
357-
; O0-NEXT: prmt.b32 %r7, %r2, 0, 0x7772U;
358-
; O0-NEXT: prmt.b32 %r8, %r1, 0, 0x7772U;
357+
; O0-NEXT: prmt.b32 %r7, %r2, 0, 0xaaa2U;
358+
; O0-NEXT: prmt.b32 %r8, %r1, 0, 0xaaa2U;
359359
; O0-NEXT: setp.gt.s32 %p3, %r8, %r7;
360-
; O0-NEXT: prmt.b32 %r9, %r2, 0, 0x7773U;
361-
; O0-NEXT: prmt.b32 %r10, %r1, 0, 0x7773U;
360+
; O0-NEXT: prmt.b32 %r9, %r2, 0, 0xbbb3U;
361+
; O0-NEXT: prmt.b32 %r10, %r1, 0, 0xbbb3U;
362362
; O0-NEXT: setp.gt.s32 %p4, %r10, %r9;
363-
; O0-NEXT: selp.b32 %r11, %r10, %r9, %p4;
364-
; O0-NEXT: selp.b32 %r12, %r8, %r7, %p3;
365-
; O0-NEXT: prmt.b32 %r13, %r12, %r11, 0x3340U;
366-
; O0-NEXT: selp.b32 %r14, %r6, %r5, %p2;
367-
; O0-NEXT: selp.b32 %r15, %r4, %r3, %p1;
368-
; O0-NEXT: prmt.b32 %r16, %r15, %r14, 0x3340U;
369-
; O0-NEXT: prmt.b32 %r17, %r16, %r13, 0x5410U;
370-
; O0-NEXT: st.param.b32 [func_retval0], %r17;
363+
; O0-NEXT: prmt.b32 %r11, %r2, 0, 0x7770U;
364+
; O0-NEXT: prmt.b32 %r12, %r2, 0, 0x7771U;
365+
; O0-NEXT: prmt.b32 %r13, %r2, 0, 0x7772U;
366+
; O0-NEXT: prmt.b32 %r14, %r2, 0, 0x7773U;
367+
; O0-NEXT: prmt.b32 %r15, %r1, 0, 0x7773U;
368+
; O0-NEXT: selp.b32 %r16, %r15, %r14, %p4;
369+
; O0-NEXT: prmt.b32 %r17, %r1, 0, 0x7772U;
370+
; O0-NEXT: selp.b32 %r18, %r17, %r13, %p3;
371+
; O0-NEXT: prmt.b32 %r19, %r18, %r16, 0x3340U;
372+
; O0-NEXT: prmt.b32 %r20, %r1, 0, 0x7771U;
373+
; O0-NEXT: selp.b32 %r21, %r20, %r12, %p2;
374+
; O0-NEXT: prmt.b32 %r22, %r1, 0, 0x7770U;
375+
; O0-NEXT: selp.b32 %r23, %r22, %r11, %p1;
376+
; O0-NEXT: prmt.b32 %r24, %r23, %r21, 0x3340U;
377+
; O0-NEXT: prmt.b32 %r25, %r24, %r19, 0x5410U;
378+
; O0-NEXT: st.param.b32 [func_retval0], %r25;
371379
; O0-NEXT: ret;
372380
;
373381
; O3-LABEL: test_smax(
374382
; O3: {
375383
; O3-NEXT: .reg .pred %p<5>;
376-
; O3-NEXT: .reg .b32 %r<18>;
384+
; O3-NEXT: .reg .b32 %r<26>;
377385
; O3-EMPTY:
378386
; O3-NEXT: // %bb.0:
379387
; O3-NEXT: ld.param.b32 %r1, [test_smax_param_0];
380388
; O3-NEXT: ld.param.b32 %r2, [test_smax_param_1];
381-
; O3-NEXT: prmt.b32 %r3, %r2, 0, 0x7770U;
382-
; O3-NEXT: prmt.b32 %r4, %r1, 0, 0x7770U;
389+
; O3-NEXT: prmt.b32 %r3, %r2, 0, 0x8880U;
390+
; O3-NEXT: prmt.b32 %r4, %r1, 0, 0x8880U;
383391
; O3-NEXT: setp.gt.s32 %p1, %r4, %r3;
384-
; O3-NEXT: prmt.b32 %r5, %r2, 0, 0x7771U;
385-
; O3-NEXT: prmt.b32 %r6, %r1, 0, 0x7771U;
392+
; O3-NEXT: prmt.b32 %r5, %r2, 0, 0x9991U;
393+
; O3-NEXT: prmt.b32 %r6, %r1, 0, 0x9991U;
386394
; O3-NEXT: setp.gt.s32 %p2, %r6, %r5;
387-
; O3-NEXT: prmt.b32 %r7, %r2, 0, 0x7772U;
388-
; O3-NEXT: prmt.b32 %r8, %r1, 0, 0x7772U;
395+
; O3-NEXT: prmt.b32 %r7, %r2, 0, 0xaaa2U;
396+
; O3-NEXT: prmt.b32 %r8, %r1, 0, 0xaaa2U;
389397
; O3-NEXT: setp.gt.s32 %p3, %r8, %r7;
390-
; O3-NEXT: prmt.b32 %r9, %r2, 0, 0x7773U;
391-
; O3-NEXT: prmt.b32 %r10, %r1, 0, 0x7773U;
398+
; O3-NEXT: prmt.b32 %r9, %r2, 0, 0xbbb3U;
399+
; O3-NEXT: prmt.b32 %r10, %r1, 0, 0xbbb3U;
392400
; O3-NEXT: setp.gt.s32 %p4, %r10, %r9;
393-
; O3-NEXT: selp.b32 %r11, %r10, %r9, %p4;
394-
; O3-NEXT: selp.b32 %r12, %r8, %r7, %p3;
395-
; O3-NEXT: prmt.b32 %r13, %r12, %r11, 0x3340U;
396-
; O3-NEXT: selp.b32 %r14, %r6, %r5, %p2;
397-
; O3-NEXT: selp.b32 %r15, %r4, %r3, %p1;
398-
; O3-NEXT: prmt.b32 %r16, %r15, %r14, 0x3340U;
399-
; O3-NEXT: prmt.b32 %r17, %r16, %r13, 0x5410U;
400-
; O3-NEXT: st.param.b32 [func_retval0], %r17;
401+
; O3-NEXT: prmt.b32 %r11, %r2, 0, 0x7770U;
402+
; O3-NEXT: prmt.b32 %r12, %r2, 0, 0x7771U;
403+
; O3-NEXT: prmt.b32 %r13, %r2, 0, 0x7772U;
404+
; O3-NEXT: prmt.b32 %r14, %r2, 0, 0x7773U;
405+
; O3-NEXT: prmt.b32 %r15, %r1, 0, 0x7773U;
406+
; O3-NEXT: selp.b32 %r16, %r15, %r14, %p4;
407+
; O3-NEXT: prmt.b32 %r17, %r1, 0, 0x7772U;
408+
; O3-NEXT: selp.b32 %r18, %r17, %r13, %p3;
409+
; O3-NEXT: prmt.b32 %r19, %r18, %r16, 0x3340U;
410+
; O3-NEXT: prmt.b32 %r20, %r1, 0, 0x7771U;
411+
; O3-NEXT: selp.b32 %r21, %r20, %r12, %p2;
412+
; O3-NEXT: prmt.b32 %r22, %r1, 0, 0x7770U;
413+
; O3-NEXT: selp.b32 %r23, %r22, %r11, %p1;
414+
; O3-NEXT: prmt.b32 %r24, %r23, %r21, 0x3340U;
415+
; O3-NEXT: prmt.b32 %r25, %r24, %r19, 0x5410U;
416+
; O3-NEXT: st.param.b32 [func_retval0], %r25;
401417
; O3-NEXT: ret;
402418
%cmp = icmp sgt <4 x i8> %a, %b
403419
%r = select <4 x i1> %cmp, <4 x i8> %a, <4 x i8> %b
@@ -473,61 +489,77 @@ define <4 x i8> @test_smin(<4 x i8> %a, <4 x i8> %b) #0 {
473489
; O0-LABEL: test_smin(
474490
; O0: {
475491
; O0-NEXT: .reg .pred %p<5>;
476-
; O0-NEXT: .reg .b32 %r<18>;
492+
; O0-NEXT: .reg .b32 %r<26>;
477493
; O0-EMPTY:
478494
; O0-NEXT: // %bb.0:
479495
; O0-NEXT: ld.param.b32 %r2, [test_smin_param_1];
480496
; O0-NEXT: ld.param.b32 %r1, [test_smin_param_0];
481-
; O0-NEXT: prmt.b32 %r3, %r2, 0, 0x7770U;
482-
; O0-NEXT: prmt.b32 %r4, %r1, 0, 0x7770U;
497+
; O0-NEXT: prmt.b32 %r3, %r2, 0, 0x8880U;
498+
; O0-NEXT: prmt.b32 %r4, %r1, 0, 0x8880U;
483499
; O0-NEXT: setp.le.s32 %p1, %r4, %r3;
484-
; O0-NEXT: prmt.b32 %r5, %r2, 0, 0x7771U;
485-
; O0-NEXT: prmt.b32 %r6, %r1, 0, 0x7771U;
500+
; O0-NEXT: prmt.b32 %r5, %r2, 0, 0x9991U;
501+
; O0-NEXT: prmt.b32 %r6, %r1, 0, 0x9991U;
486502
; O0-NEXT: setp.le.s32 %p2, %r6, %r5;
487-
; O0-NEXT: prmt.b32 %r7, %r2, 0, 0x7772U;
488-
; O0-NEXT: prmt.b32 %r8, %r1, 0, 0x7772U;
503+
; O0-NEXT: prmt.b32 %r7, %r2, 0, 0xaaa2U;
504+
; O0-NEXT: prmt.b32 %r8, %r1, 0, 0xaaa2U;
489505
; O0-NEXT: setp.le.s32 %p3, %r8, %r7;
490-
; O0-NEXT: prmt.b32 %r9, %r2, 0, 0x7773U;
491-
; O0-NEXT: prmt.b32 %r10, %r1, 0, 0x7773U;
506+
; O0-NEXT: prmt.b32 %r9, %r2, 0, 0xbbb3U;
507+
; O0-NEXT: prmt.b32 %r10, %r1, 0, 0xbbb3U;
492508
; O0-NEXT: setp.le.s32 %p4, %r10, %r9;
493-
; O0-NEXT: selp.b32 %r11, %r10, %r9, %p4;
494-
; O0-NEXT: selp.b32 %r12, %r8, %r7, %p3;
495-
; O0-NEXT: prmt.b32 %r13, %r12, %r11, 0x3340U;
496-
; O0-NEXT: selp.b32 %r14, %r6, %r5, %p2;
497-
; O0-NEXT: selp.b32 %r15, %r4, %r3, %p1;
498-
; O0-NEXT: prmt.b32 %r16, %r15, %r14, 0x3340U;
499-
; O0-NEXT: prmt.b32 %r17, %r16, %r13, 0x5410U;
500-
; O0-NEXT: st.param.b32 [func_retval0], %r17;
509+
; O0-NEXT: prmt.b32 %r11, %r2, 0, 0x7770U;
510+
; O0-NEXT: prmt.b32 %r12, %r2, 0, 0x7771U;
511+
; O0-NEXT: prmt.b32 %r13, %r2, 0, 0x7772U;
512+
; O0-NEXT: prmt.b32 %r14, %r2, 0, 0x7773U;
513+
; O0-NEXT: prmt.b32 %r15, %r1, 0, 0x7773U;
514+
; O0-NEXT: selp.b32 %r16, %r15, %r14, %p4;
515+
; O0-NEXT: prmt.b32 %r17, %r1, 0, 0x7772U;
516+
; O0-NEXT: selp.b32 %r18, %r17, %r13, %p3;
517+
; O0-NEXT: prmt.b32 %r19, %r18, %r16, 0x3340U;
518+
; O0-NEXT: prmt.b32 %r20, %r1, 0, 0x7771U;
519+
; O0-NEXT: selp.b32 %r21, %r20, %r12, %p2;
520+
; O0-NEXT: prmt.b32 %r22, %r1, 0, 0x7770U;
521+
; O0-NEXT: selp.b32 %r23, %r22, %r11, %p1;
522+
; O0-NEXT: prmt.b32 %r24, %r23, %r21, 0x3340U;
523+
; O0-NEXT: prmt.b32 %r25, %r24, %r19, 0x5410U;
524+
; O0-NEXT: st.param.b32 [func_retval0], %r25;
501525
; O0-NEXT: ret;
502526
;
503527
; O3-LABEL: test_smin(
504528
; O3: {
505529
; O3-NEXT: .reg .pred %p<5>;
506-
; O3-NEXT: .reg .b32 %r<18>;
530+
; O3-NEXT: .reg .b32 %r<26>;
507531
; O3-EMPTY:
508532
; O3-NEXT: // %bb.0:
509533
; O3-NEXT: ld.param.b32 %r1, [test_smin_param_0];
510534
; O3-NEXT: ld.param.b32 %r2, [test_smin_param_1];
511-
; O3-NEXT: prmt.b32 %r3, %r2, 0, 0x7770U;
512-
; O3-NEXT: prmt.b32 %r4, %r1, 0, 0x7770U;
535+
; O3-NEXT: prmt.b32 %r3, %r2, 0, 0x8880U;
536+
; O3-NEXT: prmt.b32 %r4, %r1, 0, 0x8880U;
513537
; O3-NEXT: setp.le.s32 %p1, %r4, %r3;
514-
; O3-NEXT: prmt.b32 %r5, %r2, 0, 0x7771U;
515-
; O3-NEXT: prmt.b32 %r6, %r1, 0, 0x7771U;
538+
; O3-NEXT: prmt.b32 %r5, %r2, 0, 0x9991U;
539+
; O3-NEXT: prmt.b32 %r6, %r1, 0, 0x9991U;
516540
; O3-NEXT: setp.le.s32 %p2, %r6, %r5;
517-
; O3-NEXT: prmt.b32 %r7, %r2, 0, 0x7772U;
518-
; O3-NEXT: prmt.b32 %r8, %r1, 0, 0x7772U;
541+
; O3-NEXT: prmt.b32 %r7, %r2, 0, 0xaaa2U;
542+
; O3-NEXT: prmt.b32 %r8, %r1, 0, 0xaaa2U;
519543
; O3-NEXT: setp.le.s32 %p3, %r8, %r7;
520-
; O3-NEXT: prmt.b32 %r9, %r2, 0, 0x7773U;
521-
; O3-NEXT: prmt.b32 %r10, %r1, 0, 0x7773U;
544+
; O3-NEXT: prmt.b32 %r9, %r2, 0, 0xbbb3U;
545+
; O3-NEXT: prmt.b32 %r10, %r1, 0, 0xbbb3U;
522546
; O3-NEXT: setp.le.s32 %p4, %r10, %r9;
523-
; O3-NEXT: selp.b32 %r11, %r10, %r9, %p4;
524-
; O3-NEXT: selp.b32 %r12, %r8, %r7, %p3;
525-
; O3-NEXT: prmt.b32 %r13, %r12, %r11, 0x3340U;
526-
; O3-NEXT: selp.b32 %r14, %r6, %r5, %p2;
527-
; O3-NEXT: selp.b32 %r15, %r4, %r3, %p1;
528-
; O3-NEXT: prmt.b32 %r16, %r15, %r14, 0x3340U;
529-
; O3-NEXT: prmt.b32 %r17, %r16, %r13, 0x5410U;
530-
; O3-NEXT: st.param.b32 [func_retval0], %r17;
547+
; O3-NEXT: prmt.b32 %r11, %r2, 0, 0x7770U;
548+
; O3-NEXT: prmt.b32 %r12, %r2, 0, 0x7771U;
549+
; O3-NEXT: prmt.b32 %r13, %r2, 0, 0x7772U;
550+
; O3-NEXT: prmt.b32 %r14, %r2, 0, 0x7773U;
551+
; O3-NEXT: prmt.b32 %r15, %r1, 0, 0x7773U;
552+
; O3-NEXT: selp.b32 %r16, %r15, %r14, %p4;
553+
; O3-NEXT: prmt.b32 %r17, %r1, 0, 0x7772U;
554+
; O3-NEXT: selp.b32 %r18, %r17, %r13, %p3;
555+
; O3-NEXT: prmt.b32 %r19, %r18, %r16, 0x3340U;
556+
; O3-NEXT: prmt.b32 %r20, %r1, 0, 0x7771U;
557+
; O3-NEXT: selp.b32 %r21, %r20, %r12, %p2;
558+
; O3-NEXT: prmt.b32 %r22, %r1, 0, 0x7770U;
559+
; O3-NEXT: selp.b32 %r23, %r22, %r11, %p1;
560+
; O3-NEXT: prmt.b32 %r24, %r23, %r21, 0x3340U;
561+
; O3-NEXT: prmt.b32 %r25, %r24, %r19, 0x5410U;
562+
; O3-NEXT: st.param.b32 [func_retval0], %r25;
531563
; O3-NEXT: ret;
532564
%cmp = icmp sle <4 x i8> %a, %b
533565
%r = select <4 x i1> %cmp, <4 x i8> %a, <4 x i8> %b

0 commit comments

Comments
 (0)