Skip to content

Commit 8d30c3c

Browse files
AlexMacleankrishna2803
authored andcommitted
[NVPTX] Fixup ISel patterns for setcc of i8 extract (llvm#151204)
Fix a correctness bug in ISel lowering patterns for setcc of v4i8 extraction. Refactor and cleanup these patterns somewhat in general to try to make them a bit more comprehensible.
1 parent 14fb1d4 commit 8d30c3c

File tree

2 files changed

+132
-112
lines changed

2 files changed

+132
-112
lines changed

llvm/lib/Target/NVPTX/NVPTXInstrInfo.td

Lines changed: 32 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -1460,20 +1460,19 @@ def : Pat<(i16 (sext_inreg (trunc (prmt i32:$s, 0, byte_extract_prmt:$sel, PrmtN
14601460

14611461

14621462
// Byte extraction via shift/trunc/sext
1463-
def : Pat<(i16 (sext_inreg (trunc i32:$s), i8)),
1464-
(CVT_s8_s32 $s, CvtNONE)>;
1465-
def : Pat<(i16 (sext_inreg (trunc (srl i32:$s, (i32 imm:$o))), i8)),
1463+
def : Pat<(i16 (sext_inreg (trunc i32:$s), i8)), (CVT_s8_s32 $s, CvtNONE)>;
1464+
def : Pat<(i16 (sext_inreg (trunc i64:$s), i8)), (CVT_s8_s64 $s, CvtNONE)>;
1465+
1466+
def : Pat<(sext_inreg (srl i32:$s, (i32 imm:$o)), i8), (BFE_S32rii $s, imm:$o, 8)>;
1467+
def : Pat<(sext_inreg (srl i64:$s, (i32 imm:$o)), i8), (BFE_S64rii $s, imm:$o, 8)>;
1468+
1469+
def : Pat<(i16 (sext_inreg (trunc (srl i32:$s, (i32 imm:$o))), i8)),
14661470
(CVT_s8_s32 (BFE_S32rii $s, imm:$o, 8), CvtNONE)>;
1467-
def : Pat<(sext_inreg (srl i32:$s, (i32 imm:$o)), i8),
1468-
(BFE_S32rii $s, imm:$o, 8)>;
1471+
def : Pat<(i16 (sext_inreg (trunc (srl i64:$s, (i32 imm:$o))), i8)),
1472+
(CVT_s8_s64 (BFE_S64rii $s, imm:$o, 8), CvtNONE)>;
1473+
14691474
def : Pat<(i16 (sra (i16 (trunc i32:$s)), (i32 8))),
14701475
(CVT_s8_s32 (BFE_S32rii $s, 8, 8), CvtNONE)>;
1471-
def : Pat<(sext_inreg (srl i64:$s, (i32 imm:$o)), i8),
1472-
(BFE_S64rii $s, imm:$o, 8)>;
1473-
def : Pat<(i16 (sext_inreg (trunc i64:$s), i8)),
1474-
(CVT_s8_s64 $s, CvtNONE)>;
1475-
def : Pat<(i16 (sext_inreg (trunc (srl i64:$s, (i32 imm:$o))), i8)),
1476-
(CVT_s8_s64 (BFE_S64rii $s, imm:$o, 8), CvtNONE)>;
14771476

14781477
//-----------------------------------
14791478
// Comparison instructions (setp, set)
@@ -1657,45 +1656,34 @@ def : Pat<(i64 frameindex:$fi), (LEA_ADDRi64 (to_tframeindex $fi), 0)>;
16571656
//-----------------------------------
16581657
// Comparison and Selection
16591658
//-----------------------------------
1659+
// TODO: These patterns seem very specific and brittle. We should try to find
1660+
// a more general solution.
16601661

16611662
def cond_signed : PatLeaf<(cond), [{
16621663
return isSignedIntSetCC(N->get());
16631664
}]>;
16641665

1665-
def cond_not_signed : PatLeaf<(cond), [{
1666-
return !isSignedIntSetCC(N->get());
1667-
}]>;
1666+
// A 16-bit signed comparison of sign-extended byte extracts can be converted
1667+
// to 32-bit comparison if we change the PRMT to sign-extend the extracted
1668+
// bytes.
1669+
def : Pat<(setcc (i16 (sext_inreg (trunc (prmt i32:$a, 0, byte_extract_prmt:$sel_a, PrmtNONE)), i8)),
1670+
(i16 (sext_inreg (trunc (prmt i32:$b, 0, byte_extract_prmt:$sel_b, PrmtNONE)), i8)),
1671+
cond_signed:$cc),
1672+
(SETP_i32rr (PRMT_B32rii i32:$a, 0, (to_sign_extend_selector $sel_a), PrmtNONE),
1673+
(PRMT_B32rii i32:$b, 0, (to_sign_extend_selector $sel_b), PrmtNONE),
1674+
(cond2cc $cc))>;
1675+
1676+
// A 16-bit comparison of truncated byte extracts can be be converted to 32-bit
1677+
// comparison because we know that the truncate is just trancating off zeros
1678+
// and that the most-significant byte is also zeros so the meaning of signed and
1679+
// unsigned comparisons will not be changed.
1680+
def : Pat<(setcc (i16 (trunc (prmt i32:$a, 0, byte_extract_prmt:$sel_a, PrmtNONE))),
1681+
(i16 (trunc (prmt i32:$b, 0, byte_extract_prmt:$sel_b, PrmtNONE))),
1682+
cond:$cc),
1683+
(SETP_i32rr (PRMT_B32rii i32:$a, 0, byte_extract_prmt:$sel_a, PrmtNONE),
1684+
(PRMT_B32rii i32:$b, 0, byte_extract_prmt:$sel_b, PrmtNONE),
1685+
(cond2cc $cc))>;
16681686

1669-
// comparisons of i8 extracted with PRMT as i32
1670-
// It's faster to do comparison directly on i32 extracted by PRMT,
1671-
// instead of the long conversion and sign extending.
1672-
def: Pat<(setcc (i16 (sext_inreg (i16 (trunc (prmt i32:$a, 0, byte_extract_prmt:$sel_a, PrmtNONE))), i8)),
1673-
(i16 (sext_inreg (i16 (trunc (prmt i32:$b, 0, byte_extract_prmt:$sel_b, PrmtNONE))), i8)),
1674-
cond_signed:$cc),
1675-
(SETP_i32rr (PRMT_B32rii i32:$a, 0, byte_extract_prmt:$sel_a, PrmtNONE),
1676-
(PRMT_B32rii i32:$b, 0, byte_extract_prmt:$sel_b, PrmtNONE),
1677-
(cond2cc $cc))>;
1678-
1679-
def: Pat<(setcc (i16 (sext_inreg (trunc (prmt i32:$a, 0, byte_extract_prmt:$sel_a, PrmtNONE)), i8)),
1680-
(i16 (sext_inreg (trunc (prmt i32:$b, 0, byte_extract_prmt:$sel_b, PrmtNONE)), i8)),
1681-
cond_signed:$cc),
1682-
(SETP_i32rr (PRMT_B32rii i32:$a, 0, byte_extract_prmt:$sel_a, PrmtNONE),
1683-
(PRMT_B32rii i32:$b, 0, byte_extract_prmt:$sel_b, PrmtNONE),
1684-
(cond2cc $cc))>;
1685-
1686-
def: Pat<(setcc (i16 (trunc (prmt i32:$a, 0, byte_extract_prmt:$sel_a, PrmtNONE))),
1687-
(i16 (trunc (prmt i32:$b, 0, byte_extract_prmt:$sel_b, PrmtNONE))),
1688-
cond_signed:$cc),
1689-
(SETP_i32rr (PRMT_B32rii i32:$a, 0, byte_extract_prmt:$sel_a, PrmtNONE),
1690-
(PRMT_B32rii i32:$b, 0, byte_extract_prmt:$sel_b, PrmtNONE),
1691-
(cond2cc $cc))>;
1692-
1693-
def: Pat<(setcc (i16 (trunc (prmt i32:$a, 0, byte_extract_prmt:$sel_a, PrmtNONE))),
1694-
(i16 (trunc (prmt i32:$b, 0, byte_extract_prmt:$sel_b, PrmtNONE))),
1695-
cond_not_signed:$cc),
1696-
(SETP_i32rr (PRMT_B32rii i32:$a, 0, byte_extract_prmt:$sel_a, PrmtNONE),
1697-
(PRMT_B32rii i32:$b, 0, byte_extract_prmt:$sel_b, PrmtNONE),
1698-
(cond2cc $cc))>;
16991687

17001688
def SDTDeclareArrayParam :
17011689
SDTypeProfile<0, 3, [SDTCisVT<0, i32>, SDTCisVT<1, i32>, SDTCisVT<2, i32>]>;

llvm/test/CodeGen/NVPTX/i8x4-instructions.ll

Lines changed: 100 additions & 68 deletions
Original file line numberDiff line numberDiff line change
@@ -343,61 +343,77 @@ define <4 x i8> @test_smax(<4 x i8> %a, <4 x i8> %b) #0 {
343343
; O0-LABEL: test_smax(
344344
; O0: {
345345
; O0-NEXT: .reg .pred %p<5>;
346-
; O0-NEXT: .reg .b32 %r<18>;
346+
; O0-NEXT: .reg .b32 %r<26>;
347347
; O0-EMPTY:
348348
; O0-NEXT: // %bb.0:
349349
; O0-NEXT: ld.param.b32 %r2, [test_smax_param_1];
350350
; O0-NEXT: ld.param.b32 %r1, [test_smax_param_0];
351-
; O0-NEXT: prmt.b32 %r3, %r2, 0, 0x7770U;
352-
; O0-NEXT: prmt.b32 %r4, %r1, 0, 0x7770U;
351+
; O0-NEXT: prmt.b32 %r3, %r2, 0, 0x8880U;
352+
; O0-NEXT: prmt.b32 %r4, %r1, 0, 0x8880U;
353353
; O0-NEXT: setp.gt.s32 %p1, %r4, %r3;
354-
; O0-NEXT: prmt.b32 %r5, %r2, 0, 0x7771U;
355-
; O0-NEXT: prmt.b32 %r6, %r1, 0, 0x7771U;
354+
; O0-NEXT: prmt.b32 %r5, %r2, 0, 0x9991U;
355+
; O0-NEXT: prmt.b32 %r6, %r1, 0, 0x9991U;
356356
; O0-NEXT: setp.gt.s32 %p2, %r6, %r5;
357-
; O0-NEXT: prmt.b32 %r7, %r2, 0, 0x7772U;
358-
; O0-NEXT: prmt.b32 %r8, %r1, 0, 0x7772U;
357+
; O0-NEXT: prmt.b32 %r7, %r2, 0, 0xaaa2U;
358+
; O0-NEXT: prmt.b32 %r8, %r1, 0, 0xaaa2U;
359359
; O0-NEXT: setp.gt.s32 %p3, %r8, %r7;
360-
; O0-NEXT: prmt.b32 %r9, %r2, 0, 0x7773U;
361-
; O0-NEXT: prmt.b32 %r10, %r1, 0, 0x7773U;
360+
; O0-NEXT: prmt.b32 %r9, %r2, 0, 0xbbb3U;
361+
; O0-NEXT: prmt.b32 %r10, %r1, 0, 0xbbb3U;
362362
; O0-NEXT: setp.gt.s32 %p4, %r10, %r9;
363-
; O0-NEXT: selp.b32 %r11, %r10, %r9, %p4;
364-
; O0-NEXT: selp.b32 %r12, %r8, %r7, %p3;
365-
; O0-NEXT: prmt.b32 %r13, %r12, %r11, 0x3340U;
366-
; O0-NEXT: selp.b32 %r14, %r6, %r5, %p2;
367-
; O0-NEXT: selp.b32 %r15, %r4, %r3, %p1;
368-
; O0-NEXT: prmt.b32 %r16, %r15, %r14, 0x3340U;
369-
; O0-NEXT: prmt.b32 %r17, %r16, %r13, 0x5410U;
370-
; O0-NEXT: st.param.b32 [func_retval0], %r17;
363+
; O0-NEXT: prmt.b32 %r11, %r2, 0, 0x7770U;
364+
; O0-NEXT: prmt.b32 %r12, %r2, 0, 0x7771U;
365+
; O0-NEXT: prmt.b32 %r13, %r2, 0, 0x7772U;
366+
; O0-NEXT: prmt.b32 %r14, %r2, 0, 0x7773U;
367+
; O0-NEXT: prmt.b32 %r15, %r1, 0, 0x7773U;
368+
; O0-NEXT: selp.b32 %r16, %r15, %r14, %p4;
369+
; O0-NEXT: prmt.b32 %r17, %r1, 0, 0x7772U;
370+
; O0-NEXT: selp.b32 %r18, %r17, %r13, %p3;
371+
; O0-NEXT: prmt.b32 %r19, %r18, %r16, 0x3340U;
372+
; O0-NEXT: prmt.b32 %r20, %r1, 0, 0x7771U;
373+
; O0-NEXT: selp.b32 %r21, %r20, %r12, %p2;
374+
; O0-NEXT: prmt.b32 %r22, %r1, 0, 0x7770U;
375+
; O0-NEXT: selp.b32 %r23, %r22, %r11, %p1;
376+
; O0-NEXT: prmt.b32 %r24, %r23, %r21, 0x3340U;
377+
; O0-NEXT: prmt.b32 %r25, %r24, %r19, 0x5410U;
378+
; O0-NEXT: st.param.b32 [func_retval0], %r25;
371379
; O0-NEXT: ret;
372380
;
373381
; O3-LABEL: test_smax(
374382
; O3: {
375383
; O3-NEXT: .reg .pred %p<5>;
376-
; O3-NEXT: .reg .b32 %r<18>;
384+
; O3-NEXT: .reg .b32 %r<26>;
377385
; O3-EMPTY:
378386
; O3-NEXT: // %bb.0:
379387
; O3-NEXT: ld.param.b32 %r1, [test_smax_param_0];
380388
; O3-NEXT: ld.param.b32 %r2, [test_smax_param_1];
381-
; O3-NEXT: prmt.b32 %r3, %r2, 0, 0x7770U;
382-
; O3-NEXT: prmt.b32 %r4, %r1, 0, 0x7770U;
389+
; O3-NEXT: prmt.b32 %r3, %r2, 0, 0x8880U;
390+
; O3-NEXT: prmt.b32 %r4, %r1, 0, 0x8880U;
383391
; O3-NEXT: setp.gt.s32 %p1, %r4, %r3;
384-
; O3-NEXT: prmt.b32 %r5, %r2, 0, 0x7771U;
385-
; O3-NEXT: prmt.b32 %r6, %r1, 0, 0x7771U;
392+
; O3-NEXT: prmt.b32 %r5, %r2, 0, 0x9991U;
393+
; O3-NEXT: prmt.b32 %r6, %r1, 0, 0x9991U;
386394
; O3-NEXT: setp.gt.s32 %p2, %r6, %r5;
387-
; O3-NEXT: prmt.b32 %r7, %r2, 0, 0x7772U;
388-
; O3-NEXT: prmt.b32 %r8, %r1, 0, 0x7772U;
395+
; O3-NEXT: prmt.b32 %r7, %r2, 0, 0xaaa2U;
396+
; O3-NEXT: prmt.b32 %r8, %r1, 0, 0xaaa2U;
389397
; O3-NEXT: setp.gt.s32 %p3, %r8, %r7;
390-
; O3-NEXT: prmt.b32 %r9, %r2, 0, 0x7773U;
391-
; O3-NEXT: prmt.b32 %r10, %r1, 0, 0x7773U;
398+
; O3-NEXT: prmt.b32 %r9, %r2, 0, 0xbbb3U;
399+
; O3-NEXT: prmt.b32 %r10, %r1, 0, 0xbbb3U;
392400
; O3-NEXT: setp.gt.s32 %p4, %r10, %r9;
393-
; O3-NEXT: selp.b32 %r11, %r10, %r9, %p4;
394-
; O3-NEXT: selp.b32 %r12, %r8, %r7, %p3;
395-
; O3-NEXT: prmt.b32 %r13, %r12, %r11, 0x3340U;
396-
; O3-NEXT: selp.b32 %r14, %r6, %r5, %p2;
397-
; O3-NEXT: selp.b32 %r15, %r4, %r3, %p1;
398-
; O3-NEXT: prmt.b32 %r16, %r15, %r14, 0x3340U;
399-
; O3-NEXT: prmt.b32 %r17, %r16, %r13, 0x5410U;
400-
; O3-NEXT: st.param.b32 [func_retval0], %r17;
401+
; O3-NEXT: prmt.b32 %r11, %r2, 0, 0x7770U;
402+
; O3-NEXT: prmt.b32 %r12, %r2, 0, 0x7771U;
403+
; O3-NEXT: prmt.b32 %r13, %r2, 0, 0x7772U;
404+
; O3-NEXT: prmt.b32 %r14, %r2, 0, 0x7773U;
405+
; O3-NEXT: prmt.b32 %r15, %r1, 0, 0x7773U;
406+
; O3-NEXT: selp.b32 %r16, %r15, %r14, %p4;
407+
; O3-NEXT: prmt.b32 %r17, %r1, 0, 0x7772U;
408+
; O3-NEXT: selp.b32 %r18, %r17, %r13, %p3;
409+
; O3-NEXT: prmt.b32 %r19, %r18, %r16, 0x3340U;
410+
; O3-NEXT: prmt.b32 %r20, %r1, 0, 0x7771U;
411+
; O3-NEXT: selp.b32 %r21, %r20, %r12, %p2;
412+
; O3-NEXT: prmt.b32 %r22, %r1, 0, 0x7770U;
413+
; O3-NEXT: selp.b32 %r23, %r22, %r11, %p1;
414+
; O3-NEXT: prmt.b32 %r24, %r23, %r21, 0x3340U;
415+
; O3-NEXT: prmt.b32 %r25, %r24, %r19, 0x5410U;
416+
; O3-NEXT: st.param.b32 [func_retval0], %r25;
401417
; O3-NEXT: ret;
402418
%cmp = icmp sgt <4 x i8> %a, %b
403419
%r = select <4 x i1> %cmp, <4 x i8> %a, <4 x i8> %b
@@ -473,61 +489,77 @@ define <4 x i8> @test_smin(<4 x i8> %a, <4 x i8> %b) #0 {
473489
; O0-LABEL: test_smin(
474490
; O0: {
475491
; O0-NEXT: .reg .pred %p<5>;
476-
; O0-NEXT: .reg .b32 %r<18>;
492+
; O0-NEXT: .reg .b32 %r<26>;
477493
; O0-EMPTY:
478494
; O0-NEXT: // %bb.0:
479495
; O0-NEXT: ld.param.b32 %r2, [test_smin_param_1];
480496
; O0-NEXT: ld.param.b32 %r1, [test_smin_param_0];
481-
; O0-NEXT: prmt.b32 %r3, %r2, 0, 0x7770U;
482-
; O0-NEXT: prmt.b32 %r4, %r1, 0, 0x7770U;
497+
; O0-NEXT: prmt.b32 %r3, %r2, 0, 0x8880U;
498+
; O0-NEXT: prmt.b32 %r4, %r1, 0, 0x8880U;
483499
; O0-NEXT: setp.le.s32 %p1, %r4, %r3;
484-
; O0-NEXT: prmt.b32 %r5, %r2, 0, 0x7771U;
485-
; O0-NEXT: prmt.b32 %r6, %r1, 0, 0x7771U;
500+
; O0-NEXT: prmt.b32 %r5, %r2, 0, 0x9991U;
501+
; O0-NEXT: prmt.b32 %r6, %r1, 0, 0x9991U;
486502
; O0-NEXT: setp.le.s32 %p2, %r6, %r5;
487-
; O0-NEXT: prmt.b32 %r7, %r2, 0, 0x7772U;
488-
; O0-NEXT: prmt.b32 %r8, %r1, 0, 0x7772U;
503+
; O0-NEXT: prmt.b32 %r7, %r2, 0, 0xaaa2U;
504+
; O0-NEXT: prmt.b32 %r8, %r1, 0, 0xaaa2U;
489505
; O0-NEXT: setp.le.s32 %p3, %r8, %r7;
490-
; O0-NEXT: prmt.b32 %r9, %r2, 0, 0x7773U;
491-
; O0-NEXT: prmt.b32 %r10, %r1, 0, 0x7773U;
506+
; O0-NEXT: prmt.b32 %r9, %r2, 0, 0xbbb3U;
507+
; O0-NEXT: prmt.b32 %r10, %r1, 0, 0xbbb3U;
492508
; O0-NEXT: setp.le.s32 %p4, %r10, %r9;
493-
; O0-NEXT: selp.b32 %r11, %r10, %r9, %p4;
494-
; O0-NEXT: selp.b32 %r12, %r8, %r7, %p3;
495-
; O0-NEXT: prmt.b32 %r13, %r12, %r11, 0x3340U;
496-
; O0-NEXT: selp.b32 %r14, %r6, %r5, %p2;
497-
; O0-NEXT: selp.b32 %r15, %r4, %r3, %p1;
498-
; O0-NEXT: prmt.b32 %r16, %r15, %r14, 0x3340U;
499-
; O0-NEXT: prmt.b32 %r17, %r16, %r13, 0x5410U;
500-
; O0-NEXT: st.param.b32 [func_retval0], %r17;
509+
; O0-NEXT: prmt.b32 %r11, %r2, 0, 0x7770U;
510+
; O0-NEXT: prmt.b32 %r12, %r2, 0, 0x7771U;
511+
; O0-NEXT: prmt.b32 %r13, %r2, 0, 0x7772U;
512+
; O0-NEXT: prmt.b32 %r14, %r2, 0, 0x7773U;
513+
; O0-NEXT: prmt.b32 %r15, %r1, 0, 0x7773U;
514+
; O0-NEXT: selp.b32 %r16, %r15, %r14, %p4;
515+
; O0-NEXT: prmt.b32 %r17, %r1, 0, 0x7772U;
516+
; O0-NEXT: selp.b32 %r18, %r17, %r13, %p3;
517+
; O0-NEXT: prmt.b32 %r19, %r18, %r16, 0x3340U;
518+
; O0-NEXT: prmt.b32 %r20, %r1, 0, 0x7771U;
519+
; O0-NEXT: selp.b32 %r21, %r20, %r12, %p2;
520+
; O0-NEXT: prmt.b32 %r22, %r1, 0, 0x7770U;
521+
; O0-NEXT: selp.b32 %r23, %r22, %r11, %p1;
522+
; O0-NEXT: prmt.b32 %r24, %r23, %r21, 0x3340U;
523+
; O0-NEXT: prmt.b32 %r25, %r24, %r19, 0x5410U;
524+
; O0-NEXT: st.param.b32 [func_retval0], %r25;
501525
; O0-NEXT: ret;
502526
;
503527
; O3-LABEL: test_smin(
504528
; O3: {
505529
; O3-NEXT: .reg .pred %p<5>;
506-
; O3-NEXT: .reg .b32 %r<18>;
530+
; O3-NEXT: .reg .b32 %r<26>;
507531
; O3-EMPTY:
508532
; O3-NEXT: // %bb.0:
509533
; O3-NEXT: ld.param.b32 %r1, [test_smin_param_0];
510534
; O3-NEXT: ld.param.b32 %r2, [test_smin_param_1];
511-
; O3-NEXT: prmt.b32 %r3, %r2, 0, 0x7770U;
512-
; O3-NEXT: prmt.b32 %r4, %r1, 0, 0x7770U;
535+
; O3-NEXT: prmt.b32 %r3, %r2, 0, 0x8880U;
536+
; O3-NEXT: prmt.b32 %r4, %r1, 0, 0x8880U;
513537
; O3-NEXT: setp.le.s32 %p1, %r4, %r3;
514-
; O3-NEXT: prmt.b32 %r5, %r2, 0, 0x7771U;
515-
; O3-NEXT: prmt.b32 %r6, %r1, 0, 0x7771U;
538+
; O3-NEXT: prmt.b32 %r5, %r2, 0, 0x9991U;
539+
; O3-NEXT: prmt.b32 %r6, %r1, 0, 0x9991U;
516540
; O3-NEXT: setp.le.s32 %p2, %r6, %r5;
517-
; O3-NEXT: prmt.b32 %r7, %r2, 0, 0x7772U;
518-
; O3-NEXT: prmt.b32 %r8, %r1, 0, 0x7772U;
541+
; O3-NEXT: prmt.b32 %r7, %r2, 0, 0xaaa2U;
542+
; O3-NEXT: prmt.b32 %r8, %r1, 0, 0xaaa2U;
519543
; O3-NEXT: setp.le.s32 %p3, %r8, %r7;
520-
; O3-NEXT: prmt.b32 %r9, %r2, 0, 0x7773U;
521-
; O3-NEXT: prmt.b32 %r10, %r1, 0, 0x7773U;
544+
; O3-NEXT: prmt.b32 %r9, %r2, 0, 0xbbb3U;
545+
; O3-NEXT: prmt.b32 %r10, %r1, 0, 0xbbb3U;
522546
; O3-NEXT: setp.le.s32 %p4, %r10, %r9;
523-
; O3-NEXT: selp.b32 %r11, %r10, %r9, %p4;
524-
; O3-NEXT: selp.b32 %r12, %r8, %r7, %p3;
525-
; O3-NEXT: prmt.b32 %r13, %r12, %r11, 0x3340U;
526-
; O3-NEXT: selp.b32 %r14, %r6, %r5, %p2;
527-
; O3-NEXT: selp.b32 %r15, %r4, %r3, %p1;
528-
; O3-NEXT: prmt.b32 %r16, %r15, %r14, 0x3340U;
529-
; O3-NEXT: prmt.b32 %r17, %r16, %r13, 0x5410U;
530-
; O3-NEXT: st.param.b32 [func_retval0], %r17;
547+
; O3-NEXT: prmt.b32 %r11, %r2, 0, 0x7770U;
548+
; O3-NEXT: prmt.b32 %r12, %r2, 0, 0x7771U;
549+
; O3-NEXT: prmt.b32 %r13, %r2, 0, 0x7772U;
550+
; O3-NEXT: prmt.b32 %r14, %r2, 0, 0x7773U;
551+
; O3-NEXT: prmt.b32 %r15, %r1, 0, 0x7773U;
552+
; O3-NEXT: selp.b32 %r16, %r15, %r14, %p4;
553+
; O3-NEXT: prmt.b32 %r17, %r1, 0, 0x7772U;
554+
; O3-NEXT: selp.b32 %r18, %r17, %r13, %p3;
555+
; O3-NEXT: prmt.b32 %r19, %r18, %r16, 0x3340U;
556+
; O3-NEXT: prmt.b32 %r20, %r1, 0, 0x7771U;
557+
; O3-NEXT: selp.b32 %r21, %r20, %r12, %p2;
558+
; O3-NEXT: prmt.b32 %r22, %r1, 0, 0x7770U;
559+
; O3-NEXT: selp.b32 %r23, %r22, %r11, %p1;
560+
; O3-NEXT: prmt.b32 %r24, %r23, %r21, 0x3340U;
561+
; O3-NEXT: prmt.b32 %r25, %r24, %r19, 0x5410U;
562+
; O3-NEXT: st.param.b32 [func_retval0], %r25;
531563
; O3-NEXT: ret;
532564
%cmp = icmp sle <4 x i8> %a, %b
533565
%r = select <4 x i1> %cmp, <4 x i8> %a, <4 x i8> %b

0 commit comments

Comments
 (0)