@@ -455,42 +455,64 @@ define dso_local ptx_kernel void @memcpy_to_param(ptr nocapture noundef readonly
455455; PTX-NEXT: .local .align 8 .b8 __local_depot9[8];
456456; PTX-NEXT: .reg .b64 %SP;
457457; PTX-NEXT: .reg .b64 %SPL;
458- ; PTX-NEXT: .reg .b64 %rd<30>;
458+ ; PTX-NEXT: .reg .b32 %r<3>;
459+ ; PTX-NEXT: .reg .b64 %rd<47>;
459460; PTX-EMPTY:
460461; PTX-NEXT: // %bb.0: // %entry
461462; PTX-NEXT: mov.b64 %SPL, __local_depot9;
462463; PTX-NEXT: cvta.local.u64 %SP, %SPL;
463464; PTX-NEXT: ld.param.b64 %rd1, [memcpy_to_param_param_0];
464- ; PTX-NEXT: cvta.to.global.u64 %rd2, %rd1;
465- ; PTX-NEXT: ld.param.b32 %rd3, [memcpy_to_param_param_1+4];
466- ; PTX-NEXT: shl.b64 %rd4, %rd3, 32;
467- ; PTX-NEXT: ld.param.b32 %rd5, [memcpy_to_param_param_1];
468- ; PTX-NEXT: or.b64 %rd6, %rd4, %rd5;
469- ; PTX-NEXT: st.b64 [%SP], %rd6;
470- ; PTX-NEXT: ld.volatile.global.b8 %rd7, [%rd2];
471- ; PTX-NEXT: ld.volatile.global.b8 %rd8, [%rd2+1];
472- ; PTX-NEXT: shl.b64 %rd9, %rd8, 8;
473- ; PTX-NEXT: or.b64 %rd10, %rd9, %rd7;
474- ; PTX-NEXT: ld.volatile.global.b8 %rd11, [%rd2+2];
475- ; PTX-NEXT: shl.b64 %rd12, %rd11, 16;
476- ; PTX-NEXT: ld.volatile.global.b8 %rd13, [%rd2+3];
477- ; PTX-NEXT: shl.b64 %rd14, %rd13, 24;
478- ; PTX-NEXT: or.b64 %rd15, %rd14, %rd12;
479- ; PTX-NEXT: or.b64 %rd16, %rd15, %rd10;
480- ; PTX-NEXT: ld.volatile.global.b8 %rd17, [%rd2+4];
481- ; PTX-NEXT: ld.volatile.global.b8 %rd18, [%rd2+5];
482- ; PTX-NEXT: shl.b64 %rd19, %rd18, 8;
483- ; PTX-NEXT: or.b64 %rd20, %rd19, %rd17;
484- ; PTX-NEXT: ld.volatile.global.b8 %rd21, [%rd2+6];
485- ; PTX-NEXT: shl.b64 %rd22, %rd21, 16;
486- ; PTX-NEXT: ld.volatile.global.b8 %rd23, [%rd2+7];
487- ; PTX-NEXT: shl.b64 %rd24, %rd23, 24;
488- ; PTX-NEXT: or.b64 %rd25, %rd24, %rd22;
489- ; PTX-NEXT: or.b64 %rd26, %rd25, %rd20;
490- ; PTX-NEXT: shl.b64 %rd27, %rd26, 32;
491- ; PTX-NEXT: or.b64 %rd28, %rd27, %rd16;
492- ; PTX-NEXT: add.u64 %rd29, %SPL, 0;
493- ; PTX-NEXT: st.local.b64 [%rd29], %rd28;
465+ ; PTX-NEXT: add.u64 %rd2, %SPL, 0;
466+ ; PTX-NEXT: ld.param.b32 %r1, [memcpy_to_param_param_1+4];
467+ ; PTX-NEXT: st.local.b32 [%rd2+4], %r1;
468+ ; PTX-NEXT: ld.param.b32 %r2, [memcpy_to_param_param_1];
469+ ; PTX-NEXT: st.local.b32 [%rd2], %r2;
470+ ; PTX-NEXT: ld.volatile.b8 %rd3, [%rd1];
471+ ; PTX-NEXT: ld.volatile.b8 %rd4, [%rd1+1];
472+ ; PTX-NEXT: shl.b64 %rd5, %rd4, 8;
473+ ; PTX-NEXT: or.b64 %rd6, %rd5, %rd3;
474+ ; PTX-NEXT: ld.volatile.b8 %rd7, [%rd1+2];
475+ ; PTX-NEXT: shl.b64 %rd8, %rd7, 16;
476+ ; PTX-NEXT: ld.volatile.b8 %rd9, [%rd1+3];
477+ ; PTX-NEXT: shl.b64 %rd10, %rd9, 24;
478+ ; PTX-NEXT: or.b64 %rd11, %rd10, %rd8;
479+ ; PTX-NEXT: or.b64 %rd12, %rd11, %rd6;
480+ ; PTX-NEXT: ld.volatile.b8 %rd13, [%rd1+4];
481+ ; PTX-NEXT: ld.volatile.b8 %rd14, [%rd1+5];
482+ ; PTX-NEXT: shl.b64 %rd15, %rd14, 8;
483+ ; PTX-NEXT: or.b64 %rd16, %rd15, %rd13;
484+ ; PTX-NEXT: ld.volatile.b8 %rd17, [%rd1+6];
485+ ; PTX-NEXT: shl.b64 %rd18, %rd17, 16;
486+ ; PTX-NEXT: ld.volatile.b8 %rd19, [%rd1+7];
487+ ; PTX-NEXT: shl.b64 %rd20, %rd19, 24;
488+ ; PTX-NEXT: or.b64 %rd21, %rd20, %rd18;
489+ ; PTX-NEXT: or.b64 %rd22, %rd21, %rd16;
490+ ; PTX-NEXT: shl.b64 %rd23, %rd22, 32;
491+ ; PTX-NEXT: or.b64 %rd24, %rd23, %rd12;
492+ ; PTX-NEXT: st.volatile.b64 [%SP], %rd24;
493+ ; PTX-NEXT: ld.volatile.b8 %rd25, [%rd1+8];
494+ ; PTX-NEXT: ld.volatile.b8 %rd26, [%rd1+9];
495+ ; PTX-NEXT: shl.b64 %rd27, %rd26, 8;
496+ ; PTX-NEXT: or.b64 %rd28, %rd27, %rd25;
497+ ; PTX-NEXT: ld.volatile.b8 %rd29, [%rd1+10];
498+ ; PTX-NEXT: shl.b64 %rd30, %rd29, 16;
499+ ; PTX-NEXT: ld.volatile.b8 %rd31, [%rd1+11];
500+ ; PTX-NEXT: shl.b64 %rd32, %rd31, 24;
501+ ; PTX-NEXT: or.b64 %rd33, %rd32, %rd30;
502+ ; PTX-NEXT: or.b64 %rd34, %rd33, %rd28;
503+ ; PTX-NEXT: ld.volatile.b8 %rd35, [%rd1+12];
504+ ; PTX-NEXT: ld.volatile.b8 %rd36, [%rd1+13];
505+ ; PTX-NEXT: shl.b64 %rd37, %rd36, 8;
506+ ; PTX-NEXT: or.b64 %rd38, %rd37, %rd35;
507+ ; PTX-NEXT: ld.volatile.b8 %rd39, [%rd1+14];
508+ ; PTX-NEXT: shl.b64 %rd40, %rd39, 16;
509+ ; PTX-NEXT: ld.volatile.b8 %rd41, [%rd1+15];
510+ ; PTX-NEXT: shl.b64 %rd42, %rd41, 24;
511+ ; PTX-NEXT: or.b64 %rd43, %rd42, %rd40;
512+ ; PTX-NEXT: or.b64 %rd44, %rd43, %rd38;
513+ ; PTX-NEXT: shl.b64 %rd45, %rd44, 32;
514+ ; PTX-NEXT: or.b64 %rd46, %rd45, %rd34;
515+ ; PTX-NEXT: st.volatile.b64 [%SP+8], %rd46;
494516; PTX-NEXT: ret;
495517entry:
496518 tail call void @llvm.memcpy.p0.p0.i64 (ptr %s , ptr %in , i64 16 , i1 true )
@@ -562,25 +584,44 @@ define ptx_kernel void @test_select(ptr byval(i32) align 4 %input1, ptr byval(i3
562584; COPY-NEXT: store i32 [[VALLOADED]], ptr [[OUT]], align 4
563585; COPY-NEXT: ret void
564586;
565- ; PTX-LABEL: test_select(
566- ; PTX: {
567- ; PTX-NEXT: .reg .pred %p<2>;
568- ; PTX-NEXT: .reg .b16 %rs<3>;
569- ; PTX-NEXT: .reg .b32 %r<2>;
570- ; PTX-NEXT: .reg .b64 %rd<6>;
571- ; PTX-EMPTY:
572- ; PTX-NEXT: // %bb.0: // %bb
573- ; PTX-NEXT: ld.param.b8 %rs1, [test_select_param_3];
574- ; PTX-NEXT: and.b16 %rs2, %rs1, 1;
575- ; PTX-NEXT: setp.ne.b16 %p1, %rs2, 0;
576- ; PTX-NEXT: mov.b64 %rd1, test_select_param_0;
577- ; PTX-NEXT: ld.param.b64 %rd2, [test_select_param_2];
578- ; PTX-NEXT: cvta.to.global.u64 %rd3, %rd2;
579- ; PTX-NEXT: mov.b64 %rd4, test_select_param_1;
580- ; PTX-NEXT: selp.b64 %rd5, %rd1, %rd4, %p1;
581- ; PTX-NEXT: ld.param.b32 %r1, [%rd5];
582- ; PTX-NEXT: st.global.b32 [%rd3], %r1;
583- ; PTX-NEXT: ret;
587+ ; PTX_60-LABEL: test_select(
588+ ; PTX_60: {
589+ ; PTX_60-NEXT: .reg .pred %p<2>;
590+ ; PTX_60-NEXT: .reg .b16 %rs<3>;
591+ ; PTX_60-NEXT: .reg .b32 %r<4>;
592+ ; PTX_60-NEXT: .reg .b64 %rd<3>;
593+ ; PTX_60-EMPTY:
594+ ; PTX_60-NEXT: // %bb.0: // %bb
595+ ; PTX_60-NEXT: ld.param.b8 %rs1, [test_select_param_3];
596+ ; PTX_60-NEXT: and.b16 %rs2, %rs1, 1;
597+ ; PTX_60-NEXT: setp.ne.b16 %p1, %rs2, 0;
598+ ; PTX_60-NEXT: ld.param.b64 %rd1, [test_select_param_2];
599+ ; PTX_60-NEXT: cvta.to.global.u64 %rd2, %rd1;
600+ ; PTX_60-NEXT: ld.param.b32 %r1, [test_select_param_1];
601+ ; PTX_60-NEXT: ld.param.b32 %r2, [test_select_param_0];
602+ ; PTX_60-NEXT: selp.b32 %r3, %r2, %r1, %p1;
603+ ; PTX_60-NEXT: st.global.b32 [%rd2], %r3;
604+ ; PTX_60-NEXT: ret;
605+ ;
606+ ; PTX_70-LABEL: test_select(
607+ ; PTX_70: {
608+ ; PTX_70-NEXT: .reg .pred %p<2>;
609+ ; PTX_70-NEXT: .reg .b16 %rs<3>;
610+ ; PTX_70-NEXT: .reg .b32 %r<2>;
611+ ; PTX_70-NEXT: .reg .b64 %rd<6>;
612+ ; PTX_70-EMPTY:
613+ ; PTX_70-NEXT: // %bb.0: // %bb
614+ ; PTX_70-NEXT: ld.param.b8 %rs1, [test_select_param_3];
615+ ; PTX_70-NEXT: and.b16 %rs2, %rs1, 1;
616+ ; PTX_70-NEXT: setp.ne.b16 %p1, %rs2, 0;
617+ ; PTX_70-NEXT: mov.b64 %rd1, test_select_param_0;
618+ ; PTX_70-NEXT: ld.param.b64 %rd2, [test_select_param_2];
619+ ; PTX_70-NEXT: cvta.to.global.u64 %rd3, %rd2;
620+ ; PTX_70-NEXT: mov.b64 %rd4, test_select_param_1;
621+ ; PTX_70-NEXT: selp.b64 %rd5, %rd1, %rd4, %p1;
622+ ; PTX_70-NEXT: ld.param.b32 %r1, [%rd5];
623+ ; PTX_70-NEXT: st.global.b32 [%rd3], %r1;
624+ ; PTX_70-NEXT: ret;
584625bb:
585626 %ptrnew = select i1 %cond , ptr %input1 , ptr %input2
586627 %valloaded = load i32 , ptr %ptrnew , align 4
0 commit comments