@@ -455,64 +455,42 @@ define dso_local ptx_kernel void @memcpy_to_param(ptr nocapture noundef readonly
455455; PTX-NEXT: .local .align 8 .b8 __local_depot9[8];
456456; PTX-NEXT: .reg .b64 %SP;
457457; PTX-NEXT: .reg .b64 %SPL;
458- ; PTX-NEXT: .reg .b32 %r<3>;
459- ; PTX-NEXT: .reg .b64 %rd<47>;
458+ ; PTX-NEXT: .reg .b64 %rd<30>;
460459; PTX-EMPTY:
461460; PTX-NEXT: // %bb.0: // %entry
462461; PTX-NEXT: mov.b64 %SPL, __local_depot9;
463462; PTX-NEXT: cvta.local.u64 %SP, %SPL;
464463; PTX-NEXT: ld.param.b64 %rd1, [memcpy_to_param_param_0];
465- ; PTX-NEXT: add.u64 %rd2, %SPL, 0;
466- ; PTX-NEXT: ld.param.b32 %r1, [memcpy_to_param_param_1+4];
467- ; PTX-NEXT: st.local.b32 [%rd2+4], %r1;
468- ; PTX-NEXT: ld.param.b32 %r2, [memcpy_to_param_param_1];
469- ; PTX-NEXT: st.local.b32 [%rd2], %r2;
470- ; PTX-NEXT: ld.volatile.b8 %rd3, [%rd1];
471- ; PTX-NEXT: ld.volatile.b8 %rd4, [%rd1+1];
472- ; PTX-NEXT: shl.b64 %rd5, %rd4, 8;
473- ; PTX-NEXT: or.b64 %rd6, %rd5, %rd3;
474- ; PTX-NEXT: ld.volatile.b8 %rd7, [%rd1+2];
475- ; PTX-NEXT: shl.b64 %rd8, %rd7, 16;
476- ; PTX-NEXT: ld.volatile.b8 %rd9, [%rd1+3];
477- ; PTX-NEXT: shl.b64 %rd10, %rd9, 24;
478- ; PTX-NEXT: or.b64 %rd11, %rd10, %rd8;
479- ; PTX-NEXT: or.b64 %rd12, %rd11, %rd6;
480- ; PTX-NEXT: ld.volatile.b8 %rd13, [%rd1+4];
481- ; PTX-NEXT: ld.volatile.b8 %rd14, [%rd1+5];
482- ; PTX-NEXT: shl.b64 %rd15, %rd14, 8;
483- ; PTX-NEXT: or.b64 %rd16, %rd15, %rd13;
484- ; PTX-NEXT: ld.volatile.b8 %rd17, [%rd1+6];
485- ; PTX-NEXT: shl.b64 %rd18, %rd17, 16;
486- ; PTX-NEXT: ld.volatile.b8 %rd19, [%rd1+7];
487- ; PTX-NEXT: shl.b64 %rd20, %rd19, 24;
488- ; PTX-NEXT: or.b64 %rd21, %rd20, %rd18;
489- ; PTX-NEXT: or.b64 %rd22, %rd21, %rd16;
490- ; PTX-NEXT: shl.b64 %rd23, %rd22, 32;
491- ; PTX-NEXT: or.b64 %rd24, %rd23, %rd12;
492- ; PTX-NEXT: st.volatile.b64 [%SP], %rd24;
493- ; PTX-NEXT: ld.volatile.b8 %rd25, [%rd1+8];
494- ; PTX-NEXT: ld.volatile.b8 %rd26, [%rd1+9];
495- ; PTX-NEXT: shl.b64 %rd27, %rd26, 8;
496- ; PTX-NEXT: or.b64 %rd28, %rd27, %rd25;
497- ; PTX-NEXT: ld.volatile.b8 %rd29, [%rd1+10];
498- ; PTX-NEXT: shl.b64 %rd30, %rd29, 16;
499- ; PTX-NEXT: ld.volatile.b8 %rd31, [%rd1+11];
500- ; PTX-NEXT: shl.b64 %rd32, %rd31, 24;
501- ; PTX-NEXT: or.b64 %rd33, %rd32, %rd30;
502- ; PTX-NEXT: or.b64 %rd34, %rd33, %rd28;
503- ; PTX-NEXT: ld.volatile.b8 %rd35, [%rd1+12];
504- ; PTX-NEXT: ld.volatile.b8 %rd36, [%rd1+13];
505- ; PTX-NEXT: shl.b64 %rd37, %rd36, 8;
506- ; PTX-NEXT: or.b64 %rd38, %rd37, %rd35;
507- ; PTX-NEXT: ld.volatile.b8 %rd39, [%rd1+14];
508- ; PTX-NEXT: shl.b64 %rd40, %rd39, 16;
509- ; PTX-NEXT: ld.volatile.b8 %rd41, [%rd1+15];
510- ; PTX-NEXT: shl.b64 %rd42, %rd41, 24;
511- ; PTX-NEXT: or.b64 %rd43, %rd42, %rd40;
512- ; PTX-NEXT: or.b64 %rd44, %rd43, %rd38;
513- ; PTX-NEXT: shl.b64 %rd45, %rd44, 32;
514- ; PTX-NEXT: or.b64 %rd46, %rd45, %rd34;
515- ; PTX-NEXT: st.volatile.b64 [%SP+8], %rd46;
464+ ; PTX-NEXT: cvta.to.global.u64 %rd2, %rd1;
465+ ; PTX-NEXT: ld.param.b32 %rd3, [memcpy_to_param_param_1+4];
466+ ; PTX-NEXT: shl.b64 %rd4, %rd3, 32;
467+ ; PTX-NEXT: ld.param.b32 %rd5, [memcpy_to_param_param_1];
468+ ; PTX-NEXT: or.b64 %rd6, %rd4, %rd5;
469+ ; PTX-NEXT: st.b64 [%SP], %rd6;
470+ ; PTX-NEXT: ld.volatile.global.b8 %rd7, [%rd2];
471+ ; PTX-NEXT: ld.volatile.global.b8 %rd8, [%rd2+1];
472+ ; PTX-NEXT: shl.b64 %rd9, %rd8, 8;
473+ ; PTX-NEXT: or.b64 %rd10, %rd9, %rd7;
474+ ; PTX-NEXT: ld.volatile.global.b8 %rd11, [%rd2+2];
475+ ; PTX-NEXT: shl.b64 %rd12, %rd11, 16;
476+ ; PTX-NEXT: ld.volatile.global.b8 %rd13, [%rd2+3];
477+ ; PTX-NEXT: shl.b64 %rd14, %rd13, 24;
478+ ; PTX-NEXT: or.b64 %rd15, %rd14, %rd12;
479+ ; PTX-NEXT: or.b64 %rd16, %rd15, %rd10;
480+ ; PTX-NEXT: ld.volatile.global.b8 %rd17, [%rd2+4];
481+ ; PTX-NEXT: ld.volatile.global.b8 %rd18, [%rd2+5];
482+ ; PTX-NEXT: shl.b64 %rd19, %rd18, 8;
483+ ; PTX-NEXT: or.b64 %rd20, %rd19, %rd17;
484+ ; PTX-NEXT: ld.volatile.global.b8 %rd21, [%rd2+6];
485+ ; PTX-NEXT: shl.b64 %rd22, %rd21, 16;
486+ ; PTX-NEXT: ld.volatile.global.b8 %rd23, [%rd2+7];
487+ ; PTX-NEXT: shl.b64 %rd24, %rd23, 24;
488+ ; PTX-NEXT: or.b64 %rd25, %rd24, %rd22;
489+ ; PTX-NEXT: or.b64 %rd26, %rd25, %rd20;
490+ ; PTX-NEXT: shl.b64 %rd27, %rd26, 32;
491+ ; PTX-NEXT: or.b64 %rd28, %rd27, %rd16;
492+ ; PTX-NEXT: add.u64 %rd29, %SPL, 0;
493+ ; PTX-NEXT: st.local.b64 [%rd29], %rd28;
516494; PTX-NEXT: ret;
517495entry:
518496 tail call void @llvm.memcpy.p0.p0.i64 (ptr %s , ptr %in , i64 16 , i1 true )
0 commit comments