@@ -353,6 +353,76 @@ define void @global_2xf32_no_align(ptr addrspace(1) %a, ptr addrspace(1) %b) {
353353 ret void
354354}
355355
356+ ; This is testing the lowering behavior of this case from LoadStoreVectorizer/NVPTX/4x2xhalf.ll
357+ ; where two 3xhalfs are chained together and extended to 8xhalf.
358+ define void @halfx3_extend_chain (ptr align 16 captures(none) %rd0 ) {
359+ ; CHECK-LABEL: halfx3_extend_chain(
360+ ; CHECK: {
361+ ; CHECK-NEXT: .reg .b16 %rs<13>;
362+ ; CHECK-NEXT: .reg .b32 %r<20>;
363+ ; CHECK-NEXT: .reg .b64 %rd<2>;
364+ ; CHECK-EMPTY:
365+ ; CHECK-NEXT: // %bb.0:
366+ ; CHECK-NEXT: ld.param.b64 %rd1, [halfx3_extend_chain_param_0];
367+ ; CHECK-NEXT: .pragma "used_bytes_mask 0xfff";
368+ ; CHECK-NEXT: ld.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
369+ ; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r3;
370+ ; CHECK-NEXT: mov.b32 {_, %rs3}, %r2;
371+ ; CHECK-NEXT: mov.b32 %r5, {%rs3, %rs1};
372+ ; CHECK-NEXT: mov.b32 %r6, {%rs2, %rs4};
373+ ; CHECK-NEXT: mov.b32 %r7, 0;
374+ ; CHECK-NEXT: max.f16x2 %r8, %r2, %r7;
375+ ; CHECK-NEXT: max.f16x2 %r9, %r1, %r7;
376+ ; CHECK-NEXT: st.b32 [%rd1], %r9;
377+ ; CHECK-NEXT: mov.b32 {%rs5, _}, %r8;
378+ ; CHECK-NEXT: st.b16 [%rd1+4], %rs5;
379+ ; CHECK-NEXT: max.f16x2 %r10, %r6, %r7;
380+ ; CHECK-NEXT: max.f16x2 %r11, %r5, %r7;
381+ ; CHECK-NEXT: st.b32 [%rd1+6], %r11;
382+ ; CHECK-NEXT: mov.b32 {%rs6, _}, %r10;
383+ ; CHECK-NEXT: st.b16 [%rd1+10], %rs6;
384+ ; CHECK-NEXT: ld.b16 %rs7, [%rd1+16];
385+ ; CHECK-NEXT: mov.b32 %r12, {%rs7, %rs8};
386+ ; CHECK-NEXT: ld.b32 %r13, [%rd1+12];
387+ ; CHECK-NEXT: max.f16x2 %r14, %r12, %r7;
388+ ; CHECK-NEXT: max.f16x2 %r15, %r13, %r7;
389+ ; CHECK-NEXT: st.b32 [%rd1+12], %r15;
390+ ; CHECK-NEXT: mov.b32 {%rs9, _}, %r14;
391+ ; CHECK-NEXT: st.b16 [%rd1+16], %rs9;
392+ ; CHECK-NEXT: ld.b16 %rs10, [%rd1+22];
393+ ; CHECK-NEXT: mov.b32 %r16, {%rs10, %rs11};
394+ ; CHECK-NEXT: ld.b32 %r17, [%rd1+18];
395+ ; CHECK-NEXT: max.f16x2 %r18, %r16, %r7;
396+ ; CHECK-NEXT: max.f16x2 %r19, %r17, %r7;
397+ ; CHECK-NEXT: st.b32 [%rd1+18], %r19;
398+ ; CHECK-NEXT: mov.b32 {%rs12, _}, %r18;
399+ ; CHECK-NEXT: st.b16 [%rd1+22], %rs12;
400+ ; CHECK-NEXT: ret;
401+ %1 = call <8 x half > @llvm.masked.load.v8f16.p0 (ptr align 16 %rd0 , <8 x i1 > <i1 true , i1 true , i1 true , i1 true , i1 true , i1 true , i1 false , i1 false >, <8 x half > poison)
402+ %load13 = shufflevector <8 x half > %1 , <8 x half > poison, <3 x i32 > <i32 0 , i32 1 , i32 2 >
403+ %load24 = shufflevector <8 x half > %1 , <8 x half > poison, <3 x i32 > <i32 3 , i32 4 , i32 5 >
404+ %Extend5 = extractelement <8 x half > %1 , i32 6
405+ %Extend26 = extractelement <8 x half > %1 , i32 7
406+ %p1 = fcmp ogt <3 x half > %load13 , zeroinitializer
407+ %s1 = select <3 x i1 > %p1 , <3 x half > %load13 , <3 x half > zeroinitializer
408+ store <3 x half > %s1 , ptr %rd0 , align 16
409+ %in2 = getelementptr half , ptr %rd0 , i64 3
410+ %p2 = fcmp ogt <3 x half > %load24 , zeroinitializer
411+ %s2 = select <3 x i1 > %p2 , <3 x half > %load24 , <3 x half > zeroinitializer
412+ store <3 x half > %s2 , ptr %in2 , align 4
413+ %in3 = getelementptr half , ptr %rd0 , i64 6
414+ %load3 = load <3 x half >, ptr %in3 , align 4
415+ %p3 = fcmp ogt <3 x half > %load3 , zeroinitializer
416+ %s3 = select <3 x i1 > %p3 , <3 x half > %load3 , <3 x half > zeroinitializer
417+ store <3 x half > %s3 , ptr %in3 , align 4
418+ %in4 = getelementptr half , ptr %rd0 , i64 9
419+ %load4 = load <3 x half >, ptr %in4 , align 4
420+ %p4 = fcmp ogt <3 x half > %load4 , zeroinitializer
421+ %s4 = select <3 x i1 > %p4 , <3 x half > %load4 , <3 x half > zeroinitializer
422+ store <3 x half > %s4 , ptr %in4 , align 4
423+ ret void
424+ }
425+
356426declare <8 x i32 > @llvm.masked.load.v8i32.p1 (ptr addrspace (1 ), <8 x i1 >, <8 x i32 >)
357427declare void @llvm.masked.store.v8i32.p1 (<8 x i32 >, ptr addrspace (1 ), <8 x i1 >)
358428declare <16 x i16 > @llvm.masked.load.v16i16.p1 (ptr addrspace (1 ), <16 x i1 >, <16 x i16 >)
@@ -363,4 +433,5 @@ declare <4 x i8> @llvm.masked.load.v4i8.p1(ptr addrspace(1), <4 x i1>, <4 x i8>)
363433declare void @llvm.masked.store.v4i8.p1 (<4 x i8 >, ptr addrspace (1 ), <4 x i1 >)
364434declare <2 x float > @llvm.masked.load.v2f32.p1 (ptr addrspace (1 ), <2 x i1 >, <2 x float >)
365435declare void @llvm.masked.store.v2f32.p1 (<2 x float >, ptr addrspace (1 ), <2 x i1 >)
436+ declare <8 x half > @llvm.masked.load.v8f16.p0 (ptr captures(none), <8 x i1 >, <8 x half >)
366437!0 = !{}
0 commit comments