@@ -1341,3 +1341,66 @@ entry:
13411341 %avg = ashr <vscale x 2 x i64 > %add , splat (i64 1 )
13421342 ret <vscale x 2 x i64 > %avg
13431343}
1344+
1345+ define void @zext_mload_avgflooru (ptr %p1 , ptr %p2 , <vscale x 8 x i1 > %mask ) {
1346+ ; SVE-LABEL: zext_mload_avgflooru:
1347+ ; SVE: // %bb.0:
1348+ ; SVE-NEXT: ld1b { z0.h }, p0/z, [x0]
1349+ ; SVE-NEXT: ld1b { z1.h }, p0/z, [x1]
1350+ ; SVE-NEXT: eor z2.d, z0.d, z1.d
1351+ ; SVE-NEXT: and z0.d, z0.d, z1.d
1352+ ; SVE-NEXT: lsr z1.h, z2.h, #1
1353+ ; SVE-NEXT: add z0.h, z0.h, z1.h
1354+ ; SVE-NEXT: st1h { z0.h }, p0, [x0]
1355+ ; SVE-NEXT: ret
1356+ ;
1357+ ; SVE2-LABEL: zext_mload_avgflooru:
1358+ ; SVE2: // %bb.0:
1359+ ; SVE2-NEXT: ld1b { z0.h }, p0/z, [x0]
1360+ ; SVE2-NEXT: ld1b { z1.h }, p0/z, [x1]
1361+ ; SVE2-NEXT: ptrue p1.h
1362+ ; SVE2-NEXT: uhadd z0.h, p1/m, z0.h, z1.h
1363+ ; SVE2-NEXT: st1h { z0.h }, p0, [x0]
1364+ ; SVE2-NEXT: ret
1365+ %ld1 = call <vscale x 8 x i8 > @llvm.masked.load (ptr %p1 , i32 16 , <vscale x 8 x i1 > %mask , <vscale x 8 x i8 > zeroinitializer )
1366+ %ld2 = call <vscale x 8 x i8 > @llvm.masked.load (ptr %p2 , i32 16 , <vscale x 8 x i1 > %mask , <vscale x 8 x i8 > zeroinitializer )
1367+ %and = and <vscale x 8 x i8 > %ld1 , %ld2
1368+ %xor = xor <vscale x 8 x i8 > %ld1 , %ld2
1369+ %shift = lshr <vscale x 8 x i8 > %xor , splat(i8 1 )
1370+ %avg = add <vscale x 8 x i8 > %and , %shift
1371+ %avgext = zext <vscale x 8 x i8 > %avg to <vscale x 8 x i16 >
1372+ call void @llvm.masked.store.nxv8i16 (<vscale x 8 x i16 > %avgext , ptr %p1 , i32 16 , <vscale x 8 x i1 > %mask )
1373+ ret void
1374+ }
1375+
1376+ define void @zext_mload_avgceilu (ptr %p1 , ptr %p2 , <vscale x 8 x i1 > %mask ) {
1377+ ; SVE-LABEL: zext_mload_avgceilu:
1378+ ; SVE: // %bb.0:
1379+ ; SVE-NEXT: ld1b { z0.h }, p0/z, [x0]
1380+ ; SVE-NEXT: ld1b { z1.h }, p0/z, [x1]
1381+ ; SVE-NEXT: eor z2.d, z0.d, z1.d
1382+ ; SVE-NEXT: orr z0.d, z0.d, z1.d
1383+ ; SVE-NEXT: lsr z1.h, z2.h, #1
1384+ ; SVE-NEXT: sub z0.h, z0.h, z1.h
1385+ ; SVE-NEXT: st1b { z0.h }, p0, [x0]
1386+ ; SVE-NEXT: ret
1387+ ;
1388+ ; SVE2-LABEL: zext_mload_avgceilu:
1389+ ; SVE2: // %bb.0:
1390+ ; SVE2-NEXT: ld1b { z0.h }, p0/z, [x0]
1391+ ; SVE2-NEXT: ld1b { z1.h }, p0/z, [x1]
1392+ ; SVE2-NEXT: ptrue p1.h
1393+ ; SVE2-NEXT: urhadd z0.h, p1/m, z0.h, z1.h
1394+ ; SVE2-NEXT: st1b { z0.h }, p0, [x0]
1395+ ; SVE2-NEXT: ret
1396+ %ld1 = call <vscale x 8 x i8 > @llvm.masked.load (ptr %p1 , i32 16 , <vscale x 8 x i1 > %mask , <vscale x 8 x i8 > zeroinitializer )
1397+ %ld2 = call <vscale x 8 x i8 > @llvm.masked.load (ptr %p2 , i32 16 , <vscale x 8 x i1 > %mask , <vscale x 8 x i8 > zeroinitializer )
1398+ %zext1 = zext <vscale x 8 x i8 > %ld1 to <vscale x 8 x i16 >
1399+ %zext2 = zext <vscale x 8 x i8 > %ld2 to <vscale x 8 x i16 >
1400+ %add1 = add nuw nsw <vscale x 8 x i16 > %zext1 , splat(i16 1 )
1401+ %add2 = add nuw nsw <vscale x 8 x i16 > %add1 , %zext2
1402+ %shift = lshr <vscale x 8 x i16 > %add2 , splat(i16 1 )
1403+ %trunc = trunc <vscale x 8 x i16 > %shift to <vscale x 8 x i8 >
1404+ call void @llvm.masked.store.nxv8i8 (<vscale x 8 x i8 > %trunc , ptr %p1 , i32 16 , <vscale x 8 x i1 > %mask )
1405+ ret void
1406+ }
0 commit comments