@@ -1364,13 +1364,11 @@ define <4 x i64> @unzip2a_dual_v4i64(<4 x i64> %a, <4 x i64> %b) {
13641364;
13651365; ZIP-LABEL: unzip2a_dual_v4i64:
13661366; ZIP: # %bb.0: # %entry
1367- ; ZIP-NEXT: vsetivli zero, 4, e64, m1, ta, mu
1368- ; ZIP-NEXT: vmv.v.i v0, 8
1369- ; ZIP-NEXT: vslideup.vi v10, v9, 2
1370- ; ZIP-NEXT: vslideup.vi v10, v9, 1, v0.t
1371- ; ZIP-NEXT: vmv.v.i v0, 12
1372- ; ZIP-NEXT: ri.vunzip2a.vv v11, v8, v9
1373- ; ZIP-NEXT: vmerge.vvm v8, v11, v10, v0
1367+ ; ZIP-NEXT: vsetivli zero, 4, e64, m1, ta, ma
1368+ ; ZIP-NEXT: ri.vunzip2a.vv v11, v9, v10
1369+ ; ZIP-NEXT: ri.vunzip2a.vv v9, v8, v10
1370+ ; ZIP-NEXT: vslideup.vi v9, v11, 2
1371+ ; ZIP-NEXT: vmv.v.v v8, v9
13741372; ZIP-NEXT: ret
13751373entry:
13761374 %c = shufflevector <4 x i64 > %a , <4 x i64 > %b , <4 x i32 > <i32 0 , i32 2 , i32 4 , i32 6 >
@@ -1502,16 +1500,11 @@ define <16 x i64> @unzip2a_dual_v16i64(<16 x i64> %a, <16 x i64> %b) {
15021500; ZIP-LABEL: unzip2a_dual_v16i64:
15031501; ZIP: # %bb.0: # %entry
15041502; ZIP-NEXT: vsetivli zero, 8, e64, m2, ta, ma
1505- ; ZIP-NEXT: ri.vunzip2a.vv v16, v8, v10
1506- ; ZIP-NEXT: vsetivli zero, 16, e16, m1, ta, ma
1507- ; ZIP-NEXT: vid.v v8
1508- ; ZIP-NEXT: li a0, -256
1509- ; ZIP-NEXT: vadd.vv v8, v8, v8
1510- ; ZIP-NEXT: vmv.s.x v0, a0
1511- ; ZIP-NEXT: vadd.vi v8, v8, -16
1512- ; ZIP-NEXT: vsetvli zero, zero, e64, m4, ta, mu
1513- ; ZIP-NEXT: vrgatherei16.vv v16, v12, v8, v0.t
1514- ; ZIP-NEXT: vmv.v.v v8, v16
1503+ ; ZIP-NEXT: ri.vunzip2a.vv v16, v12, v14
1504+ ; ZIP-NEXT: ri.vunzip2a.vv v12, v8, v10
1505+ ; ZIP-NEXT: vsetivli zero, 16, e64, m4, ta, ma
1506+ ; ZIP-NEXT: vslideup.vi v12, v16, 8
1507+ ; ZIP-NEXT: vmv.v.v v8, v12
15151508; ZIP-NEXT: ret
15161509entry:
15171510 %c = shufflevector <16 x i64 > %a , <16 x i64 > %b , <16 x i32 > <i32 0 , i32 2 , i32 4 , i32 6 , i32 8 , i32 10 , i32 12 , i32 14 , i32 16 , i32 18 , i32 20 , i32 22 , i32 24 , i32 26 , i32 28 , i32 30 >
@@ -1557,13 +1550,9 @@ define <4 x i64> @unzip2a_dual_v4i64_exact(<4 x i64> %a, <4 x i64> %b) vscale_ra
15571550;
15581551; ZIP-LABEL: unzip2a_dual_v4i64_exact:
15591552; ZIP: # %bb.0: # %entry
1560- ; ZIP-NEXT: vsetivli zero, 4, e64, m1, ta, mu
1561- ; ZIP-NEXT: vmv.v.i v0, 8
1562- ; ZIP-NEXT: vslideup.vi v10, v9, 2
1563- ; ZIP-NEXT: vslideup.vi v10, v9, 1, v0.t
1564- ; ZIP-NEXT: vmv.v.i v0, 12
1565- ; ZIP-NEXT: ri.vunzip2a.vv v11, v8, v9
1566- ; ZIP-NEXT: vmerge.vvm v8, v11, v10, v0
1553+ ; ZIP-NEXT: vsetivli zero, 4, e64, m1, ta, ma
1554+ ; ZIP-NEXT: ri.vunzip2a.vv v10, v8, v9
1555+ ; ZIP-NEXT: vmv.v.v v8, v10
15671556; ZIP-NEXT: ret
15681557entry:
15691558 %c = shufflevector <4 x i64 > %a , <4 x i64 > %b , <4 x i32 > <i32 0 , i32 2 , i32 4 , i32 6 >
@@ -1609,13 +1598,10 @@ define <4 x i64> @unzip2a_dual_v4i64_exact_nf2(<4 x i64> %a, <4 x i64> %b) vscal
16091598;
16101599; ZIP-LABEL: unzip2a_dual_v4i64_exact_nf2:
16111600; ZIP: # %bb.0: # %entry
1612- ; ZIP-NEXT: vsetivli zero, 4, e64, m1, ta, mu
1613- ; ZIP-NEXT: vmv.v.i v0, 8
1614- ; ZIP-NEXT: vslideup.vi v10, v9, 2
1615- ; ZIP-NEXT: vslideup.vi v10, v9, 1, v0.t
1616- ; ZIP-NEXT: vmv.v.i v0, 12
1617- ; ZIP-NEXT: ri.vunzip2a.vv v11, v8, v9
1618- ; ZIP-NEXT: vmerge.vvm v8, v11, v10, v0
1601+ ; ZIP-NEXT: vsetivli zero, 8, e64, m1, ta, ma
1602+ ; ZIP-NEXT: vslideup.vi v8, v9, 4
1603+ ; ZIP-NEXT: ri.vunzip2a.vv v9, v8, v10
1604+ ; ZIP-NEXT: vmv.v.v v8, v9
16191605; ZIP-NEXT: ret
16201606entry:
16211607 %c = shufflevector <4 x i64 > %a , <4 x i64 > %b , <4 x i32 > <i32 0 , i32 2 , i32 4 , i32 6 >
@@ -1740,39 +1726,111 @@ define <16 x i64> @unzip2a_dual_v16i64_exact(<16 x i64> %a, <16 x i64> %b) vscal
17401726;
17411727; ZIP-LABEL: unzip2a_dual_v16i64_exact:
17421728; ZIP: # %bb.0: # %entry
1743- ; ZIP-NEXT: vsetivli zero, 4, e64, m1, ta, mu
1744- ; ZIP-NEXT: vslideup.vi v18, v15, 2
1745- ; ZIP-NEXT: vmv.v.i v16, 8
1746- ; ZIP-NEXT: vmv.v.i v17, 12
1747- ; ZIP-NEXT: vslideup.vi v20, v13, 2
1748- ; ZIP-NEXT: vmv.v.v v0, v16
1749- ; ZIP-NEXT: vslideup.vi v18, v15, 1, v0.t
1750- ; ZIP-NEXT: ri.vunzip2a.vv v15, v14, v19
1751- ; ZIP-NEXT: vmv.v.v v0, v17
1752- ; ZIP-NEXT: vmerge.vvm v15, v15, v18, v0
1753- ; ZIP-NEXT: vmv.v.v v0, v16
1754- ; ZIP-NEXT: vslideup.vi v20, v13, 1, v0.t
1755- ; ZIP-NEXT: ri.vunzip2a.vv v14, v12, v13
1756- ; ZIP-NEXT: vslideup.vi v12, v11, 2
1757- ; ZIP-NEXT: vslideup.vi v18, v9, 2
1758- ; ZIP-NEXT: vmv.v.v v0, v17
1759- ; ZIP-NEXT: vmerge.vvm v14, v14, v20, v0
1760- ; ZIP-NEXT: li a0, -256
1761- ; ZIP-NEXT: ri.vunzip2a.vv v20, v10, v13
1762- ; ZIP-NEXT: ri.vunzip2a.vv v10, v8, v19
1763- ; ZIP-NEXT: vmv.v.v v0, v16
1764- ; ZIP-NEXT: vslideup.vi v12, v11, 1, v0.t
1765- ; ZIP-NEXT: vmv.v.v v0, v17
1766- ; ZIP-NEXT: vmerge.vvm v13, v20, v12, v0
1767- ; ZIP-NEXT: vmv.v.v v0, v16
1768- ; ZIP-NEXT: vslideup.vi v18, v9, 1, v0.t
1769- ; ZIP-NEXT: vmv.v.v v0, v17
1770- ; ZIP-NEXT: vmerge.vvm v12, v10, v18, v0
1771- ; ZIP-NEXT: vmv.s.x v0, a0
17721729; ZIP-NEXT: vsetivli zero, 16, e64, m4, ta, ma
1773- ; ZIP-NEXT: vmerge.vvm v8, v12, v12, v0
1730+ ; ZIP-NEXT: ri.vunzip2a.vv v16, v8, v12
1731+ ; ZIP-NEXT: vmv.v.v v8, v16
17741732; ZIP-NEXT: ret
17751733entry:
17761734 %c = shufflevector <16 x i64 > %a , <16 x i64 > %b , <16 x i32 > <i32 0 , i32 2 , i32 4 , i32 6 , i32 8 , i32 10 , i32 12 , i32 14 , i32 16 , i32 18 , i32 20 , i32 22 , i32 24 , i32 26 , i32 28 , i32 30 >
17771735 ret <16 x i64 > %c
17781736}
1737+
1738+ define <4 x i64 > @unzip2b_dual_v4i64 (<4 x i64 > %a , <4 x i64 > %b ) {
1739+ ; V-LABEL: unzip2b_dual_v4i64:
1740+ ; V: # %bb.0: # %entry
1741+ ; V-NEXT: vsetivli zero, 4, e64, m1, ta, mu
1742+ ; V-NEXT: vmv.v.i v0, 2
1743+ ; V-NEXT: vslidedown.vi v10, v8, 1
1744+ ; V-NEXT: vslidedown.vi v10, v8, 2, v0.t
1745+ ; V-NEXT: vmv.v.i v0, 4
1746+ ; V-NEXT: vmv1r.v v8, v9
1747+ ; V-NEXT: vslideup.vi v8, v9, 1, v0.t
1748+ ; V-NEXT: vmv.v.i v0, 12
1749+ ; V-NEXT: vmerge.vvm v8, v10, v8, v0
1750+ ; V-NEXT: ret
1751+ ;
1752+ ; ZVE32F-LABEL: unzip2b_dual_v4i64:
1753+ ; ZVE32F: # %bb.0: # %entry
1754+ ; ZVE32F-NEXT: ld a3, 8(a2)
1755+ ; ZVE32F-NEXT: ld a2, 24(a2)
1756+ ; ZVE32F-NEXT: ld a4, 8(a1)
1757+ ; ZVE32F-NEXT: ld a1, 24(a1)
1758+ ; ZVE32F-NEXT: vsetivli zero, 8, e32, m1, ta, mu
1759+ ; ZVE32F-NEXT: vmv.v.i v0, 15
1760+ ; ZVE32F-NEXT: srli a5, a2, 32
1761+ ; ZVE32F-NEXT: srli a6, a3, 32
1762+ ; ZVE32F-NEXT: srli a7, a1, 32
1763+ ; ZVE32F-NEXT: srli t0, a4, 32
1764+ ; ZVE32F-NEXT: vmv.v.x v8, a4
1765+ ; ZVE32F-NEXT: vmv.v.x v9, a3
1766+ ; ZVE32F-NEXT: vslide1down.vx v8, v8, t0
1767+ ; ZVE32F-NEXT: vslide1down.vx v9, v9, a6
1768+ ; ZVE32F-NEXT: vslide1down.vx v8, v8, a1
1769+ ; ZVE32F-NEXT: vslide1down.vx v9, v9, a2
1770+ ; ZVE32F-NEXT: vslide1down.vx v8, v8, a7
1771+ ; ZVE32F-NEXT: vslide1down.vx v9, v9, a5
1772+ ; ZVE32F-NEXT: vslidedown.vi v9, v8, 4, v0.t
1773+ ; ZVE32F-NEXT: vse32.v v9, (a0)
1774+ ; ZVE32F-NEXT: ret
1775+ ;
1776+ ; ZIP-LABEL: unzip2b_dual_v4i64:
1777+ ; ZIP: # %bb.0: # %entry
1778+ ; ZIP-NEXT: vsetivli zero, 4, e64, m1, ta, ma
1779+ ; ZIP-NEXT: ri.vunzip2b.vv v11, v9, v10
1780+ ; ZIP-NEXT: ri.vunzip2b.vv v9, v8, v10
1781+ ; ZIP-NEXT: vslideup.vi v9, v11, 2
1782+ ; ZIP-NEXT: vmv.v.v v8, v9
1783+ ; ZIP-NEXT: ret
1784+ entry:
1785+ %c = shufflevector <4 x i64 > %a , <4 x i64 > %b , <4 x i32 > <i32 1 , i32 3 , i32 5 , i32 7 >
1786+ ret <4 x i64 > %c
1787+ }
1788+
1789+ define <4 x i64 > @unzip2b_dual_v4i64_exact (<4 x i64 > %a , <4 x i64 > %b ) vscale_range(4 ,4 ) {
1790+ ; V-LABEL: unzip2b_dual_v4i64_exact:
1791+ ; V: # %bb.0: # %entry
1792+ ; V-NEXT: vsetivli zero, 4, e64, m1, ta, mu
1793+ ; V-NEXT: vmv.v.i v0, 2
1794+ ; V-NEXT: vslidedown.vi v10, v8, 1
1795+ ; V-NEXT: vslidedown.vi v10, v8, 2, v0.t
1796+ ; V-NEXT: vmv.v.i v0, 4
1797+ ; V-NEXT: vmv1r.v v8, v9
1798+ ; V-NEXT: vslideup.vi v8, v9, 1, v0.t
1799+ ; V-NEXT: vmv.v.i v0, 12
1800+ ; V-NEXT: vmerge.vvm v8, v10, v8, v0
1801+ ; V-NEXT: ret
1802+ ;
1803+ ; ZVE32F-LABEL: unzip2b_dual_v4i64_exact:
1804+ ; ZVE32F: # %bb.0: # %entry
1805+ ; ZVE32F-NEXT: ld a3, 8(a2)
1806+ ; ZVE32F-NEXT: ld a2, 24(a2)
1807+ ; ZVE32F-NEXT: ld a4, 8(a1)
1808+ ; ZVE32F-NEXT: ld a1, 24(a1)
1809+ ; ZVE32F-NEXT: vsetivli zero, 8, e32, m1, ta, mu
1810+ ; ZVE32F-NEXT: vmv.v.i v0, 15
1811+ ; ZVE32F-NEXT: srli a5, a2, 32
1812+ ; ZVE32F-NEXT: srli a6, a3, 32
1813+ ; ZVE32F-NEXT: srli a7, a1, 32
1814+ ; ZVE32F-NEXT: srli t0, a4, 32
1815+ ; ZVE32F-NEXT: vmv.v.x v8, a4
1816+ ; ZVE32F-NEXT: vmv.v.x v9, a3
1817+ ; ZVE32F-NEXT: vslide1down.vx v8, v8, t0
1818+ ; ZVE32F-NEXT: vslide1down.vx v9, v9, a6
1819+ ; ZVE32F-NEXT: vslide1down.vx v8, v8, a1
1820+ ; ZVE32F-NEXT: vslide1down.vx v9, v9, a2
1821+ ; ZVE32F-NEXT: vslide1down.vx v8, v8, a7
1822+ ; ZVE32F-NEXT: vslide1down.vx v9, v9, a5
1823+ ; ZVE32F-NEXT: vslidedown.vi v9, v8, 4, v0.t
1824+ ; ZVE32F-NEXT: vs1r.v v9, (a0)
1825+ ; ZVE32F-NEXT: ret
1826+ ;
1827+ ; ZIP-LABEL: unzip2b_dual_v4i64_exact:
1828+ ; ZIP: # %bb.0: # %entry
1829+ ; ZIP-NEXT: vsetivli zero, 4, e64, m1, ta, ma
1830+ ; ZIP-NEXT: ri.vunzip2b.vv v10, v8, v9
1831+ ; ZIP-NEXT: vmv.v.v v8, v10
1832+ ; ZIP-NEXT: ret
1833+ entry:
1834+ %c = shufflevector <4 x i64 > %a , <4 x i64 > %b , <4 x i32 > <i32 1 , i32 3 , i32 5 , i32 7 >
1835+ ret <4 x i64 > %c
1836+ }
0 commit comments