@@ -1384,11 +1384,10 @@ define <4 x float> @test_fmaximum_v4f32_splat(<4 x float> %x, float %y) {
13841384 ret <4 x float > %r
13851385}
13861386
1387- define <4 x half > @test_fmaximum_v4f16 (<4 x half > %x , <4 x half > %y ) {
1387+ define <4 x half > @test_fmaximum_v4f16 (<4 x half > %x , <4 x half > %y ) nounwind {
13881388; SSE2-LABEL: test_fmaximum_v4f16:
13891389; SSE2: # %bb.0:
13901390; SSE2-NEXT: subq $104, %rsp
1391- ; SSE2-NEXT: .cfi_def_cfa_offset 112
13921391; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13931392; SSE2-NEXT: psrld $16, %xmm0
13941393; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -1524,13 +1523,11 @@ define <4 x half> @test_fmaximum_v4f16(<4 x half> %x, <4 x half> %y) {
15241523; SSE2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
15251524; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
15261525; SSE2-NEXT: addq $104, %rsp
1527- ; SSE2-NEXT: .cfi_def_cfa_offset 8
15281526; SSE2-NEXT: retq
15291527;
15301528; AVX1-LABEL: test_fmaximum_v4f16:
15311529; AVX1: # %bb.0:
15321530; AVX1-NEXT: subq $120, %rsp
1533- ; AVX1-NEXT: .cfi_def_cfa_offset 128
15341531; AVX1-NEXT: vmovaps %xmm0, %xmm2
15351532; AVX1-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
15361533; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -1636,37 +1633,179 @@ define <4 x half> @test_fmaximum_v4f16(<4 x half> %x, <4 x half> %y) {
16361633; AVX1-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
16371634; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero
16381635; AVX1-NEXT: addq $120, %rsp
1639- ; AVX1-NEXT: .cfi_def_cfa_offset 8
16401636; AVX1-NEXT: retq
16411637;
16421638; AVX512-LABEL: test_fmaximum_v4f16:
16431639; AVX512: # %bb.0:
1644- ; AVX512-NEXT: vcvtph2ps %xmm0, %ymm2
1645- ; AVX512-NEXT: vcvtph2ps %xmm1, %ymm3
1646- ; AVX512-NEXT: vcmpltps %ymm2, %ymm3, %ymm4
1647- ; AVX512-NEXT: vpmovdw %zmm4, %ymm4
1648- ; AVX512-NEXT: vpblendvb %xmm4, %xmm0, %xmm1, %xmm4
1649- ; AVX512-NEXT: vcmpunordps %ymm3, %ymm2, %ymm2
1650- ; AVX512-NEXT: vpmovdw %zmm2, %ymm2
1651- ; AVX512-NEXT: vpbroadcastw {{.*#+}} xmm3 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN]
1652- ; AVX512-NEXT: vpblendvb %xmm2, %xmm3, %xmm4, %xmm2
1653- ; AVX512-NEXT: vpxor %xmm3, %xmm3, %xmm3
1654- ; AVX512-NEXT: vpcmpeqw %xmm3, %xmm0, %xmm4
1655- ; AVX512-NEXT: vpblendvb %xmm4, %xmm0, %xmm2, %xmm0
1656- ; AVX512-NEXT: vpcmpeqw %xmm3, %xmm1, %xmm3
1657- ; AVX512-NEXT: vpblendvb %xmm3, %xmm1, %xmm0, %xmm0
1658- ; AVX512-NEXT: vcvtph2ps %xmm2, %ymm1
1659- ; AVX512-NEXT: vpxor %xmm3, %xmm3, %xmm3
1660- ; AVX512-NEXT: vcmpeqps %ymm3, %ymm1, %ymm1
1661- ; AVX512-NEXT: vpmovdw %zmm1, %ymm1
1662- ; AVX512-NEXT: vpblendvb %xmm1, %xmm0, %xmm2, %xmm0
1663- ; AVX512-NEXT: vzeroupper
1640+ ; AVX512-NEXT: pushq %rbp
1641+ ; AVX512-NEXT: pushq %r15
1642+ ; AVX512-NEXT: pushq %r14
1643+ ; AVX512-NEXT: pushq %r13
1644+ ; AVX512-NEXT: pushq %r12
1645+ ; AVX512-NEXT: pushq %rbx
1646+ ; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[3,3,3,3]
1647+ ; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2
1648+ ; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[3,3,3,3]
1649+ ; AVX512-NEXT: vcvtph2ps %xmm3, %xmm3
1650+ ; AVX512-NEXT: xorl %eax, %eax
1651+ ; AVX512-NEXT: vucomiss %xmm2, %xmm3
1652+ ; AVX512-NEXT: movl $65535, %ecx # imm = 0xFFFF
1653+ ; AVX512-NEXT: movl $0, %edx
1654+ ; AVX512-NEXT: cmovpl %ecx, %edx
1655+ ; AVX512-NEXT: movl $0, %edi
1656+ ; AVX512-NEXT: cmoval %ecx, %edi
1657+ ; AVX512-NEXT: vpsrldq {{.*#+}} xmm2 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1658+ ; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2
1659+ ; AVX512-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1660+ ; AVX512-NEXT: vcvtph2ps %xmm3, %xmm3
1661+ ; AVX512-NEXT: vucomiss %xmm2, %xmm3
1662+ ; AVX512-NEXT: movl $0, %esi
1663+ ; AVX512-NEXT: cmovpl %ecx, %esi
1664+ ; AVX512-NEXT: movl $0, %r9d
1665+ ; AVX512-NEXT: cmoval %ecx, %r9d
1666+ ; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0]
1667+ ; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2
1668+ ; AVX512-NEXT: vshufpd {{.*#+}} xmm3 = xmm0[1,0]
1669+ ; AVX512-NEXT: vcvtph2ps %xmm3, %xmm3
1670+ ; AVX512-NEXT: vucomiss %xmm2, %xmm3
1671+ ; AVX512-NEXT: movl $0, %r8d
1672+ ; AVX512-NEXT: cmovpl %ecx, %r8d
1673+ ; AVX512-NEXT: movl $0, %r11d
1674+ ; AVX512-NEXT: cmoval %ecx, %r11d
1675+ ; AVX512-NEXT: vpshuflw {{.*#+}} xmm2 = xmm1[3,3,3,3,4,5,6,7]
1676+ ; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2
1677+ ; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm0[3,3,3,3,4,5,6,7]
1678+ ; AVX512-NEXT: vcvtph2ps %xmm3, %xmm3
1679+ ; AVX512-NEXT: vucomiss %xmm2, %xmm3
1680+ ; AVX512-NEXT: movl $0, %r10d
1681+ ; AVX512-NEXT: cmovpl %ecx, %r10d
1682+ ; AVX512-NEXT: movl $0, %ebp
1683+ ; AVX512-NEXT: cmoval %ecx, %ebp
1684+ ; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
1685+ ; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2
1686+ ; AVX512-NEXT: vmovshdup {{.*#+}} xmm3 = xmm0[1,1,3,3]
1687+ ; AVX512-NEXT: vcvtph2ps %xmm3, %xmm3
1688+ ; AVX512-NEXT: vucomiss %xmm2, %xmm3
1689+ ; AVX512-NEXT: movl $0, %ebx
1690+ ; AVX512-NEXT: cmovpl %ecx, %ebx
1691+ ; AVX512-NEXT: movl $0, %r14d
1692+ ; AVX512-NEXT: cmoval %ecx, %r14d
1693+ ; AVX512-NEXT: vpshuflw {{.*#+}} xmm2 = xmm1[1,1,1,1,4,5,6,7]
1694+ ; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2
1695+ ; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm0[1,1,1,1,4,5,6,7]
1696+ ; AVX512-NEXT: vcvtph2ps %xmm3, %xmm3
1697+ ; AVX512-NEXT: vucomiss %xmm2, %xmm3
1698+ ; AVX512-NEXT: movl $0, %r15d
1699+ ; AVX512-NEXT: cmovpl %ecx, %r15d
1700+ ; AVX512-NEXT: movl $0, %r12d
1701+ ; AVX512-NEXT: cmoval %ecx, %r12d
1702+ ; AVX512-NEXT: vcvtph2ps %xmm1, %xmm2
1703+ ; AVX512-NEXT: vcvtph2ps %xmm0, %xmm3
1704+ ; AVX512-NEXT: vucomiss %xmm2, %xmm3
1705+ ; AVX512-NEXT: movl $0, %r13d
1706+ ; AVX512-NEXT: cmoval %ecx, %r13d
1707+ ; AVX512-NEXT: vmovd %r13d, %xmm2
1708+ ; AVX512-NEXT: vpinsrw $1, %r12d, %xmm2, %xmm2
1709+ ; AVX512-NEXT: vpinsrw $2, %r14d, %xmm2, %xmm2
1710+ ; AVX512-NEXT: vpinsrw $3, %ebp, %xmm2, %xmm2
1711+ ; AVX512-NEXT: vpinsrw $4, %r11d, %xmm2, %xmm2
1712+ ; AVX512-NEXT: vpinsrw $5, %r9d, %xmm2, %xmm2
1713+ ; AVX512-NEXT: vpinsrw $6, %edi, %xmm2, %xmm2
1714+ ; AVX512-NEXT: movl $0, %edi
1715+ ; AVX512-NEXT: cmovpl %ecx, %edi
1716+ ; AVX512-NEXT: vpsrldq {{.*#+}} xmm3 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1717+ ; AVX512-NEXT: vcvtph2ps %xmm3, %xmm3
1718+ ; AVX512-NEXT: vpsrldq {{.*#+}} xmm4 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1719+ ; AVX512-NEXT: vcvtph2ps %xmm4, %xmm4
1720+ ; AVX512-NEXT: vucomiss %xmm3, %xmm4
1721+ ; AVX512-NEXT: movl $0, %r9d
1722+ ; AVX512-NEXT: cmoval %ecx, %r9d
1723+ ; AVX512-NEXT: vpinsrw $7, %r9d, %xmm2, %xmm2
1724+ ; AVX512-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm2
1725+ ; AVX512-NEXT: vmovd %edi, %xmm3
1726+ ; AVX512-NEXT: vpinsrw $1, %r15d, %xmm3, %xmm3
1727+ ; AVX512-NEXT: vpinsrw $2, %ebx, %xmm3, %xmm3
1728+ ; AVX512-NEXT: vpinsrw $3, %r10d, %xmm3, %xmm3
1729+ ; AVX512-NEXT: vpinsrw $4, %r8d, %xmm3, %xmm3
1730+ ; AVX512-NEXT: vpinsrw $5, %esi, %xmm3, %xmm3
1731+ ; AVX512-NEXT: vpinsrw $6, %edx, %xmm3, %xmm3
1732+ ; AVX512-NEXT: movl $0, %edx
1733+ ; AVX512-NEXT: cmovpl %ecx, %edx
1734+ ; AVX512-NEXT: vpinsrw $7, %edx, %xmm3, %xmm3
1735+ ; AVX512-NEXT: vpbroadcastw {{.*#+}} xmm4 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN]
1736+ ; AVX512-NEXT: vpblendvb %xmm3, %xmm4, %xmm2, %xmm2
1737+ ; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[1,1,1,1,4,5,6,7]
1738+ ; AVX512-NEXT: vcvtph2ps %xmm3, %xmm3
1739+ ; AVX512-NEXT: vpxor %xmm4, %xmm4, %xmm4
1740+ ; AVX512-NEXT: vucomiss %xmm4, %xmm3
1741+ ; AVX512-NEXT: movl $65535, %edx # imm = 0xFFFF
1742+ ; AVX512-NEXT: cmovnel %eax, %edx
1743+ ; AVX512-NEXT: cmovpl %eax, %edx
1744+ ; AVX512-NEXT: vcvtph2ps %xmm2, %xmm3
1745+ ; AVX512-NEXT: vucomiss %xmm4, %xmm3
1746+ ; AVX512-NEXT: movl $65535, %esi # imm = 0xFFFF
1747+ ; AVX512-NEXT: cmovnel %eax, %esi
1748+ ; AVX512-NEXT: cmovpl %eax, %esi
1749+ ; AVX512-NEXT: vmovd %esi, %xmm3
1750+ ; AVX512-NEXT: vpinsrw $1, %edx, %xmm3, %xmm3
1751+ ; AVX512-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[1,1,1,1]
1752+ ; AVX512-NEXT: vcvtph2ps %xmm5, %xmm5
1753+ ; AVX512-NEXT: vucomiss %xmm4, %xmm5
1754+ ; AVX512-NEXT: movl $65535, %edx # imm = 0xFFFF
1755+ ; AVX512-NEXT: cmovnel %eax, %edx
1756+ ; AVX512-NEXT: cmovpl %eax, %edx
1757+ ; AVX512-NEXT: vpinsrw $2, %edx, %xmm3, %xmm3
1758+ ; AVX512-NEXT: vpshuflw {{.*#+}} xmm5 = xmm2[3,3,3,3,4,5,6,7]
1759+ ; AVX512-NEXT: vcvtph2ps %xmm5, %xmm5
1760+ ; AVX512-NEXT: vucomiss %xmm4, %xmm5
1761+ ; AVX512-NEXT: movl $65535, %edx # imm = 0xFFFF
1762+ ; AVX512-NEXT: cmovnel %eax, %edx
1763+ ; AVX512-NEXT: cmovpl %eax, %edx
1764+ ; AVX512-NEXT: vpinsrw $3, %edx, %xmm3, %xmm3
1765+ ; AVX512-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[2,3,2,3]
1766+ ; AVX512-NEXT: vcvtph2ps %xmm5, %xmm5
1767+ ; AVX512-NEXT: vucomiss %xmm4, %xmm5
1768+ ; AVX512-NEXT: movl $65535, %edx # imm = 0xFFFF
1769+ ; AVX512-NEXT: cmovnel %eax, %edx
1770+ ; AVX512-NEXT: cmovpl %eax, %edx
1771+ ; AVX512-NEXT: vpinsrw $4, %edx, %xmm3, %xmm3
1772+ ; AVX512-NEXT: vpsrldq {{.*#+}} xmm5 = xmm2[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1773+ ; AVX512-NEXT: vcvtph2ps %xmm5, %xmm5
1774+ ; AVX512-NEXT: vucomiss %xmm4, %xmm5
1775+ ; AVX512-NEXT: movl $65535, %edx # imm = 0xFFFF
1776+ ; AVX512-NEXT: cmovnel %eax, %edx
1777+ ; AVX512-NEXT: cmovpl %eax, %edx
1778+ ; AVX512-NEXT: vpinsrw $5, %edx, %xmm3, %xmm3
1779+ ; AVX512-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[3,3,3,3]
1780+ ; AVX512-NEXT: vcvtph2ps %xmm5, %xmm5
1781+ ; AVX512-NEXT: vucomiss %xmm4, %xmm5
1782+ ; AVX512-NEXT: movl $65535, %edx # imm = 0xFFFF
1783+ ; AVX512-NEXT: cmovnel %eax, %edx
1784+ ; AVX512-NEXT: cmovpl %eax, %edx
1785+ ; AVX512-NEXT: vpinsrw $6, %edx, %xmm3, %xmm3
1786+ ; AVX512-NEXT: vpsrldq {{.*#+}} xmm5 = xmm2[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1787+ ; AVX512-NEXT: vcvtph2ps %xmm5, %xmm5
1788+ ; AVX512-NEXT: vucomiss %xmm4, %xmm5
1789+ ; AVX512-NEXT: cmovnel %eax, %ecx
1790+ ; AVX512-NEXT: cmovpl %eax, %ecx
1791+ ; AVX512-NEXT: vpinsrw $7, %ecx, %xmm3, %xmm3
1792+ ; AVX512-NEXT: vpxor %xmm4, %xmm4, %xmm4
1793+ ; AVX512-NEXT: vpcmpeqw %xmm4, %xmm0, %xmm5
1794+ ; AVX512-NEXT: vpblendvb %xmm5, %xmm0, %xmm2, %xmm0
1795+ ; AVX512-NEXT: vpcmpeqw %xmm4, %xmm1, %xmm4
1796+ ; AVX512-NEXT: vpblendvb %xmm4, %xmm1, %xmm0, %xmm0
1797+ ; AVX512-NEXT: vpblendvb %xmm3, %xmm0, %xmm2, %xmm0
1798+ ; AVX512-NEXT: popq %rbx
1799+ ; AVX512-NEXT: popq %r12
1800+ ; AVX512-NEXT: popq %r13
1801+ ; AVX512-NEXT: popq %r14
1802+ ; AVX512-NEXT: popq %r15
1803+ ; AVX512-NEXT: popq %rbp
16641804; AVX512-NEXT: retq
16651805;
16661806; X86-LABEL: test_fmaximum_v4f16:
16671807; X86: # %bb.0:
16681808; X86-NEXT: subl $164, %esp
1669- ; X86-NEXT: .cfi_def_cfa_offset 168
16701809; X86-NEXT: vmovdqa %xmm0, %xmm2
16711810; X86-NEXT: vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
16721811; X86-NEXT: vpsrlq $48, %xmm0, %xmm0
@@ -1806,7 +1945,6 @@ define <4 x half> @test_fmaximum_v4f16(<4 x half> %x, <4 x half> %y) {
18061945; X86-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
18071946; X86-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero
18081947; X86-NEXT: addl $164, %esp
1809- ; X86-NEXT: .cfi_def_cfa_offset 4
18101948; X86-NEXT: retl
18111949 %r = call <4 x half > @llvm.maximum.v4f16 (<4 x half > %x , <4 x half > %y )
18121950 ret <4 x half > %r
0 commit comments