@@ -1877,85 +1877,56 @@ define i32 @blsr_u512(ptr %word) nounwind {
18771877; SSE: # %bb.0:
18781878; SSE-NEXT: pushq %r15
18791879; SSE-NEXT: pushq %r14
1880- ; SSE-NEXT: pushq %r12
18811880; SSE-NEXT: pushq %rbx
1882- ; SSE-NEXT: pushq %rax
1883- ; SSE-NEXT: movq 56(%rdi), %rcx
1884- ; SSE-NEXT: movq 48(%rdi), %rdx
1885- ; SSE-NEXT: movq 40(%rdi), %rsi
1886- ; SSE-NEXT: movq 32(%rdi), %r11
1881+ ; SSE-NEXT: movq 48(%rdi), %r11
1882+ ; SSE-NEXT: movq 40(%rdi), %r9
18871883; SSE-NEXT: movq 24(%rdi), %r8
1888- ; SSE-NEXT: movq 16(%rdi), %r9
1889- ; SSE-NEXT: movq (%rdi), %rax
1890- ; SSE-NEXT: movq 8(%rdi), %r10
1891- ; SSE-NEXT: rep bsfq %rax, %rbx
1892- ; SSE-NEXT: rep bsfq %r10, %r14
1893- ; SSE-NEXT: addq $64, %r14
1894- ; SSE-NEXT: testq %rax, %rax
1895- ; SSE-NEXT: cmovneq %rbx, %r14
1896- ; SSE-NEXT: rep bsfq %r9, %r15
1897- ; SSE-NEXT: rep bsfq %r8, %rbx
1884+ ; SSE-NEXT: movq 16(%rdi), %rdx
1885+ ; SSE-NEXT: movq (%rdi), %rcx
1886+ ; SSE-NEXT: movq 8(%rdi), %rsi
1887+ ; SSE-NEXT: rep bsfq %rcx, %rax
1888+ ; SSE-NEXT: rep bsfq %rsi, %rbx
18981889; SSE-NEXT: addq $64, %rbx
1899- ; SSE-NEXT: testq %r9 , %r9
1900- ; SSE-NEXT: cmovneq %r15 , %rbx
1901- ; SSE-NEXT: subq $-128 , %rbx
1902- ; SSE-NEXT: movq %rax , %r15
1903- ; SSE-NEXT: movq %rax , %r12
1904- ; SSE-NEXT: orq %r10 , %r12
1905- ; SSE-NEXT: cmovneq %r14 , %rbx
1906- ; SSE-NEXT: rep bsfq %r11 , %r12
1907- ; SSE-NEXT: rep bsfq %rsi , %r14
1908- ; SSE-NEXT: addq $64 , %r14
1909- ; SSE-NEXT: testq %r11 , %r11
1910- ; SSE-NEXT: cmovneq %r12 , %r14
1911- ; SSE-NEXT: xorps %xmm0 , %xmm0
1912- ; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
1913- ; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
1914- ; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
1915- ; SSE-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
1916- ; SSE-NEXT: rep bsfq %rdx , %r12
1890+ ; SSE-NEXT: testq %rcx , %rcx
1891+ ; SSE-NEXT: cmovneq %rax , %rbx
1892+ ; SSE-NEXT: rep bsfq %rdx , %rax
1893+ ; SSE-NEXT: rep bsfq %r8 , %r10
1894+ ; SSE-NEXT: addq $64 , %r10
1895+ ; SSE-NEXT: testq %rdx , %rdx
1896+ ; SSE-NEXT: cmovneq %rax , %r10
1897+ ; SSE-NEXT: movq 32(%rdi) , %r14
1898+ ; SSE-NEXT: subq $-128 , %r10
1899+ ; SSE-NEXT: movq %rcx , %rax
1900+ ; SSE-NEXT: orq %rsi , %rax
1901+ ; SSE-NEXT: cmovneq %rbx , %r10
1902+ ; SSE-NEXT: rep bsfq %r14 , %rax
1903+ ; SSE-NEXT: rep bsfq %r9, %rbx
1904+ ; SSE-NEXT: addq $64, %rbx
1905+ ; SSE-NEXT: testq %r14, %r14
1906+ ; SSE-NEXT: cmovneq %rax, %rbx
1907+ ; SSE-NEXT: rep bsfq %r11 , %r15
19171908; SSE-NEXT: movl $64, %eax
1918- ; SSE-NEXT: rep bsfq %rcx , %rax
1909+ ; SSE-NEXT: rep bsfq 56(%rdi) , %rax
19191910; SSE-NEXT: addq $64, %rax
1920- ; SSE-NEXT: testq %rdx , %rdx
1921- ; SSE-NEXT: cmovneq %r12 , %rax
1911+ ; SSE-NEXT: testq %r11 , %r11
1912+ ; SSE-NEXT: cmovneq %r15 , %rax
19221913; SSE-NEXT: subq $-128, %rax
1923- ; SSE-NEXT: movq %r11, -{{[0-9]+}}(%rsp)
1924- ; SSE-NEXT: orq %rsi, %r11
1925- ; SSE-NEXT: cmovneq %r14, %rax
1926- ; SSE-NEXT: addq $256, %rax # imm = 0x100
1927- ; SSE-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
1928- ; SSE-NEXT: orq %r8, %r10
1929- ; SSE-NEXT: orq %r9, %r15
1930- ; SSE-NEXT: orq %r10, %r15
1914+ ; SSE-NEXT: orq %r9, %r14
19311915; SSE-NEXT: cmovneq %rbx, %rax
1932- ; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
1933- ; SSE-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
1934- ; SSE-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
1935- ; SSE-NEXT: movq %rsi, -{{[0-9]+}}(%rsp)
1936- ; SSE-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
1937- ; SSE-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
1916+ ; SSE-NEXT: addq $256, %rax # imm = 0x100
1917+ ; SSE-NEXT: orq %r8, %rsi
1918+ ; SSE-NEXT: orq %rdx, %rcx
1919+ ; SSE-NEXT: orq %rsi, %rcx
1920+ ; SSE-NEXT: cmovneq %r10, %rax
1921+ ; SSE-NEXT: movl $-2, %edx
1922+ ; SSE-NEXT: movl %eax, %ecx
1923+ ; SSE-NEXT: roll %cl, %edx
19381924; SSE-NEXT: movl %eax, %ecx
1939- ; SSE-NEXT: andl $32, %ecx
1940- ; SSE-NEXT: movl %eax, %edx
1941- ; SSE-NEXT: andl $480, %edx # imm = 0x1E0
1942- ; SSE-NEXT: shrl $3, %edx
1943- ; SSE-NEXT: movl %edx, %esi
1944- ; SSE-NEXT: andl $-8, %esi
1945- ; SSE-NEXT: movq -128(%rsp,%rsi), %r8
1946- ; SSE-NEXT: shrq %cl, %r8
1947- ; SSE-NEXT: movl -120(%rsp,%rsi), %esi
1948- ; SSE-NEXT: addl %esi, %esi
1949- ; SSE-NEXT: notl %ecx
1950- ; SSE-NEXT: # kill: def $cl killed $cl killed $ecx
1951- ; SSE-NEXT: shlq %cl, %rsi
1952- ; SSE-NEXT: orl %r8d, %esi
1953- ; SSE-NEXT: btrl %eax, %esi
1954- ; SSE-NEXT: movl %esi, (%rdi,%rdx)
1925+ ; SSE-NEXT: shrl $3, %ecx
1926+ ; SSE-NEXT: andl $60, %ecx
1927+ ; SSE-NEXT: andl %edx, (%rdi,%rcx)
19551928; SSE-NEXT: # kill: def $eax killed $eax killed $rax
1956- ; SSE-NEXT: addq $8, %rsp
19571929; SSE-NEXT: popq %rbx
1958- ; SSE-NEXT: popq %r12
19591930; SSE-NEXT: popq %r14
19601931; SSE-NEXT: popq %r15
19611932; SSE-NEXT: retq
@@ -1964,133 +1935,86 @@ define i32 @blsr_u512(ptr %word) nounwind {
19641935; AVX2: # %bb.0:
19651936; AVX2-NEXT: pushq %r15
19661937; AVX2-NEXT: pushq %r14
1967- ; AVX2-NEXT: pushq %r13
1968- ; AVX2-NEXT: pushq %r12
19691938; AVX2-NEXT: pushq %rbx
1970- ; AVX2-NEXT: movq 56(%rdi), %rcx
1971- ; AVX2-NEXT: movq 40(%rdi), %rdx
1972- ; AVX2-NEXT: movq 32(%rdi), %r11
1973- ; AVX2-NEXT: movq 24(%rdi), %rsi
1974- ; AVX2-NEXT: movq 16(%rdi), %r8
1975- ; AVX2-NEXT: movq (%rdi), %r9
1976- ; AVX2-NEXT: movq 8(%rdi), %r10
1977- ; AVX2-NEXT: xorl %ebx, %ebx
1978- ; AVX2-NEXT: tzcntq %r9, %rbx
1979- ; AVX2-NEXT: tzcntq %r10, %rax
1980- ; AVX2-NEXT: addq $64, %rax
1981- ; AVX2-NEXT: testq %r9, %r9
1982- ; AVX2-NEXT: cmovneq %rbx, %rax
1983- ; AVX2-NEXT: xorl %r14d, %r14d
1984- ; AVX2-NEXT: tzcntq %r8, %r14
1939+ ; AVX2-NEXT: movq 40(%rdi), %r9
1940+ ; AVX2-NEXT: movq 32(%rdi), %r10
1941+ ; AVX2-NEXT: movq 24(%rdi), %r8
1942+ ; AVX2-NEXT: movq 16(%rdi), %rdx
1943+ ; AVX2-NEXT: movq (%rdi), %rcx
1944+ ; AVX2-NEXT: movq 8(%rdi), %rsi
1945+ ; AVX2-NEXT: tzcntq %rcx, %rax
19851946; AVX2-NEXT: xorl %ebx, %ebx
19861947; AVX2-NEXT: tzcntq %rsi, %rbx
19871948; AVX2-NEXT: addq $64, %rbx
1988- ; AVX2-NEXT: testq %r8, %r8
1989- ; AVX2-NEXT: cmovneq %r14, %rbx
1990- ; AVX2-NEXT: subq $-128, %rbx
1991- ; AVX2-NEXT: movq %r9, %r14
1992- ; AVX2-NEXT: movq %r9, %r15
1993- ; AVX2-NEXT: orq %r10, %r15
1949+ ; AVX2-NEXT: testq %rcx, %rcx
19941950; AVX2-NEXT: cmovneq %rax, %rbx
19951951; AVX2-NEXT: xorl %eax, %eax
1996- ; AVX2-NEXT: tzcntq %r11 , %rax
1997- ; AVX2-NEXT: xorl %r12d , %r12d
1998- ; AVX2-NEXT: tzcntq %rdx , %r12
1999- ; AVX2-NEXT: addq $64 , %r12
2000- ; AVX2-NEXT: testq %r11 , %r11
2001- ; AVX2-NEXT: cmovneq %rax , %r12
2002- ; AVX2-NEXT: movq 48(%rdi) , %r15
2003- ; AVX2-NEXT: xorl %r13d , %r13d
2004- ; AVX2-NEXT: tzcntq %r15 , %r13
1952+ ; AVX2-NEXT: tzcntq %rdx , %rax
1953+ ; AVX2-NEXT: tzcntq %r8 , %r11
1954+ ; AVX2-NEXT: addq $64 , %r11
1955+ ; AVX2-NEXT: testq %rdx , %rdx
1956+ ; AVX2-NEXT: cmovneq %rax , %r11
1957+ ; AVX2-NEXT: subq $-128 , %r11
1958+ ; AVX2-NEXT: movq %rcx , %rax
1959+ ; AVX2-NEXT: orq %rsi , %rax
1960+ ; AVX2-NEXT: cmovneq %rbx , %r11
20051961; AVX2-NEXT: xorl %eax, %eax
2006- ; AVX2-NEXT: tzcntq %rcx, %rax
1962+ ; AVX2-NEXT: tzcntq %r10, %rax
1963+ ; AVX2-NEXT: xorl %ebx, %ebx
1964+ ; AVX2-NEXT: tzcntq %r9, %rbx
1965+ ; AVX2-NEXT: addq $64, %rbx
1966+ ; AVX2-NEXT: testq %r10, %r10
1967+ ; AVX2-NEXT: cmovneq %rax, %rbx
1968+ ; AVX2-NEXT: movq 48(%rdi), %r14
1969+ ; AVX2-NEXT: xorl %r15d, %r15d
1970+ ; AVX2-NEXT: tzcntq %r14, %r15
1971+ ; AVX2-NEXT: xorl %eax, %eax
1972+ ; AVX2-NEXT: tzcntq 56(%rdi), %rax
20071973; AVX2-NEXT: addq $64, %rax
2008- ; AVX2-NEXT: testq %r15 , %r15
2009- ; AVX2-NEXT: cmovneq %r13 , %rax
1974+ ; AVX2-NEXT: testq %r14 , %r14
1975+ ; AVX2-NEXT: cmovneq %r15 , %rax
20101976; AVX2-NEXT: subq $-128, %rax
2011- ; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0
2012- ; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
2013- ; AVX2-NEXT: movq %r11, -{{[0-9]+}}(%rsp)
2014- ; AVX2-NEXT: orq %rdx, %r11
2015- ; AVX2-NEXT: cmovneq %r12, %rax
2016- ; AVX2-NEXT: addq $256, %rax # imm = 0x100
2017- ; AVX2-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
2018- ; AVX2-NEXT: orq %rsi, %r10
2019- ; AVX2-NEXT: orq %r8, %r14
2020- ; AVX2-NEXT: orq %r10, %r14
1977+ ; AVX2-NEXT: orq %r9, %r10
20211978; AVX2-NEXT: cmovneq %rbx, %rax
2022- ; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
2023- ; AVX2-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
2024- ; AVX2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
2025- ; AVX2-NEXT: movq %r15, -{{[0-9]+}}(%rsp)
2026- ; AVX2-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
2027- ; AVX2-NEXT: movq %rsi, -{{[0-9]+}}(%rsp)
2028- ; AVX2-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
1979+ ; AVX2-NEXT: addq $256, %rax # imm = 0x100
1980+ ; AVX2-NEXT: orq %r8, %rsi
1981+ ; AVX2-NEXT: orq %rdx, %rcx
1982+ ; AVX2-NEXT: orq %rsi, %rcx
1983+ ; AVX2-NEXT: cmovneq %r11, %rax
1984+ ; AVX2-NEXT: movl $-2, %edx
1985+ ; AVX2-NEXT: movl %eax, %ecx
1986+ ; AVX2-NEXT: roll %cl, %edx
20291987; AVX2-NEXT: movl %eax, %ecx
2030- ; AVX2-NEXT: andl $32, %ecx
2031- ; AVX2-NEXT: movl %eax, %edx
2032- ; AVX2-NEXT: andl $480, %edx # imm = 0x1E0
2033- ; AVX2-NEXT: shrl $3, %edx
2034- ; AVX2-NEXT: movl %edx, %esi
2035- ; AVX2-NEXT: andl $-8, %esi
2036- ; AVX2-NEXT: shrxq %rcx, -128(%rsp,%rsi), %r8
2037- ; AVX2-NEXT: notl %ecx
2038- ; AVX2-NEXT: movl -120(%rsp,%rsi), %esi
2039- ; AVX2-NEXT: addl %esi, %esi
2040- ; AVX2-NEXT: shlxq %rcx, %rsi, %rcx
2041- ; AVX2-NEXT: orl %r8d, %ecx
2042- ; AVX2-NEXT: btrl %eax, %ecx
2043- ; AVX2-NEXT: movl %ecx, (%rdi,%rdx)
1988+ ; AVX2-NEXT: shrl $3, %ecx
1989+ ; AVX2-NEXT: andl $60, %ecx
1990+ ; AVX2-NEXT: andl %edx, (%rdi,%rcx)
20441991; AVX2-NEXT: # kill: def $eax killed $eax killed $rax
20451992; AVX2-NEXT: popq %rbx
2046- ; AVX2-NEXT: popq %r12
2047- ; AVX2-NEXT: popq %r13
20481993; AVX2-NEXT: popq %r14
20491994; AVX2-NEXT: popq %r15
2050- ; AVX2-NEXT: vzeroupper
20511995; AVX2-NEXT: retq
20521996;
20531997; AVX512-LABEL: blsr_u512:
20541998; AVX512: # %bb.0:
2055- ; AVX512-NEXT: pushq %rax
2056- ; AVX512-NEXT: vmovups (%rdi), %ymm0
2057- ; AVX512-NEXT: vmovups 32(%rdi), %ymm1
2058- ; AVX512-NEXT: vmovdqu64 (%rdi), %zmm2
2059- ; AVX512-NEXT: vpternlogd {{.*#+}} zmm3 = -1
2060- ; AVX512-NEXT: vpaddq %zmm3, %zmm2, %zmm3
2061- ; AVX512-NEXT: vpandnq %zmm3, %zmm2, %zmm3
2062- ; AVX512-NEXT: vplzcntq %zmm3, %zmm3
2063- ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm4 = [64,128,192,256,320,384,448,512]
2064- ; AVX512-NEXT: vpsubq %zmm3, %zmm4, %zmm3
2065- ; AVX512-NEXT: vptestmq %zmm2, %zmm2, %k1
2066- ; AVX512-NEXT: vpbroadcastq {{.*#+}} zmm2 = [512,512,512,512,512,512,512,512]
2067- ; AVX512-NEXT: vpcompressq %zmm3, %zmm2 {%k1}
2068- ; AVX512-NEXT: vmovq %xmm2, %rax
2069- ; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2
2070- ; AVX512-NEXT: vmovdqu %ymm2, -{{[0-9]+}}(%rsp)
2071- ; AVX512-NEXT: vmovdqu %ymm2, -{{[0-9]+}}(%rsp)
2072- ; AVX512-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
2073- ; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
1999+ ; AVX512-NEXT: vmovdqu64 (%rdi), %zmm0
2000+ ; AVX512-NEXT: vpternlogd {{.*#+}} zmm1 = -1
2001+ ; AVX512-NEXT: vpaddq %zmm1, %zmm0, %zmm1
2002+ ; AVX512-NEXT: vpandnq %zmm1, %zmm0, %zmm1
2003+ ; AVX512-NEXT: vplzcntq %zmm1, %zmm1
2004+ ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm2 = [64,128,192,256,320,384,448,512]
2005+ ; AVX512-NEXT: vpsubq %zmm1, %zmm2, %zmm1
2006+ ; AVX512-NEXT: vptestmq %zmm0, %zmm0, %k1
2007+ ; AVX512-NEXT: vpbroadcastq {{.*#+}} zmm0 = [512,512,512,512,512,512,512,512]
2008+ ; AVX512-NEXT: vpcompressq %zmm1, %zmm0 {%k1}
2009+ ; AVX512-NEXT: vmovq %xmm0, %rax
2010+ ; AVX512-NEXT: movl $-2, %edx
2011+ ; AVX512-NEXT: movl %eax, %ecx
2012+ ; AVX512-NEXT: roll %cl, %edx
20742013; AVX512-NEXT: movl %eax, %ecx
2075- ; AVX512-NEXT: andl $32, %ecx
2076- ; AVX512-NEXT: movl %ecx, %edx
2077- ; AVX512-NEXT: notl %edx
2078- ; AVX512-NEXT: movl %eax, %esi
2079- ; AVX512-NEXT: shrl $3, %esi
2080- ; AVX512-NEXT: movl %esi, %r8d
2081- ; AVX512-NEXT: andl $56, %r8d
2082- ; AVX512-NEXT: movl -120(%rsp,%r8), %r9d
2083- ; AVX512-NEXT: addl %r9d, %r9d
2084- ; AVX512-NEXT: shlxq %rdx, %r9, %rdx
20852014; AVX512-NEXT: shrl $3, %ecx
2086- ; AVX512-NEXT: addq %rsp, %r8
2087- ; AVX512-NEXT: addq $-128, %r8
2088- ; AVX512-NEXT: orl (%rcx,%r8), %edx
2089- ; AVX512-NEXT: btrl %eax, %edx
2090- ; AVX512-NEXT: andl $60, %esi
2091- ; AVX512-NEXT: movl %edx, (%rdi,%rsi)
2015+ ; AVX512-NEXT: andl $60, %ecx
2016+ ; AVX512-NEXT: andl %edx, (%rdi,%rcx)
20922017; AVX512-NEXT: # kill: def $eax killed $eax killed $rax
2093- ; AVX512-NEXT: popq %rcx
20942018; AVX512-NEXT: vzeroupper
20952019; AVX512-NEXT: retq
20962020 %ld = load i512 , ptr %word
0 commit comments