@@ -1913,124 +1913,6 @@ define amdgpu_kernel void @s_copysign_out_f16_mag_f64_sign_f16(ptr addrspace(1)
19131913; GFX9-NEXT: global_store_short v0, v1, s[0:1]
19141914; GFX9-NEXT: s_endpgm
19151915;
1916- <<<<<<< HEAD
1917- ; GFX11-TRUE16-LABEL: s_copysign_out_f16_mag_f64_sign_f16:
1918- ; GFX11-TRUE16: ; %bb.0:
1919- ; GFX11-TRUE16-NEXT: s_clause 0x1
1920- ; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
1921- ; GFX11-TRUE16-NEXT: s_load_b32 s4, s[4:5], 0x34
1922- ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
1923- ; GFX11-TRUE16-NEXT: s_and_b32 s5, s3, 0x1ff
1924- ; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s3, 8
1925- ; GFX11-TRUE16-NEXT: s_or_b32 s2, s5, s2
1926- ; GFX11-TRUE16-NEXT: s_and_b32 s5, s6, 0xffe
1927- ; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s2, 0
1928- ; GFX11-TRUE16-NEXT: s_cselect_b32 s2, -1, 0
1929- ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
1930- ; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2
1931- ; GFX11-TRUE16-NEXT: s_bfe_u32 s2, s3, 0xb0014
1932- ; GFX11-TRUE16-NEXT: s_sub_i32 s3, 0x3f1, s2
1933- ; GFX11-TRUE16-NEXT: s_addk_i32 s2, 0xfc10
1934- ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
1935- ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, s5, v0
1936- ; GFX11-TRUE16-NEXT: v_med3_i32 v1, s3, 0, 13
1937- ; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s2, 12
1938- ; GFX11-TRUE16-NEXT: s_cmp_lt_i32 s2, 1
1939- ; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x1000, v0
1940- ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1941- ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v1, v2
1942- ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, v1, v3
1943- ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
1944- ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v2
1945- ; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, s3, v0
1946- ; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo
1947- ; GFX11-TRUE16-NEXT: s_cselect_b32 vcc_lo, -1, 0
1948- ; GFX11-TRUE16-NEXT: s_cmp_lt_i32 s2, 31
1949- ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v1
1950- ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1951- ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc_lo
1952- ; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 7, v1
1953- ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 2, v1
1954- ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
1955- ; GFX11-TRUE16-NEXT: v_cmp_lt_i32_e32 vcc_lo, 5, v2
1956- ; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo
1957- ; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v2
1958- ; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo
1959- ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
1960- ; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v3
1961- ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, 0x7e00
1962- ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
1963- ; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, 0x7c00, v3 :: v_dual_add_nc_u32 v1, v1, v2
1964- ; GFX11-TRUE16-NEXT: s_cselect_b32 vcc_lo, -1, 0
1965- ; GFX11-TRUE16-NEXT: s_cmpk_eq_i32 s2, 0x40f
1966- ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, 0x7c00, v1, vcc_lo
1967- ; GFX11-TRUE16-NEXT: s_cselect_b32 vcc_lo, -1, 0
1968- ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
1969- ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
1970- ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, s4
1971- ; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0x7fff, v0, v1
1972- ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0
1973- ; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
1974- ; GFX11-TRUE16-NEXT: s_endpgm
1975- ;
1976- ; GFX11-FAKE16-LABEL: s_copysign_out_f16_mag_f64_sign_f16:
1977- ; GFX11-FAKE16: ; %bb.0:
1978- ; GFX11-FAKE16-NEXT: s_clause 0x1
1979- ; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
1980- ; GFX11-FAKE16-NEXT: s_load_b32 s4, s[4:5], 0x34
1981- ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
1982- ; GFX11-FAKE16-NEXT: s_and_b32 s5, s3, 0x1ff
1983- ; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s3, 8
1984- ; GFX11-FAKE16-NEXT: s_or_b32 s2, s5, s2
1985- ; GFX11-FAKE16-NEXT: s_and_b32 s5, s6, 0xffe
1986- ; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s2, 0
1987- ; GFX11-FAKE16-NEXT: s_cselect_b32 s2, -1, 0
1988- ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
1989- ; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2
1990- ; GFX11-FAKE16-NEXT: s_bfe_u32 s2, s3, 0xb0014
1991- ; GFX11-FAKE16-NEXT: s_sub_i32 s3, 0x3f1, s2
1992- ; GFX11-FAKE16-NEXT: s_addk_i32 s2, 0xfc10
1993- ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
1994- ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, s5, v0
1995- ; GFX11-FAKE16-NEXT: v_med3_i32 v1, s3, 0, 13
1996- ; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s2, 12
1997- ; GFX11-FAKE16-NEXT: s_cmp_lt_i32 s2, 1
1998- ; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, 0x1000, v0
1999- ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2000- ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v1, v2
2001- ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, v1, v3
2002- ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
2003- ; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v2
2004- ; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, s3, v0
2005- ; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo
2006- ; GFX11-FAKE16-NEXT: s_cselect_b32 vcc_lo, -1, 0
2007- ; GFX11-FAKE16-NEXT: s_cmp_lt_i32 s2, 31
2008- ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v3, v1
2009- ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2010- ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc_lo
2011- ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 7, v1
2012- ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 2, v1
2013- ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
2014- ; GFX11-FAKE16-NEXT: v_cmp_lt_i32_e32 vcc_lo, 5, v2
2015- ; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo
2016- ; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v2
2017- ; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo
2018- ; GFX11-FAKE16-NEXT: s_cselect_b32 vcc_lo, -1, 0
2019- ; GFX11-FAKE16-NEXT: s_cmpk_eq_i32 s2, 0x40f
2020- ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2021- ; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v3
2022- ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, 0x7e00 :: v_dual_add_nc_u32 v1, v1, v2
2023- ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
2024- ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, 0x7c00, v1, vcc_lo
2025- ; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
2026- ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7c00, v2, vcc_lo
2027- ; GFX11-FAKE16-NEXT: s_cselect_b32 vcc_lo, -1, 0
2028- ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2029- ; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v0, v1, v0 :: v_dual_mov_b32 v1, 0
2030- ; GFX11-FAKE16-NEXT: v_bfi_b32 v0, 0x7fff, v0, s4
2031- ; GFX11-FAKE16-NEXT: global_store_b16 v1, v0, s[0:1]
2032- ; GFX11-FAKE16-NEXT: s_endpgm
2033- =======
20341916; GFX11-LABEL: s_copysign_out_f16_mag_f64_sign_f16:
20351917; GFX11: ; %bb.0:
20361918; GFX11-NEXT: s_clause 0x1
@@ -2047,49 +1929,47 @@ define amdgpu_kernel void @s_copysign_out_f16_mag_f64_sign_f16(ptr addrspace(1)
20471929; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2
20481930; GFX11-NEXT: s_bfe_u32 s2, s3, 0xb0014
20491931; GFX11-NEXT: s_sub_i32 s3, 0x3f1, s2
2050- ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
2051- ; GFX11-NEXT: v_med3_i32 v1, s3, 0, 13
2052- ; GFX11-NEXT: v_readfirstlane_b32 s3, v0
2053- ; GFX11-NEXT: v_mov_b32_e32 v0, s4
2054- ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
2055- ; GFX11-NEXT: v_readfirstlane_b32 s6, v1
2056- ; GFX11-NEXT: v_mov_b32_e32 v1, 0
2057- ; GFX11-NEXT: s_or_b32 s3, s5, s3
2058- ; GFX11-NEXT: s_or_b32 s5, s3, 0x1000
2059- ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
2060- ; GFX11-NEXT: s_lshr_b32 s7, s5, s6
2061- ; GFX11-NEXT: s_lshl_b32 s6, s7, s6
2062- ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
2063- ; GFX11-NEXT: s_cmp_lg_u32 s6, s5
2064- ; GFX11-NEXT: s_cselect_b32 s5, 1, 0
20651932; GFX11-NEXT: s_addk_i32 s2, 0xfc10
2066- ; GFX11-NEXT: s_or_b32 s5, s7, s5
2067- ; GFX11-NEXT: s_lshl_b32 s6, s2, 12
2068- ; GFX11-NEXT: s_or_b32 s6, s3, s6
1933+ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
1934+ ; GFX11-NEXT: v_or_b32_e32 v0, s5, v0
1935+ ; GFX11-NEXT: v_med3_i32 v1, s3, 0, 13
1936+ ; GFX11-NEXT: s_lshl_b32 s3, s2, 12
20691937; GFX11-NEXT: s_cmp_lt_i32 s2, 1
2070- ; GFX11-NEXT: s_cselect_b32 s5, s5, s6
2071- ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
2072- ; GFX11-NEXT: s_and_b32 s6, s5, 7
2073- ; GFX11-NEXT: s_cmp_gt_i32 s6, 5
2074- ; GFX11-NEXT: s_cselect_b32 s7, 1, 0
2075- ; GFX11-NEXT: s_cmp_eq_u32 s6, 3
2076- ; GFX11-NEXT: s_cselect_b32 s6, 1, 0
2077- ; GFX11-NEXT: s_lshr_b32 s5, s5, 2
2078- ; GFX11-NEXT: s_or_b32 s6, s6, s7
2079- ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
2080- ; GFX11-NEXT: s_add_i32 s5, s5, s6
1938+ ; GFX11-NEXT: v_or_b32_e32 v2, 0x1000, v0
1939+ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1940+ ; GFX11-NEXT: v_lshrrev_b32_e32 v3, v1, v2
1941+ ; GFX11-NEXT: v_lshlrev_b32_e32 v1, v1, v3
1942+ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
1943+ ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v2
1944+ ; GFX11-NEXT: v_or_b32_e32 v2, s3, v0
1945+ ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo
1946+ ; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0
20811947; GFX11-NEXT: s_cmp_lt_i32 s2, 31
2082- ; GFX11-NEXT: s_movk_i32 s6, 0x7e00
2083- ; GFX11-NEXT: s_cselect_b32 s5, s5, 0x7c00
2084- ; GFX11-NEXT: s_cmp_lg_u32 s3, 0
2085- ; GFX11-NEXT: s_cselect_b32 s3, s6, 0x7c00
1948+ ; GFX11-NEXT: v_or_b32_e32 v1, v3, v1
1949+ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1950+ ; GFX11-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc_lo
1951+ ; GFX11-NEXT: v_and_b32_e32 v2, 7, v1
1952+ ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 2, v1
1953+ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
1954+ ; GFX11-NEXT: v_cmp_lt_i32_e32 vcc_lo, 5, v2
1955+ ; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo
1956+ ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v2
1957+ ; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo
1958+ ; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0
20861959; GFX11-NEXT: s_cmpk_eq_i32 s2, 0x40f
2087- ; GFX11-NEXT: s_cselect_b32 s2, s3, s5
2088- ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
2089- ; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, s2, v0
1960+ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1961+ ; GFX11-NEXT: v_or_b32_e32 v2, v2, v3
1962+ ; GFX11-NEXT: v_dual_mov_b32 v2, 0x7e00 :: v_dual_add_nc_u32 v1, v1, v2
1963+ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
1964+ ; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x7c00, v1, vcc_lo
1965+ ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
1966+ ; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7c00, v2, vcc_lo
1967+ ; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0
1968+ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1969+ ; GFX11-NEXT: v_dual_cndmask_b32 v0, v1, v0 :: v_dual_mov_b32 v1, 0
1970+ ; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, v0, s4
20901971; GFX11-NEXT: global_store_b16 v1, v0, s[0:1]
20911972; GFX11-NEXT: s_endpgm
2092- >>>>>>> 41d8a9928050 (16bit sgpr folding)
20931973 %mag.trunc = fptrunc double %mag to half
20941974 %result = call half @llvm.copysign.f16 (half %mag.trunc , half %sign )
20951975 store half %result , ptr addrspace (1 ) %arg_out
0 commit comments