@@ -1390,47 +1390,19 @@ define amdgpu_ps <4 x i32> @s_copysign_v8bf16(<8 x bfloat> inreg %arg_mag, <8 x
13901390;
13911391; GFX8-LABEL: s_copysign_v8bf16:
13921392; GFX8: ; %bb.0:
1393- ; GFX8-NEXT: s_movk_i32 s8, 0x7fff
1393+ ; GFX8-NEXT: s_mov_b32 s8, 0x7fff7fff
13941394; GFX8-NEXT: v_mov_b32_e32 v0, s3
13951395; GFX8-NEXT: v_mov_b32_e32 v1, s7
1396- ; GFX8-NEXT: s_lshr_b32 s7, s7, 16
1397- ; GFX8-NEXT: s_lshr_b32 s3, s3, 16
13981396; GFX8-NEXT: v_bfi_b32 v0, s8, v0, v1
1399- ; GFX8-NEXT: v_mov_b32_e32 v1, s3
1400- ; GFX8-NEXT: v_mov_b32_e32 v2, s7
1401- ; GFX8-NEXT: v_bfi_b32 v1, s8, v1, v2
1402- ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
1403- ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
14041397; GFX8-NEXT: v_mov_b32_e32 v1, s2
14051398; GFX8-NEXT: v_mov_b32_e32 v2, s6
1406- ; GFX8-NEXT: s_lshr_b32 s3, s6, 16
1407- ; GFX8-NEXT: s_lshr_b32 s2, s2, 16
14081399; GFX8-NEXT: v_bfi_b32 v1, s8, v1, v2
1409- ; GFX8-NEXT: v_mov_b32_e32 v2, s2
1410- ; GFX8-NEXT: v_mov_b32_e32 v3, s3
1411- ; GFX8-NEXT: v_bfi_b32 v2, s8, v2, v3
1412- ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
1413- ; GFX8-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
14141400; GFX8-NEXT: v_mov_b32_e32 v2, s1
14151401; GFX8-NEXT: v_mov_b32_e32 v3, s5
1416- ; GFX8-NEXT: s_lshr_b32 s2, s5, 16
1417- ; GFX8-NEXT: s_lshr_b32 s1, s1, 16
14181402; GFX8-NEXT: v_bfi_b32 v2, s8, v2, v3
1419- ; GFX8-NEXT: v_mov_b32_e32 v3, s1
1420- ; GFX8-NEXT: v_mov_b32_e32 v4, s2
1421- ; GFX8-NEXT: v_bfi_b32 v3, s8, v3, v4
1422- ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3
1423- ; GFX8-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
14241403; GFX8-NEXT: v_mov_b32_e32 v3, s0
14251404; GFX8-NEXT: v_mov_b32_e32 v4, s4
1426- ; GFX8-NEXT: s_lshr_b32 s1, s4, 16
1427- ; GFX8-NEXT: s_lshr_b32 s0, s0, 16
14281405; GFX8-NEXT: v_bfi_b32 v3, s8, v3, v4
1429- ; GFX8-NEXT: v_mov_b32_e32 v4, s0
1430- ; GFX8-NEXT: v_mov_b32_e32 v5, s1
1431- ; GFX8-NEXT: v_bfi_b32 v4, s8, v4, v5
1432- ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v4
1433- ; GFX8-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
14341406; GFX8-NEXT: v_readfirstlane_b32 s0, v3
14351407; GFX8-NEXT: v_readfirstlane_b32 s1, v2
14361408; GFX8-NEXT: v_readfirstlane_b32 s2, v1
@@ -1439,47 +1411,19 @@ define amdgpu_ps <4 x i32> @s_copysign_v8bf16(<8 x bfloat> inreg %arg_mag, <8 x
14391411;
14401412; GFX9-LABEL: s_copysign_v8bf16:
14411413; GFX9: ; %bb.0:
1442- ; GFX9-NEXT: s_movk_i32 s8, 0x7fff
1414+ ; GFX9-NEXT: s_mov_b32 s8, 0x7fff7fff
14431415; GFX9-NEXT: v_mov_b32_e32 v0, s3
14441416; GFX9-NEXT: v_mov_b32_e32 v1, s7
1445- ; GFX9-NEXT: s_lshr_b32 s7, s7, 16
1446- ; GFX9-NEXT: s_lshr_b32 s3, s3, 16
14471417; GFX9-NEXT: v_bfi_b32 v0, s8, v0, v1
1448- ; GFX9-NEXT: v_mov_b32_e32 v1, s3
1449- ; GFX9-NEXT: v_mov_b32_e32 v2, s7
1450- ; GFX9-NEXT: v_bfi_b32 v1, s8, v1, v2
1451- ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
1452- ; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0
14531418; GFX9-NEXT: v_mov_b32_e32 v1, s2
14541419; GFX9-NEXT: v_mov_b32_e32 v2, s6
1455- ; GFX9-NEXT: s_lshr_b32 s3, s6, 16
1456- ; GFX9-NEXT: s_lshr_b32 s2, s2, 16
14571420; GFX9-NEXT: v_bfi_b32 v1, s8, v1, v2
1458- ; GFX9-NEXT: v_mov_b32_e32 v2, s2
1459- ; GFX9-NEXT: v_mov_b32_e32 v3, s3
1460- ; GFX9-NEXT: v_bfi_b32 v2, s8, v2, v3
1461- ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1
1462- ; GFX9-NEXT: v_lshl_or_b32 v1, v2, 16, v1
14631421; GFX9-NEXT: v_mov_b32_e32 v2, s1
14641422; GFX9-NEXT: v_mov_b32_e32 v3, s5
1465- ; GFX9-NEXT: s_lshr_b32 s2, s5, 16
1466- ; GFX9-NEXT: s_lshr_b32 s1, s1, 16
14671423; GFX9-NEXT: v_bfi_b32 v2, s8, v2, v3
1468- ; GFX9-NEXT: v_mov_b32_e32 v3, s1
1469- ; GFX9-NEXT: v_mov_b32_e32 v4, s2
1470- ; GFX9-NEXT: v_bfi_b32 v3, s8, v3, v4
1471- ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2
1472- ; GFX9-NEXT: v_lshl_or_b32 v2, v3, 16, v2
14731424; GFX9-NEXT: v_mov_b32_e32 v3, s0
14741425; GFX9-NEXT: v_mov_b32_e32 v4, s4
1475- ; GFX9-NEXT: s_lshr_b32 s1, s4, 16
1476- ; GFX9-NEXT: s_lshr_b32 s0, s0, 16
14771426; GFX9-NEXT: v_bfi_b32 v3, s8, v3, v4
1478- ; GFX9-NEXT: v_mov_b32_e32 v4, s0
1479- ; GFX9-NEXT: v_mov_b32_e32 v5, s1
1480- ; GFX9-NEXT: v_bfi_b32 v4, s8, v4, v5
1481- ; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v3
1482- ; GFX9-NEXT: v_lshl_or_b32 v3, v4, 16, v3
14831427; GFX9-NEXT: v_readfirstlane_b32 s0, v3
14841428; GFX9-NEXT: v_readfirstlane_b32 s1, v2
14851429; GFX9-NEXT: v_readfirstlane_b32 s2, v1
@@ -1488,85 +1432,36 @@ define amdgpu_ps <4 x i32> @s_copysign_v8bf16(<8 x bfloat> inreg %arg_mag, <8 x
14881432;
14891433; GFX10-LABEL: s_copysign_v8bf16:
14901434; GFX10: ; %bb.0:
1491- ; GFX10-NEXT: v_mov_b32_e32 v0, s7
1492- ; GFX10-NEXT: s_lshr_b32 s7, s7, 16
1435+ ; GFX10-NEXT: v_mov_b32_e32 v0, s4
1436+ ; GFX10-NEXT: v_mov_b32_e32 v1, s5
14931437; GFX10-NEXT: v_mov_b32_e32 v2, s6
1494- ; GFX10-NEXT: v_mov_b32_e32 v1, s7
1495- ; GFX10-NEXT: s_lshr_b32 s7, s6, 16
1496- ; GFX10-NEXT: v_bfi_b32 v0, 0x7fff, s3, v0
14971438; GFX10-NEXT: v_mov_b32_e32 v3, s7
1498- ; GFX10-NEXT: s_lshr_b32 s3, s3, 16
1499- ; GFX10-NEXT: v_mov_b32_e32 v4, s5
1500- ; GFX10-NEXT: v_mov_b32_e32 v5, s4
1501- ; GFX10-NEXT: v_bfi_b32 v1, 0x7fff, s3, v1
1502- ; GFX10-NEXT: s_lshr_b32 s3, s2, 16
1503- ; GFX10-NEXT: v_bfi_b32 v2, 0x7fff, s2, v2
1504- ; GFX10-NEXT: v_bfi_b32 v3, 0x7fff, s3, v3
1505- ; GFX10-NEXT: s_lshr_b32 s2, s5, 16
1506- ; GFX10-NEXT: s_lshr_b32 s3, s4, 16
1507- ; GFX10-NEXT: v_bfi_b32 v4, 0x7fff, s1, v4
1508- ; GFX10-NEXT: v_mov_b32_e32 v6, s2
1509- ; GFX10-NEXT: v_mov_b32_e32 v7, s3
1510- ; GFX10-NEXT: v_bfi_b32 v5, 0x7fff, s0, v5
1511- ; GFX10-NEXT: s_lshr_b32 s1, s1, 16
1512- ; GFX10-NEXT: s_lshr_b32 s0, s0, 16
1513- ; GFX10-NEXT: v_bfi_b32 v6, 0x7fff, s1, v6
1514- ; GFX10-NEXT: v_bfi_b32 v7, 0x7fff, s0, v7
1515- ; GFX10-NEXT: v_and_b32_e32 v5, 0xffff, v5
1516- ; GFX10-NEXT: v_and_b32_e32 v4, 0xffff, v4
1517- ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2
1518- ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0
1519- ; GFX10-NEXT: v_lshl_or_b32 v5, v7, 16, v5
1520- ; GFX10-NEXT: v_lshl_or_b32 v4, v6, 16, v4
1521- ; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2
1522- ; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0
1523- ; GFX10-NEXT: v_readfirstlane_b32 s0, v5
1524- ; GFX10-NEXT: v_readfirstlane_b32 s1, v4
1439+ ; GFX10-NEXT: v_bfi_b32 v0, 0x7fff7fff, s0, v0
1440+ ; GFX10-NEXT: v_bfi_b32 v1, 0x7fff7fff, s1, v1
1441+ ; GFX10-NEXT: v_bfi_b32 v2, 0x7fff7fff, s2, v2
1442+ ; GFX10-NEXT: v_bfi_b32 v3, 0x7fff7fff, s3, v3
1443+ ; GFX10-NEXT: v_readfirstlane_b32 s0, v0
1444+ ; GFX10-NEXT: v_readfirstlane_b32 s1, v1
15251445; GFX10-NEXT: v_readfirstlane_b32 s2, v2
1526- ; GFX10-NEXT: v_readfirstlane_b32 s3, v0
1446+ ; GFX10-NEXT: v_readfirstlane_b32 s3, v3
15271447; GFX10-NEXT: ; return to shader part epilog
15281448;
15291449; GFX11-LABEL: s_copysign_v8bf16:
15301450; GFX11: ; %bb.0:
1531- ; GFX11-NEXT: v_mov_b32_e32 v0, s7
1532- ; GFX11-NEXT: s_lshr_b32 s7, s7, 16
1533- ; GFX11-NEXT: v_dual_mov_b32 v4, s5 :: v_dual_mov_b32 v5, s4
1534- ; GFX11-NEXT: v_mov_b32_e32 v1, s7
1535- ; GFX11-NEXT: s_lshr_b32 s7, s6, 16
1536- ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
1451+ ; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
15371452; GFX11-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
1538- ; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, s3, v0
1539- ; GFX11-NEXT: s_lshr_b32 s3, s3, 16
1540- ; GFX11-NEXT: v_bfi_b32 v4, 0x7fff, s1, v4
1541- ; GFX11-NEXT: v_bfi_b32 v1, 0x7fff, s3, v1
1542- ; GFX11-NEXT: s_lshr_b32 s3, s2, 16
1543- ; GFX11-NEXT: v_bfi_b32 v2, 0x7fff, s2, v2
1544- ; GFX11-NEXT: v_bfi_b32 v3, 0x7fff, s3, v3
1545- ; GFX11-NEXT: s_lshr_b32 s2, s5, 16
1546- ; GFX11-NEXT: s_lshr_b32 s3, s4, 16
1547- ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
1548- ; GFX11-NEXT: v_dual_mov_b32 v6, s2 :: v_dual_mov_b32 v7, s3
1549- ; GFX11-NEXT: v_bfi_b32 v5, 0x7fff, s0, v5
1550- ; GFX11-NEXT: s_lshr_b32 s1, s1, 16
1551- ; GFX11-NEXT: s_lshr_b32 s0, s0, 16
1552- ; GFX11-NEXT: v_bfi_b32 v6, 0x7fff, s1, v6
1553- ; GFX11-NEXT: v_bfi_b32 v7, 0x7fff, s0, v7
1554- ; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5
1555- ; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v4
1556- ; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2
1557- ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
1558- ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
1559- ; GFX11-NEXT: v_lshl_or_b32 v5, v7, 16, v5
1560- ; GFX11-NEXT: v_lshl_or_b32 v4, v6, 16, v4
1561- ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
1562- ; GFX11-NEXT: v_lshl_or_b32 v2, v3, 16, v2
1563- ; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0
1453+ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
1454+ ; GFX11-NEXT: v_bfi_b32 v0, 0x7fff7fff, s0, v0
1455+ ; GFX11-NEXT: v_bfi_b32 v1, 0x7fff7fff, s1, v1
1456+ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
1457+ ; GFX11-NEXT: v_bfi_b32 v2, 0x7fff7fff, s2, v2
1458+ ; GFX11-NEXT: v_bfi_b32 v3, 0x7fff7fff, s3, v3
15641459; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
1565- ; GFX11-NEXT: v_readfirstlane_b32 s0, v5
1566- ; GFX11-NEXT: v_readfirstlane_b32 s1, v4
1460+ ; GFX11-NEXT: v_readfirstlane_b32 s0, v0
1461+ ; GFX11-NEXT: v_readfirstlane_b32 s1, v1
15671462; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
15681463; GFX11-NEXT: v_readfirstlane_b32 s2, v2
1569- ; GFX11-NEXT: v_readfirstlane_b32 s3, v0
1464+ ; GFX11-NEXT: v_readfirstlane_b32 s3, v3
15701465; GFX11-NEXT: ; return to shader part epilog
15711466 %out = call <8 x bfloat> @llvm.copysign.v8bf16 (<8 x bfloat> %arg_mag , <8 x bfloat> %arg_sign )
15721467 %cast = bitcast <8 x bfloat> %out to <4 x i32 >
@@ -2542,148 +2437,40 @@ define <8 x bfloat> @v_copysign_v8bf16(<8 x bfloat> %mag, <8 x bfloat> %sign) {
25422437; GFX8-LABEL: v_copysign_v8bf16:
25432438; GFX8: ; %bb.0:
25442439; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2545- ; GFX8-NEXT: v_lshrrev_b32_e32 v8, 16, v7
2546- ; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v3
2547- ; GFX8-NEXT: s_movk_i32 s4, 0x7fff
2548- ; GFX8-NEXT: v_bfi_b32 v8, s4, v9, v8
2549- ; GFX8-NEXT: v_bfi_b32 v3, s4, v3, v7
2550- ; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v6
2551- ; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v2
2552- ; GFX8-NEXT: v_bfi_b32 v7, s4, v9, v7
2553- ; GFX8-NEXT: v_bfi_b32 v2, s4, v2, v6
2554- ; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v5
2555- ; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v1
2556- ; GFX8-NEXT: v_bfi_b32 v6, s4, v9, v6
2557- ; GFX8-NEXT: v_bfi_b32 v1, s4, v1, v5
2558- ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v4
2559- ; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v0
2560- ; GFX8-NEXT: v_bfi_b32 v5, s4, v9, v5
2440+ ; GFX8-NEXT: s_mov_b32 s4, 0x7fff7fff
25612441; GFX8-NEXT: v_bfi_b32 v0, s4, v0, v4
2562- ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v5
2563- ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2564- ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v6
2565- ; GFX8-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2566- ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v7
2567- ; GFX8-NEXT: v_or_b32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2568- ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v8
2569- ; GFX8-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2442+ ; GFX8-NEXT: v_bfi_b32 v1, s4, v1, v5
2443+ ; GFX8-NEXT: v_bfi_b32 v2, s4, v2, v6
2444+ ; GFX8-NEXT: v_bfi_b32 v3, s4, v3, v7
25702445; GFX8-NEXT: s_setpc_b64 s[30:31]
25712446;
25722447; GFX9-LABEL: v_copysign_v8bf16:
25732448; GFX9: ; %bb.0:
25742449; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2575- ; GFX9-NEXT: s_movk_i32 s4, 0x7fff
2576- ; GFX9-NEXT: v_bfi_b32 v8, s4, v3, v7
2577- ; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v7
2578- ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3
2579- ; GFX9-NEXT: v_bfi_b32 v3, s4, v3, v7
2580- ; GFX9-NEXT: v_bfi_b32 v7, s4, v2, v6
2581- ; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v6
2582- ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2
2583- ; GFX9-NEXT: v_bfi_b32 v2, s4, v2, v6
2584- ; GFX9-NEXT: v_bfi_b32 v6, s4, v1, v5
2585- ; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v5
2586- ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
2587- ; GFX9-NEXT: v_bfi_b32 v1, s4, v1, v5
2588- ; GFX9-NEXT: v_bfi_b32 v5, s4, v0, v4
2589- ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v4
2590- ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
2450+ ; GFX9-NEXT: s_mov_b32 s4, 0x7fff7fff
25912451; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v4
2592- ; GFX9-NEXT: s_mov_b32 s4, 0x5040100
2593- ; GFX9-NEXT: v_perm_b32 v0, v0, v5, s4
2594- ; GFX9-NEXT: v_perm_b32 v1, v1, v6, s4
2595- ; GFX9-NEXT: v_perm_b32 v2, v2, v7, s4
2596- ; GFX9-NEXT: v_perm_b32 v3, v3, v8, s4
2452+ ; GFX9-NEXT: v_bfi_b32 v1, s4, v1, v5
2453+ ; GFX9-NEXT: v_bfi_b32 v2, s4, v2, v6
2454+ ; GFX9-NEXT: v_bfi_b32 v3, s4, v3, v7
25972455; GFX9-NEXT: s_setpc_b64 s[30:31]
25982456;
25992457; GFX10-LABEL: v_copysign_v8bf16:
26002458; GFX10: ; %bb.0:
26012459; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2602- ; GFX10-NEXT: v_bfi_b32 v8, 0x7fff, v3, v7
2603- ; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v7
2604- ; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v3
2605- ; GFX10-NEXT: v_bfi_b32 v9, 0x7fff, v2, v6
2606- ; GFX10-NEXT: v_lshrrev_b32_e32 v6, 16, v6
2607- ; GFX10-NEXT: v_lshrrev_b32_e32 v10, 16, v5
2608- ; GFX10-NEXT: v_lshrrev_b32_e32 v11, 16, v4
2609- ; GFX10-NEXT: v_lshrrev_b32_e32 v12, 16, v0
2610- ; GFX10-NEXT: v_lshrrev_b32_e32 v13, 16, v1
2611- ; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v2
2612- ; GFX10-NEXT: v_bfi_b32 v1, 0x7fff, v1, v5
2613- ; GFX10-NEXT: v_bfi_b32 v0, 0x7fff, v0, v4
2614- ; GFX10-NEXT: v_bfi_b32 v4, 0x7fff, v12, v11
2615- ; GFX10-NEXT: v_bfi_b32 v5, 0x7fff, v13, v10
2616- ; GFX10-NEXT: v_bfi_b32 v2, 0x7fff, v2, v6
2617- ; GFX10-NEXT: v_bfi_b32 v3, 0x7fff, v3, v7
2618- ; GFX10-NEXT: v_perm_b32 v0, v4, v0, 0x5040100
2619- ; GFX10-NEXT: v_perm_b32 v1, v5, v1, 0x5040100
2620- ; GFX10-NEXT: v_perm_b32 v2, v2, v9, 0x5040100
2621- ; GFX10-NEXT: v_perm_b32 v3, v3, v8, 0x5040100
2460+ ; GFX10-NEXT: v_bfi_b32 v0, 0x7fff7fff, v0, v4
2461+ ; GFX10-NEXT: v_bfi_b32 v1, 0x7fff7fff, v1, v5
2462+ ; GFX10-NEXT: v_bfi_b32 v2, 0x7fff7fff, v2, v6
2463+ ; GFX10-NEXT: v_bfi_b32 v3, 0x7fff7fff, v3, v7
26222464; GFX10-NEXT: s_setpc_b64 s[30:31]
26232465;
2624- ; GFX11TRUE16-LABEL: v_copysign_v8bf16:
2625- ; GFX11TRUE16: ; %bb.0:
2626- ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2627- ; GFX11TRUE16-NEXT: v_mov_b16_e32 v8.l, v3.l
2628- ; GFX11TRUE16-NEXT: v_mov_b16_e32 v9.l, v7.l
2629- ; GFX11TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h
2630- ; GFX11TRUE16-NEXT: v_mov_b16_e32 v7.l, v7.h
2631- ; GFX11TRUE16-NEXT: v_mov_b16_e32 v10.l, v2.l
2632- ; GFX11TRUE16-NEXT: v_mov_b16_e32 v11.l, v6.l
2633- ; GFX11TRUE16-NEXT: v_bfi_b32 v8, 0x7fff, v8, v9
2634- ; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
2635- ; GFX11TRUE16-NEXT: v_bfi_b32 v7, 0x7fff, v3, v7
2636- ; GFX11TRUE16-NEXT: v_mov_b16_e32 v9.l, v1.l
2637- ; GFX11TRUE16-NEXT: v_bfi_b32 v3, 0x7fff, v10, v11
2638- ; GFX11TRUE16-NEXT: v_mov_b16_e32 v10.l, v5.l
2639- ; GFX11TRUE16-NEXT: v_mov_b16_e32 v11.l, v0.l
2640- ; GFX11TRUE16-NEXT: v_mov_b16_e32 v12.l, v4.l
2641- ; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
2642- ; GFX11TRUE16-NEXT: v_mov_b16_e32 v4.l, v4.h
2643- ; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h
2644- ; GFX11TRUE16-NEXT: v_mov_b16_e32 v5.l, v5.h
2645- ; GFX11TRUE16-NEXT: v_mov_b16_e32 v6.l, v6.h
2646- ; GFX11TRUE16-NEXT: v_bfi_b32 v11, 0x7fff, v11, v12
2647- ; GFX11TRUE16-NEXT: v_bfi_b32 v4, 0x7fff, v0, v4
2648- ; GFX11TRUE16-NEXT: v_bfi_b32 v9, 0x7fff, v9, v10
2649- ; GFX11TRUE16-NEXT: v_bfi_b32 v5, 0x7fff, v1, v5
2650- ; GFX11TRUE16-NEXT: v_bfi_b32 v6, 0x7fff, v2, v6
2651- ; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v11.l
2652- ; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, v4.l
2653- ; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v9.l
2654- ; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, v5.l
2655- ; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.l, v3.l
2656- ; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.h, v6.l
2657- ; GFX11TRUE16-NEXT: v_mov_b16_e32 v3.l, v8.l
2658- ; GFX11TRUE16-NEXT: v_mov_b16_e32 v3.h, v7.l
2659- ; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
2660- ;
2661- ; GFX11FAKE16-LABEL: v_copysign_v8bf16:
2662- ; GFX11FAKE16: ; %bb.0:
2663- ; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2664- ; GFX11FAKE16-NEXT: v_bfi_b32 v8, 0x7fff, v3, v7
2665- ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v7, 16, v7
2666- ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
2667- ; GFX11FAKE16-NEXT: v_bfi_b32 v9, 0x7fff, v2, v6
2668- ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v6
2669- ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v10, 16, v5
2670- ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v11, 16, v4
2671- ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v12, 16, v0
2672- ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v13, 16, v1
2673- ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
2674- ; GFX11FAKE16-NEXT: v_bfi_b32 v1, 0x7fff, v1, v5
2675- ; GFX11FAKE16-NEXT: v_bfi_b32 v0, 0x7fff, v0, v4
2676- ; GFX11FAKE16-NEXT: v_bfi_b32 v4, 0x7fff, v12, v11
2677- ; GFX11FAKE16-NEXT: v_bfi_b32 v5, 0x7fff, v13, v10
2678- ; GFX11FAKE16-NEXT: v_bfi_b32 v2, 0x7fff, v2, v6
2679- ; GFX11FAKE16-NEXT: v_bfi_b32 v3, 0x7fff, v3, v7
2680- ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
2681- ; GFX11FAKE16-NEXT: v_perm_b32 v0, v4, v0, 0x5040100
2682- ; GFX11FAKE16-NEXT: v_perm_b32 v1, v5, v1, 0x5040100
2683- ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
2684- ; GFX11FAKE16-NEXT: v_perm_b32 v2, v2, v9, 0x5040100
2685- ; GFX11FAKE16-NEXT: v_perm_b32 v3, v3, v8, 0x5040100
2686- ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
2466+ ; GFX11-LABEL: v_copysign_v8bf16:
2467+ ; GFX11: ; %bb.0:
2468+ ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2469+ ; GFX11-NEXT: v_bfi_b32 v0, 0x7fff7fff, v0, v4
2470+ ; GFX11-NEXT: v_bfi_b32 v1, 0x7fff7fff, v1, v5
2471+ ; GFX11-NEXT: v_bfi_b32 v2, 0x7fff7fff, v2, v6
2472+ ; GFX11-NEXT: v_bfi_b32 v3, 0x7fff7fff, v3, v7
2473+ ; GFX11-NEXT: s_setpc_b64 s[30:31]
26872474 %result = call <8 x bfloat> @llvm.copysign.v8bf16 (<8 x bfloat> %mag , <8 x bfloat> %sign )
26882475 ret <8 x bfloat> %result
26892476}
0 commit comments