@@ -145,37 +145,36 @@ define <3 x half> @bitcast_v3bf16_to_v3f16(<3 x bfloat> %a, i32 %b) {
145
145
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
146
146
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB0_2
147
147
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true
148
- ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l
149
- ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
150
- ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
151
- ; GFX11-TRUE16-NEXT: v_dual_add_f32 v1, 0x40c00000, v1 :: v_dual_lshlrev_b32 v2, 16, v2
148
+ ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, 0
149
+ ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l
150
+ ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
151
+ ; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v2
152
+ ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v0.l
153
+ ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
152
154
; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
153
- ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
155
+ ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
156
+ ; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
154
157
; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v1, 16, 1
155
158
; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v2, 16, 1
159
+ ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
160
+ ; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v0, 16, 1
156
161
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v2
157
162
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
158
- ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
163
+ ; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v0
159
164
; GFX11-TRUE16-NEXT: v_add3_u32 v3, v3, v2, 0x7fff
165
+ ; GFX11-TRUE16-NEXT: v_add3_u32 v4, v4, v0, 0x7fff
166
+ ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
160
167
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v3, v5, vcc_lo
168
+ ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
161
169
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
162
170
; GFX11-TRUE16-NEXT: v_add3_u32 v5, v6, v1, 0x7fff
163
- ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
164
- ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
165
171
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
166
- ; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
167
- ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
168
- ; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v0, 16, 1
169
- ; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v0
170
- ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
171
- ; GFX11-TRUE16-NEXT: v_add3_u32 v4, v4, v0, 0x7fff
172
- ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
173
172
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v4, v7, vcc_lo
174
173
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
175
174
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0x7fc0
175
+ ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
176
176
; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v2, v0
177
177
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc_lo
178
- ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
179
178
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v3.h
180
179
; GFX11-TRUE16-NEXT: .LBB0_2: ; %end
181
180
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
@@ -797,40 +796,40 @@ define <3 x i16> @bitcast_v3bf16_to_v3i16(<3 x bfloat> %a, i32 %b) {
797
796
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
798
797
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB4_2
799
798
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true
800
- ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l
801
- ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
802
- ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
803
- ; GFX11-TRUE16-NEXT: v_dual_add_f32 v1, 0x40c00000, v1 :: v_dual_lshlrev_b32 v2, 16, v2
804
- ; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
799
+ ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, 0
800
+ ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l
801
+ ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v0
805
802
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
806
- ; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v1, 16, 1
807
- ; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v1
808
- ; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v2, 16, 1
809
- ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
810
- ; GFX11-TRUE16-NEXT: v_add3_u32 v4, v4, v1, 0x7fff
811
- ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
812
- ; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
813
- ; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v2, 0x7fff
814
- ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
815
- ; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
816
- ; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v0, 16, 1
803
+ ; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v2
804
+ ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v0.l
805
+ ; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v1
806
+ ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
807
+ ; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v3, 16, 1
808
+ ; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v2
809
+ ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
810
+ ; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 1
817
811
; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v0
818
812
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
819
- ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
820
- ; GFX11-TRUE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
821
- ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc_lo
822
- ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
823
- ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
824
- ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
825
- ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v5, v8, vcc_lo
813
+ ; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3
814
+ ; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v1, 16, 1
815
+ ; GFX11-TRUE16-NEXT: v_add3_u32 v2, v2, v0, 0x7fff
816
+ ; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v1
817
+ ; GFX11-TRUE16-NEXT: v_add3_u32 v4, v4, v3, 0x7fff
818
+ ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
819
+ ; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v1, 0x7fff
820
+ ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v2, v7, vcc_lo
826
821
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
822
+ ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
823
+ ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
824
+ ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v5, v8, vcc_lo
825
+ ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
826
+ ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v4, v6, vcc_lo
827
+ ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
828
+ ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
827
829
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
828
- ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo
829
830
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
830
- ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v0, 16, v2
831
- ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
832
- ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
833
- ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v1, 0x7fc0, 16, v1
831
+ ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v0, 16, v1
832
+ ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v1, 0x7fc0, 16, v2
834
833
; GFX11-TRUE16-NEXT: .LBB4_2: ; %end
835
834
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
836
835
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
0 commit comments