@@ -176,12 +176,7 @@ define amdgpu_kernel void @s_fabs_v2bf16(ptr addrspace(1) %out, <2 x bfloat> %in
176176; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
177177; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
178178; VI-NEXT: s_waitcnt lgkmcnt(0)
179- ; VI-NEXT: s_and_b32 s3, s2, 0x7fff
180- ; VI-NEXT: s_lshr_b32 s2, s2, 16
181- ; VI-NEXT: s_and_b32 s2, s2, 0x7fff
182- ; VI-NEXT: s_and_b32 s3, 0xffff, s3
183- ; VI-NEXT: s_lshl_b32 s2, s2, 16
184- ; VI-NEXT: s_or_b32 s2, s3, s2
179+ ; VI-NEXT: s_and_b32 s2, s2, 0x7fff7fff
185180; VI-NEXT: v_mov_b32_e32 v0, s0
186181; VI-NEXT: v_mov_b32_e32 v1, s1
187182; VI-NEXT: v_mov_b32_e32 v2, s2
@@ -194,44 +189,22 @@ define amdgpu_kernel void @s_fabs_v2bf16(ptr addrspace(1) %out, <2 x bfloat> %in
194189; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
195190; GFX9-NEXT: v_mov_b32_e32 v0, 0
196191; GFX9-NEXT: s_waitcnt lgkmcnt(0)
197- ; GFX9-NEXT: s_and_b32 s3, s2, 0x7fff
198- ; GFX9-NEXT: s_lshr_b32 s2, s2, 16
199- ; GFX9-NEXT: s_and_b32 s2, s2, 0x7fff
200- ; GFX9-NEXT: s_pack_ll_b32_b16 s2, s3, s2
192+ ; GFX9-NEXT: s_and_b32 s2, s2, 0x7fff7fff
201193; GFX9-NEXT: v_mov_b32_e32 v1, s2
202194; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
203195; GFX9-NEXT: s_endpgm
204196;
205- ; GFX11-TRUE16-LABEL: s_fabs_v2bf16:
206- ; GFX11-TRUE16: ; %bb.0:
207- ; GFX11-TRUE16-NEXT: s_clause 0x1
208- ; GFX11-TRUE16-NEXT: s_load_b32 s2, s[4:5], 0x8
209- ; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
210- ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
211- ; GFX11-TRUE16-NEXT: s_mov_b32 s3, s2
212- ; GFX11-TRUE16-NEXT: s_lshr_b32 s2, s2, 16
213- ; GFX11-TRUE16-NEXT: s_and_b32 s3, s3, 0x7fff
214- ; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0x7fff
215- ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
216- ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s3, s2
217- ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
218- ; GFX11-TRUE16-NEXT: global_store_b32 v0, v1, s[0:1]
219- ; GFX11-TRUE16-NEXT: s_endpgm
220- ;
221- ; GFX11-FAKE16-LABEL: s_fabs_v2bf16:
222- ; GFX11-FAKE16: ; %bb.0:
223- ; GFX11-FAKE16-NEXT: s_clause 0x1
224- ; GFX11-FAKE16-NEXT: s_load_b32 s2, s[4:5], 0x8
225- ; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
226- ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
227- ; GFX11-FAKE16-NEXT: s_lshr_b32 s3, s2, 16
228- ; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0x7fff
229- ; GFX11-FAKE16-NEXT: s_and_b32 s3, s3, 0x7fff
230- ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
231- ; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s3
232- ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
233- ; GFX11-FAKE16-NEXT: global_store_b32 v0, v1, s[0:1]
234- ; GFX11-FAKE16-NEXT: s_endpgm
197+ ; GFX11-LABEL: s_fabs_v2bf16:
198+ ; GFX11: ; %bb.0:
199+ ; GFX11-NEXT: s_clause 0x1
200+ ; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x8
201+ ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
202+ ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
203+ ; GFX11-NEXT: s_and_b32 s2, s2, 0x7fff7fff
204+ ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
205+ ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
206+ ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
207+ ; GFX11-NEXT: s_endpgm
235208 %fabs = call <2 x bfloat> @llvm.fabs.v2bf16 (<2 x bfloat> %in )
236209 store <2 x bfloat> %fabs , ptr addrspace (1 ) %out
237210 ret void
@@ -492,59 +465,34 @@ define amdgpu_kernel void @v_fabs_v2bf16(ptr addrspace(1) %out, ptr addrspace(1)
492465; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
493466; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
494467; VI-NEXT: flat_load_dword v2, v[0:1]
495- ; VI-NEXT: v_mov_b32_e32 v3, 0x7fff
496468; VI-NEXT: s_waitcnt vmcnt(0)
497- ; VI-NEXT: v_and_b32_e32 v4, 0x7fff, v2
498- ; VI-NEXT: v_and_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
499- ; VI-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
469+ ; VI-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2
500470; VI-NEXT: flat_store_dword v[0:1], v2
501471; VI-NEXT: s_endpgm
502472;
503473; GFX9-LABEL: v_fabs_v2bf16:
504474; GFX9: ; %bb.0:
505475; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x8
506476; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
507- ; GFX9-NEXT: s_movk_i32 s2, 0x7fff
508477; GFX9-NEXT: s_waitcnt lgkmcnt(0)
509478; GFX9-NEXT: global_load_dword v1, v0, s[0:1]
510479; GFX9-NEXT: s_waitcnt vmcnt(0)
511- ; GFX9-NEXT: v_and_b32_e32 v2, 0x7fff, v1
512- ; GFX9-NEXT: v_and_b32_sdwa v1, s2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
513- ; GFX9-NEXT: s_mov_b32 s2, 0x5040100
514- ; GFX9-NEXT: v_perm_b32 v1, v1, v2, s2
480+ ; GFX9-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v1
515481; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
516482; GFX9-NEXT: s_endpgm
517483;
518- ; GFX11-TRUE16-LABEL: v_fabs_v2bf16:
519- ; GFX11-TRUE16: ; %bb.0:
520- ; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
521- ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
522- ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
523- ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 2, v0
524- ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
525- ; GFX11-TRUE16-NEXT: global_load_b32 v1, v0, s[0:1]
526- ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
527- ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0x7fff, v1.l
528- ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0x7fff, v1.h
529- ; GFX11-TRUE16-NEXT: global_store_b32 v0, v1, s[0:1]
530- ; GFX11-TRUE16-NEXT: s_endpgm
531- ;
532- ; GFX11-FAKE16-LABEL: v_fabs_v2bf16:
533- ; GFX11-FAKE16: ; %bb.0:
534- ; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
535- ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
536- ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
537- ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 2, v0
538- ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
539- ; GFX11-FAKE16-NEXT: global_load_b32 v1, v0, s[0:1]
540- ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
541- ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v1
542- ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0x7fff, v1
543- ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
544- ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0x7fff, v2
545- ; GFX11-FAKE16-NEXT: v_perm_b32 v1, v2, v1, 0x5040100
546- ; GFX11-FAKE16-NEXT: global_store_b32 v0, v1, s[0:1]
547- ; GFX11-FAKE16-NEXT: s_endpgm
484+ ; GFX11-LABEL: v_fabs_v2bf16:
485+ ; GFX11: ; %bb.0:
486+ ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
487+ ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
488+ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
489+ ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
490+ ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
491+ ; GFX11-NEXT: global_load_b32 v1, v0, s[0:1]
492+ ; GFX11-NEXT: s_waitcnt vmcnt(0)
493+ ; GFX11-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v1
494+ ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
495+ ; GFX11-NEXT: s_endpgm
548496 %tid = call i32 @llvm.amdgcn.workitem.id.x ()
549497 %gep.in = getelementptr inbounds <2 x bfloat>, ptr addrspace (1 ) %in , i32 %tid
550498 %gep.out = getelementptr inbounds <2 x bfloat>, ptr addrspace (1 ) %in , i32 %tid
@@ -661,12 +609,12 @@ define amdgpu_kernel void @v_fabs_fold_self_v2bf16(ptr addrspace(1) %out, ptr ad
661609; VI-NEXT: v_mov_b32_e32 v0, s0
662610; VI-NEXT: v_mov_b32_e32 v1, s1
663611; VI-NEXT: s_waitcnt vmcnt(0)
664- ; VI-NEXT: v_and_b32_sdwa v4, v3 , v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
665- ; VI-NEXT: v_and_b32_sdwa v3 , v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
666- ; VI-NEXT: v_lshlrev_b32_e32 v5, 16 , v2
667- ; VI-NEXT: v_and_b32_e32 v2, 0xffff0000 , v2
668- ; VI-NEXT: v_mul_f32_e32 v3, v3, v5
669- ; VI-NEXT: v_mul_f32_e32 v2, v4, v2
612+ ; VI-NEXT: v_lshlrev_b32_e32 v4, 16 , v2
613+ ; VI-NEXT: v_and_b32_sdwa v5 , v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
614+ ; VI-NEXT: v_and_b32_e32 v6, 0xffff0000 , v2
615+ ; VI-NEXT: v_and_b32_sdwa v2, v3 , v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
616+ ; VI-NEXT: v_mul_f32_e32 v3, v5, v4
617+ ; VI-NEXT: v_mul_f32_e32 v2, v2, v6
670618; VI-NEXT: v_bfe_u32 v4, v3, 16, 1
671619; VI-NEXT: v_bfe_u32 v6, v2, 16, 1
672620; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3
@@ -693,20 +641,20 @@ define amdgpu_kernel void @v_fabs_fold_self_v2bf16(ptr addrspace(1) %out, ptr ad
693641; GFX9-NEXT: global_load_dword v0, v0, s[2:3]
694642; GFX9-NEXT: s_movk_i32 s2, 0x7fff
695643; GFX9-NEXT: s_waitcnt vmcnt(0)
644+ ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v0
696645; GFX9-NEXT: v_and_b32_sdwa v3, s2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
697- ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v0
698- ; GFX9-NEXT: v_and_b32_sdwa v2, s2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
699- ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
700- ; GFX9-NEXT: v_mul_f32_e32 v3, v3, v4
701- ; GFX9-NEXT: v_mul_f32_e32 v0, v2, v0
702- ; GFX9-NEXT: v_bfe_u32 v2, v3, 16, 1
703- ; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v3
646+ ; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
647+ ; GFX9-NEXT: v_and_b32_sdwa v0, s2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
648+ ; GFX9-NEXT: v_mul_f32_e32 v2, v3, v2
649+ ; GFX9-NEXT: v_mul_f32_e32 v0, v0, v4
650+ ; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1
651+ ; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2
704652; GFX9-NEXT: v_bfe_u32 v5, v0, 16, 1
705- ; GFX9-NEXT: v_add3_u32 v2, v2, v3 , s2
706- ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
653+ ; GFX9-NEXT: v_add3_u32 v3, v3, v2 , s2
654+ ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
707655; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v0
708656; GFX9-NEXT: v_add3_u32 v5, v5, v0, s2
709- ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2 , v4, vcc
657+ ; GFX9-NEXT: v_cndmask_b32_e32 v2, v3 , v4, vcc
710658; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
711659; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v6, vcc
712660; GFX9-NEXT: s_mov_b32 s2, 0x7060302
@@ -846,24 +794,24 @@ define amdgpu_kernel void @v_fabs_fold_v2bf16(ptr addrspace(1) %out, ptr addrspa
846794; VI-NEXT: s_and_b32 s1, s4, 0xffff0000
847795; VI-NEXT: s_movk_i32 s2, 0x7fff
848796; VI-NEXT: s_waitcnt vmcnt(0)
849- ; VI-NEXT: v_and_b32_sdwa v4, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
850- ; VI-NEXT: v_and_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
851- ; VI-NEXT: v_mul_f32_e32 v2 , s0, v2
852- ; VI-NEXT: v_mul_f32_e32 v3 , s1, v4
853- ; VI-NEXT: v_bfe_u32 v4, v2 , 16, 1
854- ; VI-NEXT: v_bfe_u32 v6, v3 , 16, 1
855- ; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v2
856- ; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v3
797+ ; VI-NEXT: v_and_b32_sdwa v4, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
798+ ; VI-NEXT: v_and_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
799+ ; VI-NEXT: v_mul_f32_e32 v3 , s0, v4
800+ ; VI-NEXT: v_mul_f32_e32 v2 , s1, v2
801+ ; VI-NEXT: v_bfe_u32 v4, v3 , 16, 1
802+ ; VI-NEXT: v_bfe_u32 v6, v2 , 16, 1
803+ ; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3
804+ ; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v2
857805; VI-NEXT: v_add_u32_e32 v4, vcc, s2, v4
858806; VI-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
859- ; VI-NEXT: v_or_b32_e32 v5, 0x400000, v2
860- ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
861- ; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3
862- ; VI-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc
807+ ; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3
863808; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
864- ; VI-NEXT: v_cndmask_b32_e32 v3, v6, v7, vcc
865- ; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
866- ; VI-NEXT: v_alignbit_b32 v2, v3, v2, 16
809+ ; VI-NEXT: v_or_b32_e32 v7, 0x400000, v2
810+ ; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
811+ ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
812+ ; VI-NEXT: v_cndmask_b32_e32 v2, v6, v7, vcc
813+ ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
814+ ; VI-NEXT: v_alignbit_b32 v2, v2, v3, 16
867815; VI-NEXT: flat_store_dword v[0:1], v2
868816; VI-NEXT: s_endpgm
869817;
@@ -879,22 +827,22 @@ define amdgpu_kernel void @v_fabs_fold_v2bf16(ptr addrspace(1) %out, ptr addrspa
879827; GFX9-NEXT: s_lshl_b32 s3, s4, 16
880828; GFX9-NEXT: s_and_b32 s4, s4, 0xffff0000
881829; GFX9-NEXT: s_waitcnt vmcnt(0)
882- ; GFX9-NEXT: v_and_b32_sdwa v2, s2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
883- ; GFX9-NEXT: v_and_b32_sdwa v0, s2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
884- ; GFX9-NEXT: v_mul_f32_e32 v0, s3, v0
885- ; GFX9-NEXT: v_mul_f32_e32 v2, s4, v2
886- ; GFX9-NEXT: v_bfe_u32 v3, v0, 16, 1
887- ; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v0
888- ; GFX9-NEXT: v_bfe_u32 v5, v2, 16, 1
889- ; GFX9-NEXT: v_add3_u32 v3, v3, v0, s2
890- ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
891- ; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v2
892- ; GFX9-NEXT: v_add3_u32 v5, v5, v2, s2
893- ; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc
830+ ; GFX9-NEXT: v_and_b32_sdwa v2, s2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
831+ ; GFX9-NEXT: v_and_b32_sdwa v0, s2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
832+ ; GFX9-NEXT: v_mul_f32_e32 v2, s3, v2
833+ ; GFX9-NEXT: v_mul_f32_e32 v0, s4, v0
834+ ; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1
835+ ; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2
836+ ; GFX9-NEXT: v_bfe_u32 v5, v0, 16, 1
837+ ; GFX9-NEXT: v_add3_u32 v3, v3, v2, s2
894838; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
895- ; GFX9-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc
839+ ; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v0
840+ ; GFX9-NEXT: v_add3_u32 v5, v5, v0, s2
841+ ; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
842+ ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
843+ ; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v6, vcc
896844; GFX9-NEXT: s_mov_b32 s2, 0x7060302
897- ; GFX9-NEXT: v_perm_b32 v0, v2, v0 , s2
845+ ; GFX9-NEXT: v_perm_b32 v0, v0, v2 , s2
898846; GFX9-NEXT: global_store_dword v1, v0, s[0:1]
899847; GFX9-NEXT: s_endpgm
900848;
@@ -1194,10 +1142,9 @@ define amdgpu_kernel void @v_extract_fabs_no_fold_v2bf16(ptr addrspace(1) %in) #
11941142; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
11951143; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
11961144; VI-NEXT: flat_load_dword v0, v[0:1]
1197- ; VI-NEXT: v_mov_b32_e32 v1, 0x7fff
11981145; VI-NEXT: s_waitcnt vmcnt(0)
1199- ; VI-NEXT: v_and_b32_sdwa v1, v1 , v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
1200- ; VI-NEXT: v_and_b32_e32 v0, 0x7fff , v0
1146+ ; VI-NEXT: v_and_b32_e32 v0, 0x7fff7fff , v0
1147+ ; VI-NEXT: v_lshrrev_b32_e32 v1, 16 , v0
12011148; VI-NEXT: flat_store_short v[0:1], v0
12021149; VI-NEXT: s_waitcnt vmcnt(0)
12031150; VI-NEXT: flat_store_short v[0:1], v1
@@ -1210,51 +1157,29 @@ define amdgpu_kernel void @v_extract_fabs_no_fold_v2bf16(ptr addrspace(1) %in) #
12101157; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
12111158; GFX9-NEXT: s_waitcnt lgkmcnt(0)
12121159; GFX9-NEXT: global_load_dword v0, v0, s[0:1]
1213- ; GFX9-NEXT: s_movk_i32 s0, 0x7fff
12141160; GFX9-NEXT: s_waitcnt vmcnt(0)
1215- ; GFX9-NEXT: v_and_b32_sdwa v1, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
1216- ; GFX9-NEXT: v_and_b32_e32 v0, 0x7fff, v0
1161+ ; GFX9-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
12171162; GFX9-NEXT: global_store_short v[0:1], v0, off
12181163; GFX9-NEXT: s_waitcnt vmcnt(0)
1219- ; GFX9-NEXT: global_store_short v[0:1], v1 , off
1164+ ; GFX9-NEXT: global_store_short_d16_hi v[0:1], v0 , off
12201165; GFX9-NEXT: s_waitcnt vmcnt(0)
12211166; GFX9-NEXT: s_endpgm
12221167;
1223- ; GFX11-TRUE16-LABEL: v_extract_fabs_no_fold_v2bf16:
1224- ; GFX11-TRUE16: ; %bb.0:
1225- ; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
1226- ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
1227- ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
1228- ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1229- ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
1230- ; GFX11-TRUE16-NEXT: global_load_b32 v0, v0, s[0:1]
1231- ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
1232- ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0x7fff, v0.l
1233- ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0x7fff, v0.h
1234- ; GFX11-TRUE16-NEXT: global_store_b16 v[0:1], v0, off dlc
1235- ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
1236- ; GFX11-TRUE16-NEXT: global_store_d16_hi_b16 v[0:1], v0, off dlc
1237- ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
1238- ; GFX11-TRUE16-NEXT: s_endpgm
1239- ;
1240- ; GFX11-FAKE16-LABEL: v_extract_fabs_no_fold_v2bf16:
1241- ; GFX11-FAKE16: ; %bb.0:
1242- ; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
1243- ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
1244- ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
1245- ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1246- ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
1247- ; GFX11-FAKE16-NEXT: global_load_b32 v0, v0, s[0:1]
1248- ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
1249- ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0
1250- ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0x7fff, v0
1251- ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
1252- ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0x7fff, v1
1253- ; GFX11-FAKE16-NEXT: global_store_b16 v[0:1], v0, off dlc
1254- ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
1255- ; GFX11-FAKE16-NEXT: global_store_b16 v[0:1], v1, off dlc
1256- ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
1257- ; GFX11-FAKE16-NEXT: s_endpgm
1168+ ; GFX11-LABEL: v_extract_fabs_no_fold_v2bf16:
1169+ ; GFX11: ; %bb.0:
1170+ ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
1171+ ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
1172+ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
1173+ ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1174+ ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1175+ ; GFX11-NEXT: global_load_b32 v0, v0, s[0:1]
1176+ ; GFX11-NEXT: s_waitcnt vmcnt(0)
1177+ ; GFX11-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
1178+ ; GFX11-NEXT: global_store_b16 v[0:1], v0, off dlc
1179+ ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
1180+ ; GFX11-NEXT: global_store_d16_hi_b16 v[0:1], v0, off dlc
1181+ ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
1182+ ; GFX11-NEXT: s_endpgm
12581183 %tid = call i32 @llvm.amdgcn.workitem.id.x ()
12591184 %gep.in = getelementptr inbounds <2 x bfloat>, ptr addrspace (1 ) %in , i32 %tid
12601185 %val = load <2 x bfloat>, ptr addrspace (1 ) %gep.in
0 commit comments