@@ -383,12 +383,7 @@ define amdgpu_kernel void @s_fneg_v2bf16(ptr addrspace(1) %out, <2 x bfloat> %in
383383; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
384384; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
385385; GFX8-NEXT: s_waitcnt lgkmcnt(0)
386- ; GFX8-NEXT: s_xor_b32 s3, s2, 0x8000
387- ; GFX8-NEXT: s_lshr_b32 s2, s2, 16
388- ; GFX8-NEXT: s_xor_b32 s2, s2, 0x8000
389- ; GFX8-NEXT: s_and_b32 s3, 0xffff, s3
390- ; GFX8-NEXT: s_lshl_b32 s2, s2, 16
391- ; GFX8-NEXT: s_or_b32 s2, s3, s2
386+ ; GFX8-NEXT: s_xor_b32 s2, s2, 0x80008000
392387; GFX8-NEXT: v_mov_b32_e32 v0, s0
393388; GFX8-NEXT: v_mov_b32_e32 v1, s1
394389; GFX8-NEXT: v_mov_b32_e32 v2, s2
@@ -401,44 +396,22 @@ define amdgpu_kernel void @s_fneg_v2bf16(ptr addrspace(1) %out, <2 x bfloat> %in
401396; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
402397; GFX9-NEXT: v_mov_b32_e32 v0, 0
403398; GFX9-NEXT: s_waitcnt lgkmcnt(0)
404- ; GFX9-NEXT: s_xor_b32 s3, s2, 0x8000
405- ; GFX9-NEXT: s_lshr_b32 s2, s2, 16
406- ; GFX9-NEXT: s_xor_b32 s2, s2, 0x8000
407- ; GFX9-NEXT: s_pack_ll_b32_b16 s2, s3, s2
399+ ; GFX9-NEXT: s_xor_b32 s2, s2, 0x80008000
408400; GFX9-NEXT: v_mov_b32_e32 v1, s2
409401; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
410402; GFX9-NEXT: s_endpgm
411403;
412- ; GFX11-TRUE16-LABEL: s_fneg_v2bf16:
413- ; GFX11-TRUE16: ; %bb.0:
414- ; GFX11-TRUE16-NEXT: s_clause 0x1
415- ; GFX11-TRUE16-NEXT: s_load_b32 s2, s[4:5], 0x8
416- ; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
417- ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
418- ; GFX11-TRUE16-NEXT: s_mov_b32 s3, s2
419- ; GFX11-TRUE16-NEXT: s_lshr_b32 s2, s2, 16
420- ; GFX11-TRUE16-NEXT: s_xor_b32 s3, s3, 0x8000
421- ; GFX11-TRUE16-NEXT: s_xor_b32 s2, s2, 0x8000
422- ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
423- ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s3, s2
424- ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
425- ; GFX11-TRUE16-NEXT: global_store_b32 v0, v1, s[0:1]
426- ; GFX11-TRUE16-NEXT: s_endpgm
427- ;
428- ; GFX11-FAKE16-LABEL: s_fneg_v2bf16:
429- ; GFX11-FAKE16: ; %bb.0:
430- ; GFX11-FAKE16-NEXT: s_clause 0x1
431- ; GFX11-FAKE16-NEXT: s_load_b32 s2, s[4:5], 0x8
432- ; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
433- ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
434- ; GFX11-FAKE16-NEXT: s_lshr_b32 s3, s2, 16
435- ; GFX11-FAKE16-NEXT: s_xor_b32 s2, s2, 0x8000
436- ; GFX11-FAKE16-NEXT: s_xor_b32 s3, s3, 0x8000
437- ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
438- ; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s3
439- ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
440- ; GFX11-FAKE16-NEXT: global_store_b32 v0, v1, s[0:1]
441- ; GFX11-FAKE16-NEXT: s_endpgm
404+ ; GFX11-LABEL: s_fneg_v2bf16:
405+ ; GFX11: ; %bb.0:
406+ ; GFX11-NEXT: s_clause 0x1
407+ ; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x8
408+ ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
409+ ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
410+ ; GFX11-NEXT: s_xor_b32 s2, s2, 0x80008000
411+ ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
412+ ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
413+ ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
414+ ; GFX11-NEXT: s_endpgm
442415 %fneg = fsub <2 x bfloat> <bfloat -0 .0 , bfloat -0 .0 >, %in
443416 store <2 x bfloat> %fneg , ptr addrspace (1 ) %out
444417 ret void
@@ -473,15 +446,10 @@ define amdgpu_kernel void @s_fneg_v2bf16_nonload(ptr addrspace(1) %out) #0 {
473446; GFX8-NEXT: ;;#ASMSTART
474447; GFX8-NEXT: ; def s2
475448; GFX8-NEXT: ;;#ASMEND
476- ; GFX8-NEXT: s_xor_b32 s3, s2, 0x8000
477- ; GFX8-NEXT: s_lshr_b32 s2, s2, 16
478- ; GFX8-NEXT: s_xor_b32 s2, s2, 0x8000
479- ; GFX8-NEXT: s_and_b32 s3, 0xffff, s3
480- ; GFX8-NEXT: s_lshl_b32 s2, s2, 16
481- ; GFX8-NEXT: s_or_b32 s2, s3, s2
449+ ; GFX8-NEXT: s_xor_b32 s2, s2, 0x80008000
450+ ; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
482451; GFX8-NEXT: s_waitcnt lgkmcnt(0)
483452; GFX8-NEXT: v_mov_b32_e32 v0, s0
484- ; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
485453; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
486454; GFX8-NEXT: v_mov_b32_e32 v1, s1
487455; GFX8-NEXT: v_mov_b32_e32 v2, s2
@@ -494,10 +462,7 @@ define amdgpu_kernel void @s_fneg_v2bf16_nonload(ptr addrspace(1) %out) #0 {
494462; GFX9-NEXT: ;;#ASMSTART
495463; GFX9-NEXT: ; def s2
496464; GFX9-NEXT: ;;#ASMEND
497- ; GFX9-NEXT: s_xor_b32 s3, s2, 0x8000
498- ; GFX9-NEXT: s_lshr_b32 s2, s2, 16
499- ; GFX9-NEXT: s_xor_b32 s2, s2, 0x8000
500- ; GFX9-NEXT: s_pack_ll_b32_b16 s2, s3, s2
465+ ; GFX9-NEXT: s_xor_b32 s2, s2, 0x80008000
501466; GFX9-NEXT: v_mov_b32_e32 v0, 0
502467; GFX9-NEXT: v_mov_b32_e32 v1, s2
503468; GFX9-NEXT: s_waitcnt lgkmcnt(0)
@@ -510,11 +475,8 @@ define amdgpu_kernel void @s_fneg_v2bf16_nonload(ptr addrspace(1) %out) #0 {
510475; GFX11-NEXT: ;;#ASMSTART
511476; GFX11-NEXT: ; def s2
512477; GFX11-NEXT: ;;#ASMEND
513- ; GFX11-NEXT: s_lshr_b32 s3, s2, 16
514- ; GFX11-NEXT: s_xor_b32 s2, s2, 0x8000
515- ; GFX11-NEXT: s_xor_b32 s3, s3, 0x8000
516- ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
517- ; GFX11-NEXT: s_pack_ll_b32_b16 s2, s2, s3
478+ ; GFX11-NEXT: s_xor_b32 s2, s2, 0x80008000
479+ ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
518480; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
519481; GFX11-NEXT: s_waitcnt lgkmcnt(0)
520482; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
@@ -561,59 +523,34 @@ define amdgpu_kernel void @v_fneg_v2bf16(ptr addrspace(1) %out, ptr addrspace(1)
561523; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0
562524; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
563525; GFX8-NEXT: flat_load_dword v2, v[0:1]
564- ; GFX8-NEXT: v_mov_b32_e32 v3, 0x8000
565526; GFX8-NEXT: s_waitcnt vmcnt(0)
566- ; GFX8-NEXT: v_xor_b32_e32 v4, 0x8000, v2
567- ; GFX8-NEXT: v_xor_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
568- ; GFX8-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
527+ ; GFX8-NEXT: v_xor_b32_e32 v2, 0x80008000, v2
569528; GFX8-NEXT: flat_store_dword v[0:1], v2
570529; GFX8-NEXT: s_endpgm
571530;
572531; GFX9-LABEL: v_fneg_v2bf16:
573532; GFX9: ; %bb.0:
574533; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x8
575534; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
576- ; GFX9-NEXT: s_mov_b32 s2, 0x8000
577535; GFX9-NEXT: s_waitcnt lgkmcnt(0)
578536; GFX9-NEXT: global_load_dword v1, v0, s[0:1]
579537; GFX9-NEXT: s_waitcnt vmcnt(0)
580- ; GFX9-NEXT: v_xor_b32_e32 v2, 0x8000, v1
581- ; GFX9-NEXT: v_xor_b32_sdwa v1, s2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
582- ; GFX9-NEXT: s_mov_b32 s2, 0x5040100
583- ; GFX9-NEXT: v_perm_b32 v1, v1, v2, s2
538+ ; GFX9-NEXT: v_xor_b32_e32 v1, 0x80008000, v1
584539; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
585540; GFX9-NEXT: s_endpgm
586541;
587- ; GFX11-TRUE16-LABEL: v_fneg_v2bf16:
588- ; GFX11-TRUE16: ; %bb.0:
589- ; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
590- ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
591- ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
592- ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 2, v0
593- ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
594- ; GFX11-TRUE16-NEXT: global_load_b32 v1, v0, s[0:1]
595- ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
596- ; GFX11-TRUE16-NEXT: v_xor_b16 v1.l, 0x8000, v1.l
597- ; GFX11-TRUE16-NEXT: v_xor_b16 v1.h, 0x8000, v1.h
598- ; GFX11-TRUE16-NEXT: global_store_b32 v0, v1, s[0:1]
599- ; GFX11-TRUE16-NEXT: s_endpgm
600- ;
601- ; GFX11-FAKE16-LABEL: v_fneg_v2bf16:
602- ; GFX11-FAKE16: ; %bb.0:
603- ; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
604- ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
605- ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
606- ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 2, v0
607- ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
608- ; GFX11-FAKE16-NEXT: global_load_b32 v1, v0, s[0:1]
609- ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
610- ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v1
611- ; GFX11-FAKE16-NEXT: v_xor_b32_e32 v1, 0x8000, v1
612- ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
613- ; GFX11-FAKE16-NEXT: v_xor_b32_e32 v2, 0x8000, v2
614- ; GFX11-FAKE16-NEXT: v_perm_b32 v1, v2, v1, 0x5040100
615- ; GFX11-FAKE16-NEXT: global_store_b32 v0, v1, s[0:1]
616- ; GFX11-FAKE16-NEXT: s_endpgm
542+ ; GFX11-LABEL: v_fneg_v2bf16:
543+ ; GFX11: ; %bb.0:
544+ ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
545+ ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
546+ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
547+ ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
548+ ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
549+ ; GFX11-NEXT: global_load_b32 v1, v0, s[0:1]
550+ ; GFX11-NEXT: s_waitcnt vmcnt(0)
551+ ; GFX11-NEXT: v_xor_b32_e32 v1, 0x80008000, v1
552+ ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
553+ ; GFX11-NEXT: s_endpgm
617554 %tid = call i32 @llvm.amdgcn.workitem.id.x ()
618555 %gep.in = getelementptr inbounds <2 x bfloat>, ptr addrspace (1 ) %in , i32 %tid
619556 %gep.out = getelementptr inbounds <2 x bfloat>, ptr addrspace (1 ) %in , i32 %tid
@@ -651,12 +588,7 @@ define amdgpu_kernel void @fneg_free_v2bf16(ptr addrspace(1) %out, i32 %in) #0 {
651588; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
652589; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
653590; GFX8-NEXT: s_waitcnt lgkmcnt(0)
654- ; GFX8-NEXT: s_xor_b32 s3, s2, 0x8000
655- ; GFX8-NEXT: s_lshr_b32 s2, s2, 16
656- ; GFX8-NEXT: s_xor_b32 s2, s2, 0x8000
657- ; GFX8-NEXT: s_and_b32 s3, 0xffff, s3
658- ; GFX8-NEXT: s_lshl_b32 s2, s2, 16
659- ; GFX8-NEXT: s_or_b32 s2, s3, s2
591+ ; GFX8-NEXT: s_xor_b32 s2, s2, 0x80008000
660592; GFX8-NEXT: v_mov_b32_e32 v0, s0
661593; GFX8-NEXT: v_mov_b32_e32 v1, s1
662594; GFX8-NEXT: v_mov_b32_e32 v2, s2
@@ -669,44 +601,22 @@ define amdgpu_kernel void @fneg_free_v2bf16(ptr addrspace(1) %out, i32 %in) #0 {
669601; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
670602; GFX9-NEXT: v_mov_b32_e32 v0, 0
671603; GFX9-NEXT: s_waitcnt lgkmcnt(0)
672- ; GFX9-NEXT: s_xor_b32 s3, s2, 0x8000
673- ; GFX9-NEXT: s_lshr_b32 s2, s2, 16
674- ; GFX9-NEXT: s_xor_b32 s2, s2, 0x8000
675- ; GFX9-NEXT: s_pack_ll_b32_b16 s2, s3, s2
604+ ; GFX9-NEXT: s_xor_b32 s2, s2, 0x80008000
676605; GFX9-NEXT: v_mov_b32_e32 v1, s2
677606; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
678607; GFX9-NEXT: s_endpgm
679608;
680- ; GFX11-TRUE16-LABEL: fneg_free_v2bf16:
681- ; GFX11-TRUE16: ; %bb.0:
682- ; GFX11-TRUE16-NEXT: s_clause 0x1
683- ; GFX11-TRUE16-NEXT: s_load_b32 s2, s[4:5], 0x8
684- ; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
685- ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
686- ; GFX11-TRUE16-NEXT: s_mov_b32 s3, s2
687- ; GFX11-TRUE16-NEXT: s_lshr_b32 s2, s2, 16
688- ; GFX11-TRUE16-NEXT: s_xor_b32 s3, s3, 0x8000
689- ; GFX11-TRUE16-NEXT: s_xor_b32 s2, s2, 0x8000
690- ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
691- ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s3, s2
692- ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
693- ; GFX11-TRUE16-NEXT: global_store_b32 v0, v1, s[0:1]
694- ; GFX11-TRUE16-NEXT: s_endpgm
695- ;
696- ; GFX11-FAKE16-LABEL: fneg_free_v2bf16:
697- ; GFX11-FAKE16: ; %bb.0:
698- ; GFX11-FAKE16-NEXT: s_clause 0x1
699- ; GFX11-FAKE16-NEXT: s_load_b32 s2, s[4:5], 0x8
700- ; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
701- ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
702- ; GFX11-FAKE16-NEXT: s_lshr_b32 s3, s2, 16
703- ; GFX11-FAKE16-NEXT: s_xor_b32 s2, s2, 0x8000
704- ; GFX11-FAKE16-NEXT: s_xor_b32 s3, s3, 0x8000
705- ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
706- ; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s3
707- ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
708- ; GFX11-FAKE16-NEXT: global_store_b32 v0, v1, s[0:1]
709- ; GFX11-FAKE16-NEXT: s_endpgm
609+ ; GFX11-LABEL: fneg_free_v2bf16:
610+ ; GFX11: ; %bb.0:
611+ ; GFX11-NEXT: s_clause 0x1
612+ ; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x8
613+ ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
614+ ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
615+ ; GFX11-NEXT: s_xor_b32 s2, s2, 0x80008000
616+ ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
617+ ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
618+ ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
619+ ; GFX11-NEXT: s_endpgm
710620 %bc = bitcast i32 %in to <2 x bfloat>
711621 %fsub = fsub <2 x bfloat> <bfloat -0 .0 , bfloat -0 .0 >, %bc
712622 store <2 x bfloat> %fsub , ptr addrspace (1 ) %out
@@ -754,12 +664,12 @@ define amdgpu_kernel void @v_fneg_fold_v2bf16(ptr addrspace(1) %out, ptr addrspa
754664; GFX8-NEXT: v_mov_b32_e32 v0, s0
755665; GFX8-NEXT: v_mov_b32_e32 v1, s1
756666; GFX8-NEXT: s_waitcnt vmcnt(0)
757- ; GFX8-NEXT: v_xor_b32_sdwa v4, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
758- ; GFX8-NEXT: v_xor_b32_sdwa v3 , v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
759- ; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16 , v2
760- ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
761- ; GFX8-NEXT: v_mul_f32_e32 v3, v3, v5
762- ; GFX8-NEXT: v_mul_f32_e32 v2, v4, v2
667+ ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2
668+ ; GFX8-NEXT: v_xor_b32_sdwa v5 , v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
669+ ; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000 , v2
670+ ; GFX8-NEXT: v_xor_b32_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
671+ ; GFX8-NEXT: v_mul_f32_e32 v3, v5, v4
672+ ; GFX8-NEXT: v_mul_f32_e32 v2, v2, v6
763673; GFX8-NEXT: v_bfe_u32 v4, v3, 16, 1
764674; GFX8-NEXT: v_bfe_u32 v6, v2, 16, 1
765675; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v3
@@ -786,22 +696,22 @@ define amdgpu_kernel void @v_fneg_fold_v2bf16(ptr addrspace(1) %out, ptr addrspa
786696; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
787697; GFX9-NEXT: s_mov_b32 s2, 0x8000
788698; GFX9-NEXT: s_waitcnt vmcnt(0)
699+ ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v1
789700; GFX9-NEXT: v_xor_b32_sdwa v4, v1, s2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
790- ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v1
791- ; GFX9-NEXT: v_xor_b32_sdwa v3, v1, s2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
792- ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
793- ; GFX9-NEXT: v_mul_f32_e32 v4, v4, v5
794- ; GFX9-NEXT: v_mul_f32_e32 v1, v3, v1
795- ; GFX9-NEXT: v_bfe_u32 v3, v4, 16, 1
701+ ; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
702+ ; GFX9-NEXT: v_xor_b32_sdwa v1, v1, s2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
703+ ; GFX9-NEXT: v_mul_f32_e32 v3, v4, v3
704+ ; GFX9-NEXT: v_mul_f32_e32 v1, v1, v5
705+ ; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1
796706; GFX9-NEXT: v_bfe_u32 v6, v1, 16, 1
797- ; GFX9-NEXT: v_add_u32_e32 v3, v3, v4
798- ; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v4
707+ ; GFX9-NEXT: v_add_u32_e32 v4, v4, v3
708+ ; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3
799709; GFX9-NEXT: v_add_u32_e32 v6, v6, v1
800- ; GFX9-NEXT: v_add_u32_e32 v3 , 0x7fff, v3
801- ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
710+ ; GFX9-NEXT: v_add_u32_e32 v4 , 0x7fff, v4
711+ ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
802712; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v1
803713; GFX9-NEXT: v_add_u32_e32 v6, 0x7fff, v6
804- ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3 , v5, vcc
714+ ; GFX9-NEXT: v_cndmask_b32_e32 v3, v4 , v5, vcc
805715; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
806716; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v7, vcc
807717; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
@@ -1024,10 +934,9 @@ define amdgpu_kernel void @v_extract_fneg_no_fold_v2bf16(ptr addrspace(1) %in) #
1024934; GFX8-NEXT: v_mov_b32_e32 v0, s0
1025935; GFX8-NEXT: v_mov_b32_e32 v1, s1
1026936; GFX8-NEXT: flat_load_dword v0, v[0:1]
1027- ; GFX8-NEXT: v_mov_b32_e32 v1, 0x8000
1028937; GFX8-NEXT: s_waitcnt vmcnt(0)
1029- ; GFX8-NEXT: v_xor_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
1030- ; GFX8-NEXT: v_xor_b32_e32 v0, 0x8000 , v0
938+ ; GFX8-NEXT: v_xor_b32_e32 v0, 0x80008000, v0
939+ ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16 , v0
1031940; GFX8-NEXT: flat_store_short v[0:1], v0
1032941; GFX8-NEXT: s_waitcnt vmcnt(0)
1033942; GFX8-NEXT: flat_store_short v[0:1], v1
@@ -1040,13 +949,11 @@ define amdgpu_kernel void @v_extract_fneg_no_fold_v2bf16(ptr addrspace(1) %in) #
1040949; GFX9-NEXT: v_mov_b32_e32 v0, 0
1041950; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1042951; GFX9-NEXT: global_load_dword v0, v0, s[0:1]
1043- ; GFX9-NEXT: s_mov_b32 s0, 0x8000
1044952; GFX9-NEXT: s_waitcnt vmcnt(0)
1045- ; GFX9-NEXT: v_xor_b32_sdwa v1, v0, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
1046- ; GFX9-NEXT: v_xor_b32_e32 v0, 0x8000, v0
953+ ; GFX9-NEXT: v_xor_b32_e32 v0, 0x80008000, v0
1047954; GFX9-NEXT: global_store_short v[0:1], v0, off
1048955; GFX9-NEXT: s_waitcnt vmcnt(0)
1049- ; GFX9-NEXT: global_store_short v[0:1], v1 , off
956+ ; GFX9-NEXT: global_store_short_d16_hi v[0:1], v0 , off
1050957; GFX9-NEXT: s_waitcnt vmcnt(0)
1051958; GFX9-NEXT: s_endpgm
1052959;
@@ -1057,13 +964,10 @@ define amdgpu_kernel void @v_extract_fneg_no_fold_v2bf16(ptr addrspace(1) %in) #
1057964; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1058965; GFX11-NEXT: global_load_b32 v0, v0, s[0:1]
1059966; GFX11-NEXT: s_waitcnt vmcnt(0)
1060- ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
1061- ; GFX11-NEXT: v_xor_b32_e32 v0, 0x8000, v0
1062- ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
1063- ; GFX11-NEXT: v_xor_b32_e32 v1, 0x8000, v1
967+ ; GFX11-NEXT: v_xor_b32_e32 v0, 0x80008000, v0
1064968; GFX11-NEXT: global_store_b16 v[0:1], v0, off dlc
1065969; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
1066- ; GFX11-NEXT: global_store_b16 v[0:1], v1 , off dlc
970+ ; GFX11-NEXT: global_store_d16_hi_b16 v[0:1], v0 , off dlc
1067971; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
1068972; GFX11-NEXT: s_endpgm
1069973 %val = load <2 x bfloat>, ptr addrspace (1 ) %in
0 commit comments