@@ -480,29 +480,58 @@ define amdgpu_kernel void @v8i8_phi_chain(ptr addrspace(1) %src1, ptr addrspace(
480480; GFX906-LABEL: v8i8_phi_chain:
481481; GFX906: ; %bb.0: ; %entry
482482; GFX906-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
483- ; GFX906-NEXT: v_lshlrev_b32_e32 v3, 3, v0
484- ; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0
485- ; GFX906-NEXT: s_xor_b64 s[0:1], vcc, -1
483+ ; GFX906-NEXT: v_lshlrev_b32_e32 v5, 3, v0
484+ ; GFX906-NEXT: v_cmp_le_u32_e32 vcc, 15, v0
485+ ; GFX906-NEXT: v_cmp_gt_u32_e64 s[0:1], 7, v0
486+ ; GFX906-NEXT: s_or_b64 s[2:3], vcc, s[0:1]
486487; GFX906-NEXT: s_waitcnt lgkmcnt(0)
487- ; GFX906-NEXT: global_load_dwordx2 v[1:2], v3, s[8:9]
488- ; GFX906-NEXT: s_and_saveexec_b64 s[2:3], vcc
488+ ; GFX906-NEXT: global_load_dwordx2 v[3:4], v5, s[8:9]
489+ ; GFX906-NEXT: global_load_dwordx2 v[1:2], v5, s[10:11]
490+ ; GFX906-NEXT: s_and_saveexec_b64 s[0:1], s[2:3]
489491; GFX906-NEXT: s_cbranch_execz .LBB8_2
490- ; GFX906-NEXT: ; %bb.1: ; %bb.1
491- ; GFX906-NEXT: global_load_dwordx2 v[1:2], v3, s[10:11]
492- ; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 7, v0
493- ; GFX906-NEXT: s_andn2_b64 s[0:1], s[0:1], exec
494- ; GFX906-NEXT: s_and_b64 s[4:5], exec, vcc
495- ; GFX906-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5]
496- ; GFX906-NEXT: .LBB8_2: ; %Flow
497- ; GFX906-NEXT: s_or_b64 exec, exec, s[2:3]
498- ; GFX906-NEXT: s_and_saveexec_b64 s[2:3], s[0:1]
499- ; GFX906-NEXT: s_cbranch_execz .LBB8_4
500- ; GFX906-NEXT: ; %bb.3: ; %bb.2
501- ; GFX906-NEXT: v_mov_b32_e32 v0, 0
492+ ; GFX906-NEXT: ; %bb.1: ; %bb.2
493+ ; GFX906-NEXT: s_waitcnt vmcnt(1)
494+ ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v3
495+ ; GFX906-NEXT: v_lshrrev_b32_e32 v7, 8, v4
496+ ; GFX906-NEXT: v_lshrrev_b32_e32 v9, 24, v4
502497; GFX906-NEXT: s_waitcnt vmcnt(0)
498+ ; GFX906-NEXT: v_lshrrev_b32_e32 v10, 8, v1
499+ ; GFX906-NEXT: v_lshrrev_b32_e32 v13, 8, v2
500+ ; GFX906-NEXT: v_lshrrev_b32_e32 v15, 24, v2
501+ ; GFX906-NEXT: v_lshrrev_b32_e32 v5, 16, v3
502+ ; GFX906-NEXT: v_lshrrev_b32_e32 v6, 24, v3
503+ ; GFX906-NEXT: v_lshrrev_b32_e32 v8, 16, v4
504+ ; GFX906-NEXT: v_lshrrev_b32_e32 v11, 16, v1
505+ ; GFX906-NEXT: v_lshrrev_b32_e32 v12, 24, v1
506+ ; GFX906-NEXT: v_lshrrev_b32_e32 v14, 16, v2
507+ ; GFX906-NEXT: v_cndmask_b32_e32 v0, v10, v0, vcc
508+ ; GFX906-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
509+ ; GFX906-NEXT: v_cndmask_b32_e32 v4, v13, v7, vcc
510+ ; GFX906-NEXT: v_cndmask_b32_e32 v7, v15, v9, vcc
511+ ; GFX906-NEXT: v_mov_b32_e32 v9, 8
512+ ; GFX906-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
513+ ; GFX906-NEXT: v_cndmask_b32_e32 v3, v11, v5, vcc
514+ ; GFX906-NEXT: v_cndmask_b32_e32 v5, v12, v6, vcc
515+ ; GFX906-NEXT: v_cndmask_b32_e32 v6, v14, v8, vcc
516+ ; GFX906-NEXT: v_mov_b32_e32 v8, 0xff
517+ ; GFX906-NEXT: v_lshlrev_b32_sdwa v0, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
518+ ; GFX906-NEXT: v_and_or_b32 v0, v1, v8, v0
519+ ; GFX906-NEXT: v_and_b32_e32 v1, 0xff, v3
520+ ; GFX906-NEXT: v_and_b32_e32 v3, 0xff, v5
521+ ; GFX906-NEXT: v_lshlrev_b32_e32 v1, 16, v1
522+ ; GFX906-NEXT: v_lshlrev_b32_e32 v3, 24, v3
523+ ; GFX906-NEXT: v_or3_b32 v1, v0, v1, v3
524+ ; GFX906-NEXT: v_lshlrev_b32_sdwa v0, v9, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
525+ ; GFX906-NEXT: v_and_or_b32 v0, v2, v8, v0
526+ ; GFX906-NEXT: v_and_b32_e32 v2, 0xff, v6
527+ ; GFX906-NEXT: v_and_b32_e32 v3, 0xff, v7
528+ ; GFX906-NEXT: v_lshlrev_b32_e32 v2, 16, v2
529+ ; GFX906-NEXT: v_lshlrev_b32_e32 v3, 24, v3
530+ ; GFX906-NEXT: v_or3_b32 v2, v0, v2, v3
531+ ; GFX906-NEXT: v_mov_b32_e32 v0, 0
503532; GFX906-NEXT: global_store_dwordx2 v0, v[1:2], s[12:13]
504- ; GFX906-NEXT: .LBB8_4 : ; %bb.3
505- ; GFX906-NEXT: s_or_b64 exec, exec, s[2:3 ]
533+ ; GFX906-NEXT: .LBB8_2 : ; %bb.3
534+ ; GFX906-NEXT: s_or_b64 exec, exec, s[0:1 ]
506535; GFX906-NEXT: v_mov_b32_e32 v0, 0
507536; GFX906-NEXT: s_waitcnt vmcnt(0)
508537; GFX906-NEXT: global_store_dwordx2 v0, v[1:2], s[14:15]
@@ -535,29 +564,50 @@ define amdgpu_kernel void @v8i8_multi_block(ptr addrspace(1) %src1, ptr addrspac
535564; GFX906: ; %bb.0: ; %entry
536565; GFX906-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
537566; GFX906-NEXT: v_lshlrev_b32_e32 v5, 3, v0
538- ; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0
567+ ; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 7, v0
568+ ; GFX906-NEXT: v_cmp_gt_u32_e64 s[0:1], 15, v0
569+ ; GFX906-NEXT: s_and_b64 s[2:3], s[0:1], vcc
539570; GFX906-NEXT: s_waitcnt lgkmcnt(0)
540- ; GFX906-NEXT: global_load_dwordx2 v[3:4], v5, s[8:9]
571+ ; GFX906-NEXT: global_load_dwordx2 v[1:2], v5, s[8:9]
572+ ; GFX906-NEXT: global_load_dwordx2 v[3:4], v5, s[10:11]
573+ ; GFX906-NEXT: s_mov_b64 vcc, s[0:1]
574+ ; GFX906-NEXT: v_mov_b32_e32 v6, 8
575+ ; GFX906-NEXT: v_mov_b32_e32 v5, 0xff
576+ ; GFX906-NEXT: s_waitcnt vmcnt(1)
577+ ; GFX906-NEXT: v_lshrrev_b32_e32 v7, 8, v1
541578; GFX906-NEXT: s_waitcnt vmcnt(0)
542- ; GFX906-NEXT: v_mov_b32_e32 v1, v3
543- ; GFX906-NEXT: v_mov_b32_e32 v2, v4
544- ; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc
545- ; GFX906-NEXT: s_cbranch_execz .LBB9_4
546- ; GFX906-NEXT: ; %bb.1: ; %bb.1
547- ; GFX906-NEXT: global_load_dwordx2 v[1:2], v5, s[10:11]
548- ; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 7, v0
549- ; GFX906-NEXT: s_and_saveexec_b64 s[2:3], vcc
550- ; GFX906-NEXT: s_cbranch_execz .LBB9_3
551- ; GFX906-NEXT: ; %bb.2: ; %bb.2
579+ ; GFX906-NEXT: v_lshrrev_b32_e32 v9, 8, v3
580+ ; GFX906-NEXT: v_lshrrev_b32_e32 v8, 8, v2
581+ ; GFX906-NEXT: v_lshrrev_b32_e32 v10, 8, v4
582+ ; GFX906-NEXT: v_cndmask_b32_e64 v7, v7, v9, s[0:1]
583+ ; GFX906-NEXT: v_cndmask_b32_sdwa v9, v1, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
584+ ; GFX906-NEXT: v_cndmask_b32_e64 v0, v1, v3, s[0:1]
585+ ; GFX906-NEXT: v_cndmask_b32_e64 v8, v8, v10, s[0:1]
586+ ; GFX906-NEXT: v_lshlrev_b32_sdwa v7, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
587+ ; GFX906-NEXT: v_and_b32_e32 v9, 0xff, v9
588+ ; GFX906-NEXT: v_cndmask_b32_e64 v11, v2, v4, s[0:1]
589+ ; GFX906-NEXT: v_cndmask_b32_sdwa v10, v1, v3, vcc dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
590+ ; GFX906-NEXT: v_lshlrev_b32_sdwa v6, v6, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
591+ ; GFX906-NEXT: v_and_or_b32 v0, v0, v5, v7
592+ ; GFX906-NEXT: v_lshlrev_b32_e32 v7, 16, v9
593+ ; GFX906-NEXT: v_cndmask_b32_sdwa v8, v2, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
594+ ; GFX906-NEXT: v_and_or_b32 v6, v11, v5, v6
595+ ; GFX906-NEXT: v_or3_b32 v5, v0, v7, v10
596+ ; GFX906-NEXT: v_and_b32_e32 v0, 0xff, v8
597+ ; GFX906-NEXT: v_lshlrev_b32_e32 v0, 16, v0
598+ ; GFX906-NEXT: v_cndmask_b32_sdwa v7, v2, v4, vcc dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
599+ ; GFX906-NEXT: v_or3_b32 v6, v6, v0, v7
600+ ; GFX906-NEXT: s_and_saveexec_b64 s[0:1], s[2:3]
601+ ; GFX906-NEXT: s_cbranch_execz .LBB9_2
602+ ; GFX906-NEXT: ; %bb.1: ; %bb.2
603+ ; GFX906-NEXT: v_mov_b32_e32 v6, v4
552604; GFX906-NEXT: v_mov_b32_e32 v0, 0
553- ; GFX906-NEXT: global_store_dwordx2 v0, v[3:4], s[12:13]
554- ; GFX906-NEXT: .LBB9_3: ; %Flow
555- ; GFX906-NEXT: s_or_b64 exec, exec, s[2:3]
556- ; GFX906-NEXT: .LBB9_4: ; %bb.3
605+ ; GFX906-NEXT: v_mov_b32_e32 v5, v3
606+ ; GFX906-NEXT: global_store_dwordx2 v0, v[1:2], s[12:13]
607+ ; GFX906-NEXT: .LBB9_2: ; %bb.3
557608; GFX906-NEXT: s_or_b64 exec, exec, s[0:1]
558609; GFX906-NEXT: v_mov_b32_e32 v0, 0
559- ; GFX906-NEXT: s_waitcnt vmcnt(0)
560- ; GFX906-NEXT: global_store_dwordx2 v0, v[1:2], s[14:15]
610+ ; GFX906-NEXT: global_store_dwordx2 v0, v[5:6], s[14:15]
561611; GFX906-NEXT: s_endpgm
562612entry:
563613 %idx = call i32 @llvm.amdgcn.workitem.id.x ()
0 commit comments