@@ -439,13 +439,13 @@ define amdgpu_kernel void @preload_workgroup_size_xyz(ptr addrspace(1) inreg %ou
439439; GFX90a: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
440440; GFX90a-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
441441; GFX90a-NEXT: ; %bb.0:
442- ; GFX90a-NEXT: v_mov_b32_e32 v3, 0
443- ; GFX90a-NEXT: global_load_ushort v2, v3, s[4:5] offset:24
444442; GFX90a-NEXT: s_lshr_b32 s0, s11, 16
445443; GFX90a-NEXT: s_and_b32 s1, s11, 0xffff
444+ ; GFX90a-NEXT: s_and_b32 s2, s12, 0xffff
445+ ; GFX90a-NEXT: v_mov_b32_e32 v3, 0
446446; GFX90a-NEXT: v_mov_b32_e32 v0, s1
447447; GFX90a-NEXT: v_mov_b32_e32 v1, s0
448- ; GFX90a-NEXT: s_waitcnt vmcnt(0)
448+ ; GFX90a-NEXT: v_mov_b32_e32 v2, s2
449449; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7]
450450; GFX90a-NEXT: s_endpgm
451451 %imp_arg_ptr = call ptr addrspace (4 ) @llvm.amdgcn.implicitarg.ptr ()
@@ -554,28 +554,27 @@ define amdgpu_kernel void @preloadremainder_xyz(ptr addrspace(1) inreg %out) #0
554554; GFX940: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
555555; GFX940-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
556556; GFX940-NEXT: ; %bb.0:
557+ ; GFX940-NEXT: s_lshr_b32 s0, s9, 16
558+ ; GFX940-NEXT: s_lshr_b32 s1, s8, 16
559+ ; GFX940-NEXT: s_and_b32 s4, s9, 0xffff
557560; GFX940-NEXT: v_mov_b32_e32 v3, 0
558- ; GFX940-NEXT: global_load_ushort v2, v3, s[0:1] offset:30
559- ; GFX940-NEXT: s_lshr_b32 s0, s8, 16
560- ; GFX940-NEXT: s_and_b32 s1, s9, 0xffff
561- ; GFX940-NEXT: v_mov_b32_e32 v0, s0
562- ; GFX940-NEXT: v_mov_b32_e32 v1, s1
563- ; GFX940-NEXT: s_waitcnt vmcnt(0)
561+ ; GFX940-NEXT: v_mov_b32_e32 v0, s1
562+ ; GFX940-NEXT: v_mov_b32_e32 v1, s4
563+ ; GFX940-NEXT: v_mov_b32_e32 v2, s0
564564; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1
565565; GFX940-NEXT: s_endpgm
566566;
567567; GFX90a-LABEL: preloadremainder_xyz:
568568; GFX90a: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
569569; GFX90a-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
570570; GFX90a-NEXT: ; %bb.0:
571+ ; GFX90a-NEXT: s_lshr_b32 s0, s13, 16
572+ ; GFX90a-NEXT: s_lshr_b32 s1, s12, 16
573+ ; GFX90a-NEXT: s_and_b32 s2, s13, 0xffff
571574; GFX90a-NEXT: v_mov_b32_e32 v3, 0
572- ; GFX90a-NEXT: global_load_dword v0, v3, s[4:5] offset:26
573- ; GFX90a-NEXT: global_load_ushort v2, v3, s[4:5] offset:30
574- ; GFX90a-NEXT: s_lshr_b32 s0, s12, 16
575- ; GFX90a-NEXT: s_waitcnt vmcnt(1)
576- ; GFX90a-NEXT: v_lshrrev_b32_e32 v1, 16, v0
577- ; GFX90a-NEXT: v_mov_b32_e32 v0, s0
578- ; GFX90a-NEXT: s_waitcnt vmcnt(0)
575+ ; GFX90a-NEXT: v_mov_b32_e32 v0, s1
576+ ; GFX90a-NEXT: v_mov_b32_e32 v1, s2
577+ ; GFX90a-NEXT: v_mov_b32_e32 v2, s0
579578; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7]
580579; GFX90a-NEXT: s_endpgm
581580 %imp_arg_ptr = call ptr addrspace (4 ) @llvm.amdgcn.implicitarg.ptr ()
@@ -600,10 +599,8 @@ define amdgpu_kernel void @no_free_sgprs_preloadremainder_z(ptr addrspace(1) inr
600599; GFX940: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
601600; GFX940-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
602601; GFX940-NEXT: ; %bb.0:
603- ; GFX940-NEXT: s_load_dword s0, s[4:5], 0x1c
602+ ; GFX940-NEXT: s_lshr_b32 s0, s15, 16
604603; GFX940-NEXT: v_mov_b32_e32 v0, 0
605- ; GFX940-NEXT: s_waitcnt lgkmcnt(0)
606- ; GFX940-NEXT: s_lshr_b32 s0, s0, 16
607604; GFX940-NEXT: v_mov_b32_e32 v1, s0
608605; GFX940-NEXT: global_store_dword v0, v1, s[8:9] sc0 sc1
609606; GFX940-NEXT: s_endpgm
@@ -627,7 +624,7 @@ define amdgpu_kernel void @no_free_sgprs_preloadremainder_z(ptr addrspace(1) inr
627624 ret void
628625}
629626
630- ; Check for consistency between isel and earlier passes preload SGPR accounting.
627+ ; Check for consistency between isel and earlier passes preload SGPR accounting with max preload SGPRs .
631628
632629define amdgpu_kernel void @preload_block_max_user_sgprs (ptr addrspace (1 ) inreg %out , i192 inreg %t0 , i32 inreg %t1 ) #0 {
633630; GFX940-LABEL: preload_block_max_user_sgprs:
@@ -655,4 +652,46 @@ define amdgpu_kernel void @preload_block_max_user_sgprs(ptr addrspace(1) inreg %
655652 ret void
656653}
657654
655+ define amdgpu_kernel void @preload_block_count_z_workgroup_size_z_remainder_z (ptr addrspace (1 ) inreg %out ) #0 {
656+ ; GFX940-LABEL: preload_block_count_z_workgroup_size_z_remainder_z:
657+ ; GFX940: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
658+ ; GFX940-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
659+ ; GFX940-NEXT: ; %bb.0:
660+ ; GFX940-NEXT: s_lshr_b32 s0, s9, 16
661+ ; GFX940-NEXT: s_and_b32 s1, s8, 0xffff
662+ ; GFX940-NEXT: v_mov_b32_e32 v3, 0
663+ ; GFX940-NEXT: v_mov_b32_e32 v0, s6
664+ ; GFX940-NEXT: v_mov_b32_e32 v1, s1
665+ ; GFX940-NEXT: v_mov_b32_e32 v2, s0
666+ ; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1
667+ ; GFX940-NEXT: s_endpgm
668+ ;
669+ ; GFX90a-LABEL: preload_block_count_z_workgroup_size_z_remainder_z:
670+ ; GFX90a: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
671+ ; GFX90a-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
672+ ; GFX90a-NEXT: ; %bb.0:
673+ ; GFX90a-NEXT: s_lshr_b32 s0, s13, 16
674+ ; GFX90a-NEXT: s_and_b32 s1, s12, 0xffff
675+ ; GFX90a-NEXT: v_mov_b32_e32 v3, 0
676+ ; GFX90a-NEXT: v_mov_b32_e32 v0, s10
677+ ; GFX90a-NEXT: v_mov_b32_e32 v1, s1
678+ ; GFX90a-NEXT: v_mov_b32_e32 v2, s0
679+ ; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7]
680+ ; GFX90a-NEXT: s_endpgm
681+ %imp_arg_ptr = call ptr addrspace (4 ) @llvm.amdgcn.implicitarg.ptr ()
682+ %gep0 = getelementptr i8 , ptr addrspace (4 ) %imp_arg_ptr , i32 8
683+ %gep1 = getelementptr i8 , ptr addrspace (4 ) %imp_arg_ptr , i32 16
684+ %gep2 = getelementptr i8 , ptr addrspace (4 ) %imp_arg_ptr , i32 22
685+ %load0 = load i32 , ptr addrspace (4 ) %gep0
686+ %load1 = load i16 , ptr addrspace (4 ) %gep1
687+ %load2 = load i16 , ptr addrspace (4 ) %gep2
688+ %conv1 = zext i16 %load1 to i32
689+ %conv2 = zext i16 %load2 to i32
690+ %ins.0 = insertelement <3 x i32 > poison, i32 %load0 , i32 0
691+ %ins.1 = insertelement <3 x i32 > %ins.0 , i32 %conv1 , i32 1
692+ %ins.2 = insertelement <3 x i32 > %ins.1 , i32 %conv2 , i32 2
693+ store <3 x i32 > %ins.2 , ptr addrspace (1 ) %out
694+ ret void
695+ }
696+
658697attributes #0 = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size" ="false" }
0 commit comments