@@ -872,66 +872,66 @@ define amdgpu_kernel void @constant_load_v11i32(ptr addrspace(1) %out, ptr addrs
872872; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
873873; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
874874; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
875- ; GFX7-HSA-NEXT: s_load_dwordx4 s[12:15], s[10:11], 0x8
876875; GFX7-HSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0
876+ ; GFX7-HSA-NEXT: s_load_dwordx4 s[12:15], s[10:11], 0x8
877877; GFX7-HSA-NEXT: s_add_u32 s10, s8, 16
878878; GFX7-HSA-NEXT: s_addc_u32 s11, s9, 0
879- ; GFX7-HSA-NEXT: v_mov_b32_e32 v7 , s10
880- ; GFX7-HSA-NEXT: v_mov_b32_e32 v8 , s11
879+ ; GFX7-HSA-NEXT: v_mov_b32_e32 v5 , s10
880+ ; GFX7-HSA-NEXT: v_mov_b32_e32 v6 , s11
881881; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
882882; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s4
883883; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s5
884884; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s6
885885; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s7
886- ; GFX7-HSA-NEXT: flat_store_dwordx4 v[7:8], v[0:3]
887- ; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s8
888- ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0
889- ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s1
890- ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s2
891- ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s3
892- ; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s9
886+ ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0
887+ ; GFX7-HSA-NEXT: flat_store_dwordx4 v[5:6], v[0:3]
893888; GFX7-HSA-NEXT: s_add_u32 s0, s8, 32
894- ; GFX7-HSA-NEXT: flat_store_dwordx4 v[7:8], v[0:3]
889+ ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s8
890+ ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1
891+ ; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s2
892+ ; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s3
893+ ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s9
895894; GFX7-HSA-NEXT: s_addc_u32 s1, s9, 0
896- ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0
897- ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s12
898- ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s13
899- ; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s14
900- ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s1
901- ; GFX7-HSA-NEXT: flat_store_dwordx3 v[0:1], v[4:6]
895+ ; GFX7-HSA-NEXT: flat_store_dwordx4 v[0:1], v[4:7]
896+ ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s12
897+ ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s1
898+ ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s13
899+ ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s14
900+ ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s0
901+ ; GFX7-HSA-NEXT: flat_store_dwordx3 v[3:4], v[0:2]
902902; GFX7-HSA-NEXT: s_endpgm
903903;
904904; GFX8-NOHSA-LABEL: constant_load_v11i32:
905905; GFX8-NOHSA: ; %bb.0: ; %entry
906906; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
907907; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
908- ; GFX8-NOHSA-NEXT: s_load_dwordx4 s[12:15], s[2:3], 0x20
909908; GFX8-NOHSA-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0
909+ ; GFX8-NOHSA-NEXT: s_load_dwordx4 s[12:15], s[2:3], 0x20
910910; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 16
911911; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0
912- ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v8 , s3
913- ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7 , s2
912+ ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6 , s3
913+ ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5 , s2
914914; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
915915; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s8
916916; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s9
917917; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s10
918918; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s11
919- ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[7:8], v[0:3]
920- ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v8, s1
921- ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4
922- ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s5
923- ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s6
924- ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s7
925- ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s0
926- ; GFX8-NOHSA-NEXT: s_add_u32 s0, s0, 32
927- ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[7:8], v[0:3]
928- ; GFX8-NOHSA-NEXT: s_addc_u32 s1, s1, 0
919+ ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[5:6], v[0:3]
920+ ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4
929921; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0
930- ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s12
931- ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s13
932- ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s14
933922; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s1
934- ; GFX8-NOHSA-NEXT: flat_store_dwordx3 v[0:1], v[4:6]
923+ ; GFX8-NOHSA-NEXT: s_add_u32 s0, s0, 32
924+ ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s5
925+ ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s6
926+ ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s7
927+ ; GFX8-NOHSA-NEXT: s_addc_u32 s1, s1, 0
928+ ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[0:1], v[4:7]
929+ ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s12
930+ ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s1
931+ ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s13
932+ ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s14
933+ ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s0
934+ ; GFX8-NOHSA-NEXT: flat_store_dwordx3 v[3:4], v[0:2]
935935; GFX8-NOHSA-NEXT: s_endpgm
936936;
937937; EG-LABEL: constant_load_v11i32:
@@ -969,45 +969,45 @@ define amdgpu_kernel void @constant_load_v11i32(ptr addrspace(1) %out, ptr addrs
969969; GFX9-HSA-LABEL: constant_load_v11i32:
970970; GFX9-HSA: ; %bb.0: ; %entry
971971; GFX9-HSA-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x0
972- ; GFX9-HSA-NEXT: v_mov_b32_e32 v7 , 0
972+ ; GFX9-HSA-NEXT: v_mov_b32_e32 v8 , 0
973973; GFX9-HSA-NEXT: s_waitcnt lgkmcnt(0)
974- ; GFX9-HSA-NEXT: s_load_dwordx4 s[12:15], s[10:11], 0x20
975974; GFX9-HSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0
975+ ; GFX9-HSA-NEXT: s_load_dwordx4 s[12:15], s[10:11], 0x20
976976; GFX9-HSA-NEXT: s_waitcnt lgkmcnt(0)
977- ; GFX9-HSA-NEXT: v_mov_b32_e32 v4, s12
978977; GFX9-HSA-NEXT: v_mov_b32_e32 v0, s4
979978; GFX9-HSA-NEXT: v_mov_b32_e32 v1, s5
980979; GFX9-HSA-NEXT: v_mov_b32_e32 v2, s6
981980; GFX9-HSA-NEXT: v_mov_b32_e32 v3, s7
982- ; GFX9-HSA-NEXT: global_store_dwordx4 v7, v[0:3], s[8:9] offset:16
983- ; GFX9-HSA-NEXT: v_mov_b32_e32 v5, s13
984- ; GFX9-HSA-NEXT: v_mov_b32_e32 v0, s0
985- ; GFX9-HSA-NEXT: v_mov_b32_e32 v1, s1
986- ; GFX9-HSA-NEXT: v_mov_b32_e32 v2, s2
987- ; GFX9-HSA-NEXT: v_mov_b32_e32 v3, s3
988- ; GFX9-HSA-NEXT: v_mov_b32_e32 v6, s14
989- ; GFX9-HSA-NEXT: global_store_dwordx4 v7, v[0:3], s[8:9]
990- ; GFX9-HSA-NEXT: global_store_dwordx3 v7, v[4:6], s[8:9] offset:32
981+ ; GFX9-HSA-NEXT: v_mov_b32_e32 v4, s0
982+ ; GFX9-HSA-NEXT: v_mov_b32_e32 v5, s1
983+ ; GFX9-HSA-NEXT: v_mov_b32_e32 v6, s2
984+ ; GFX9-HSA-NEXT: global_store_dwordx4 v8, v[0:3], s[8:9] offset:16
985+ ; GFX9-HSA-NEXT: v_mov_b32_e32 v7, s3
986+ ; GFX9-HSA-NEXT: v_mov_b32_e32 v0, s12
987+ ; GFX9-HSA-NEXT: v_mov_b32_e32 v1, s13
988+ ; GFX9-HSA-NEXT: v_mov_b32_e32 v2, s14
989+ ; GFX9-HSA-NEXT: global_store_dwordx4 v8, v[4:7], s[8:9]
990+ ; GFX9-HSA-NEXT: global_store_dwordx3 v8, v[0:2], s[8:9] offset:32
991991; GFX9-HSA-NEXT: s_endpgm
992992;
993993; GFX12-LABEL: constant_load_v11i32:
994994; GFX12: ; %bb.0: ; %entry
995995; GFX12-NEXT: s_load_b128 s[8:11], s[4:5], 0x24
996996; GFX12-NEXT: s_wait_kmcnt 0x0
997997; GFX12-NEXT: s_clause 0x1
998- ; GFX12-NEXT: s_load_b96 s[12:14], s[10:11], 0x20
999998; GFX12-NEXT: s_load_b256 s[0:7], s[10:11], 0x0
999+ ; GFX12-NEXT: s_load_b96 s[12:14], s[10:11], 0x20
10001000; GFX12-NEXT: s_wait_kmcnt 0x0
1001- ; GFX12-NEXT: v_dual_mov_b32 v11, 0 :: v_dual_mov_b32 v8, s12
1001+ ; GFX12-NEXT: v_dual_mov_b32 v11, 0 :: v_dual_mov_b32 v0, s4
1002+ ; GFX12-NEXT: v_dual_mov_b32 v1, s5 :: v_dual_mov_b32 v2, s6
1003+ ; GFX12-NEXT: v_dual_mov_b32 v3, s7 :: v_dual_mov_b32 v4, s0
1004+ ; GFX12-NEXT: v_dual_mov_b32 v5, s1 :: v_dual_mov_b32 v6, s2
1005+ ; GFX12-NEXT: v_dual_mov_b32 v7, s3 :: v_dual_mov_b32 v8, s12
10021006; GFX12-NEXT: v_dual_mov_b32 v9, s13 :: v_dual_mov_b32 v10, s14
1003- ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
1004- ; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
1005- ; GFX12-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s1
1006- ; GFX12-NEXT: v_dual_mov_b32 v6, s2 :: v_dual_mov_b32 v7, s3
10071007; GFX12-NEXT: s_clause 0x2
1008- ; GFX12-NEXT: global_store_b96 v11, v[8:10], s[8:9] offset:32
10091008; GFX12-NEXT: global_store_b128 v11, v[0:3], s[8:9] offset:16
10101009; GFX12-NEXT: global_store_b128 v11, v[4:7], s[8:9]
1010+ ; GFX12-NEXT: global_store_b96 v11, v[8:10], s[8:9] offset:32
10111011; GFX12-NEXT: s_endpgm
10121012entry:
10131013 %ld = load <11 x i32 >, ptr addrspace (4 ) %in
0 commit comments