@@ -589,13 +589,10 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
589
589
; GFX8-NEXT: ; implicit-def: $vgpr1
590
590
; GFX8-NEXT: .LBB2_1: ; %ComputeLoop
591
591
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
592
- ; GFX8-NEXT: s_ff1_i32_b32 s5, s3
593
- ; GFX8-NEXT: s_ff1_i32_b32 s6, s2
594
- ; GFX8-NEXT: s_add_i32 s5, s5, 32
595
- ; GFX8-NEXT: s_min_u32 s5, s6, s5
592
+ ; GFX8-NEXT: s_ff1_i32_b64 s5, s[2:3]
593
+ ; GFX8-NEXT: s_mov_b32 m0, s5
596
594
; GFX8-NEXT: v_readlane_b32 s8, v0, s5
597
595
; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s5
598
- ; GFX8-NEXT: s_mov_b32 m0, s5
599
596
; GFX8-NEXT: v_writelane_b32 v1, s4, m0
600
597
; GFX8-NEXT: s_add_i32 s4, s4, s8
601
598
; GFX8-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
@@ -633,13 +630,10 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
633
630
; GFX9-NEXT: ; implicit-def: $vgpr1
634
631
; GFX9-NEXT: .LBB2_1: ; %ComputeLoop
635
632
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
636
- ; GFX9-NEXT: s_ff1_i32_b32 s5, s3
637
- ; GFX9-NEXT: s_ff1_i32_b32 s6, s2
638
- ; GFX9-NEXT: s_add_i32 s5, s5, 32
639
- ; GFX9-NEXT: s_min_u32 s5, s6, s5
633
+ ; GFX9-NEXT: s_ff1_i32_b64 s5, s[2:3]
634
+ ; GFX9-NEXT: s_mov_b32 m0, s5
640
635
; GFX9-NEXT: v_readlane_b32 s8, v0, s5
641
636
; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s5
642
- ; GFX9-NEXT: s_mov_b32 m0, s5
643
637
; GFX9-NEXT: v_writelane_b32 v1, s4, m0
644
638
; GFX9-NEXT: s_add_i32 s4, s4, s8
645
639
; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
@@ -676,10 +670,7 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
676
670
; GFX10W64-NEXT: ; implicit-def: $vgpr1
677
671
; GFX10W64-NEXT: .LBB2_1: ; %ComputeLoop
678
672
; GFX10W64-NEXT: ; =>This Inner Loop Header: Depth=1
679
- ; GFX10W64-NEXT: s_ff1_i32_b32 s5, s3
680
- ; GFX10W64-NEXT: s_ff1_i32_b32 s6, s2
681
- ; GFX10W64-NEXT: s_add_i32 s5, s5, 32
682
- ; GFX10W64-NEXT: s_min_u32 s5, s6, s5
673
+ ; GFX10W64-NEXT: s_ff1_i32_b64 s5, s[2:3]
683
674
; GFX10W64-NEXT: v_readlane_b32 s8, v0, s5
684
675
; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s5
685
676
; GFX10W64-NEXT: v_writelane_b32 v1, s4, s5
@@ -758,16 +749,12 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
758
749
; GFX11W64-NEXT: ; implicit-def: $vgpr1
759
750
; GFX11W64-NEXT: .LBB2_1: ; %ComputeLoop
760
751
; GFX11W64-NEXT: ; =>This Inner Loop Header: Depth=1
761
- ; GFX11W64-NEXT: s_ctz_i32_b32 s5, s3
762
- ; GFX11W64-NEXT: s_ctz_i32_b32 s6, s2
763
- ; GFX11W64-NEXT: s_add_i32 s5, s5, 32
764
- ; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
765
- ; GFX11W64-NEXT: s_min_u32 s5, s6, s5
752
+ ; GFX11W64-NEXT: s_ctz_i32_b64 s5, s[2:3]
753
+ ; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
766
754
; GFX11W64-NEXT: v_readlane_b32 s8, v0, s5
767
755
; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s5
768
756
; GFX11W64-NEXT: v_writelane_b32 v1, s4, s5
769
757
; GFX11W64-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7]
770
- ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
771
758
; GFX11W64-NEXT: s_add_i32 s4, s4, s8
772
759
; GFX11W64-NEXT: s_cmp_lg_u64 s[2:3], 0
773
760
; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_1
@@ -849,16 +836,12 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
849
836
; GFX12W64-NEXT: ; implicit-def: $vgpr1
850
837
; GFX12W64-NEXT: .LBB2_1: ; %ComputeLoop
851
838
; GFX12W64-NEXT: ; =>This Inner Loop Header: Depth=1
852
- ; GFX12W64-NEXT: s_ctz_i32_b32 s5, s3
853
- ; GFX12W64-NEXT: s_ctz_i32_b32 s6, s2
854
- ; GFX12W64-NEXT: s_add_co_i32 s5, s5, 32
855
- ; GFX12W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
856
- ; GFX12W64-NEXT: s_min_u32 s5, s6, s5
839
+ ; GFX12W64-NEXT: s_ctz_i32_b64 s5, s[2:3]
840
+ ; GFX12W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
857
841
; GFX12W64-NEXT: v_readlane_b32 s8, v0, s5
858
842
; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s5
859
843
; GFX12W64-NEXT: v_writelane_b32 v1, s4, s5
860
844
; GFX12W64-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7]
861
- ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
862
845
; GFX12W64-NEXT: s_add_co_i32 s4, s4, s8
863
846
; GFX12W64-NEXT: s_cmp_lg_u64 s[2:3], 0
864
847
; GFX12W64-NEXT: s_cbranch_scc1 .LBB2_1
@@ -961,13 +944,10 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p
961
944
; GFX8-NEXT: ; implicit-def: $vgpr1
962
945
; GFX8-NEXT: .LBB3_1: ; %ComputeLoop
963
946
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
964
- ; GFX8-NEXT: s_ff1_i32_b32 s5, s3
965
- ; GFX8-NEXT: s_ff1_i32_b32 s6, s2
966
- ; GFX8-NEXT: s_add_i32 s5, s5, 32
967
- ; GFX8-NEXT: s_min_u32 s5, s6, s5
947
+ ; GFX8-NEXT: s_ff1_i32_b64 s5, s[2:3]
948
+ ; GFX8-NEXT: s_mov_b32 m0, s5
968
949
; GFX8-NEXT: v_readlane_b32 s8, v0, s5
969
950
; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s5
970
- ; GFX8-NEXT: s_mov_b32 m0, s5
971
951
; GFX8-NEXT: v_writelane_b32 v1, s4, m0
972
952
; GFX8-NEXT: s_add_i32 s4, s4, s8
973
953
; GFX8-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
@@ -1007,13 +987,10 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p
1007
987
; GFX9-NEXT: ; implicit-def: $vgpr1
1008
988
; GFX9-NEXT: .LBB3_1: ; %ComputeLoop
1009
989
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
1010
- ; GFX9-NEXT: s_ff1_i32_b32 s5, s3
1011
- ; GFX9-NEXT: s_ff1_i32_b32 s6, s2
1012
- ; GFX9-NEXT: s_add_i32 s5, s5, 32
1013
- ; GFX9-NEXT: s_min_u32 s5, s6, s5
990
+ ; GFX9-NEXT: s_ff1_i32_b64 s5, s[2:3]
991
+ ; GFX9-NEXT: s_mov_b32 m0, s5
1014
992
; GFX9-NEXT: v_readlane_b32 s8, v0, s5
1015
993
; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s5
1016
- ; GFX9-NEXT: s_mov_b32 m0, s5
1017
994
; GFX9-NEXT: v_writelane_b32 v1, s4, m0
1018
995
; GFX9-NEXT: s_add_i32 s4, s4, s8
1019
996
; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
@@ -1052,10 +1029,7 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p
1052
1029
; GFX10W64-NEXT: ; implicit-def: $vgpr1
1053
1030
; GFX10W64-NEXT: .LBB3_1: ; %ComputeLoop
1054
1031
; GFX10W64-NEXT: ; =>This Inner Loop Header: Depth=1
1055
- ; GFX10W64-NEXT: s_ff1_i32_b32 s5, s3
1056
- ; GFX10W64-NEXT: s_ff1_i32_b32 s6, s2
1057
- ; GFX10W64-NEXT: s_add_i32 s5, s5, 32
1058
- ; GFX10W64-NEXT: s_min_u32 s5, s6, s5
1032
+ ; GFX10W64-NEXT: s_ff1_i32_b64 s5, s[2:3]
1059
1033
; GFX10W64-NEXT: v_readlane_b32 s8, v0, s5
1060
1034
; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s5
1061
1035
; GFX10W64-NEXT: v_writelane_b32 v1, s4, s5
@@ -1140,16 +1114,12 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p
1140
1114
; GFX11W64-NEXT: ; implicit-def: $vgpr1
1141
1115
; GFX11W64-NEXT: .LBB3_1: ; %ComputeLoop
1142
1116
; GFX11W64-NEXT: ; =>This Inner Loop Header: Depth=1
1143
- ; GFX11W64-NEXT: s_ctz_i32_b32 s5, s3
1144
- ; GFX11W64-NEXT: s_ctz_i32_b32 s6, s2
1145
- ; GFX11W64-NEXT: s_add_i32 s5, s5, 32
1146
- ; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
1147
- ; GFX11W64-NEXT: s_min_u32 s5, s6, s5
1117
+ ; GFX11W64-NEXT: s_ctz_i32_b64 s5, s[2:3]
1118
+ ; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
1148
1119
; GFX11W64-NEXT: v_readlane_b32 s8, v0, s5
1149
1120
; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s5
1150
1121
; GFX11W64-NEXT: v_writelane_b32 v1, s4, s5
1151
1122
; GFX11W64-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7]
1152
- ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
1153
1123
; GFX11W64-NEXT: s_add_i32 s4, s4, s8
1154
1124
; GFX11W64-NEXT: s_cmp_lg_u64 s[2:3], 0
1155
1125
; GFX11W64-NEXT: s_cbranch_scc1 .LBB3_1
@@ -1237,16 +1207,12 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p
1237
1207
; GFX12W64-NEXT: ; implicit-def: $vgpr1
1238
1208
; GFX12W64-NEXT: .LBB3_1: ; %ComputeLoop
1239
1209
; GFX12W64-NEXT: ; =>This Inner Loop Header: Depth=1
1240
- ; GFX12W64-NEXT: s_ctz_i32_b32 s5, s3
1241
- ; GFX12W64-NEXT: s_ctz_i32_b32 s6, s2
1242
- ; GFX12W64-NEXT: s_add_co_i32 s5, s5, 32
1243
- ; GFX12W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
1244
- ; GFX12W64-NEXT: s_min_u32 s5, s6, s5
1210
+ ; GFX12W64-NEXT: s_ctz_i32_b64 s5, s[2:3]
1211
+ ; GFX12W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
1245
1212
; GFX12W64-NEXT: v_readlane_b32 s8, v0, s5
1246
1213
; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s5
1247
1214
; GFX12W64-NEXT: v_writelane_b32 v1, s4, s5
1248
1215
; GFX12W64-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7]
1249
- ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
1250
1216
; GFX12W64-NEXT: s_add_co_i32 s4, s4, s8
1251
1217
; GFX12W64-NEXT: s_cmp_lg_u64 s[2:3], 0
1252
1218
; GFX12W64-NEXT: s_cbranch_scc1 .LBB3_1
@@ -2005,13 +1971,10 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
2005
1971
; GFX8-NEXT: ; implicit-def: $vgpr1
2006
1972
; GFX8-NEXT: .LBB7_1: ; %ComputeLoop
2007
1973
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
2008
- ; GFX8-NEXT: s_ff1_i32_b32 s5, s3
2009
- ; GFX8-NEXT: s_ff1_i32_b32 s6, s2
2010
- ; GFX8-NEXT: s_add_i32 s5, s5, 32
2011
- ; GFX8-NEXT: s_min_u32 s5, s6, s5
1974
+ ; GFX8-NEXT: s_ff1_i32_b64 s5, s[2:3]
1975
+ ; GFX8-NEXT: s_mov_b32 m0, s5
2012
1976
; GFX8-NEXT: v_readlane_b32 s8, v0, s5
2013
1977
; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s5
2014
- ; GFX8-NEXT: s_mov_b32 m0, s5
2015
1978
; GFX8-NEXT: v_writelane_b32 v1, s4, m0
2016
1979
; GFX8-NEXT: s_add_i32 s4, s4, s8
2017
1980
; GFX8-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
@@ -2049,13 +2012,10 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
2049
2012
; GFX9-NEXT: ; implicit-def: $vgpr1
2050
2013
; GFX9-NEXT: .LBB7_1: ; %ComputeLoop
2051
2014
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
2052
- ; GFX9-NEXT: s_ff1_i32_b32 s5, s3
2053
- ; GFX9-NEXT: s_ff1_i32_b32 s6, s2
2054
- ; GFX9-NEXT: s_add_i32 s5, s5, 32
2055
- ; GFX9-NEXT: s_min_u32 s5, s6, s5
2015
+ ; GFX9-NEXT: s_ff1_i32_b64 s5, s[2:3]
2016
+ ; GFX9-NEXT: s_mov_b32 m0, s5
2056
2017
; GFX9-NEXT: v_readlane_b32 s8, v0, s5
2057
2018
; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s5
2058
- ; GFX9-NEXT: s_mov_b32 m0, s5
2059
2019
; GFX9-NEXT: v_writelane_b32 v1, s4, m0
2060
2020
; GFX9-NEXT: s_add_i32 s4, s4, s8
2061
2021
; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
@@ -2092,10 +2052,7 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
2092
2052
; GFX10W64-NEXT: ; implicit-def: $vgpr1
2093
2053
; GFX10W64-NEXT: .LBB7_1: ; %ComputeLoop
2094
2054
; GFX10W64-NEXT: ; =>This Inner Loop Header: Depth=1
2095
- ; GFX10W64-NEXT: s_ff1_i32_b32 s5, s3
2096
- ; GFX10W64-NEXT: s_ff1_i32_b32 s6, s2
2097
- ; GFX10W64-NEXT: s_add_i32 s5, s5, 32
2098
- ; GFX10W64-NEXT: s_min_u32 s5, s6, s5
2055
+ ; GFX10W64-NEXT: s_ff1_i32_b64 s5, s[2:3]
2099
2056
; GFX10W64-NEXT: v_readlane_b32 s8, v0, s5
2100
2057
; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s5
2101
2058
; GFX10W64-NEXT: v_writelane_b32 v1, s4, s5
@@ -2174,16 +2131,12 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
2174
2131
; GFX11W64-NEXT: ; implicit-def: $vgpr1
2175
2132
; GFX11W64-NEXT: .LBB7_1: ; %ComputeLoop
2176
2133
; GFX11W64-NEXT: ; =>This Inner Loop Header: Depth=1
2177
- ; GFX11W64-NEXT: s_ctz_i32_b32 s5, s3
2178
- ; GFX11W64-NEXT: s_ctz_i32_b32 s6, s2
2179
- ; GFX11W64-NEXT: s_add_i32 s5, s5, 32
2180
- ; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
2181
- ; GFX11W64-NEXT: s_min_u32 s5, s6, s5
2134
+ ; GFX11W64-NEXT: s_ctz_i32_b64 s5, s[2:3]
2135
+ ; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
2182
2136
; GFX11W64-NEXT: v_readlane_b32 s8, v0, s5
2183
2137
; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s5
2184
2138
; GFX11W64-NEXT: v_writelane_b32 v1, s4, s5
2185
2139
; GFX11W64-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7]
2186
- ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
2187
2140
; GFX11W64-NEXT: s_add_i32 s4, s4, s8
2188
2141
; GFX11W64-NEXT: s_cmp_lg_u64 s[2:3], 0
2189
2142
; GFX11W64-NEXT: s_cbranch_scc1 .LBB7_1
@@ -2266,16 +2219,12 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
2266
2219
; GFX12W64-NEXT: ; implicit-def: $vgpr1
2267
2220
; GFX12W64-NEXT: .LBB7_1: ; %ComputeLoop
2268
2221
; GFX12W64-NEXT: ; =>This Inner Loop Header: Depth=1
2269
- ; GFX12W64-NEXT: s_ctz_i32_b32 s5, s3
2270
- ; GFX12W64-NEXT: s_ctz_i32_b32 s6, s2
2271
- ; GFX12W64-NEXT: s_add_co_i32 s5, s5, 32
2272
- ; GFX12W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
2273
- ; GFX12W64-NEXT: s_min_u32 s5, s6, s5
2222
+ ; GFX12W64-NEXT: s_ctz_i32_b64 s5, s[2:3]
2223
+ ; GFX12W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
2274
2224
; GFX12W64-NEXT: v_readlane_b32 s8, v0, s5
2275
2225
; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s5
2276
2226
; GFX12W64-NEXT: v_writelane_b32 v1, s4, s5
2277
2227
; GFX12W64-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7]
2278
- ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
2279
2228
; GFX12W64-NEXT: s_add_co_i32 s4, s4, s8
2280
2229
; GFX12W64-NEXT: s_cmp_lg_u64 s[2:3], 0
2281
2230
; GFX12W64-NEXT: s_cbranch_scc1 .LBB7_1
0 commit comments