@@ -45,151 +45,151 @@ define amdgpu_kernel void @preload_block_count_x(ptr addrspace(1) inreg noundef
4545; GFX942-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0
4646; GFX942-NEXT: v_trunc_f32_e32 v1, v1
4747; GFX942-NEXT: v_fmamk_f32 v0, v1, 0xcf800000, v0
48- ; GFX942-NEXT: v_cvt_u32_f32_e32 v4, v1
49- ; GFX942-NEXT: v_cvt_u32_f32_e32 v5, v0
50- ; GFX942-NEXT: v_mul_lo_u32 v0, s1, v4
51- ; GFX942-NEXT: v_mul_hi_u32 v2, s1, v5
52- ; GFX942-NEXT: v_mul_lo_u32 v1, s3, v5
53- ; GFX942-NEXT: v_add_u32_e32 v0, v2, v0
54- ; GFX942-NEXT: v_mul_lo_u32 v6, s1, v5
55- ; GFX942-NEXT: v_add_u32_e32 v7, v0, v1
56- ; GFX942-NEXT: v_mul_hi_u32 v3, v5, v7
57- ; GFX942-NEXT: v_mul_lo_u32 v2, v5, v7
58- ; GFX942-NEXT: v_mul_hi_u32 v0, v5, v6
59- ; GFX942-NEXT: v_mov_b32_e32 v1, 0
60- ; GFX942-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, v[2:3]
61- ; GFX942-NEXT: v_mul_hi_u32 v0, v4, v6
62- ; GFX942-NEXT: v_mul_lo_u32 v6, v4, v6
63- ; GFX942-NEXT: v_add_co_u32_e32 v2, vcc, v2, v6
64- ; GFX942-NEXT: v_mul_hi_u32 v8, v4, v7
65- ; GFX942-NEXT: s_nop 0
66- ; GFX942-NEXT: v_addc_co_u32_e32 v0, vcc, v3, v0, vcc
67- ; GFX942-NEXT: v_mul_lo_u32 v2, v4, v7
68- ; GFX942-NEXT: s_nop 0
69- ; GFX942-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v8, vcc
70- ; GFX942-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, v[2:3]
71- ; GFX942-NEXT: v_add_co_u32_e32 v5, vcc, v5, v2
72- ; GFX942-NEXT: v_mul_hi_u32 v2, s1, v5
73- ; GFX942-NEXT: s_nop 0
74- ; GFX942-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v3, vcc
75- ; GFX942-NEXT: v_mul_lo_u32 v0, s1, v4
76- ; GFX942-NEXT: v_add_u32_e32 v0, v2, v0
77- ; GFX942-NEXT: v_mul_lo_u32 v2, s3, v5
78- ; GFX942-NEXT: v_add_u32_e32 v6, v0, v2
79- ; GFX942-NEXT: v_mul_lo_u32 v0, s1, v5
80- ; GFX942-NEXT: v_mul_hi_u32 v8, v4, v0
81- ; GFX942-NEXT: v_mul_lo_u32 v9, v4, v0
82- ; GFX942-NEXT: v_mul_hi_u32 v3, v5, v6
83- ; GFX942-NEXT: v_mul_lo_u32 v2, v5, v6
84- ; GFX942-NEXT: v_mul_hi_u32 v0, v5, v0
85- ; GFX942-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, v[2:3]
86- ; GFX942-NEXT: v_add_co_u32_e32 v0, vcc, v2, v9
87- ; GFX942-NEXT: v_mul_hi_u32 v7, v4, v6
88- ; GFX942-NEXT: s_nop 0
89- ; GFX942-NEXT: v_addc_co_u32_e32 v0, vcc, v3, v8, vcc
90- ; GFX942-NEXT: v_mul_lo_u32 v2, v4, v6
91- ; GFX942-NEXT: s_nop 0
92- ; GFX942-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v7, vcc
93- ; GFX942-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, v[2:3]
94- ; GFX942-NEXT: v_add_co_u32_e32 v5, vcc, v5, v2
95- ; GFX942-NEXT: v_mul_hi_u32 v0, s6, v5
96- ; GFX942-NEXT: s_nop 0
97- ; GFX942-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v3, vcc
98- ; GFX942-NEXT: v_mul_hi_u32 v3, s6, v4
99- ; GFX942-NEXT: v_mul_lo_u32 v2, s6, v4
100- ; GFX942-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, v[2:3]
101- ; GFX942-NEXT: v_mul_hi_u32 v0, s7, v5
102- ; GFX942-NEXT: v_mul_lo_u32 v5, s7, v5
103- ; GFX942-NEXT: v_add_co_u32_e32 v2, vcc, v2, v5
104- ; GFX942-NEXT: v_mul_hi_u32 v6, s7, v4
105- ; GFX942-NEXT: s_nop 0
106- ; GFX942-NEXT: v_addc_co_u32_e32 v0, vcc, v3, v0, vcc
107- ; GFX942-NEXT: v_mul_lo_u32 v2, s7, v4
108- ; GFX942-NEXT: s_nop 0
109- ; GFX942-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v6, vcc
110- ; GFX942-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
111- ; GFX942-NEXT: v_mul_lo_u32 v2, s12, v1
112- ; GFX942-NEXT: v_mul_hi_u32 v3, s12, v0
113- ; GFX942-NEXT: v_add_u32_e32 v2, v3, v2
114- ; GFX942-NEXT: v_mul_lo_u32 v3, s13, v0
115- ; GFX942-NEXT: v_add_u32_e32 v6, v2, v3
116- ; GFX942-NEXT: v_mul_lo_u32 v3, s12, v0
117- ; GFX942-NEXT: v_sub_u32_e32 v2, s7, v6
118- ; GFX942-NEXT: v_mov_b32_e32 v4, s13
119- ; GFX942-NEXT: v_sub_co_u32_e32 v7, vcc, s6, v3
120- ; GFX942-NEXT: s_nop 1
121- ; GFX942-NEXT: v_subb_co_u32_e64 v2, s[8:9], v2, v4, vcc
122- ; GFX942-NEXT: v_subrev_co_u32_e64 v3, s[8:9], s12, v7
123- ; GFX942-NEXT: s_nop 1
124- ; GFX942-NEXT: v_subbrev_co_u32_e64 v2, s[8:9], 0, v2, s[8:9]
125- ; GFX942-NEXT: v_cmp_le_u32_e64 s[8:9], s13, v2
126- ; GFX942-NEXT: s_nop 1
127- ; GFX942-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[8:9]
128- ; GFX942-NEXT: v_cmp_le_u32_e64 s[8:9], s12, v3
129- ; GFX942-NEXT: s_nop 1
130- ; GFX942-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[8:9]
131- ; GFX942-NEXT: v_cmp_eq_u32_e64 s[8:9], s13, v2
132- ; GFX942-NEXT: s_nop 1
133- ; GFX942-NEXT: v_cndmask_b32_e64 v8, v4, v3, s[8:9]
134- ; GFX942-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, 1
135- ; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, 2
136- ; GFX942-NEXT: v_cmp_ne_u32_e64 s[8:9], 0, v8
137- ; GFX942-NEXT: s_nop 1
138- ; GFX942-NEXT: v_cndmask_b32_e64 v2, v2, v4, s[8:9]
139- ; GFX942-NEXT: v_mov_b32_e32 v4, s7
140- ; GFX942-NEXT: v_subb_co_u32_e32 v4, vcc, v4, v6, vcc
141- ; GFX942-NEXT: v_cmp_le_u32_e32 vcc, s13, v4
142- ; GFX942-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[8:9]
143- ; GFX942-NEXT: s_nop 0
144- ; GFX942-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc
145- ; GFX942-NEXT: v_cmp_le_u32_e32 vcc, s12, v7
146- ; GFX942-NEXT: s_nop 1
147- ; GFX942-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc
148- ; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, s13, v4
149- ; GFX942-NEXT: s_nop 1
150- ; GFX942-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc
151- ; GFX942-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4
152- ; GFX942-NEXT: s_nop 1
153- ; GFX942-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
154- ; GFX942-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
48+ ; GFX942-NEXT: v_cvt_u32_f32_e32 v1, v1
49+ ; GFX942-NEXT: v_cvt_u32_f32_e32 v0, v0
50+ ; GFX942-NEXT: v_readfirstlane_b32 s5, v1
51+ ; GFX942-NEXT: v_readfirstlane_b32 s8, v0
52+ ; GFX942-NEXT: s_mul_i32 s9, s1, s5
53+ ; GFX942-NEXT: s_mul_hi_u32 s15, s1, s8
54+ ; GFX942-NEXT: s_mul_i32 s14, s3, s8
55+ ; GFX942-NEXT: s_add_i32 s9, s15, s9
56+ ; GFX942-NEXT: s_add_i32 s9, s9, s14
57+ ; GFX942-NEXT: s_mul_i32 s16, s1, s8
58+ ; GFX942-NEXT: s_mul_hi_u32 s14, s8, s9
59+ ; GFX942-NEXT: s_mul_i32 s15, s8, s9
60+ ; GFX942-NEXT: s_mul_hi_u32 s8, s8, s16
61+ ; GFX942-NEXT: s_add_u32 s8, s8, s15
62+ ; GFX942-NEXT: s_addc_u32 s14, 0, s14
63+ ; GFX942-NEXT: s_mul_hi_u32 s17, s5, s16
64+ ; GFX942-NEXT: s_mul_i32 s16, s5, s16
65+ ; GFX942-NEXT: s_add_u32 s8, s8, s16
66+ ; GFX942-NEXT: s_mul_hi_u32 s15, s5, s9
67+ ; GFX942-NEXT: s_addc_u32 s8, s14, s17
68+ ; GFX942-NEXT: s_addc_u32 s14, s15, 0
69+ ; GFX942-NEXT: s_mul_i32 s9, s5, s9
70+ ; GFX942-NEXT: s_add_u32 s8, s8, s9
71+ ; GFX942-NEXT: s_addc_u32 s9, 0, s14
72+ ; GFX942-NEXT: v_add_co_u32_e32 v0, vcc, s8, v0
73+ ; GFX942-NEXT: s_cmp_lg_u64 vcc, 0
74+ ; GFX942-NEXT: s_addc_u32 s5, s5, s9
75+ ; GFX942-NEXT: v_readfirstlane_b32 s9, v0
76+ ; GFX942-NEXT: s_mul_i32 s8, s1, s5
77+ ; GFX942-NEXT: s_mul_hi_u32 s14, s1, s9
78+ ; GFX942-NEXT: s_add_i32 s8, s14, s8
79+ ; GFX942-NEXT: s_mul_i32 s3, s3, s9
80+ ; GFX942-NEXT: s_add_i32 s8, s8, s3
81+ ; GFX942-NEXT: s_mul_i32 s1, s1, s9
82+ ; GFX942-NEXT: s_mul_hi_u32 s14, s5, s1
83+ ; GFX942-NEXT: s_mul_i32 s15, s5, s1
84+ ; GFX942-NEXT: s_mul_i32 s17, s9, s8
85+ ; GFX942-NEXT: s_mul_hi_u32 s1, s9, s1
86+ ; GFX942-NEXT: s_mul_hi_u32 s16, s9, s8
87+ ; GFX942-NEXT: s_add_u32 s1, s1, s17
88+ ; GFX942-NEXT: s_addc_u32 s9, 0, s16
89+ ; GFX942-NEXT: s_add_u32 s1, s1, s15
90+ ; GFX942-NEXT: s_mul_hi_u32 s3, s5, s8
91+ ; GFX942-NEXT: s_addc_u32 s1, s9, s14
92+ ; GFX942-NEXT: s_addc_u32 s3, s3, 0
93+ ; GFX942-NEXT: s_mul_i32 s8, s5, s8
94+ ; GFX942-NEXT: s_add_u32 s1, s1, s8
95+ ; GFX942-NEXT: s_addc_u32 s3, 0, s3
96+ ; GFX942-NEXT: v_add_co_u32_e32 v0, vcc, s1, v0
97+ ; GFX942-NEXT: s_cmp_lg_u64 vcc, 0
98+ ; GFX942-NEXT: s_addc_u32 s1, s5, s3
99+ ; GFX942-NEXT: v_readfirstlane_b32 s8, v0
100+ ; GFX942-NEXT: s_mul_i32 s5, s6, s1
101+ ; GFX942-NEXT: s_mul_hi_u32 s9, s6, s8
102+ ; GFX942-NEXT: s_mul_hi_u32 s3, s6, s1
103+ ; GFX942-NEXT: s_add_u32 s5, s9, s5
104+ ; GFX942-NEXT: s_addc_u32 s3, 0, s3
105+ ; GFX942-NEXT: s_mul_hi_u32 s14, s7, s8
106+ ; GFX942-NEXT: s_mul_i32 s8, s7, s8
107+ ; GFX942-NEXT: s_add_u32 s5, s5, s8
108+ ; GFX942-NEXT: s_mul_hi_u32 s9, s7, s1
109+ ; GFX942-NEXT: s_addc_u32 s3, s3, s14
110+ ; GFX942-NEXT: s_addc_u32 s5, s9, 0
111+ ; GFX942-NEXT: s_mul_i32 s1, s7, s1
112+ ; GFX942-NEXT: s_add_u32 s1, s3, s1
113+ ; GFX942-NEXT: s_addc_u32 s3, 0, s5
114+ ; GFX942-NEXT: s_mul_i32 s5, s12, s3
115+ ; GFX942-NEXT: s_mul_hi_u32 s8, s12, s1
116+ ; GFX942-NEXT: s_add_i32 s5, s8, s5
117+ ; GFX942-NEXT: s_mul_i32 s8, s13, s1
118+ ; GFX942-NEXT: s_mul_i32 s9, s12, s1
119+ ; GFX942-NEXT: s_add_i32 s5, s5, s8
120+ ; GFX942-NEXT: v_mov_b32_e32 v0, s9
121+ ; GFX942-NEXT: s_sub_i32 s8, s7, s5
122+ ; GFX942-NEXT: v_sub_co_u32_e32 v0, vcc, s6, v0
123+ ; GFX942-NEXT: s_cmp_lg_u64 vcc, 0
124+ ; GFX942-NEXT: s_subb_u32 s14, s8, s13
125+ ; GFX942-NEXT: v_subrev_co_u32_e64 v1, s[8:9], s12, v0
126+ ; GFX942-NEXT: s_cmp_lg_u64 s[8:9], 0
127+ ; GFX942-NEXT: s_subb_u32 s8, s14, 0
128+ ; GFX942-NEXT: s_cmp_ge_u32 s8, s13
129+ ; GFX942-NEXT: v_readfirstlane_b32 s14, v1
130+ ; GFX942-NEXT: s_cselect_b32 s9, -1, 0
131+ ; GFX942-NEXT: s_cmp_ge_u32 s14, s12
132+ ; GFX942-NEXT: s_cselect_b32 s14, -1, 0
133+ ; GFX942-NEXT: s_cmp_eq_u32 s8, s13
134+ ; GFX942-NEXT: s_cselect_b32 s8, s14, s9
135+ ; GFX942-NEXT: s_add_u32 s9, s1, 1
136+ ; GFX942-NEXT: s_addc_u32 s14, s3, 0
137+ ; GFX942-NEXT: s_add_u32 s15, s1, 2
138+ ; GFX942-NEXT: s_addc_u32 s16, s3, 0
139+ ; GFX942-NEXT: s_cmp_lg_u32 s8, 0
140+ ; GFX942-NEXT: s_cselect_b32 s8, s15, s9
141+ ; GFX942-NEXT: s_cselect_b32 s9, s16, s14
142+ ; GFX942-NEXT: s_cmp_lg_u64 vcc, 0
143+ ; GFX942-NEXT: s_subb_u32 s5, s7, s5
144+ ; GFX942-NEXT: s_cmp_ge_u32 s5, s13
145+ ; GFX942-NEXT: v_readfirstlane_b32 s15, v0
146+ ; GFX942-NEXT: s_cselect_b32 s14, -1, 0
147+ ; GFX942-NEXT: s_cmp_ge_u32 s15, s12
148+ ; GFX942-NEXT: s_cselect_b32 s15, -1, 0
149+ ; GFX942-NEXT: s_cmp_eq_u32 s5, s13
150+ ; GFX942-NEXT: s_cselect_b32 s5, s15, s14
151+ ; GFX942-NEXT: s_cmp_lg_u32 s5, 0
152+ ; GFX942-NEXT: s_cselect_b32 s9, s9, s3
153+ ; GFX942-NEXT: s_cselect_b32 s8, s8, s1
155154; GFX942-NEXT: s_cbranch_execnz .LBB0_3
156155; GFX942-NEXT: .LBB0_2:
157156; GFX942-NEXT: v_cvt_f32_u32_e32 v0, s12
158157; GFX942-NEXT: s_sub_i32 s1, 0, s12
158+ ; GFX942-NEXT: s_mov_b32 s9, 0
159159; GFX942-NEXT: v_rcp_iflag_f32_e32 v0, v0
160160; GFX942-NEXT: s_nop 0
161161; GFX942-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
162162; GFX942-NEXT: v_cvt_u32_f32_e32 v0, v0
163- ; GFX942-NEXT: v_mul_lo_u32 v1, s1, v0
164- ; GFX942-NEXT: v_mul_hi_u32 v1, v0, v1
165- ; GFX942-NEXT: v_add_u32_e32 v0, v0, v1
166- ; GFX942-NEXT: v_mul_hi_u32 v0, s6, v0
167- ; GFX942-NEXT: v_mul_lo_u32 v2, v0, s12
168- ; GFX942-NEXT: v_sub_u32_e32 v2, s6, v2
169- ; GFX942-NEXT: v_add_u32_e32 v1, 1, v0
170- ; GFX942-NEXT: v_subrev_u32_e32 v3, s12, v2
171- ; GFX942-NEXT: v_cmp_le_u32_e32 vcc, s12, v2
172- ; GFX942-NEXT: s_nop 1
173- ; GFX942-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc
174- ; GFX942-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
175- ; GFX942-NEXT: v_add_u32_e32 v1, 1, v0
176- ; GFX942-NEXT: v_cmp_le_u32_e32 vcc, s12, v2
177- ; GFX942-NEXT: s_nop 1
178- ; GFX942-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
179- ; GFX942-NEXT: v_mov_b32_e32 v1, 0
163+ ; GFX942-NEXT: s_nop 0
164+ ; GFX942-NEXT: v_readfirstlane_b32 s3, v0
165+ ; GFX942-NEXT: s_mul_i32 s1, s1, s3
166+ ; GFX942-NEXT: s_mul_hi_u32 s1, s3, s1
167+ ; GFX942-NEXT: s_add_i32 s3, s3, s1
168+ ; GFX942-NEXT: s_mul_hi_u32 s1, s6, s3
169+ ; GFX942-NEXT: s_mul_i32 s5, s1, s12
170+ ; GFX942-NEXT: s_sub_i32 s5, s6, s5
171+ ; GFX942-NEXT: s_add_i32 s3, s1, 1
172+ ; GFX942-NEXT: s_sub_i32 s8, s5, s12
173+ ; GFX942-NEXT: s_cmp_ge_u32 s5, s12
174+ ; GFX942-NEXT: s_cselect_b32 s1, s3, s1
175+ ; GFX942-NEXT: s_cselect_b32 s5, s8, s5
176+ ; GFX942-NEXT: s_add_i32 s3, s1, 1
177+ ; GFX942-NEXT: s_cmp_ge_u32 s5, s12
178+ ; GFX942-NEXT: s_cselect_b32 s8, s3, s1
180179; GFX942-NEXT: .LBB0_3:
181- ; GFX942-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, 15
182180; GFX942-NEXT: s_ashr_i32 s1, s0, 31
183- ; GFX942-NEXT: v_and_b32_e32 v0, -16, v0
184- ; GFX942-NEXT: v_mul_lo_u32 v2, v0, s1
185- ; GFX942-NEXT: v_mul_hi_u32 v3, v0, s0
186- ; GFX942-NEXT: v_add_u32_e32 v2, v3, v2
187- ; GFX942-NEXT: v_mul_lo_u32 v1, v1, s0
188- ; GFX942-NEXT: v_add_u32_e32 v1, v2, v1
189- ; GFX942-NEXT: v_mul_lo_u32 v2, v0, s0
190- ; GFX942-NEXT: v_cvt_f64_i32_e32 v[0:1], v1
181+ ; GFX942-NEXT: s_add_u32 s3, s8, 15
182+ ; GFX942-NEXT: s_addc_u32 s5, s9, 0
183+ ; GFX942-NEXT: s_and_b32 s3, s3, -16
184+ ; GFX942-NEXT: s_mul_i32 s1, s3, s1
185+ ; GFX942-NEXT: s_mul_hi_u32 s8, s3, s0
186+ ; GFX942-NEXT: s_add_i32 s1, s8, s1
187+ ; GFX942-NEXT: s_mul_i32 s5, s5, s0
188+ ; GFX942-NEXT: s_add_i32 s1, s1, s5
189+ ; GFX942-NEXT: s_mul_i32 s3, s3, s0
190+ ; GFX942-NEXT: v_cvt_f64_i32_e32 v[0:1], s1
191191; GFX942-NEXT: v_ldexp_f64 v[0:1], v[0:1], 32
192- ; GFX942-NEXT: v_cvt_f64_u32_e32 v[2:3], v2
192+ ; GFX942-NEXT: v_cvt_f64_u32_e32 v[2:3], s3
193193; GFX942-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3]
194194; GFX942-NEXT: v_cvt_f64_u32_e32 v[2:3], s7
195195; GFX942-NEXT: v_ldexp_f64 v[2:3], v[2:3], 32
@@ -215,7 +215,7 @@ define amdgpu_kernel void @preload_block_count_x(ptr addrspace(1) inreg noundef
215215; GFX942-NEXT: .LBB0_4:
216216; GFX942-NEXT: .Ltmp4:
217217; GFX942-NEXT: ;DEBUG_VALUE: test:var <- [DW_OP_LLVM_poisoned] $sgpr2_sgpr3
218- ; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1
218+ ; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9
219219; GFX942-NEXT: s_branch .LBB0_2
220220; GFX942-NEXT: .Ltmp5:
221221entry:
0 commit comments