Skip to content

Commit b034a4f

Browse files
committed
[AMDGPU] Optimize image sample followed by llvm.amdgcn.cvt.pkrtz into d16 variant
1 parent 64fe323 commit b034a4f

File tree

2 files changed

+170
-0
lines changed

2 files changed

+170
-0
lines changed

llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -247,6 +247,42 @@ simplifyAMDGCNImageIntrinsic(const GCNSubtarget *ST,
247247
ArgTys[0] = User->getType();
248248
});
249249
}
250+
251+
// Fold image.sample + cvt.pkrtz -> extractelement idx0 into a single
252+
// d16 image sample.
253+
// Pattern to match:
254+
// %sample = call float @llvm.amdgcn.image.sample...
255+
// %pack = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %sample,
256+
// float %any)
257+
// %low = extractelement <2 x half> %pack, i64 0
258+
// Replacement:
259+
// call half @llvm.amdgcn.image.sample
260+
//
261+
// Folding criteria:
262+
// 1. The only user of the image.sample intrinsic is amdgcn.cvt.pkrtz.
263+
// 2. That cvt.pkrtz call has exactly one use.
264+
// 3. Its sole user is an extractelement instruction with index zero.
265+
// Otherwise, folding is not performed, because D16 sampling only
266+
// guarantees that the element at index 0 is defined; index 1 is
267+
// undefined and using it will result in poison.
268+
if (auto *CvtPkrtzCall = dyn_cast<CallInst>(User)) {
269+
if (CvtPkrtzCall->getIntrinsicID() == Intrinsic::amdgcn_cvt_pkrtz &&
270+
CvtPkrtzCall->hasOneUse()) {
271+
// Unique use must be extractelement idx == 0
272+
if (auto *Ext =
273+
dyn_cast<ExtractElementInst>(*CvtPkrtzCall->user_begin())) {
274+
if (isa<ConstantInt>(Ext->getIndexOperand()) &&
275+
cast<ConstantInt>(Ext->getIndexOperand())->isZero()) {
276+
277+
return modifyIntrinsicCall(
278+
II, *CvtPkrtzCall, ImageDimIntr->Intr, IC,
279+
[&](auto &Args, auto &ArgTys) {
280+
ArgTys[0] = CvtPkrtzCall->getType();
281+
});
282+
}
283+
}
284+
}
285+
}
250286
}
251287

252288
// Only perform D16 folding if every user of the image sample is

llvm/test/Transforms/InstCombine/AMDGPU/image-d16.ll

Lines changed: 134 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -239,6 +239,140 @@ main_body:
239239
ret bfloat %res
240240
}
241241

242+
define amdgpu_ps float @image_sample_2d_single_pkrtz_to_d16(<8 x i32> %surf_desc, <4 x i32> %samp, float %u, float %v) {
243+
; GFX7-LABEL: @image_sample_2d_single_pkrtz_to_d16(
244+
; GFX7-NEXT: main_body:
245+
; GFX7-NEXT: [[SAMPLE:%.*]] = call float @llvm.amdgcn.image.sample.lz.2d.f32.f32.v8i32.v4i32(i32 1, float [[U:%.*]], float [[V:%.*]], <8 x i32> [[SURF_DESC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
246+
; GFX7-NEXT: [[PACK:%.*]] = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float [[SAMPLE]], float 0.000000e+00)
247+
; GFX7-NEXT: [[H0:%.*]] = extractelement <2 x half> [[PACK]], i64 0
248+
; GFX7-NEXT: [[MUL:%.*]] = fmul reassoc arcp contract afn half [[H0]], [[H0]]
249+
; GFX7-NEXT: [[DIV:%.*]] = fdiv reassoc arcp contract afn half [[MUL]], [[H0]]
250+
; GFX7-NEXT: [[ADD:%.*]] = fadd reassoc arcp contract afn half [[DIV]], [[H0]]
251+
; GFX7-NEXT: [[RES:%.*]] = fpext half [[ADD]] to float
252+
; GFX7-NEXT: ret float [[RES]]
253+
;
254+
; GFX81PLUS-LABEL: @image_sample_2d_single_pkrtz_to_d16(
255+
; GFX81PLUS-NEXT: main_body:
256+
; GFX81PLUS-NEXT: [[SAMPLE:%.*]] = call half @llvm.amdgcn.image.sample.lz.2d.f16.f32.v8i32.v4i32(i32 1, float [[U:%.*]], float [[V:%.*]], <8 x i32> [[SURF_DESC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
257+
; GFX81PLUS-NEXT: [[MUL:%.*]] = fmul reassoc arcp contract afn half [[SAMPLE]], [[SAMPLE]]
258+
; GFX81PLUS-NEXT: [[DIV:%.*]] = fdiv reassoc arcp contract afn half [[MUL]], [[SAMPLE]]
259+
; GFX81PLUS-NEXT: [[ADD:%.*]] = fadd reassoc arcp contract afn half [[DIV]], [[SAMPLE]]
260+
; GFX81PLUS-NEXT: [[RES:%.*]] = fpext half [[ADD]] to float
261+
; GFX81PLUS-NEXT: ret float [[RES]]
262+
;
263+
main_body:
264+
%sample = call float @llvm.amdgcn.image.sample.lz.2d.f32.f32.v8i32.v4i32(i32 1, float %u, float %v, <8 x i32> %surf_desc, <4 x i32> %samp, i1 false, i32 0, i32 0)
265+
%pack = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %sample, float 0.000000e+00)
266+
%h0 = extractelement <2 x half> %pack, i64 0
267+
%mul = fmul reassoc arcp contract afn half %h0, %h0
268+
%div = fdiv reassoc arcp contract afn half %mul, %h0
269+
%add = fadd reassoc arcp contract afn half %div, %h0
270+
%res = fpext half %add to float
271+
ret float %res
272+
}
273+
274+
define amdgpu_ps float @image_sample_2d_pkrtz_variable_no_d16(<8 x i32> %surf_desc, <4 x i32> %samp, float %u, float %v) {
275+
; GFX7-LABEL: @image_sample_2d_pkrtz_variable_no_d16(
276+
; GFX7-NEXT: main_body:
277+
; GFX7-NEXT: [[SAMPLE:%.*]] = call float @llvm.amdgcn.image.sample.lz.2d.f32.f32.v8i32.v4i32(i32 1, float [[U:%.*]], float [[V:%.*]], <8 x i32> [[SURF_DESC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
278+
; GFX7-NEXT: [[PACK:%.*]] = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float [[SAMPLE]], float [[V]])
279+
; GFX7-NEXT: [[H0:%.*]] = extractelement <2 x half> [[PACK]], i64 0
280+
; GFX7-NEXT: [[H1:%.*]] = extractelement <2 x half> [[PACK]], i64 1
281+
; GFX7-NEXT: [[MUL:%.*]] = fmul half [[H0]], [[H1]]
282+
; GFX7-NEXT: [[ADD:%.*]] = fadd half [[MUL]], [[H0]]
283+
; GFX7-NEXT: [[RES:%.*]] = fpext half [[ADD]] to float
284+
; GFX7-NEXT: ret float [[RES]]
285+
;
286+
; GFX81PLUS-LABEL: @image_sample_2d_pkrtz_variable_no_d16(
287+
; GFX81PLUS-NEXT: main_body:
288+
; GFX81PLUS-NEXT: [[SAMPLE:%.*]] = call float @llvm.amdgcn.image.sample.lz.2d.f32.f32.v8i32.v4i32(i32 1, float [[U:%.*]], float [[V:%.*]], <8 x i32> [[SURF_DESC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
289+
; GFX81PLUS-NEXT: [[PACK:%.*]] = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float [[SAMPLE]], float [[V]])
290+
; GFX81PLUS-NEXT: [[H0:%.*]] = extractelement <2 x half> [[PACK]], i64 0
291+
; GFX81PLUS-NEXT: [[H1:%.*]] = extractelement <2 x half> [[PACK]], i64 1
292+
; GFX81PLUS-NEXT: [[MUL:%.*]] = fmul half [[H0]], [[H1]]
293+
; GFX81PLUS-NEXT: [[ADD:%.*]] = fadd half [[MUL]], [[H0]]
294+
; GFX81PLUS-NEXT: [[RES:%.*]] = fpext half [[ADD]] to float
295+
; GFX81PLUS-NEXT: ret float [[RES]]
296+
;
297+
main_body:
298+
%sample = call float @llvm.amdgcn.image.sample.lz.2d.f32.f32.v8i32.v4i32(i32 1, float %u, float %v, <8 x i32> %surf_desc, <4 x i32> %samp, i1 false, i32 0, i32 0)
299+
%pack = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %sample, float %v)
300+
%h0 = extractelement <2 x half> %pack, i64 0
301+
%h1 = extractelement <2 x half> %pack, i64 1
302+
%mul = fmul half %h0, %h1
303+
%add = fadd half %mul, %h0
304+
%res = fpext half %add to float
305+
ret float %res
306+
}
307+
308+
define amdgpu_ps float @image_sample_2d_pkrtz_constant_no_fold(<8 x i32> %surf_desc, <4 x i32> %samp, float %u, float %v) {
309+
; GFX7-LABEL: @image_sample_2d_pkrtz_constant_no_fold(
310+
; GFX7-NEXT: main_body:
311+
; GFX7-NEXT: [[SAMPLE:%.*]] = call float @llvm.amdgcn.image.sample.lz.2d.f32.f32.v8i32.v4i32(i32 1, float [[U:%.*]], float [[V:%.*]], <8 x i32> [[SURF_DESC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
312+
; GFX7-NEXT: [[PACK:%.*]] = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float [[SAMPLE]], float 0.000000e+00)
313+
; GFX7-NEXT: [[H0:%.*]] = extractelement <2 x half> [[PACK]], i64 0
314+
; GFX7-NEXT: [[H1:%.*]] = extractelement <2 x half> [[PACK]], i64 1
315+
; GFX7-NEXT: [[MUL:%.*]] = fmul half [[H0]], [[H1]]
316+
; GFX7-NEXT: [[ADD:%.*]] = fadd half [[MUL]], [[H0]]
317+
; GFX7-NEXT: [[RES:%.*]] = fpext half [[ADD]] to float
318+
; GFX7-NEXT: ret float [[RES]]
319+
;
320+
; GFX81PLUS-LABEL: @image_sample_2d_pkrtz_constant_no_fold(
321+
; GFX81PLUS-NEXT: main_body:
322+
; GFX81PLUS-NEXT: [[SAMPLE:%.*]] = call float @llvm.amdgcn.image.sample.lz.2d.f32.f32.v8i32.v4i32(i32 1, float [[U:%.*]], float [[V:%.*]], <8 x i32> [[SURF_DESC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
323+
; GFX81PLUS-NEXT: [[PACK:%.*]] = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float [[SAMPLE]], float 0.000000e+00)
324+
; GFX81PLUS-NEXT: [[H0:%.*]] = extractelement <2 x half> [[PACK]], i64 0
325+
; GFX81PLUS-NEXT: [[H1:%.*]] = extractelement <2 x half> [[PACK]], i64 1
326+
; GFX81PLUS-NEXT: [[MUL:%.*]] = fmul half [[H0]], [[H1]]
327+
; GFX81PLUS-NEXT: [[ADD:%.*]] = fadd half [[MUL]], [[H0]]
328+
; GFX81PLUS-NEXT: [[RES:%.*]] = fpext half [[ADD]] to float
329+
; GFX81PLUS-NEXT: ret float [[RES]]
330+
;
331+
main_body:
332+
%sample = call float @llvm.amdgcn.image.sample.lz.2d.f32.f32.v8i32.v4i32(i32 1, float %u, float %v, <8 x i32> %surf_desc, <4 x i32> %samp, i1 false, i32 0, i32 0)
333+
%pack = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %sample, float 0.000000e+00)
334+
%h0 = extractelement <2 x half> %pack, i64 0
335+
%h1 = extractelement <2 x half> %pack, i64 1
336+
%mul = fmul half %h0, %h1
337+
%add = fadd half %mul, %h0
338+
%res = fpext half %add to float
339+
ret float %res
340+
}
341+
342+
define amdgpu_ps float @image_sample_2d_single_pkrtz_high_no_d16(<8 x i32> %surf_desc, <4 x i32> %samp, float %u, float %v) {
343+
; GFX7-LABEL: @image_sample_2d_single_pkrtz_high_no_d16(
344+
; GFX7-NEXT: main_body:
345+
; GFX7-NEXT: [[SAMPLE:%.*]] = call float @llvm.amdgcn.image.sample.lz.2d.f32.f32.v8i32.v4i32(i32 1, float [[U:%.*]], float [[V:%.*]], <8 x i32> [[SURF_DESC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
346+
; GFX7-NEXT: [[PACK:%.*]] = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float 0.000000e+00, float [[SAMPLE]])
347+
; GFX7-NEXT: [[H0:%.*]] = extractelement <2 x half> [[PACK]], i64 1
348+
; GFX7-NEXT: [[MUL:%.*]] = fmul reassoc arcp contract afn half [[H0]], [[H0]]
349+
; GFX7-NEXT: [[DIV:%.*]] = fdiv reassoc arcp contract afn half [[MUL]], [[H0]]
350+
; GFX7-NEXT: [[ADD:%.*]] = fadd reassoc arcp contract afn half [[DIV]], [[H0]]
351+
; GFX7-NEXT: [[RES:%.*]] = fpext half [[ADD]] to float
352+
; GFX7-NEXT: ret float [[RES]]
353+
;
354+
; GFX81PLUS-LABEL: @image_sample_2d_single_pkrtz_high_no_d16(
355+
; GFX81PLUS-NEXT: main_body:
356+
; GFX81PLUS-NEXT: [[SAMPLE:%.*]] = call float @llvm.amdgcn.image.sample.lz.2d.f32.f32.v8i32.v4i32(i32 1, float [[U:%.*]], float [[V:%.*]], <8 x i32> [[SURF_DESC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
357+
; GFX81PLUS-NEXT: [[PACK:%.*]] = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float 0.000000e+00, float [[SAMPLE]])
358+
; GFX81PLUS-NEXT: [[H0:%.*]] = extractelement <2 x half> [[PACK]], i64 1
359+
; GFX81PLUS-NEXT: [[MUL:%.*]] = fmul reassoc arcp contract afn half [[H0]], [[H0]]
360+
; GFX81PLUS-NEXT: [[DIV:%.*]] = fdiv reassoc arcp contract afn half [[MUL]], [[H0]]
361+
; GFX81PLUS-NEXT: [[ADD:%.*]] = fadd reassoc arcp contract afn half [[DIV]], [[H0]]
362+
; GFX81PLUS-NEXT: [[RES:%.*]] = fpext half [[ADD]] to float
363+
; GFX81PLUS-NEXT: ret float [[RES]]
364+
;
365+
main_body:
366+
%sample = call float @llvm.amdgcn.image.sample.lz.2d.f32.f32.v8i32.v4i32(i32 1, float %u, float %v, <8 x i32> %surf_desc, <4 x i32> %samp, i1 false, i32 0, i32 0)
367+
%pack = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float 0.000000e+00, float %sample)
368+
%h0 = extractelement <2 x half> %pack, i64 1
369+
%mul = fmul reassoc arcp contract afn half %h0, %h0
370+
%div = fdiv reassoc arcp contract afn half %mul, %h0
371+
%add = fadd reassoc arcp contract afn half %div, %h0
372+
%res = fpext half %add to float
373+
ret float %res
374+
}
375+
242376
define amdgpu_ps half @image_gather4_2d_v4f32(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %t) {
243377
; GFX7-LABEL: @image_gather4_2d_v4f32(
244378
; GFX7-NEXT: main_body:

0 commit comments

Comments
 (0)