Skip to content

Commit eac6d6e

Browse files
committed
AMDGPU: Custom lower vector fptrunc of f32 -> f16
GFx950+ supports v_cvt_pk_f16_f32. However current implementation of vector fptrunc lowering fully scalarizes the vector, and the scalar conversions may not always be combined to generate the packed one. We made v2f32 -> v2f16 legal in #139956. This work is an extension to handle wider vectors. Instead of fully scalarization, we split the vector to packs (v2f32 -> v2f16) to ensure the packed conversion can always been generated. NOTE: Use .clampMaxNumElements(0, S16, 2)
1 parent 833b904 commit eac6d6e

File tree

2 files changed

+10
-29
lines changed

2 files changed

+10
-29
lines changed

llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp

Lines changed: 1 addition & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -223,13 +223,6 @@ static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
223223
};
224224
}
225225

226-
static LegalityPredicate numElementsPowerOf2(unsigned TypeIdx) {
227-
return [=](const LegalityQuery &Query) {
228-
const LLT QueryTy = Query.Types[TypeIdx];
229-
return QueryTy.isVector() && isPowerOf2_32(QueryTy.getNumElements());
230-
};
231-
}
232-
233226
static bool isRegisterSize(const GCNSubtarget &ST, unsigned Size) {
234227
return ((ST.useRealTrue16Insts() && Size == 16) || Size % 32 == 0) &&
235228
Size <= MaxRegisterSize;
@@ -1070,9 +1063,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
10701063
auto &FPTruncActions = getActionDefinitionsBuilder(G_FPTRUNC);
10711064
if (ST.hasCvtPkF16F32Inst()) {
10721065
FPTruncActions.legalFor({{S32, S64}, {S16, S32}, {V2S16, V2S32}})
1073-
.fewerElementsIf(all(elementTypeIs(0, S16), vectorWiderThan(0, 32),
1074-
numElementsPowerOf2(0), elementTypeIs(1, S32)),
1075-
changeTo(0, V2S16));
1066+
.clampMaxNumElements(0, S16, 2);
10761067
} else {
10771068
FPTruncActions.legalFor({{S32, S64}, {S16, S32}});
10781069
}

llvm/test/CodeGen/AMDGPU/fptrunc.v2f16.no.fast.math.ll

Lines changed: 9 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -13,22 +13,13 @@ define <2 x half> @v_test_cvt_v2f32_v2f16(<2 x float> %src) {
1313
}
1414

1515
define <3 x half> @v_test_cvt_v3f32_v3f16(<3 x float> %src) {
16-
; GFX950-SDAG-LABEL: v_test_cvt_v3f32_v3f16:
17-
; GFX950-SDAG: ; %bb.0:
18-
; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
19-
; GFX950-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2
20-
; GFX950-SDAG-NEXT: v_cvt_pk_f16_f32 v0, v0, v1
21-
; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, v2
22-
; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31]
23-
;
24-
; GFX950-GISEL-LABEL: v_test_cvt_v3f32_v3f16:
25-
; GFX950-GISEL: ; %bb.0:
26-
; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27-
; GFX950-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
28-
; GFX950-GISEL-NEXT: v_cvt_f16_f32_e32 v3, v1
29-
; GFX950-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v2
30-
; GFX950-GISEL-NEXT: v_pack_b32_f16 v0, v0, v3
31-
; GFX950-GISEL-NEXT: s_setpc_b64 s[30:31]
16+
; GFX950-LABEL: v_test_cvt_v3f32_v3f16:
17+
; GFX950: ; %bb.0:
18+
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
19+
; GFX950-NEXT: v_cvt_f16_f32_e32 v2, v2
20+
; GFX950-NEXT: v_cvt_pk_f16_f32 v0, v0, v1
21+
; GFX950-NEXT: v_mov_b32_e32 v1, v2
22+
; GFX950-NEXT: s_setpc_b64 s[30:31]
3223
%res = fptrunc <3 x float> %src to <3 x half>
3324
ret <3 x half> %res
3425
}
@@ -102,10 +93,9 @@ define half @fptrunc_v3f32_v3f16_extract_uses(<3 x float> %vec_float) {
10293
; GFX950-GISEL-LABEL: fptrunc_v3f32_v3f16_extract_uses:
10394
; GFX950-GISEL: ; %bb.0:
10495
; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
105-
; GFX950-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
106-
; GFX950-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1
10796
; GFX950-GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2
108-
; GFX950-GISEL-NEXT: v_add_f16_e32 v0, v0, v1
97+
; GFX950-GISEL-NEXT: v_cvt_pk_f16_f32 v0, v0, v1
98+
; GFX950-GISEL-NEXT: v_add_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
10999
; GFX950-GISEL-NEXT: v_add_f16_e32 v0, v2, v0
110100
; GFX950-GISEL-NEXT: s_setpc_b64 s[30:31]
111101
%vec_half = fptrunc <3 x float> %vec_float to <3 x half>

0 commit comments

Comments
 (0)