Skip to content

Commit fe4f6c1

Browse files
authored
[RISCV] Cost bf16/f16 vector non-unit memory accesses as legal without zvfhmin/zvfbfmin (#150882)
When vectorizing with predication some loops that were previously vectorized without zvfhmin/zvfbfmin will no longer be vectorized because the masked load/store or gather/scatter cost returns illegal. This is due to a discrepancy where for these costs we check isLegalElementTypeForRVV but for regular memory accesses we don't. But for bf16 and f16 vectors we don't actually need the extension support for loads and stores, so this adds a new function which takes this into account. For regular memory accesses we should probably also e.g. return an invalid cost for i64 elements on zve32x, but it doesn't look like we have tests for this yet. We also should probably not be vectorizing these bf16/f16 loops to begin with if we don't have zvfhmin/zvfbfmin and zfhmin/zfbfmin. I think this is due to the scalar costs being too cheap. I've added tests for this in a100f63 to fix in another patch.
1 parent 4b1d5b8 commit fe4f6c1

File tree

8 files changed

+266
-8
lines changed

8 files changed

+266
-8
lines changed

llvm/lib/Target/RISCV/RISCVISelLowering.cpp

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2739,6 +2739,27 @@ bool RISCVTargetLowering::isLegalElementTypeForRVV(EVT ScalarTy) const {
27392739
}
27402740
}
27412741

2742+
bool RISCVTargetLowering::isLegalLoadStoreElementTypeForRVV(
2743+
EVT ScalarTy) const {
2744+
if (!ScalarTy.isSimple())
2745+
return false;
2746+
switch (ScalarTy.getSimpleVT().SimpleTy) {
2747+
case MVT::iPTR:
2748+
return Subtarget.is64Bit() ? Subtarget.hasVInstructionsI64() : true;
2749+
case MVT::i8:
2750+
case MVT::i16:
2751+
case MVT::i32:
2752+
case MVT::f16:
2753+
case MVT::bf16:
2754+
case MVT::f32:
2755+
return true;
2756+
case MVT::i64:
2757+
case MVT::f64:
2758+
return Subtarget.hasVInstructionsI64();
2759+
default:
2760+
return false;
2761+
}
2762+
}
27422763

27432764
unsigned RISCVTargetLowering::combineRepeatedFPDivisors() const {
27442765
return NumRepeatedDivisors;
@@ -24239,7 +24260,7 @@ bool RISCVTargetLowering::isLegalStridedLoadStore(EVT DataType,
2423924260
return false;
2424024261

2424124262
EVT ScalarType = DataType.getScalarType();
24242-
if (!isLegalElementTypeForRVV(ScalarType))
24263+
if (!isLegalLoadStoreElementTypeForRVV(ScalarType))
2424324264
return false;
2424424265

2424524266
if (!Subtarget.enableUnalignedVectorMem() &&

llvm/lib/Target/RISCV/RISCVISelLowering.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -384,6 +384,7 @@ class RISCVTargetLowering : public TargetLowering {
384384
bool shouldRemoveExtendFromGSIndex(SDValue Extend, EVT DataVT) const override;
385385

386386
bool isLegalElementTypeForRVV(EVT ScalarTy) const;
387+
bool isLegalLoadStoreElementTypeForRVV(EVT ScalarTy) const;
387388

388389
bool shouldConvertFpToSat(unsigned Op, EVT FPVT, EVT VT) const override;
389390

llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ bool RISCVTargetLowering::isLegalInterleavedAccessType(
3232
if (!isTypeLegal(VT))
3333
return false;
3434

35-
if (!isLegalElementTypeForRVV(VT.getScalarType()) ||
35+
if (!isLegalLoadStoreElementTypeForRVV(VT.getScalarType()) ||
3636
!allowsMemoryAccessForAlignment(VTy->getContext(), DL, VT, AddrSpace,
3737
Alignment))
3838
return false;

llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -265,7 +265,7 @@ class RISCVTTIImpl final : public BasicTTIImplBase<RISCVTTIImpl> {
265265
if (!ST->enableUnalignedVectorMem() && Alignment < ElemType.getStoreSize())
266266
return false;
267267

268-
return TLI->isLegalElementTypeForRVV(ElemType);
268+
return TLI->isLegalLoadStoreElementTypeForRVV(ElemType);
269269
}
270270

271271
bool isLegalMaskedLoad(Type *DataType, Align Alignment,
@@ -297,7 +297,7 @@ class RISCVTTIImpl final : public BasicTTIImplBase<RISCVTTIImpl> {
297297
if (!ST->enableUnalignedVectorMem() && Alignment < ElemType.getStoreSize())
298298
return false;
299299

300-
return TLI->isLegalElementTypeForRVV(ElemType);
300+
return TLI->isLegalLoadStoreElementTypeForRVV(ElemType);
301301
}
302302

303303
bool isLegalMaskedGather(Type *DataType, Align Alignment) const override {

llvm/test/Analysis/CostModel/RISCV/masked_ldst.ll

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -13,14 +13,14 @@ define void @fixed() {
1313
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 8, <2 x i1> undef, <2 x i32> undef)
1414
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 8, <4 x i1> undef, <4 x i32> undef)
1515
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 8, <2 x i1> undef, <2 x i64> undef)
16-
; CHECK-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %v2f16 = call <2 x half> @llvm.masked.load.v2f16.p0(ptr undef, i32 8, <2 x i1> undef, <2 x half> undef)
17-
; CHECK-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %v4f16 = call <4 x half> @llvm.masked.load.v4f16.p0(ptr undef, i32 8, <4 x i1> undef, <4 x half> undef)
18-
; CHECK-NEXT: Cost Model: Found an estimated cost of 39 for instruction: %v8f16 = call <8 x half> @llvm.masked.load.v8f16.p0(ptr undef, i32 8, <8 x i1> undef, <8 x half> undef)
16+
; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = call <2 x half> @llvm.masked.load.v2f16.p0(ptr undef, i32 8, <2 x i1> undef, <2 x half> undef)
17+
; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f16 = call <4 x half> @llvm.masked.load.v4f16.p0(ptr undef, i32 8, <4 x i1> undef, <4 x half> undef)
18+
; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8f16 = call <8 x half> @llvm.masked.load.v8f16.p0(ptr undef, i32 8, <8 x i1> undef, <8 x half> undef)
1919
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 8, <2 x i1> undef, <2 x float> undef)
2020
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4f32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 8, <4 x i1> undef, <4 x float> undef)
2121
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 8, <2 x i1> undef, <2 x double> undef)
2222
; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr undef, i32 8, <4 x i1> undef, <4 x i64> undef)
23-
; CHECK-NEXT: Cost Model: Found an estimated cost of 223 for instruction: %v32f16 = call <32 x half> @llvm.masked.load.v32f16.p0(ptr undef, i32 8, <32 x i1> undef, <32 x half> undef)
23+
; CHECK-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v32f16 = call <32 x half> @llvm.masked.load.v32f16.p0(ptr undef, i32 8, <32 x i1> undef, <32 x half> undef)
2424
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
2525
;
2626
entry:

llvm/test/Transforms/InterleavedAccess/RISCV/interleaved-accesses.ll

Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -874,3 +874,79 @@ define void @load_factor2_fp128(ptr %ptr) {
874874
%v1 = shufflevector <4 x fp128> %interleaved.vec, <4 x fp128> poison, <2 x i32> <i32 1, i32 3>
875875
ret void
876876
}
877+
878+
define void @load_factor2_f32(ptr %ptr) {
879+
; RV32-LABEL: @load_factor2_f32(
880+
; RV32-NEXT: [[TMP1:%.*]] = call { <8 x float>, <8 x float> } @llvm.riscv.seg2.load.mask.v8f32.p0.i32(ptr [[PTR:%.*]], <8 x i1> splat (i1 true), i32 8)
881+
; RV32-NEXT: [[TMP2:%.*]] = extractvalue { <8 x float>, <8 x float> } [[TMP1]], 1
882+
; RV32-NEXT: [[TMP3:%.*]] = extractvalue { <8 x float>, <8 x float> } [[TMP1]], 0
883+
; RV32-NEXT: ret void
884+
;
885+
; RV64-LABEL: @load_factor2_f32(
886+
; RV64-NEXT: [[TMP1:%.*]] = call { <8 x float>, <8 x float> } @llvm.riscv.seg2.load.mask.v8f32.p0.i64(ptr [[PTR:%.*]], <8 x i1> splat (i1 true), i64 8)
887+
; RV64-NEXT: [[TMP2:%.*]] = extractvalue { <8 x float>, <8 x float> } [[TMP1]], 1
888+
; RV64-NEXT: [[TMP3:%.*]] = extractvalue { <8 x float>, <8 x float> } [[TMP1]], 0
889+
; RV64-NEXT: ret void
890+
;
891+
%interleaved.vec = load <16 x float>, ptr %ptr
892+
%v0 = shufflevector <16 x float> %interleaved.vec, <16 x float> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
893+
%v1 = shufflevector <16 x float> %interleaved.vec, <16 x float> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
894+
ret void
895+
}
896+
897+
define void @load_factor2_f64(ptr %ptr) {
898+
; RV32-LABEL: @load_factor2_f64(
899+
; RV32-NEXT: [[TMP1:%.*]] = call { <8 x double>, <8 x double> } @llvm.riscv.seg2.load.mask.v8f64.p0.i32(ptr [[PTR:%.*]], <8 x i1> splat (i1 true), i32 8)
900+
; RV32-NEXT: [[TMP2:%.*]] = extractvalue { <8 x double>, <8 x double> } [[TMP1]], 1
901+
; RV32-NEXT: [[TMP3:%.*]] = extractvalue { <8 x double>, <8 x double> } [[TMP1]], 0
902+
; RV32-NEXT: ret void
903+
;
904+
; RV64-LABEL: @load_factor2_f64(
905+
; RV64-NEXT: [[TMP1:%.*]] = call { <8 x double>, <8 x double> } @llvm.riscv.seg2.load.mask.v8f64.p0.i64(ptr [[PTR:%.*]], <8 x i1> splat (i1 true), i64 8)
906+
; RV64-NEXT: [[TMP2:%.*]] = extractvalue { <8 x double>, <8 x double> } [[TMP1]], 1
907+
; RV64-NEXT: [[TMP3:%.*]] = extractvalue { <8 x double>, <8 x double> } [[TMP1]], 0
908+
; RV64-NEXT: ret void
909+
;
910+
%interleaved.vec = load <16 x double>, ptr %ptr
911+
%v0 = shufflevector <16 x double> %interleaved.vec, <16 x double> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
912+
%v1 = shufflevector <16 x double> %interleaved.vec, <16 x double> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
913+
ret void
914+
}
915+
916+
define void @load_factor2_bf16(ptr %ptr) {
917+
; RV32-LABEL: @load_factor2_bf16(
918+
; RV32-NEXT: [[INTERLEAVED_VEC:%.*]] = load <16 x bfloat>, ptr [[PTR:%.*]], align 32
919+
; RV32-NEXT: [[V0:%.*]] = shufflevector <16 x bfloat> [[INTERLEAVED_VEC]], <16 x bfloat> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
920+
; RV32-NEXT: [[V1:%.*]] = shufflevector <16 x bfloat> [[INTERLEAVED_VEC]], <16 x bfloat> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
921+
; RV32-NEXT: ret void
922+
;
923+
; RV64-LABEL: @load_factor2_bf16(
924+
; RV64-NEXT: [[INTERLEAVED_VEC:%.*]] = load <16 x bfloat>, ptr [[PTR:%.*]], align 32
925+
; RV64-NEXT: [[V0:%.*]] = shufflevector <16 x bfloat> [[INTERLEAVED_VEC]], <16 x bfloat> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
926+
; RV64-NEXT: [[V1:%.*]] = shufflevector <16 x bfloat> [[INTERLEAVED_VEC]], <16 x bfloat> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
927+
; RV64-NEXT: ret void
928+
;
929+
%interleaved.vec = load <16 x bfloat>, ptr %ptr
930+
%v0 = shufflevector <16 x bfloat> %interleaved.vec, <16 x bfloat> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
931+
%v1 = shufflevector <16 x bfloat> %interleaved.vec, <16 x bfloat> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
932+
ret void
933+
}
934+
935+
define void @load_factor2_f16(ptr %ptr) {
936+
; RV32-LABEL: @load_factor2_f16(
937+
; RV32-NEXT: [[INTERLEAVED_VEC:%.*]] = load <16 x half>, ptr [[PTR:%.*]], align 32
938+
; RV32-NEXT: [[V0:%.*]] = shufflevector <16 x half> [[INTERLEAVED_VEC]], <16 x half> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
939+
; RV32-NEXT: [[V1:%.*]] = shufflevector <16 x half> [[INTERLEAVED_VEC]], <16 x half> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
940+
; RV32-NEXT: ret void
941+
;
942+
; RV64-LABEL: @load_factor2_f16(
943+
; RV64-NEXT: [[INTERLEAVED_VEC:%.*]] = load <16 x half>, ptr [[PTR:%.*]], align 32
944+
; RV64-NEXT: [[V0:%.*]] = shufflevector <16 x half> [[INTERLEAVED_VEC]], <16 x half> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
945+
; RV64-NEXT: [[V1:%.*]] = shufflevector <16 x half> [[INTERLEAVED_VEC]], <16 x half> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
946+
; RV64-NEXT: ret void
947+
;
948+
%interleaved.vec = load <16 x half>, ptr %ptr
949+
%v0 = shufflevector <16 x half> %interleaved.vec, <16 x half> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
950+
%v1 = shufflevector <16 x half> %interleaved.vec, <16 x half> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
951+
ret void
952+
}

0 commit comments

Comments
 (0)