@@ -166,12 +166,6 @@ InstructionCost WebAssemblyTTIImpl::getMemoryOpCost(
166166 CostKind);
167167 }
168168
169- int ISD = TLI->InstructionOpcodeToISD (Opcode);
170- if (ISD != ISD::LOAD) {
171- return BaseT::getMemoryOpCost (Opcode, Ty, Alignment, AddressSpace,
172- CostKind);
173- }
174-
175169 EVT VT = TLI->getValueType (DL, Ty, true );
176170 // Type legalization can't handle structs
177171 if (VT == MVT::Other)
@@ -182,22 +176,121 @@ InstructionCost WebAssemblyTTIImpl::getMemoryOpCost(
182176 if (!LT.first .isValid ())
183177 return InstructionCost::getInvalid ();
184178
185- // 128-bit loads are a single instruction. 32-bit and 64-bit vector loads can
186- // be lowered to load32_zero and load64_zero respectively. Assume SIMD loads
187- // are twice as expensive as scalar.
179+ int ISD = TLI->InstructionOpcodeToISD (Opcode);
188180 unsigned width = VT.getSizeInBits ();
189- switch (width) {
190- default :
191- break ;
192- case 32 :
193- case 64 :
194- case 128 :
195- return 2 ;
181+ if (ISD == ISD::LOAD) {
182+ // 128-bit loads are a single instruction. 32-bit and 64-bit vector loads
183+ // can be lowered to load32_zero and load64_zero respectively. Assume SIMD
184+ // loads are twice as expensive as scalar.
185+ switch (width) {
186+ default :
187+ break ;
188+ case 32 :
189+ case 64 :
190+ case 128 :
191+ return 2 ;
192+ }
193+ } else if (ISD == ISD::STORE) {
194+ // For stores, we can use store lane operations.
195+ switch (width) {
196+ default :
197+ break ;
198+ case 8 :
199+ case 16 :
200+ case 32 :
201+ case 64 :
202+ case 128 :
203+ return 2 ;
204+ }
196205 }
197206
198207 return BaseT::getMemoryOpCost (Opcode, Ty, Alignment, AddressSpace, CostKind);
199208}
200209
210+ InstructionCost WebAssemblyTTIImpl::getInterleavedMemoryOpCost (
211+ unsigned Opcode, Type *Ty, unsigned Factor, ArrayRef<unsigned > Indices,
212+ Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
213+ bool UseMaskForCond, bool UseMaskForGaps) const {
214+ assert (Factor >= 2 && " Invalid interleave factor" );
215+
216+ auto *VecTy = cast<VectorType>(Ty);
217+ if (!ST->hasSIMD128 () || !isa<FixedVectorType>(VecTy)) {
218+ return InstructionCost::getInvalid ();
219+ }
220+
221+ if (UseMaskForCond || UseMaskForGaps)
222+ return BaseT::getInterleavedMemoryOpCost (Opcode, Ty, Factor, Indices,
223+ Alignment, AddressSpace, CostKind,
224+ UseMaskForCond, UseMaskForGaps);
225+
226+ constexpr unsigned MaxInterleaveFactor = 4 ;
227+ if (Factor <= MaxInterleaveFactor) {
228+ unsigned MinElts = VecTy->getElementCount ().getKnownMinValue ();
229+ // Ensure the number of vector elements is greater than 1.
230+ if (MinElts < 2 || MinElts % Factor != 0 )
231+ return InstructionCost::getInvalid ();
232+
233+ unsigned ElSize = DL.getTypeSizeInBits (VecTy->getElementType ());
234+ // Ensure the element type is legal.
235+ if (ElSize != 8 && ElSize != 16 && ElSize != 32 && ElSize != 64 )
236+ return InstructionCost::getInvalid ();
237+
238+ auto *SubVecTy =
239+ VectorType::get (VecTy->getElementType (),
240+ VecTy->getElementCount ().divideCoefficientBy (Factor));
241+ InstructionCost MemCost =
242+ getMemoryOpCost (Opcode, SubVecTy, Alignment, AddressSpace, CostKind);
243+
244+ unsigned VecSize = DL.getTypeSizeInBits (SubVecTy);
245+ unsigned MaxVecSize = 128 ;
246+ unsigned NumAccesses =
247+ std::max<unsigned >(1 , (MinElts * ElSize + MaxVecSize - 1 ) / VecSize);
248+
249+ // A stride of two is commonly supported via dedicated instructions, so it
250+ // should be relatively cheap for all element sizes. A stride of four is
251+ // more expensive as it will likely require more shuffles. Using two
252+ // simd128 inputs is considered more expensive and we mainly account for
253+ // shuffling two inputs (32 bytes), but we do model 4 x v4i32 to enable
254+ // arithmetic kernels.
255+ static const CostTblEntry ShuffleCostTbl[] = {
256+ // One reg.
257+ {2 , MVT::v2i8, 1 }, // interleave 2 x 2i8 into 4i8
258+ {2 , MVT::v4i8, 1 }, // interleave 2 x 4i8 into 8i8
259+ {2 , MVT::v8i8, 1 }, // interleave 2 x 8i8 into 16i8
260+ {2 , MVT::v2i16, 1 }, // interleave 2 x 2i16 into 4i16
261+ {2 , MVT::v4i16, 1 }, // interleave 2 x 4i16 into 8i16
262+ {2 , MVT::v2i32, 1 }, // interleave 2 x 2i32 into 4i32
263+
264+ // Two regs.
265+ {2 , MVT::v16i8, 2 }, // interleave 2 x 16i8 into 32i8
266+ {2 , MVT::v8i16, 2 }, // interleave 2 x 8i16 into 16i16
267+ {2 , MVT::v4i32, 2 }, // interleave 2 x 4i32 into 8i32
268+
269+ // One reg.
270+ {4 , MVT::v2i8, 4 }, // interleave 4 x 2i8 into 8i8
271+ {4 , MVT::v4i8, 4 }, // interleave 4 x 4i8 into 16i8
272+ {4 , MVT::v2i16, 4 }, // interleave 4 x 2i16 into 8i16
273+
274+ // Two regs.
275+ {4 , MVT::v8i8, 16 }, // interleave 4 x 8i8 into 32i8
276+ {4 , MVT::v4i16, 8 }, // interleave 4 x 4i16 into 16i16
277+ {4 , MVT::v2i32, 4 }, // interleave 4 x 2i32 into 8i32
278+
279+ // Four regs.
280+ {4 , MVT::v4i32, 16 }, // interleave 4 x 4i32 into 16i32
281+ };
282+
283+ EVT ETy = TLI->getValueType (DL, SubVecTy);
284+ if (const auto *Entry =
285+ CostTableLookup (ShuffleCostTbl, Factor, ETy.getSimpleVT ()))
286+ return Entry->Cost + (NumAccesses * MemCost);
287+ }
288+
289+ return BaseT::getInterleavedMemoryOpCost (Opcode, VecTy, Factor, Indices,
290+ Alignment, AddressSpace, CostKind,
291+ UseMaskForCond, UseMaskForGaps);
292+ }
293+
201294InstructionCost WebAssemblyTTIImpl::getVectorInstrCost (
202295 unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
203296 const Value *Op0, const Value *Op1) const {
0 commit comments