@@ -1211,16 +1211,81 @@ AAAMDWavesPerEU &AAAMDWavesPerEU::createForPosition(const IRPosition &IRP,
1211
1211
llvm_unreachable (" AAAMDWavesPerEU is only valid for function position" );
1212
1212
}
1213
1213
1214
- static bool inlineAsmUsesAGPRs (const InlineAsm *IA) {
1215
- for (const auto &CI : IA->ParseConstraints ()) {
1214
+ // / Compute the minimum number of AGPRs required to allocate the inline asm.
1215
+ static unsigned inlineAsmGetNumRequiredAGPRs (const InlineAsm *IA,
1216
+ const CallBase &Call) {
1217
+ unsigned ArgNo = 0 ;
1218
+ unsigned ResNo = 0 ;
1219
+ unsigned AGPRDefCount = 0 ;
1220
+ unsigned AGPRUseCount = 0 ;
1221
+ unsigned MaxPhysReg = 0 ;
1222
+ const DataLayout &DL = Call.getFunction ()->getParent ()->getDataLayout ();
1223
+
1224
+ // TODO: Overestimates due to not accounting for tied operands
1225
+ for (const InlineAsm::ConstraintInfo &CI : IA->ParseConstraints ()) {
1226
+ Type *Ty = nullptr ;
1227
+ switch (CI.Type ) {
1228
+ case InlineAsm::isOutput: {
1229
+ Ty = Call.getType ();
1230
+ if (auto *STy = dyn_cast<StructType>(Ty))
1231
+ Ty = STy->getElementType (ResNo);
1232
+ ++ResNo;
1233
+ break ;
1234
+ }
1235
+ case InlineAsm::isInput: {
1236
+ Ty = Call.getArgOperand (ArgNo++)->getType ();
1237
+ break ;
1238
+ }
1239
+ case InlineAsm::isLabel:
1240
+ continue ;
1241
+ case InlineAsm::isClobber:
1242
+ // Parse the physical register reference.
1243
+ break ;
1244
+ }
1245
+
1216
1246
for (StringRef Code : CI.Codes ) {
1217
- Code.consume_front (" {" );
1218
- if (Code.starts_with (" a" ))
1219
- return true ;
1247
+ unsigned RegCount = 0 ;
1248
+ if (Code.starts_with (" a" )) {
1249
+ // Virtual register, compute number of registers based on the type.
1250
+ //
1251
+ // We ought to be going through TargetLowering to get the number of
1252
+ // registers, but we should avoid the dependence on CodeGen here.
1253
+ RegCount = divideCeil (DL.getTypeSizeInBits (Ty), 32 );
1254
+ } else {
1255
+ // Physical register reference
1256
+ auto [Kind, RegIdx, NumRegs] = AMDGPU::parseAsmConstraintPhysReg (Code);
1257
+ if (Kind == ' a' ) {
1258
+ RegCount = NumRegs;
1259
+ MaxPhysReg = std::max (MaxPhysReg, std::min (RegIdx + NumRegs, 256u ));
1260
+ }
1261
+
1262
+ continue ;
1263
+ }
1264
+
1265
+ if (CI.Type == InlineAsm::isOutput) {
1266
+ // Apply tuple alignment requirement
1267
+ //
1268
+ // TODO: This is more conservative than necessary.
1269
+ AGPRDefCount = alignTo (AGPRDefCount, RegCount);
1270
+
1271
+ AGPRDefCount += RegCount;
1272
+ if (CI.isEarlyClobber ) {
1273
+ AGPRUseCount = alignTo (AGPRUseCount, RegCount);
1274
+ AGPRUseCount += RegCount;
1275
+ }
1276
+ } else {
1277
+ AGPRUseCount = alignTo (AGPRUseCount, RegCount);
1278
+ AGPRUseCount += RegCount;
1279
+ }
1220
1280
}
1221
1281
}
1222
1282
1223
- return false ;
1283
+ unsigned MaxVirtReg = std::max (AGPRUseCount, AGPRDefCount);
1284
+
1285
+ // TODO: This is overly conservative. If there are any physical registers,
1286
+ // allocate any virtual registers after them so we don't have to solve optimal
1287
+ // packing.
1288
+ return std::min (MaxVirtReg + MaxPhysReg, 256u );
1224
1289
}
1225
1290
1226
1291
// TODO: Migrate to range merge of amdgpu-agpr-alloc.
@@ -1259,7 +1324,7 @@ struct AAAMDGPUNoAGPR : public StateWrapper<BooleanState, AbstractAttribute> {
1259
1324
const Function *Callee = dyn_cast<Function>(CalleeOp);
1260
1325
if (!Callee) {
1261
1326
if (const InlineAsm *IA = dyn_cast<InlineAsm>(CalleeOp))
1262
- return ! inlineAsmUsesAGPRs (IA) ;
1327
+ return inlineAsmGetNumRequiredAGPRs (IA, CB) == 0 ;
1263
1328
return false ;
1264
1329
}
1265
1330
0 commit comments