Skip to content

Commit 181e901

Browse files
authored
Function to estimate stack size (using existing tooling) (#1121)
This will replace #1116 -- instead of string matching the assembly file, it uses llvm's `llvm-readelf` after adding `--stack-size-section` to the llc call. Please read the paper trail of PRs I've left here for more information.
1 parent fdf8b58 commit 181e901

File tree

3 files changed

+300
-23
lines changed

3 files changed

+300
-23
lines changed

compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/XCLBinGen.cpp

Lines changed: 200 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66

77
#include "XCLBinGen.h"
88

9+
#include <charconv>
910
#include <filesystem>
1011
#include <fstream>
1112
#include <functional>
@@ -77,6 +78,107 @@ FailureOr<std::vector<std::string>> flagStringToVector(
7778
std::istream_iterator<std::string>{}};
7879
}
7980

81+
// Extract an integer from a string, if possible.
82+
std::optional<int> safeStoi(std::string_view intString) {
83+
size_t start = intString.find_first_not_of(" \t\n\r\f\v");
84+
if (start == std::string::npos) return std::nullopt;
85+
int value = 0;
86+
const char *d0 = intString.data() + start;
87+
const char *d1 = intString.data() + intString.size();
88+
auto [ptr, ec] = std::from_chars(d0, d1, value);
89+
if (ec == std::errc()) return value;
90+
return std::nullopt;
91+
}
92+
93+
// We assume that input string is of the form:
94+
//
95+
// ```
96+
// Stack Sizes:
97+
// Size Functions
98+
// 32 some_func
99+
// 64 some_other_func
100+
// 288 core_3_5
101+
// 288 core_2_5
102+
// 288 core_1_5
103+
// 288 core_0_5
104+
// 288 core_3_4
105+
// 288 core_3_3
106+
// 288 core_2_3
107+
// 288 core_3_2
108+
// 288 core_1_2
109+
// 288 core_0_2
110+
// ```
111+
//
112+
// In terms of how we estimate stack sizes, we assume that function call
113+
// structure is as follows: functions with names core_0_0, core_0_1, core_0_2,
114+
// et cetera are the entry point functions. These functions call into the
115+
// other functions like some_func and some_other_func, but never in a
116+
// nested manner. With these assumptions, an upper bound on the total stack size
117+
// of a core is the maximum sum of it's stack size, and another function's stack
118+
// size.
119+
FailureOr<llvm::DenseMap<std::pair<uint32_t, uint32_t>, uint32_t>>
120+
getUpperBoundStackSizes(const std::string &readElfOutput) {
121+
llvm::DenseMap<std::pair<uint32_t, uint32_t>, uint32_t> coreStackSizes;
122+
123+
// Split input on whitespace. For the example above, tokens becomes
124+
// ['Functions', '32', 'some_func', '64', 'some_other', 288, 'core_3_5', ...]
125+
SmallVector<std::string> tokens;
126+
size_t index0 = readElfOutput.find("Functions");
127+
std::istringstream stackSizesStream(readElfOutput.substr(index0));
128+
std::copy(std::istream_iterator<std::string>(stackSizesStream),
129+
std::istream_iterator<std::string>(), std::back_inserter(tokens));
130+
131+
uint32_t maxNonCoreStackSize = 0;
132+
for (uint32_t i = 1; i < tokens.size(); i += 2) {
133+
std::string_view stackSizeStr = tokens[i];
134+
std::string_view functionName = tokens[i + 1];
135+
136+
std::optional<int> maybeSize = safeStoi(stackSizeStr);
137+
if (!maybeSize) {
138+
llvm::errs() << "Failed to convert stack size (" << stackSizeStr
139+
<< ") to integer.\n";
140+
return failure();
141+
}
142+
uint32_t size = maybeSize.value();
143+
size_t coreIndex = functionName.find("core_");
144+
145+
// If the function is not a core function, in the example above either
146+
// 'some_func' or 'some_other_func', then we track the maximum stack size
147+
// for these.
148+
if (coreIndex == std::string::npos) {
149+
maxNonCoreStackSize = std::max<uint32_t>(maxNonCoreStackSize, size);
150+
continue;
151+
}
152+
153+
// The case where the function is a core function.
154+
size_t colIndex = functionName.find("_", coreIndex) + 1;
155+
std::optional<int> col = safeStoi(functionName.substr(colIndex));
156+
if (!col.has_value()) {
157+
llvm::errs() << "Failed to extract column from " << functionName << "\n";
158+
return failure();
159+
}
160+
161+
size_t rowIndex = functionName.find("_", colIndex) + 1;
162+
std::optional<int> row = safeStoi(functionName.substr(rowIndex));
163+
if (!row.has_value()) {
164+
llvm::errs() << "Failed to extract row from " << functionName << "\n";
165+
return failure();
166+
}
167+
168+
coreStackSizes.insert({{col.value(), row.value()}, size});
169+
}
170+
171+
// Add the maximum non-core stack size to all core stack sizes. The
172+
// logic here is that each core calls into all the non-core functions
173+
// (without nesting calls), and so the maximum stack for the core is
174+
// the maximum non-core stack size plus the core stack.
175+
for (auto &[_, size] : coreStackSizes) {
176+
size += maxNonCoreStackSize;
177+
}
178+
179+
return coreStackSizes;
180+
}
181+
80182
// Peano's `opt` program optimizes llvm-ir (.ll files). We run it with a system
81183
// call. This functions constructs the flags to pass to `opt`. There are some
82184
// default flags, most of which are copied from llvm-aie. See
@@ -400,11 +502,11 @@ bool hasEnding(std::string const &fullString, std::string const &ending) {
400502
}
401503

402504
LogicalResult runTool(
403-
const std::string &program_, const std::vector<std::string> &args,
404-
bool verbose, std::optional<std::vector<std::string>> env = std::nullopt) {
405-
std::string program = program_;
505+
std::string program, ArrayRef<std::string> args, bool verbose,
506+
std::optional<std::vector<std::string>> env = std::nullopt,
507+
std::optional<std::string> userProvidedLogFilename = std::nullopt) {
406508
#if defined(_WIN32)
407-
if (!hasEnding(program_, ".exe")) program = program_ + ".exe";
509+
if (!hasEnding(program, ".exe")) program = program + ".exe";
408510
#endif // _WIN32
409511
if (verbose) {
410512
llvm::outs() << "\nRun: ";
@@ -421,16 +523,23 @@ LogicalResult runTool(
421523
return failure();
422524
}
423525

424-
// Run the program, piping any output to a temporary file (we only want to
425-
// print to terminal if verbose is true).
526+
// Run the program, piping any output to a file.
426527
SmallVector<StringRef, 8> pArgs = {program};
427528
pArgs.append(args.begin(), args.end());
428-
SmallVector<char> temporaryPath;
429-
{
529+
SmallVector<char> logPath;
530+
if (userProvidedLogFilename.has_value()) {
531+
std::string lfn = userProvidedLogFilename.value();
532+
logPath.append(lfn.begin(), lfn.end());
533+
if (!std::filesystem::exists(lfn)) {
534+
std::ofstream ofs(lfn);
535+
ofs.close();
536+
}
537+
538+
} else {
430539
std::string prefix{"tmpRunTool"};
431540
std::string suffix{"Logging"};
432541
auto errorCode =
433-
llvm::sys::fs::createTemporaryFile(prefix, suffix, temporaryPath);
542+
llvm::sys::fs::createTemporaryFile(prefix, suffix, logPath);
434543
if (errorCode) {
435544
llvm::errs() << "Failed to create temporary file: " << errorCode.message()
436545
<< "\n";
@@ -444,12 +553,11 @@ LogicalResult runTool(
444553
// Explicit type but this never actually constructs an ArrayRef
445554
std::optional<ArrayRef<StringRef>> envSmallVec = std::nullopt;
446555
#else
447-
std::string temporaryPathStr =
448-
std::string(temporaryPath.begin(), temporaryPath.size());
449-
StringRef temporaryPathRef(temporaryPathStr);
556+
std::string logPathStr = std::string(logPath.begin(), logPath.size());
557+
StringRef logPathRef(logPathStr);
450558
llvm::SmallVector<llvm::StringRef> envSmallVec;
451559
if (env) envSmallVec.append(env->begin(), env->end());
452-
auto tp = std::optional<StringRef>(temporaryPathRef);
560+
auto tp = std::optional<StringRef>(logPathRef);
453561
redirects = {tp, tp, tp};
454562
#endif
455563

@@ -464,7 +572,7 @@ LogicalResult runTool(
464572

465573
#ifndef _WIN32
466574
auto maybeOutputFromFile = [&]() -> std::optional<std::string> {
467-
std::ifstream t(temporaryPathRef.str());
575+
std::ifstream t(logPathRef.str());
468576
std::stringstream buffer;
469577
if (t.is_open() && t.good()) {
470578
buffer << t.rdbuf();
@@ -474,7 +582,7 @@ LogicalResult runTool(
474582
}();
475583

476584
if (!maybeOutputFromFile) {
477-
llvm::errs() << "Failed to open temporary file " << temporaryPathRef.str()
585+
llvm::errs() << "Failed to open temporary file " << logPathRef.str()
478586
<< "\n";
479587
}
480588
const std::string &outputFromFile = maybeOutputFromFile.value();
@@ -501,7 +609,6 @@ LogicalResult runTool(
501609
#endif
502610
return failure();
503611
}
504-
505612
return success();
506613
}
507614

@@ -746,7 +853,14 @@ LogicalResult generateCoreElfFiles(AIE::DeviceOp deviceOp,
746853
}
747854
flags.emplace_back("--target=" + targetLower + "-none-unknown-elf");
748855
flags.emplace_back("-Wl,--gc-sections");
749-
flags.emplace_back("-Wl,--orphan-handling=error");
856+
857+
// Decision to use 'warn' for orphan sections: currently if the preceding
858+
// call to llc has the flag --stack-size-section, an orphan section
859+
// is created containing the stack sizes. The linker needs to know how to
860+
// handle this: options are 'place' or 'warn' or 'error'. 'place' would
861+
// result in larger binaries. The flag '--exclude-secion' should work
862+
// but doesn't appear to supported with peano.
863+
flags.emplace_back("-Wl,--orphan-handling=warn");
750864
flags.emplace_back("-Wl,-T," + ldscriptPath.string());
751865
flags.emplace_back("-o");
752866
flags.emplace_back(elfFile.string());
@@ -1078,6 +1192,55 @@ void addLowerToLLVMPasses(OpPassManager &pm) {
10781192
pm.addPass(createCSEPass());
10791193
}
10801194

1195+
LogicalResult checkStackSize(const std::string &outputFile, bool verbose,
1196+
Path peanoReadElfBin, AIE::DeviceOp deviceOp) {
1197+
std::string stackSizesFile = outputFile + ".stacksizes";
1198+
std::vector<std::string> args{outputFile, "--stack-sizes"};
1199+
if (failed(runTool(peanoReadElfBin.string(), args, verbose, std::nullopt,
1200+
stackSizesFile))) {
1201+
llvm::errs() << "Failed to get stack sizes with peano\n";
1202+
return failure();
1203+
}
1204+
1205+
// Read the contents of the file stackSizesFile.
1206+
std::ifstream stackSizesFileStream(stackSizesFile);
1207+
std::stringstream stackSizesBuffer;
1208+
stackSizesBuffer << stackSizesFileStream.rdbuf();
1209+
std::string stackSizes = stackSizesBuffer.str();
1210+
FailureOr<llvm::DenseMap<std::pair<uint32_t, uint32_t>, uint32_t>>
1211+
maybeUpperBounds =
1212+
mlir::iree_compiler::AMDAIE::detail::getUpperBoundStackSizes(
1213+
stackSizes);
1214+
if (failed(maybeUpperBounds)) {
1215+
llvm::errs() << "Failed to get upper bounds of stack sizes\n";
1216+
return failure();
1217+
}
1218+
llvm::DenseMap<std::pair<uint32_t, uint32_t>, uint32_t> upperBounds =
1219+
std::move(maybeUpperBounds.value());
1220+
1221+
SmallVector<AIE::CoreOp> coreOps;
1222+
deviceOp->walk([&](AIE::CoreOp coreOp) { coreOps.push_back(coreOp); });
1223+
for (auto coreOp : coreOps) {
1224+
int col = coreOp.getTileOp().getCol();
1225+
int row = coreOp.getTileOp().getRow();
1226+
auto iter = upperBounds.find({col, row});
1227+
if (iter == upperBounds.end()) {
1228+
llvm::errs() << "The stack size for core (" << col << ", " << row
1229+
<< ") has no upper bound. ";
1230+
return failure();
1231+
}
1232+
auto stackSize = coreOp.getStackSize();
1233+
if (stackSize < iter->second) {
1234+
llvm::errs() << "An upper bound of the stack size, inferred from "
1235+
"dumper stack size file, is"
1236+
<< iter->second << " bytes. The assigned stack size is "
1237+
<< stackSize << " bytes, which is insufficient. ";
1238+
return failure();
1239+
}
1240+
}
1241+
return success();
1242+
}
1243+
10811244
LogicalResult generateUnifiedObject(
10821245
MLIRContext *context, AIE::DeviceOp deviceOp, const std::string &outputFile,
10831246
bool printIRBeforeAll, bool printIRAfterAll, bool printIRModuleScope,
@@ -1150,6 +1313,7 @@ LogicalResult generateUnifiedObject(
11501313
}
11511314
Path peanoOptBin = peanoDir / "bin" / "opt";
11521315
Path peanoLLCBin = peanoDir / "bin" / "llc";
1316+
Path peanoReadElfBin = peanoDir / "bin" / "llvm-readelf";
11531317

11541318
std::string OptLLVMIRFile = (tempDir / "input.opt.ll").string();
11551319

@@ -1183,15 +1347,28 @@ LogicalResult generateUnifiedObject(
11831347
return failure();
11841348
}
11851349

1186-
if (failed(runTool(
1187-
peanoLLCBin.string(),
1188-
{OptLLVMIRFile, "-O2", "--march=" + StringRef(targetArch).lower(),
1189-
"--function-sections", "--filetype=obj", "-o",
1190-
std::string(outputFile)},
1191-
verbose))) {
1350+
std::vector<std::string> llcArgs{OptLLVMIRFile,
1351+
"-O2",
1352+
"--march=" + StringRef(targetArch).lower(),
1353+
"--function-sections",
1354+
"--filetype=obj",
1355+
"-o",
1356+
outputFile,
1357+
"--stack-size-section"};
1358+
1359+
if (failed(runTool(peanoLLCBin.string(), llcArgs, verbose))) {
11921360
llvm::errs() << "Failed to assemble ll with peano\n";
11931361
return failure();
11941362
}
1363+
1364+
// If this is not windows, we can do this check. On windows checkTool
1365+
// doesn't pipe logging in the way thay's needed for this to work.
1366+
#ifndef _WIN32
1367+
if (failed(
1368+
checkStackSize(outputFile, verbose, peanoReadElfBin, deviceOp))) {
1369+
return failure();
1370+
}
1371+
#endif
11951372
}
11961373

11971374
moduleOpCopy->erase();

compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/XCLBinGen.h

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,9 +32,33 @@ mlir::LogicalResult emitNpuInstructions(xilinx::AIE::DeviceOp deviceOp,
3232
const std::string &outputNPU);
3333

3434
namespace detail {
35+
3536
FailureOr<std::vector<std::string>> flagStringToVector(
3637
const std::string &flags);
38+
3739
FailureOr<std::vector<std::string>> makePeanoOptArgs(
3840
const std::vector<std::string> &additionalPeanoOptFlags);
41+
42+
/// An exception-free version of std::stoi, using C++17's std::from_chars.
43+
std::optional<int> safeStoi(std::string_view intString);
44+
45+
/// Get upper-bounds on the maximum stack sizes for the different cores (col,
46+
/// row) by parsing a string of the form:
47+
///
48+
/// ```
49+
/// Stack Sizes:
50+
/// Size Functions
51+
/// 32 some_func
52+
/// 512 core_1_3
53+
/// 64 some_other_func
54+
/// 288 core_3_5
55+
/// ```
56+
///
57+
/// \return A map from (col, row) to an upper bound on maximum stack size for
58+
/// that core. If the analysis of the string fails, a failure is
59+
/// returned.
60+
FailureOr<llvm::DenseMap<std::pair<uint32_t, uint32_t>, uint32_t>>
61+
getUpperBoundStackSizes(const std::string &);
62+
3963
} // namespace detail
4064
} // namespace mlir::iree_compiler::AMDAIE

0 commit comments

Comments
 (0)