Skip to content

Commit 4f0bf33

Browse files
authored
Improve and enable inlining pass (#966)
* improve inlining pass to inline single-use functions that are fairly small, which makes it useful for removing unnecessary global constructors from clang. add an inlining-optimizing pass that also optimizes where it inlined, as new opportunities arise. enable that it by default in O2+ * fix a bug where we didn't run all passes properly - refactor addDefaultGlobalOptimizationPasses() into a pre and post version. we can only run the post version in incremental optimizing builds (functions appear one by one, we optimize them first, and do global stuff when all are done), but can run both when doing a full optimize * copy in inlining, allowing multiple inlinings of the same function in the future
1 parent b93ea39 commit 4f0bf33

40 files changed

+1964
-1613
lines changed

src/pass.h

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -109,8 +109,14 @@ struct PassRunner {
109109
void addDefaultFunctionOptimizationPasses();
110110

111111
// Adds the default optimization passes that work on
112-
// entire modules as a whole.
113-
void addDefaultGlobalOptimizationPasses();
112+
// entire modules as a whole, and make sense to
113+
// run before function passes.
114+
void addDefaultGlobalOptimizationPrePasses();
115+
116+
// Adds the default optimization passes that work on
117+
// entire modules as a whole, and make sense to
118+
// run after function passes.
119+
void addDefaultGlobalOptimizationPostPasses();
114120

115121
// Run the passes on the module
116122
void run();

src/passes/Inlining.cpp

Lines changed: 131 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -18,45 +18,68 @@
1818
// Inlining.
1919
//
2020
// For now, this does a conservative inlining of all functions that have
21-
// exactly one use. That should not increase code size, and may have
22-
// speed benefits.
21+
// exactly one use, and are fairly small. That should not increase code
22+
// size, and may have speed benefits.
2323
//
2424

25+
#include <atomic>
26+
2527
#include <wasm.h>
2628
#include <pass.h>
2729
#include <wasm-builder.h>
30+
#include <ast_utils.h>
2831
#include <parsing.h>
2932

3033
namespace wasm {
3134

35+
// A limit on how big a function to inline.
36+
static const int INLINING_SIZE_LIMIT = 15;
37+
38+
// We only inline a function with a single use.
39+
static const int SINGLE_USE = 1;
40+
41+
// A number of uses of a function that is too high for us to
42+
// inline it to all those locations.
43+
static const int TOO_MANY_USES_TO_INLINE = SINGLE_USE + 1;
44+
45+
// Map of function name => number of uses. We build the values in
46+
// parallel, using atomic increments. This is safe because we never
47+
// update the map itself in parallel, we only update the values,
48+
// and so the map never allocates or moves values which could be
49+
// a problem with atomics (in fact it would be a problem in general
50+
// as well, not just with atomics, as we don't use a lock in
51+
// parallel access, we depend on the map itself being constant
52+
// when running multiple threads).
53+
typedef std::map<Name, std::atomic<Index>> NameToAtomicIndexMap;
54+
3255
struct FunctionUseCounter : public WalkerPass<PostWalker<FunctionUseCounter>> {
3356
bool isFunctionParallel() override { return true; }
3457

35-
FunctionUseCounter(std::map<Name, Index>* output) : output(output) {}
58+
FunctionUseCounter(NameToAtomicIndexMap* uses) : uses(uses) {}
3659

3760
FunctionUseCounter* create() override {
38-
return new FunctionUseCounter(output);
61+
return new FunctionUseCounter(uses);
3962
}
4063

4164
void visitCall(Call *curr) {
42-
(*output)[curr->target]++;
65+
assert(uses->count(curr->target) > 0); // can't add a new element in parallel
66+
(*uses)[curr->target]++;
4367
}
4468

4569
private:
46-
std::map<Name, Index>* output;
70+
NameToAtomicIndexMap* uses;
4771
};
4872

49-
struct Action {
50-
Call* call;
51-
Block* block; // the replacement for the call, into which we should inline
73+
struct InliningAction {
74+
Expression** callSite;
5275
Function* contents;
5376

54-
Action(Call* call, Block* block, Function* contents) : call(call), block(block), contents(contents) {}
77+
InliningAction(Expression** callSite, Function* contents) : callSite(callSite), contents(contents) {}
5578
};
5679

5780
struct InliningState {
5881
std::set<Name> canInline;
59-
std::map<Name, std::vector<Action>> actionsForFunction; // function name => actions that can be performed in it
82+
std::map<Name, std::vector<InliningAction>> actionsForFunction; // function name => actions that can be performed in it
6083
};
6184

6285
struct Planner : public WalkerPass<PostWalker<Planner>> {
@@ -68,12 +91,18 @@ struct Planner : public WalkerPass<PostWalker<Planner>> {
6891
return new Planner(state);
6992
}
7093

71-
void visitCall(Call *curr) {
72-
if (state->canInline.count(curr->target)) {
73-
auto* block = Builder(*getModule()).makeBlock();
74-
block->type = curr->type;
94+
void visitCall(Call* curr) {
95+
// plan to inline if we know this is valid to inline, and if the call is
96+
// actually performed - if it is dead code, it's pointless to inline
97+
if (state->canInline.count(curr->target) &&
98+
curr->type != unreachable) {
99+
// nest the call in a block. that way the location of the pointer to the call will not
100+
// change even if we inline multiple times into the same function, otherwise
101+
// call1(call2()) might be a problem
102+
auto* block = Builder(*getModule()).makeBlock(curr);
75103
replaceCurrent(block);
76-
state->actionsForFunction[getFunction()->name].emplace_back(curr, block, getModule()->getFunction(curr->target));
104+
assert(state->actionsForFunction.count(getFunction()->name) > 0); // can't add a new element in parallel
105+
state->actionsForFunction[getFunction()->name].emplace_back(&block->list[0], getModule()->getFunction(curr->target));
77106
}
78107
}
79108

@@ -91,13 +120,13 @@ struct Planner : public WalkerPass<PostWalker<Planner>> {
91120

92121
// Core inlining logic. Modifies the outside function (adding locals as
93122
// needed), and returns the inlined code.
94-
// Since we only inline once, and do not need the function afterwards, we
95-
// can just reuse all the nodes and even avoid copying.
96-
static Expression* doInlining(Module* module, Function* into, Action& action) {
123+
static Expression* doInlining(Module* module, Function* into, InliningAction& action) {
124+
auto* call = (*action.callSite)->cast<Call>();
97125
Builder builder(*module);
98-
auto* block = action.block;
126+
auto* block = Builder(*module).makeBlock();
127+
block->type = call->type;
99128
block->name = Name(std::string("__inlined_func$") + action.contents->name.str);
100-
block->type = action.contents->result;
129+
*action.callSite = block;
101130
// set up a locals mapping
102131
struct Updater : public PostWalker<Updater> {
103132
std::map<Index, Index> localMapping;
@@ -121,49 +150,59 @@ static Expression* doInlining(Module* module, Function* into, Action& action) {
121150
}
122151
// assign the operands into the params
123152
for (Index i = 0; i < action.contents->params.size(); i++) {
124-
block->list.push_back(builder.makeSetLocal(updater.localMapping[i], action.call->operands[i]));
153+
block->list.push_back(builder.makeSetLocal(updater.localMapping[i], call->operands[i]));
125154
}
126-
// update the inlined contents
127-
updater.walk(action.contents->body);
128-
block->list.push_back(action.contents->body);
129-
action.contents->body = builder.makeUnreachable(); // not strictly needed, since it's going away
155+
// generate and update the inlined contents
156+
auto* contents = ExpressionManipulator::copy(action.contents->body, *module);
157+
updater.walk(contents);
158+
block->list.push_back(contents);
130159
return block;
131160
}
132161

133162
struct Inlining : public Pass {
163+
// whether to optimize where we inline
164+
bool optimize = false;
165+
166+
NameToAtomicIndexMap uses;
167+
134168
void run(PassRunner* runner, Module* module) override {
135169
// keep going while we inline, to handle nesting. TODO: optimize
170+
calculateUses(module);
136171
while (iteration(runner, module)) {}
137172
}
138173

139-
bool iteration(PassRunner* runner, Module* module) {
140-
// Count uses
141-
std::map<Name, Index> uses;
174+
void calculateUses(Module* module) {
142175
// fill in uses, as we operate on it in parallel (each function to its own entry)
143176
for (auto& func : module->functions) {
144-
uses[func->name] = 0;
145-
}
146-
{
147-
PassRunner runner(module);
148-
runner.setIsNested(true);
149-
runner.add<FunctionUseCounter>(&uses);
150-
runner.run();
177+
uses[func->name].store(0);
151178
}
179+
PassRunner runner(module);
180+
runner.setIsNested(true);
181+
runner.add<FunctionUseCounter>(&uses);
182+
runner.run();
183+
// anything exported or used in a table should not be inlined
152184
for (auto& ex : module->exports) {
153185
if (ex->kind == ExternalKind::Function) {
154-
uses[ex->value] = 2; // too many, so we ignore it
186+
uses[ex->value].store(TOO_MANY_USES_TO_INLINE);
155187
}
156188
}
157189
for (auto& segment : module->table.segments) {
158190
for (auto name : segment.data) {
159-
uses[name]++;
191+
if (module->getFunctionOrNull(name)) {
192+
uses[name].store(TOO_MANY_USES_TO_INLINE);
193+
}
160194
}
161195
}
196+
}
197+
198+
bool iteration(PassRunner* runner, Module* module) {
162199
// decide which to inline
163200
InliningState state;
164-
for (auto iter : uses) {
165-
if (iter.second == 1) {
166-
state.canInline.insert(iter.first);
201+
for (auto& func : module->functions) {
202+
auto name = func->name;
203+
auto numUses = uses[name].load();
204+
if (canInline(numUses) && worthInlining(module->getFunction(name))) {
205+
state.canInline.insert(name);
167206
}
168207
}
169208
// fill in actionsForFunction, as we operate on it in parallel (each function to its own entry)
@@ -182,15 +221,21 @@ struct Inlining : public Pass {
182221
std::set<Function*> inlinedInto;
183222
for (auto& func : module->functions) {
184223
for (auto& action : state.actionsForFunction[func->name]) {
224+
Name inlinedName = action.contents->name;
185225
doInlining(module, func.get(), action);
186-
inlined.insert(action.contents->name);
226+
inlined.insert(inlinedName);
187227
inlinedInto.insert(func.get());
228+
uses[inlinedName]--;
229+
assert(uses[inlinedName].load() == 0);
188230
}
189231
}
190232
// anything we inlined into may now have non-unique label names, fix it up
191233
for (auto func : inlinedInto) {
192234
wasm::UniqueNameMapper::uniquify(func->body);
193235
}
236+
if (optimize && inlinedInto.size() > 0) {
237+
doOptimize(inlinedInto, module, runner);
238+
}
194239
// remove functions that we managed to inline, their one use is gone
195240
auto& funcs = module->functions;
196241
funcs.erase(std::remove_if(funcs.begin(), funcs.end(), [&inlined](const std::unique_ptr<Function>& curr) {
@@ -199,11 +244,55 @@ struct Inlining : public Pass {
199244
// return whether we did any work
200245
return inlined.size() > 0;
201246
}
247+
248+
bool canInline(int numUses) {
249+
return numUses == SINGLE_USE;
250+
}
251+
252+
bool worthInlining(Function* func) {
253+
return Measurer::measure(func->body) <= INLINING_SIZE_LIMIT;
254+
}
255+
256+
// Run useful optimizations after inlining, things like removing
257+
// unnecessary new blocks, sharing variables, etc.
258+
void doOptimize(std::set<Function*>& funcs, Module* module, PassRunner* parentRunner) {
259+
// save the full list of functions on the side
260+
std::vector<std::unique_ptr<Function>> all;
261+
all.swap(module->functions);
262+
module->updateMaps();
263+
for (auto& func : funcs) {
264+
module->addFunction(func);
265+
}
266+
PassRunner runner(module, parentRunner->options);
267+
runner.setIsNested(true);
268+
runner.setValidateGlobally(false); // not a full valid module
269+
runner.add("remove-unused-brs");
270+
runner.add("remove-unused-names");
271+
runner.add("coalesce-locals");
272+
runner.add("simplify-locals");
273+
runner.add("vacuum");
274+
runner.add("reorder-locals");
275+
runner.add("remove-unused-brs");
276+
runner.add("merge-blocks");
277+
runner.run();
278+
// restore all the funcs
279+
for (auto& func : module->functions) {
280+
func.release();
281+
}
282+
all.swap(module->functions);
283+
module->updateMaps();
284+
}
202285
};
203286

204287
Pass *createInliningPass() {
205288
return new Inlining();
206289
}
207290

291+
Pass *createInliningOptimizingPass() {
292+
auto* ret = new Inlining();
293+
ret->optimize = true;
294+
return ret;
295+
}
296+
208297
} // namespace wasm
209298

src/passes/pass.cpp

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,7 @@ void PassRegistry::registerPasses() {
7373
registerPass("extract-function", "leaves just one function (useful for debugging)", createExtractFunctionPass);
7474
registerPass("flatten-control-flow", "flattens out control flow to be only on blocks, not nested as expressions", createFlattenControlFlowPass);
7575
registerPass("inlining", "inlines functions (currently only ones with a single use)", createInliningPass);
76+
registerPass("inlining-optimizing", "inlines functions (currently only ones with a single use) and optimizes where we inlined", createInliningOptimizingPass);
7677
registerPass("legalize-js-interface", "legalizes i64 types on the import/export boundary", createLegalizeJSInterfacePass);
7778
registerPass("local-cse", "common subexpression elimination inside basic blocks", createLocalCSEPass);
7879
registerPass("log-execution", "instrument the build with logging of where execution goes", createLogExecutionPass);
@@ -110,11 +111,9 @@ void PassRegistry::registerPasses() {
110111
}
111112

112113
void PassRunner::addDefaultOptimizationPasses() {
113-
add("duplicate-function-elimination");
114+
addDefaultGlobalOptimizationPrePasses();
114115
addDefaultFunctionOptimizationPasses();
115-
add("duplicate-function-elimination"); // optimizations show more functions as duplicate
116-
add("remove-unused-module-elements");
117-
add("memory-packing");
116+
addDefaultGlobalOptimizationPostPasses();
118117
}
119118

120119
void PassRunner::addDefaultFunctionOptimizationPasses() {
@@ -154,9 +153,16 @@ void PassRunner::addDefaultFunctionOptimizationPasses() {
154153
add("vacuum"); // just to be safe
155154
}
156155

157-
void PassRunner::addDefaultGlobalOptimizationPasses() {
156+
void PassRunner::addDefaultGlobalOptimizationPrePasses() {
158157
add("duplicate-function-elimination");
158+
}
159+
160+
void PassRunner::addDefaultGlobalOptimizationPostPasses() {
161+
add("duplicate-function-elimination"); // optimizations show more functions as duplicate
159162
add("remove-unused-module-elements");
163+
if (options.optimizeLevel >= 2 || options.shrinkLevel >= 2) {
164+
add("inlining-optimizing");
165+
}
160166
add("memory-packing");
161167
}
162168

src/passes/passes.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ Pass *createExtractFunctionPass();
3232
Pass *createFlattenControlFlowPass();
3333
Pass *createFullPrinterPass();
3434
Pass *createInliningPass();
35+
Pass *createInliningOptimizingPass();
3536
Pass *createLegalizeJSInterfacePass();
3637
Pass *createLocalCSEPass();
3738
Pass *createLogExecutionPass();

src/tools/wasm-merge.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -622,7 +622,7 @@ int main(int argc, const char* argv[]) {
622622
PassRunner passRunner(&output);
623623
passRunner.add("precompute");
624624
passRunner.add("optimize-instructions"); // things now-constant may be further optimized
625-
passRunner.addDefaultGlobalOptimizationPasses();
625+
passRunner.addDefaultGlobalOptimizationPostPasses();
626626
passRunner.run();
627627
}
628628

src/wasm-module-building.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -165,7 +165,7 @@ class OptimizingIncrementalModuleBuilder {
165165
}
166166
addPrePasses(passRunner);
167167
passRunner.addDefaultFunctionOptimizationPasses();
168-
passRunner.addDefaultGlobalOptimizationPasses();
168+
passRunner.addDefaultGlobalOptimizationPostPasses();
169169
passRunner.run();
170170
return;
171171
}
@@ -226,7 +226,7 @@ class OptimizingIncrementalModuleBuilder {
226226

227227
void optimizeGlobally() {
228228
PassRunner passRunner(wasm, passOptions);
229-
passRunner.addDefaultGlobalOptimizationPasses();
229+
passRunner.addDefaultGlobalOptimizationPostPasses();
230230
passRunner.run();
231231
}
232232

0 commit comments

Comments
 (0)