|
15 | 15 | */
|
16 | 16 |
|
17 | 17 | #include "ir/names.h"
|
| 18 | +#include "ir/stack-utils.h" |
18 | 19 | #include "ir/utils.h"
|
19 | 20 | #include "pass.h"
|
20 | 21 | #include "passes/stringify-walker.h"
|
| 22 | +#include "support/intervals.h" |
21 | 23 | #include "support/suffix_tree.h"
|
| 24 | +#include "wasm-ir-builder.h" |
22 | 25 | #include "wasm.h"
|
23 | 26 |
|
24 | 27 | #define OUTLINING_DEBUG 0
|
|
37 | 40 |
|
38 | 41 | namespace wasm {
|
39 | 42 |
|
| 43 | +// This custom hasher conforms to std::hash<Key>. Its purpose is to provide |
| 44 | +// a custom hash for if expressions, so the if-condition of the if expression is |
| 45 | +// not included in the hash for the if expression. This is needed because in the |
| 46 | +// binary format, the if-condition comes before and is consumed by the if. To |
| 47 | +// match the binary format, we hash the if condition before and separately from |
| 48 | +// the rest of the if expression. |
| 49 | +struct StringifyHasher { |
| 50 | + size_t operator()(Expression* curr) const { |
| 51 | + if (Properties::isControlFlowStructure(curr)) { |
| 52 | + if (auto* iff = curr->dynCast<If>()) { |
| 53 | + size_t digest = wasm::hash(iff->_id); |
| 54 | + rehash(digest, ExpressionAnalyzer::hash(iff->ifTrue)); |
| 55 | + if (iff->ifFalse) { |
| 56 | + rehash(digest, ExpressionAnalyzer::hash(iff->ifFalse)); |
| 57 | + } |
| 58 | + return digest; |
| 59 | + } |
| 60 | + |
| 61 | + return ExpressionAnalyzer::hash(curr); |
| 62 | + } |
| 63 | + |
| 64 | + return ExpressionAnalyzer::shallowHash(curr); |
| 65 | + } |
| 66 | +}; |
| 67 | + |
| 68 | +// This custom equator conforms to std::equal_to<Key>. Similar to |
| 69 | +// StringifyHasher, it's purpose is to not include the if-condition when |
| 70 | +// evaluating the equality of two if expressions. |
| 71 | +struct StringifyEquator { |
| 72 | + bool operator()(Expression* lhs, Expression* rhs) const { |
| 73 | + if (Properties::isControlFlowStructure(lhs) && |
| 74 | + Properties::isControlFlowStructure(rhs)) { |
| 75 | + auto* iffl = lhs->dynCast<If>(); |
| 76 | + auto* iffr = rhs->dynCast<If>(); |
| 77 | + |
| 78 | + if (iffl && iffr) { |
| 79 | + return ExpressionAnalyzer::equal(iffl->ifTrue, iffr->ifTrue) && |
| 80 | + ExpressionAnalyzer::equal(iffl->ifFalse, iffr->ifFalse); |
| 81 | + } |
| 82 | + |
| 83 | + return ExpressionAnalyzer::equal(lhs, rhs); |
| 84 | + } |
| 85 | + |
| 86 | + return ExpressionAnalyzer::shallowEqual(lhs, rhs); |
| 87 | + } |
| 88 | +}; |
| 89 | + |
| 90 | +struct HashStringifyWalker : public StringifyWalker<HashStringifyWalker> { |
| 91 | + // After calling walkModule, this vector contains the result of encoding a |
| 92 | + // wasm module as a string of uint32_t values. Each value represents either an |
| 93 | + // Expression or a separator to mark the end of control flow. |
| 94 | + std::vector<uint32_t> hashString; |
| 95 | + // A monotonic counter used to ensure that unique expressions in the |
| 96 | + // module are assigned a unique value in the hashString. |
| 97 | + uint32_t nextVal = 0; |
| 98 | + // A monotonic counter used to ensure that each separator in the |
| 99 | + // module is assigned a unique value in the hashString. |
| 100 | + int32_t nextSeparatorVal = -1; |
| 101 | + // Contains a mapping of expression pointer to value to ensure we |
| 102 | + // use the same value for matching expressions. A custom hasher and |
| 103 | + // equator is provided in order to separate out evaluation of the if-condition |
| 104 | + // when evaluating if expressions. |
| 105 | + std::unordered_map<Expression*, uint32_t, StringifyHasher, StringifyEquator> |
| 106 | + exprToCounter; |
| 107 | + std::vector<Expression*> exprs; |
| 108 | + |
| 109 | + void addUniqueSymbol(SeparatorReason reason); |
| 110 | + void visitExpression(Expression* curr); |
| 111 | + // Converts the idx from relative to the beginning of the program to |
| 112 | + // relative to its enclosing function, and returns the name of its function. |
| 113 | + std::pair<uint32_t, Name> makeRelative(uint32_t idx) const; |
| 114 | + |
| 115 | +private: |
| 116 | + // Contains the indices that mark the start of each function. |
| 117 | + std::set<uint32_t> funcIndices; |
| 118 | + // Maps the start idx of each function to the function name. |
| 119 | + std::map<uint32_t, Name> idxToFuncName; |
| 120 | +}; |
| 121 | + |
| 122 | +void HashStringifyWalker::addUniqueSymbol(SeparatorReason reason) { |
| 123 | + // Use a negative value to distinguish symbols for separators from symbols |
| 124 | + // for Expressions |
| 125 | + assert((uint32_t)nextSeparatorVal >= nextVal); |
| 126 | + if (auto funcStart = reason.getFuncStart()) { |
| 127 | + idxToFuncName.insert({hashString.size(), funcStart->func->name}); |
| 128 | + } |
| 129 | + hashString.push_back((uint32_t)nextSeparatorVal); |
| 130 | + nextSeparatorVal--; |
| 131 | + exprs.push_back(nullptr); |
| 132 | +} |
| 133 | + |
| 134 | +void HashStringifyWalker::visitExpression(Expression* curr) { |
| 135 | + auto [it, inserted] = exprToCounter.insert({curr, nextVal}); |
| 136 | + hashString.push_back(it->second); |
| 137 | + exprs.push_back(curr); |
| 138 | + if (inserted) { |
| 139 | + nextVal++; |
| 140 | + } |
| 141 | +} |
| 142 | + |
| 143 | +std::pair<uint32_t, Name> |
| 144 | +HashStringifyWalker::makeRelative(uint32_t idx) const { |
| 145 | + // The upper_bound function returns an iterator to the first value in the set |
| 146 | + // that is true for idx < value. We subtract one from this returned value to |
| 147 | + // tell us which function actually contains the the idx. |
| 148 | + auto [funcIdx, func] = *--idxToFuncName.upper_bound(idx); |
| 149 | + return {idx - funcIdx, func}; |
| 150 | +} |
| 151 | + |
| 152 | +using Substrings = std::vector<SuffixTree::RepeatedSubstring>; |
| 153 | + |
| 154 | +// Functions that filter vectors of SuffixTree::RepeatedSubstring |
| 155 | +struct StringifyProcessor { |
| 156 | + static Substrings repeatSubstrings(std::vector<uint32_t>& hashString); |
| 157 | + static Substrings dedupe(const Substrings& substrings); |
| 158 | + static Substrings filterOverlaps(const Substrings& substrings); |
| 159 | + // Filter is the general purpose function backing subsequent filter functions. |
| 160 | + // It can be used directly, but generally prefer a wrapper function |
| 161 | + // to encapsulate your condition and make it available for tests. |
| 162 | + static Substrings filter(const Substrings& substrings, |
| 163 | + const std::vector<Expression*>& exprs, |
| 164 | + std::function<bool(const Expression*)> condition); |
| 165 | + static Substrings filterLocalSets(const Substrings& substrings, |
| 166 | + const std::vector<Expression*>& exprs); |
| 167 | + static Substrings filterLocalGets(const Substrings& substrings, |
| 168 | + const std::vector<Expression*>& exprs); |
| 169 | + static Substrings filterBranches(const Substrings& substrings, |
| 170 | + const std::vector<Expression*>& exprs); |
| 171 | +}; |
| 172 | + |
| 173 | +std::vector<SuffixTree::RepeatedSubstring> |
| 174 | +StringifyProcessor::repeatSubstrings(std::vector<uint32_t>& hashString) { |
| 175 | + SuffixTree st(hashString); |
| 176 | + std::vector<SuffixTree::RepeatedSubstring> substrings(st.begin(), st.end()); |
| 177 | + for (auto& substring : substrings) { |
| 178 | + // Sort by increasing start index to ensure determinism. |
| 179 | + std::sort(substring.StartIndices.begin(), substring.StartIndices.end()); |
| 180 | + } |
| 181 | + // Substrings are sorted so that the longest substring that repeats the most |
| 182 | + // times is ordered first. This is done so that we can assume the most |
| 183 | + // worthwhile substrings to outline come first. |
| 184 | + std::sort( |
| 185 | + substrings.begin(), |
| 186 | + substrings.end(), |
| 187 | + [](SuffixTree::RepeatedSubstring a, SuffixTree::RepeatedSubstring b) { |
| 188 | + size_t aWeight = a.Length * a.StartIndices.size(); |
| 189 | + size_t bWeight = b.Length * b.StartIndices.size(); |
| 190 | + if (aWeight == bWeight) { |
| 191 | + return a.StartIndices[0] < b.StartIndices[0]; |
| 192 | + } |
| 193 | + return aWeight > bWeight; |
| 194 | + }); |
| 195 | + return substrings; |
| 196 | +} |
| 197 | + |
| 198 | +// Deduplicate substrings by iterating through the list of substrings, keeping |
| 199 | +// only those whose list of end indices is disjoint from the set of end indices |
| 200 | +// for all substrings kept so far. Substrings that are contained within other |
| 201 | +// substrings will always share an end index with those other substrings. Note |
| 202 | +// that this deduplication may be over-aggressive, since it will remove |
| 203 | +// substrings that are contained within any previous substring, even if they |
| 204 | +// have many other occurrences that are not inside other substrings. Part of the |
| 205 | +// reason dedupe can be so aggressive is an assumption 1) that the input |
| 206 | +// substrings have been sorted so that the longest substrings with the most |
| 207 | +// repeats come first and 2) these are more worthwhile to keep than subsequent |
| 208 | +// substrings of substrings, even if they appear more times. |
| 209 | +std::vector<SuffixTree::RepeatedSubstring> StringifyProcessor::dedupe( |
| 210 | + const std::vector<SuffixTree::RepeatedSubstring>& substrings) { |
| 211 | + std::unordered_set<uint32_t> seen; |
| 212 | + std::vector<SuffixTree::RepeatedSubstring> result; |
| 213 | + for (auto substring : substrings) { |
| 214 | + std::vector<uint32_t> idxToInsert; |
| 215 | + bool seenEndIdx = false; |
| 216 | + for (auto startIdx : substring.StartIndices) { |
| 217 | + // We are using the end index to ensure that each repeated substring |
| 218 | + // reported by the SuffixTree is unique. This is because LLVM's SuffixTree |
| 219 | + // reports back repeat sequences that are substrings of longer repeat |
| 220 | + // sequences with the same endIdx, and we generally prefer to outline |
| 221 | + // longer repeat sequences. |
| 222 | + uint32_t endIdx = substring.Length + startIdx; |
| 223 | + if (seen.count(endIdx)) { |
| 224 | + seenEndIdx = true; |
| 225 | + break; |
| 226 | + } |
| 227 | + idxToInsert.push_back(endIdx); |
| 228 | + } |
| 229 | + if (!seenEndIdx) { |
| 230 | + seen.insert(idxToInsert.begin(), idxToInsert.end()); |
| 231 | + result.push_back(substring); |
| 232 | + } |
| 233 | + } |
| 234 | + |
| 235 | + return result; |
| 236 | +} |
| 237 | + |
| 238 | +std::vector<SuffixTree::RepeatedSubstring> StringifyProcessor::filterOverlaps( |
| 239 | + const std::vector<SuffixTree::RepeatedSubstring>& substrings) { |
| 240 | + // A substring represents a contiguous set of instructions that appear more |
| 241 | + // than once in a Wasm binary. For each appearance of the substring, an |
| 242 | + // Interval is created that lacks a connection back to its originating |
| 243 | + // substring. To fix, upon Interval creation, a second vector is populated |
| 244 | + // with the index of the corresponding substring. |
| 245 | + std::vector<Interval> intervals; |
| 246 | + std::vector<int> substringIdxs; |
| 247 | + |
| 248 | + // Construct intervals |
| 249 | + for (Index i = 0; i < substrings.size(); i++) { |
| 250 | + auto& substring = substrings[i]; |
| 251 | + for (auto startIdx : substring.StartIndices) { |
| 252 | + intervals.emplace_back( |
| 253 | + startIdx, startIdx + substring.Length, substring.Length); |
| 254 | + substringIdxs.push_back(i); |
| 255 | + } |
| 256 | + } |
| 257 | + |
| 258 | + // Get the overlapping intervals |
| 259 | + std::vector<SuffixTree::RepeatedSubstring> result; |
| 260 | + std::vector<std::vector<Index>> startIndices(substrings.size()); |
| 261 | + std::vector<int> indices = IntervalProcessor::filterOverlaps(intervals); |
| 262 | + for (auto i : indices) { |
| 263 | + // i is the idx of the Interval in the intervals vector |
| 264 | + // i in substringIdxs returns the idx of the substring that needs to be |
| 265 | + // included in result |
| 266 | + auto substringIdx = substringIdxs[i]; |
| 267 | + startIndices[substringIdx].push_back(intervals[i].start); |
| 268 | + } |
| 269 | + for (Index i = 0; i < startIndices.size(); i++) { |
| 270 | + if (startIndices[i].size() > 1) { |
| 271 | + result.emplace_back(SuffixTree::RepeatedSubstring( |
| 272 | + {substrings[i].Length, std::move(startIndices[i])})); |
| 273 | + } |
| 274 | + } |
| 275 | + |
| 276 | + return result; |
| 277 | +} |
| 278 | + |
| 279 | +std::vector<SuffixTree::RepeatedSubstring> StringifyProcessor::filter( |
| 280 | + const std::vector<SuffixTree::RepeatedSubstring>& substrings, |
| 281 | + const std::vector<Expression*>& exprs, |
| 282 | + std::function<bool(const Expression*)> condition) { |
| 283 | + |
| 284 | + struct FilterStringifyWalker : public StringifyWalker<FilterStringifyWalker> { |
| 285 | + bool hasFilterValue = false; |
| 286 | + std::function<bool(const Expression*)> condition; |
| 287 | + |
| 288 | + FilterStringifyWalker(std::function<bool(const Expression*)> condition) |
| 289 | + : condition(condition){}; |
| 290 | + |
| 291 | + void walk(Expression* curr) { |
| 292 | + hasFilterValue = false; |
| 293 | + Super::walk(curr); |
| 294 | + flushControlFlowQueue(); |
| 295 | + } |
| 296 | + |
| 297 | + void addUniqueSymbol(SeparatorReason reason) {} |
| 298 | + |
| 299 | + void visitExpression(Expression* curr) { |
| 300 | + if (condition(curr)) { |
| 301 | + hasFilterValue = true; |
| 302 | + } |
| 303 | + } |
| 304 | + }; |
| 305 | + |
| 306 | + FilterStringifyWalker walker(condition); |
| 307 | + |
| 308 | + std::vector<SuffixTree::RepeatedSubstring> result; |
| 309 | + for (auto substring : substrings) { |
| 310 | + bool hasFilterValue = false; |
| 311 | + for (auto idx = substring.StartIndices[0], |
| 312 | + endIdx = substring.StartIndices[0] + substring.Length; |
| 313 | + idx < endIdx; |
| 314 | + idx++) { |
| 315 | + Expression* curr = exprs[idx]; |
| 316 | + if (Properties::isControlFlowStructure(curr)) { |
| 317 | + walker.walk(curr); |
| 318 | + if (walker.hasFilterValue) { |
| 319 | + hasFilterValue = true; |
| 320 | + break; |
| 321 | + } |
| 322 | + } |
| 323 | + if (condition(curr)) { |
| 324 | + hasFilterValue = true; |
| 325 | + break; |
| 326 | + } |
| 327 | + } |
| 328 | + if (!hasFilterValue) { |
| 329 | + result.push_back(substring); |
| 330 | + } |
| 331 | + } |
| 332 | + |
| 333 | + return result; |
| 334 | +} |
| 335 | + |
| 336 | +std::vector<SuffixTree::RepeatedSubstring> StringifyProcessor::filterLocalSets( |
| 337 | + const std::vector<SuffixTree::RepeatedSubstring>& substrings, |
| 338 | + const std::vector<Expression*>& exprs) { |
| 339 | + return StringifyProcessor::filter( |
| 340 | + substrings, exprs, [](const Expression* curr) { |
| 341 | + return curr->is<LocalSet>(); |
| 342 | + }); |
| 343 | +} |
| 344 | + |
| 345 | +std::vector<SuffixTree::RepeatedSubstring> StringifyProcessor::filterLocalGets( |
| 346 | + const std::vector<SuffixTree::RepeatedSubstring>& substrings, |
| 347 | + const std::vector<Expression*>& exprs) { |
| 348 | + return StringifyProcessor::filter( |
| 349 | + substrings, exprs, [](const Expression* curr) { |
| 350 | + return curr->is<LocalGet>(); |
| 351 | + }); |
| 352 | +} |
| 353 | + |
| 354 | +std::vector<SuffixTree::RepeatedSubstring> StringifyProcessor::filterBranches( |
| 355 | + const std::vector<SuffixTree::RepeatedSubstring>& substrings, |
| 356 | + const std::vector<Expression*>& exprs) { |
| 357 | + return StringifyProcessor::filter( |
| 358 | + substrings, exprs, [](const Expression* curr) { |
| 359 | + return Properties::isBranch(curr) || curr->is<Return>() || |
| 360 | + curr->is<TryTable>(); |
| 361 | + }); |
| 362 | +} |
| 363 | + |
40 | 364 | struct OutliningSequence {
|
41 | 365 | unsigned startIdx;
|
42 | 366 | unsigned endIdx;
|
|
0 commit comments