|
| 1 | +#!/usr/bin/env python3 |
| 2 | +# Copyright 2025 The Emscripten Authors. All rights reserved. |
| 3 | +# Emscripten is available under two separate licenses, the MIT license and the |
| 4 | +# University of Illinois/NCSA Open Source License. Both these licenses can be |
| 5 | +# found in the LICENSE file. |
| 6 | + |
| 7 | +""" |
| 8 | +Wrapper for 'wasm-split --multi-split' functionality. This script generates a |
| 9 | +.manifest file based on the list of user source paths, using source map |
| 10 | +information. |
| 11 | +
|
| 12 | +This assumes the name section exists in the input wasm file, and also assumes |
| 13 | +the sourceMappingURL section exists in the input or a source map file is |
| 14 | +separately supplied with --sourcemap. If we have two files a.c and b.c, to |
| 15 | +generate a source map and the name section, if you compile and link within a |
| 16 | +single command, you can do something like |
| 17 | +$ emcc -g2 -gsrouce-map a.c b.c -o result.js |
| 18 | +If you want to compile and link in separate commands, you can do |
| 19 | +$ emcc -gsource-map a.c -o a.o |
| 20 | +$ emcc -gsource-map b.c -o b.o |
| 21 | +$ emcc -g2 -gsource-map a.o b.o -o result.js |
| 22 | +See https://emscripten.org/docs/porting/Debugging.html for more details. |
| 23 | +
|
| 24 | +This takes a wasm file and a paths file, which is a text file containing a list |
| 25 | +of paths as inputs. The paths file should contain a single path per line. A |
| 26 | +single split module will be generated per specified path. If a specified path |
| 27 | +contains another specified path, functions contained in the inner path will be |
| 28 | +split as the inner path's module, and the rest of the functions will be split as |
| 29 | +the outer path's module. Functions that do not belong to any of the specified |
| 30 | +paths will remain in the primary module. |
| 31 | +""" |
| 32 | + |
| 33 | +import argparse |
| 34 | +import os |
| 35 | +import sys |
| 36 | +import tempfile |
| 37 | +from pathlib import PurePath |
| 38 | + |
| 39 | +__scriptdir__ = os.path.dirname(os.path.abspath(__file__)) |
| 40 | +__rootdir__ = os.path.dirname(__scriptdir__) |
| 41 | +sys.path.insert(0, __rootdir__) |
| 42 | + |
| 43 | +from tools import building |
| 44 | +from tools import diagnostics |
| 45 | +from tools import emsymbolizer |
| 46 | +from tools import shared |
| 47 | +from tools import utils |
| 48 | +from tools import webassembly |
| 49 | +from tools.utils import exit_with_error |
| 50 | + |
| 51 | + |
| 52 | +def parse_args(): |
| 53 | + parser = argparse.ArgumentParser( |
| 54 | + description='Split a wasm file based on user paths', |
| 55 | + epilog=""" |
| 56 | +This is a wrapper for 'wasm-split --multi-split' functionality, so you should |
| 57 | +add wasm-split's command line options as well. You should or may want to add |
| 58 | +wasm-split options like -o (--output), --out-prefix, -g, and feature |
| 59 | +enabling/disabling options. Run 'wasm-split -h' for the list of options. But you |
| 60 | +should NOT add --manifest, because this will be generated from this script. |
| 61 | +""") |
| 62 | + parser.add_argument('wasm', help='Path to the input wasm file') |
| 63 | + parser.add_argument('paths_file', help='Path to the input file containing paths') |
| 64 | + parser.add_argument('-s', '--sourcemap', help='Force source map file') |
| 65 | + parser.add_argument('-v', '--verbose', action='store_true', |
| 66 | + help='Print verbose info for debugging this script') |
| 67 | + parser.add_argument('--wasm-split', help='Path to wasm-split executable') |
| 68 | + parser.add_argument('--preserve-manifest', action='store_true', |
| 69 | + help='Preserve generated manifest file. This sets --verbose too.') |
| 70 | + args, forwarded_args = parser.parse_known_args() |
| 71 | + if '--manifest' in forwarded_args: |
| 72 | + exit_with_error('manifest file will be generated by this script and should not be given') |
| 73 | + if args.preserve_manifest: |
| 74 | + args.verbose = True |
| 75 | + return args, forwarded_args |
| 76 | + |
| 77 | + |
| 78 | +def get_path_to_functions_map(wasm, sourcemap, paths): |
| 79 | + def is_synthesized_func(func): |
| 80 | + # TODO There can be more |
| 81 | + synthesized_names = [ |
| 82 | + 'main', |
| 83 | + '__wasm_call_ctors', |
| 84 | + '__clang_call_terminate', |
| 85 | + ] |
| 86 | + synthesized_prefixes = [ |
| 87 | + 'legalstub$', |
| 88 | + 'legalfunc$', |
| 89 | + '__cxx_global_', |
| 90 | + '_GLOBAL__', |
| 91 | + 'virtual thunk to ', |
| 92 | + ] |
| 93 | + if func in synthesized_names: |
| 94 | + return True |
| 95 | + return func.startswith(tuple(synthesized_prefixes)) |
| 96 | + |
| 97 | + # Compute {func_name: src file} map, and invert it to get |
| 98 | + # {src file: list of functions} map, and construct {path: list of functions} |
| 99 | + # map from it |
| 100 | + with webassembly.Module(wasm) as module: |
| 101 | + if not module.has_name_section(): |
| 102 | + exit_with_error('Name section does not eixst') |
| 103 | + if not sourcemap: |
| 104 | + if not emsymbolizer.get_sourceMappingURL_section(module): |
| 105 | + exit_with_error('sourceMappingURL section does not exist') |
| 106 | + |
| 107 | + funcs = module.get_functions() |
| 108 | + func_names = module.get_function_names() |
| 109 | + assert len(funcs) == len(func_names) |
| 110 | + |
| 111 | + func_to_src = {} |
| 112 | + src_to_funcs = {} |
| 113 | + |
| 114 | + if not sourcemap: |
| 115 | + sourcemap = module.get_sourceMappingURL() |
| 116 | + sm = emsymbolizer.WasmSourceMap() |
| 117 | + sm.parse(sourcemap) |
| 118 | + |
| 119 | + for func_name, func in zip(func_names, funcs): |
| 120 | + # From the last address, decrement the address by 1 until we find location |
| 121 | + # info with source file information. The reason we do this is to reduce |
| 122 | + # the probability of picking an address where another function is inlined |
| 123 | + # into, picking the inlined function's source. |
| 124 | + # We start from the end because it is simpler; it is harder to compute the |
| 125 | + # first instruction's address, because there is a gap for local types |
| 126 | + # between function offset and the first instruction. |
| 127 | + addr = func.offset + func.size - 1 |
| 128 | + while addr > func.offset: |
| 129 | + loc = sm.lookup(addr, func.offset) |
| 130 | + # This means there is no source map mappings for the entire function |
| 131 | + # (because we give func.offset as a lower bound). Exit the loop. |
| 132 | + if not loc: |
| 133 | + break |
| 134 | + # Exit the loop only if a location info with source file information is |
| 135 | + # found. If not, continue the search. |
| 136 | + if loc.source: |
| 137 | + break |
| 138 | + addr -= 1 |
| 139 | + |
| 140 | + if loc and loc.source: |
| 141 | + func_to_src[func_name] = utils.normalize_path(loc.source) |
| 142 | + else: |
| 143 | + if not is_synthesized_func(func_name): |
| 144 | + diagnostics.warn(f"No source file information found in the source map for function '{func_name}'") |
| 145 | + |
| 146 | + for func_name, src in func_to_src.items(): |
| 147 | + if src not in src_to_funcs: |
| 148 | + src_to_funcs[src] = [] |
| 149 | + src_to_funcs[src].append(func_name) |
| 150 | + |
| 151 | + # Visit paths in the reverse sorting order, so that we can process inner paths |
| 152 | + # first. |
| 153 | + # e.g. If we have /a/b and /a/b/c, /a/b/c will come first, so we can assign |
| 154 | + # functions contained in /a/b/c to it first and assign the remaining functions |
| 155 | + # to /a/b. |
| 156 | + visited_funcs = set() |
| 157 | + path_to_funcs = {} |
| 158 | + for path in sorted(paths, reverse=True): |
| 159 | + ppath = PurePath(path) |
| 160 | + path_to_funcs[path] = [] |
| 161 | + for src, funcs in src_to_funcs.items(): |
| 162 | + psrc = PurePath(src) |
| 163 | + if ppath == psrc or ppath in psrc.parents: |
| 164 | + for func in funcs: |
| 165 | + if func not in visited_funcs: |
| 166 | + visited_funcs.add(func) |
| 167 | + path_to_funcs[path].append(func) |
| 168 | + return path_to_funcs |
| 169 | + |
| 170 | + |
| 171 | +def main(): |
| 172 | + args, forwarded_args = parse_args() |
| 173 | + if args.wasm_split: |
| 174 | + wasm_split = args.wasm_split |
| 175 | + else: |
| 176 | + wasm_split = os.path.join(building.get_binaryen_bin(), 'wasm-split') |
| 177 | + |
| 178 | + if not os.path.isfile(args.wasm): |
| 179 | + exit_with_error(f"'{args.wasm}' was not found or not a file") |
| 180 | + if not os.path.isfile(args.paths_file): |
| 181 | + exit_with_error(f"'{args.paths_file}' was not found or not a file") |
| 182 | + if args.sourcemap: |
| 183 | + if not os.path.isfile(args.sourcemap): |
| 184 | + exit_with_error(f"'{args.sourcemap}' was not found or not a file") |
| 185 | + if not os.path.isfile(wasm_split): |
| 186 | + exit_with_error(f"'{wasm_split}' was not found or not a file") |
| 187 | + |
| 188 | + paths = utils.read_file(args.paths_file).splitlines() |
| 189 | + paths = [utils.normalize_path(path.strip()) for path in paths if path.strip()] |
| 190 | + # To make /a/b/c and /a/b/c/ equivalent |
| 191 | + paths = [path.rstrip(os.sep) for path in paths] |
| 192 | + # Remove duplicates |
| 193 | + paths = list(dict.fromkeys(paths)) |
| 194 | + |
| 195 | + # Compute {path: list of functions} map |
| 196 | + path_to_funcs = get_path_to_functions_map(args.wasm, args.sourcemap, paths) |
| 197 | + |
| 198 | + # Write .manifest file |
| 199 | + with tempfile.NamedTemporaryFile(suffix=".manifest", mode='w+', delete=args.preserve_manifest) as f: |
| 200 | + manifest = f.name |
| 201 | + for i, path in enumerate(paths): |
| 202 | + f.write(f'{i}\n') |
| 203 | + if not path_to_funcs[path]: |
| 204 | + diagnostics.warn(f'{path} does not match any functions') |
| 205 | + if args.verbose: |
| 206 | + print(path) |
| 207 | + for func in path_to_funcs[path]: |
| 208 | + print(' ' + func) |
| 209 | + print() |
| 210 | + for func in path_to_funcs[path]: |
| 211 | + f.write(func + '\n') |
| 212 | + if i < len(paths) - 1: |
| 213 | + f.write('\n') |
| 214 | + f.flush() |
| 215 | + |
| 216 | + cmd = [wasm_split, '--multi-split', args.wasm, '--manifest', manifest] |
| 217 | + if args.verbose: |
| 218 | + # This option is used both in this script and wasm-split |
| 219 | + cmd.append('-v') |
| 220 | + cmd += forwarded_args |
| 221 | + if args.verbose: |
| 222 | + print('\n' + ' '.join(cmd)) |
| 223 | + shared.run_process(cmd) |
| 224 | + |
| 225 | + |
| 226 | +if __name__ == '__main__': |
| 227 | + sys.exit(main()) |
0 commit comments