Skip to content

Commit d801296

Browse files
authored
Add script to split module based on source paths (#25278)
This adds a script, `tools/empath-split.py`, which is a wrapper for Binaryen's `wasm-split`. `wasm-split` has `--multi-split` mode, which takes a manifest file that lists the name of functions per module. (Example: https://github.com/WebAssembly/binaryen/blob/main/test/lit/wasm-split/multi-split.wast.manifest) But listing all functions belonging to each module is a tedious process. `empath-split` takes a wasm file and a text file that has a list of paths, which can be either directories or functions, and using the source map information, generates a manifest file, and runs `wasm-split`. This adds a small drive-by fix for `emsymbolizer`. Currently when it takes a address 0, it returns the location info associated with `offsets[-1]`, which is the largest offset. This fixes it, and adds an optional `lower_bound` argument to `find_offset` so that when we want to get a source info entry, we don't go below the current function start offset.
1 parent aae1214 commit d801296

File tree

6 files changed

+333
-10
lines changed

6 files changed

+333
-10
lines changed

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@ emdump
4646
emdwp
4747
emmake
4848
emnm
49+
empath-split
4950
emprofile
5051
emranlib
5152
emrun
@@ -67,6 +68,7 @@ emdump.bat
6768
emdwp.bat
6869
emmake.bat
6970
emnm.bat
71+
empath-split.bat
7072
emprofile.bat
7173
emranlib.bat
7274
emrun.bat

test/test_other.py

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,7 @@
5555
emmake = shared.bat_suffix(path_from_root('emmake'))
5656
emconfig = shared.bat_suffix(path_from_root('em-config'))
5757
emsize = shared.bat_suffix(path_from_root('emsize'))
58+
empath_split = shared.bat_suffix(path_from_root('empath-split'))
5859
emprofile = shared.bat_suffix(path_from_root('emprofile'))
5960
emstrip = shared.bat_suffix(path_from_root('emstrip'))
6061
emsymbolizer = shared.bat_suffix(path_from_root('emsymbolizer'))
@@ -15596,3 +15597,50 @@ def test_create_preloaded_file(self):
1559615597
return 0;
1559715598
}''')
1559815599
self.do_runf('main.c', 'done\n', cflags=['-sFORCE_FILESYSTEM', '--post-js=post.js'])
15600+
15601+
def test_empath_split(self):
15602+
create_file('main.cpp', r'''
15603+
#include <iostream>
15604+
void foo();
15605+
int main() {
15606+
std::cout << "main" << std::endl;
15607+
foo();
15608+
return 0;
15609+
}
15610+
''')
15611+
create_file('foo.cpp', r'''
15612+
#include <iostream>
15613+
void foo() { std::cout << "foo" << std::endl; }
15614+
''')
15615+
create_file('path_list', r'''
15616+
main.cpp
15617+
foo.cpp
15618+
/emsdk/emscripten/system
15619+
/emsdk/emscripten/system/lib/libc/musl
15620+
/emsdk/emscripten/system/lib/libcxx
15621+
''')
15622+
15623+
self.run_process([EMCC, 'main.cpp', 'foo.cpp', '-gsource-map', '-g2', '-o', 'test.js'])
15624+
self.run_process([empath_split, 'test.wasm', 'path_list', '-g', '-o', 'test_primary.wasm', '--out-prefix=test_'])
15625+
15626+
# Check if functions are correctly assigned and split with the specified
15627+
# paths. When one path contains another, the inner path should take its
15628+
# functions first, and the rest is split with the outer path.
15629+
def has_defined_function(file, func):
15630+
self.run_process([common.WASM_DIS, file, '-o', 'test.wast'])
15631+
pattern = re.compile(r'^\s*\(\s*func\s+\$' + func + r'[\s\(\)]', flags=re.MULTILINE)
15632+
with open('test.wast') as f:
15633+
return pattern.search(f.read()) is not None
15634+
15635+
# main.cpp
15636+
self.assertTrue(has_defined_function('test_0.wasm', '__original_main'))
15637+
# foo.cpp
15638+
self.assertTrue(has_defined_function('test_1.wasm', r'foo\\28\\29'))
15639+
# /emsdk/emscripten/system
15640+
self.assertTrue(has_defined_function('test_2.wasm', '__abort_message'))
15641+
self.assertTrue(has_defined_function('test_2.wasm', 'pthread_cond_wait'))
15642+
# /emsdk/emscripten/system/lib/libc/musl
15643+
self.assertTrue(has_defined_function('test_3.wasm', 'strcmp'))
15644+
# /emsdk/emscripten/system/lib/libcxx
15645+
self.assertTrue(has_defined_function('test_4.wasm', r'std::__2::ios_base::getloc\\28\\29\\20const'))
15646+
self.assertTrue(has_defined_function('test_4.wasm', r'std::uncaught_exceptions\\28\\29'))

tools/empath-split.py

Lines changed: 227 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,227 @@
1+
#!/usr/bin/env python3
2+
# Copyright 2025 The Emscripten Authors. All rights reserved.
3+
# Emscripten is available under two separate licenses, the MIT license and the
4+
# University of Illinois/NCSA Open Source License. Both these licenses can be
5+
# found in the LICENSE file.
6+
7+
"""
8+
Wrapper for 'wasm-split --multi-split' functionality. This script generates a
9+
.manifest file based on the list of user source paths, using source map
10+
information.
11+
12+
This assumes the name section exists in the input wasm file, and also assumes
13+
the sourceMappingURL section exists in the input or a source map file is
14+
separately supplied with --sourcemap. If we have two files a.c and b.c, to
15+
generate a source map and the name section, if you compile and link within a
16+
single command, you can do something like
17+
$ emcc -g2 -gsrouce-map a.c b.c -o result.js
18+
If you want to compile and link in separate commands, you can do
19+
$ emcc -gsource-map a.c -o a.o
20+
$ emcc -gsource-map b.c -o b.o
21+
$ emcc -g2 -gsource-map a.o b.o -o result.js
22+
See https://emscripten.org/docs/porting/Debugging.html for more details.
23+
24+
This takes a wasm file and a paths file, which is a text file containing a list
25+
of paths as inputs. The paths file should contain a single path per line. A
26+
single split module will be generated per specified path. If a specified path
27+
contains another specified path, functions contained in the inner path will be
28+
split as the inner path's module, and the rest of the functions will be split as
29+
the outer path's module. Functions that do not belong to any of the specified
30+
paths will remain in the primary module.
31+
"""
32+
33+
import argparse
34+
import os
35+
import sys
36+
import tempfile
37+
from pathlib import PurePath
38+
39+
__scriptdir__ = os.path.dirname(os.path.abspath(__file__))
40+
__rootdir__ = os.path.dirname(__scriptdir__)
41+
sys.path.insert(0, __rootdir__)
42+
43+
from tools import building
44+
from tools import diagnostics
45+
from tools import emsymbolizer
46+
from tools import shared
47+
from tools import utils
48+
from tools import webassembly
49+
from tools.utils import exit_with_error
50+
51+
52+
def parse_args():
53+
parser = argparse.ArgumentParser(
54+
description='Split a wasm file based on user paths',
55+
epilog="""
56+
This is a wrapper for 'wasm-split --multi-split' functionality, so you should
57+
add wasm-split's command line options as well. You should or may want to add
58+
wasm-split options like -o (--output), --out-prefix, -g, and feature
59+
enabling/disabling options. Run 'wasm-split -h' for the list of options. But you
60+
should NOT add --manifest, because this will be generated from this script.
61+
""")
62+
parser.add_argument('wasm', help='Path to the input wasm file')
63+
parser.add_argument('paths_file', help='Path to the input file containing paths')
64+
parser.add_argument('-s', '--sourcemap', help='Force source map file')
65+
parser.add_argument('-v', '--verbose', action='store_true',
66+
help='Print verbose info for debugging this script')
67+
parser.add_argument('--wasm-split', help='Path to wasm-split executable')
68+
parser.add_argument('--preserve-manifest', action='store_true',
69+
help='Preserve generated manifest file. This sets --verbose too.')
70+
args, forwarded_args = parser.parse_known_args()
71+
if '--manifest' in forwarded_args:
72+
exit_with_error('manifest file will be generated by this script and should not be given')
73+
if args.preserve_manifest:
74+
args.verbose = True
75+
return args, forwarded_args
76+
77+
78+
def get_path_to_functions_map(wasm, sourcemap, paths):
79+
def is_synthesized_func(func):
80+
# TODO There can be more
81+
synthesized_names = [
82+
'main',
83+
'__wasm_call_ctors',
84+
'__clang_call_terminate',
85+
]
86+
synthesized_prefixes = [
87+
'legalstub$',
88+
'legalfunc$',
89+
'__cxx_global_',
90+
'_GLOBAL__',
91+
'virtual thunk to ',
92+
]
93+
if func in synthesized_names:
94+
return True
95+
return func.startswith(tuple(synthesized_prefixes))
96+
97+
# Compute {func_name: src file} map, and invert it to get
98+
# {src file: list of functions} map, and construct {path: list of functions}
99+
# map from it
100+
with webassembly.Module(wasm) as module:
101+
if not module.has_name_section():
102+
exit_with_error('Name section does not eixst')
103+
if not sourcemap:
104+
if not emsymbolizer.get_sourceMappingURL_section(module):
105+
exit_with_error('sourceMappingURL section does not exist')
106+
107+
funcs = module.get_functions()
108+
func_names = module.get_function_names()
109+
assert len(funcs) == len(func_names)
110+
111+
func_to_src = {}
112+
src_to_funcs = {}
113+
114+
if not sourcemap:
115+
sourcemap = module.get_sourceMappingURL()
116+
sm = emsymbolizer.WasmSourceMap()
117+
sm.parse(sourcemap)
118+
119+
for func_name, func in zip(func_names, funcs):
120+
# From the last address, decrement the address by 1 until we find location
121+
# info with source file information. The reason we do this is to reduce
122+
# the probability of picking an address where another function is inlined
123+
# into, picking the inlined function's source.
124+
# We start from the end because it is simpler; it is harder to compute the
125+
# first instruction's address, because there is a gap for local types
126+
# between function offset and the first instruction.
127+
addr = func.offset + func.size - 1
128+
while addr > func.offset:
129+
loc = sm.lookup(addr, func.offset)
130+
# This means there is no source map mappings for the entire function
131+
# (because we give func.offset as a lower bound). Exit the loop.
132+
if not loc:
133+
break
134+
# Exit the loop only if a location info with source file information is
135+
# found. If not, continue the search.
136+
if loc.source:
137+
break
138+
addr -= 1
139+
140+
if loc and loc.source:
141+
func_to_src[func_name] = utils.normalize_path(loc.source)
142+
else:
143+
if not is_synthesized_func(func_name):
144+
diagnostics.warn(f"No source file information found in the source map for function '{func_name}'")
145+
146+
for func_name, src in func_to_src.items():
147+
if src not in src_to_funcs:
148+
src_to_funcs[src] = []
149+
src_to_funcs[src].append(func_name)
150+
151+
# Visit paths in the reverse sorting order, so that we can process inner paths
152+
# first.
153+
# e.g. If we have /a/b and /a/b/c, /a/b/c will come first, so we can assign
154+
# functions contained in /a/b/c to it first and assign the remaining functions
155+
# to /a/b.
156+
visited_funcs = set()
157+
path_to_funcs = {}
158+
for path in sorted(paths, reverse=True):
159+
ppath = PurePath(path)
160+
path_to_funcs[path] = []
161+
for src, funcs in src_to_funcs.items():
162+
psrc = PurePath(src)
163+
if ppath == psrc or ppath in psrc.parents:
164+
for func in funcs:
165+
if func not in visited_funcs:
166+
visited_funcs.add(func)
167+
path_to_funcs[path].append(func)
168+
return path_to_funcs
169+
170+
171+
def main():
172+
args, forwarded_args = parse_args()
173+
if args.wasm_split:
174+
wasm_split = args.wasm_split
175+
else:
176+
wasm_split = os.path.join(building.get_binaryen_bin(), 'wasm-split')
177+
178+
if not os.path.isfile(args.wasm):
179+
exit_with_error(f"'{args.wasm}' was not found or not a file")
180+
if not os.path.isfile(args.paths_file):
181+
exit_with_error(f"'{args.paths_file}' was not found or not a file")
182+
if args.sourcemap:
183+
if not os.path.isfile(args.sourcemap):
184+
exit_with_error(f"'{args.sourcemap}' was not found or not a file")
185+
if not os.path.isfile(wasm_split):
186+
exit_with_error(f"'{wasm_split}' was not found or not a file")
187+
188+
paths = utils.read_file(args.paths_file).splitlines()
189+
paths = [utils.normalize_path(path.strip()) for path in paths if path.strip()]
190+
# To make /a/b/c and /a/b/c/ equivalent
191+
paths = [path.rstrip(os.sep) for path in paths]
192+
# Remove duplicates
193+
paths = list(dict.fromkeys(paths))
194+
195+
# Compute {path: list of functions} map
196+
path_to_funcs = get_path_to_functions_map(args.wasm, args.sourcemap, paths)
197+
198+
# Write .manifest file
199+
with tempfile.NamedTemporaryFile(suffix=".manifest", mode='w+', delete=args.preserve_manifest) as f:
200+
manifest = f.name
201+
for i, path in enumerate(paths):
202+
f.write(f'{i}\n')
203+
if not path_to_funcs[path]:
204+
diagnostics.warn(f'{path} does not match any functions')
205+
if args.verbose:
206+
print(path)
207+
for func in path_to_funcs[path]:
208+
print(' ' + func)
209+
print()
210+
for func in path_to_funcs[path]:
211+
f.write(func + '\n')
212+
if i < len(paths) - 1:
213+
f.write('\n')
214+
f.flush()
215+
216+
cmd = [wasm_split, '--multi-split', args.wasm, '--manifest', manifest]
217+
if args.verbose:
218+
# This option is used both in this script and wasm-split
219+
cmd.append('-v')
220+
cmd += forwarded_args
221+
if args.verbose:
222+
print('\n' + ' '.join(cmd))
223+
shared.run_process(cmd)
224+
225+
226+
if __name__ == '__main__':
227+
sys.exit(main())

tools/emsymbolizer.py

Lines changed: 17 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -178,7 +178,7 @@ def decodeVLQ(string):
178178
self.offsets.append(offset)
179179
self.offsets.sort()
180180

181-
def find_offset(self, offset):
181+
def find_offset(self, offset, lower_bound=None):
182182
# Find the largest mapped offset <= the search offset
183183
lo = 0
184184
hi = len(self.offsets)
@@ -189,11 +189,22 @@ def find_offset(self, offset):
189189
hi = mid
190190
else:
191191
lo = mid + 1
192-
return self.offsets[lo - 1]
192+
if lo == 0:
193+
return None
194+
# If lower bound is given, return the offset only if the offset is equal to
195+
# or greather than the lower bound
196+
if lower_bound:
197+
if self.offsets[lo - 1] >= lower_bound:
198+
return self.offsets[lo - 1]
199+
else:
200+
return None
201+
else:
202+
return self.offsets[lo - 1]
193203

194-
def lookup(self, offset):
195-
nearest = self.find_offset(offset)
196-
assert nearest in self.mappings, 'Sourcemap has an offset with no mapping'
204+
def lookup(self, offset, lower_bound=None):
205+
nearest = self.find_offset(offset, lower_bound)
206+
if not nearest:
207+
return None
197208
info = self.mappings[nearest]
198209
return LocationInfo(
199210
self.sources[info.source] if info.source is not None else None,
@@ -206,12 +217,8 @@ def symbolize_address_sourcemap(module, address, force_file):
206217
URL = force_file
207218
if not URL:
208219
# If a sourcemap file is not forced, read it from the wasm module
209-
section = get_sourceMappingURL_section(module)
210-
assert section
211-
module.seek(section.offset)
212-
assert module.read_string() == 'sourceMappingURL'
213220
# TODO: support stripping/replacing a prefix from the URL
214-
URL = module.read_string()
221+
URL = module.get_sourceMappingURL()
215222

216223
if shared.DEBUG:
217224
print(f'Source Mapping URL: {URL}')

tools/maint/create_entry_points.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@
4343
emstrip
4444
emsymbolizer
4545
emscan-deps
46+
empath-split
4647
tools/file_packager
4748
tools/webidl_binder
4849
test/runner
@@ -56,6 +57,7 @@
5657
'emdwp': 'tools/emdwp',
5758
'emnm': 'tools/emnm',
5859
'emsymbolizer': 'tools/emsymbolizer',
60+
'empath-split': 'tools/empath-split',
5961
}
6062

6163

0 commit comments

Comments
 (0)