Skip to content

Commit 8c7c46f

Browse files
committed
LibJS: Inline asm interpreter JS Call fast path
Handle inline-eligible JS-to-JS Call directly in asmint.asm instead of routing the whole operation through AsmInterpreter.cpp. The asm handler now validates the callee, binds `this` for the non-allocating cases, reserves the callee InterpreterStack frame, populates the ExecutionContext header and Value tail, and enters the callee bytecode at pc 0. Keep the cases that need NewFunctionEnvironment() or sloppy `this` boxing on a narrow helper that still builds an inline frame. This preserves the existing inline-call semantics for promise-job ordering, receiver binding, and sloppy global-this handling while keeping the common path in assembly. Add regression coverage for closure-capturing callees, sloppy primitive receivers, and sloppy undefined receivers.
1 parent 7a01a64 commit 8c7c46f

File tree

4 files changed

+311
-31
lines changed

4 files changed

+311
-31
lines changed

Libraries/LibJS/Bytecode/AsmInterpreter/AsmInterpreter.cpp

Lines changed: 19 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -852,35 +852,37 @@ i64 asm_try_put_by_value_holey_array(VM* vm, u32 pc)
852852
return 0;
853853
}
854854

855-
// Try to inline a JS-to-JS call. Returns 0 on success (callee frame pushed),
856-
// 1 on failure (caller should fall through to slow path).
855+
// Try to inline a JS-to-JS call by building the callee frame through the
856+
// shared VM::push_inline_frame() helper. Returns 0 on success (callee frame
857+
// pushed) and 1 on failure (caller should keep handling the Call itself).
857858
i64 asm_try_inline_call(VM* vm, u32 pc)
858859
{
859860
auto* bytecode = vm->current_executable().bytecode.data();
860861
auto& insn = *reinterpret_cast<Op::Call const*>(&bytecode[pc]);
862+
861863
auto callee = vm->get(insn.callee());
862-
if (!callee.is_object())
864+
if (!callee.is_object()) [[unlikely]]
863865
return 1;
866+
864867
auto& callee_object = callee.as_object();
865-
if (!is<ECMAScriptFunctionObject>(callee_object))
868+
if (!is<ECMAScriptFunctionObject>(callee_object)) [[unlikely]]
866869
return 1;
870+
867871
auto& callee_function = static_cast<ECMAScriptFunctionObject&>(callee_object);
868-
if (!callee_function.can_inline_call())
872+
if (!callee_function.can_inline_call()) [[unlikely]]
869873
return 1;
870874

871-
auto& callee_executable = callee_function.inline_call_executable();
872-
873-
u32 return_pc = pc + insn.length();
874-
875875
auto* callee_context = vm->push_inline_frame(
876-
callee_function, callee_executable,
877-
insn.arguments(), return_pc, insn.dst().raw(),
878-
vm->get(insn.this_value()), nullptr, false);
879-
880-
if (!callee_context) [[unlikely]]
881-
return 1;
882-
883-
return 0;
876+
callee_function,
877+
callee_function.inline_call_executable(),
878+
insn.arguments(),
879+
pc + insn.length(),
880+
insn.dst().raw(),
881+
vm->get(insn.this_value()),
882+
nullptr,
883+
false);
884+
885+
return callee_context ? 0 : 1;
884886
}
885887

886888
// Fast cache-only PutById. Tries all cache entries for ChangeOwnProperty and

Libraries/LibJS/Bytecode/AsmInterpreter/asmint.asm

Lines changed: 226 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
# dispatch = dispatch table base pointer (256 entries, 8 bytes each)
1313
#
1414
# Temporary registers (caller-saved, clobbered by C++ calls):
15-
# t0-t9 = general-purpose scratch
15+
# t0-t8 = general-purpose scratch
1616
# ft0-ft3 = floating-point scratch (scalar double)
1717
#
1818
# NaN-boxing encoding:
@@ -448,16 +448,6 @@ macro walk_env_chain(m_cache_field, fail_label)
448448
branch_nonzero t0, fail_label
449449
end
450450

451-
# Reload pb and values from the current running execution context.
452-
# Used after inline call/return to switch to the new frame's bytecode.
453-
# Clobbers t0, t1.
454-
macro reload_state_from_exec_ctx()
455-
reload_exec_ctx
456-
load64 t1, [exec_ctx, EXECUTION_CONTEXT_EXECUTABLE]
457-
load64 pb, [t1, EXECUTABLE_BYTECODE_DATA]
458-
lea values, [exec_ctx, SIZEOF_EXECUTION_CONTEXT]
459-
end
460-
461451
# Pop an inline frame and resume the caller without bouncing through C++.
462452
# The asm-managed JS-to-JS call fast path currently only inlines Call, never
463453
# CallConstruct, so caller_is_construct is always false for asm-managed inline
@@ -2008,11 +1998,233 @@ handler SetGlobal
20081998
end
20091999

20102000
handler Call
2011-
# Try to inline the call
2001+
# Inline JS-to-JS Call in asm when the callee is an ECMAScriptFunctionObject
2002+
# with bytecode ready. Cases that need function-environment allocation or
2003+
# sloppy primitive this boxing bounce through a C++ helper that still
2004+
# prepares an inline frame instead of taking the full slow path.
2005+
#
2006+
# High-level flow:
2007+
# 1. Validate the callee and load its shared function metadata.
2008+
# 2. Bind `this` inline when we can do so without allocations.
2009+
# 3. Reserve an InterpreterStack frame and populate ExecutionContext.
2010+
# 4. Materialize [registers | locals | constants | arguments].
2011+
# 5. Swap VM state over to the callee frame and dispatch at pc = 0.
2012+
#
2013+
# Register usage within this handler:
2014+
# t3 = callee ECMAScriptFunctionObject*
2015+
# t2 = SharedFunctionInstanceData* / later callee ExecutionContext*
2016+
# t8 = boxed `this` value carried into the callee
2017+
load_operand t0, m_callee
2018+
extract_tag t1, t0
2019+
branch_ne t1, OBJECT_TAG, .call_slow
2020+
unbox_object t0, t0
2021+
mov t3, t0
2022+
2023+
# Reject everything except ordinary ECMAScript function objects with
2024+
# already-prepared bytecode and cached inline-call eligibility.
2025+
load8 t1, [t3, OBJECT_FLAGS]
2026+
branch_bits_clear t1, OBJECT_FLAG_IS_ECMASCRIPT_FUNCTION_OBJECT, .call_slow
2027+
2028+
load64 t2, [t3, ECMASCRIPT_FUNCTION_OBJECT_SHARED_DATA]
2029+
load8 t1, [t2, SHARED_FUNCTION_INSTANCE_DATA_CAN_INLINE_CALL]
2030+
branch_zero t1, .call_slow
2031+
# NewFunctionEnvironment() allocates and has to stay out of the pure asm
2032+
# path, but we still preserve inline-call semantics via .call_interp_inline.
2033+
load8 t1, [t2, SHARED_FUNCTION_INSTANCE_DATA_FUNCTION_ENVIRONMENT_NEEDED]
2034+
branch_nonzero t1, .call_interp_inline
2035+
2036+
# Bind this without allocations. Sloppy primitive this-values still need
2037+
# ToObject(), so they use the C++ inline-frame helper.
2038+
#
2039+
# t8 starts as "empty" to match the normal interpreter behavior for
2040+
# callees that never observe `this`.
2041+
mov t8, EMPTY_TAG_SHIFTED
2042+
load8 t1, [t2, SHARED_FUNCTION_INSTANCE_DATA_USES_THIS]
2043+
branch_zero t1, .this_ready
2044+
load_operand t8, m_this_value
2045+
load8 t1, [t2, SHARED_FUNCTION_INSTANCE_DATA_STRICT]
2046+
branch_nonzero t1, .this_ready
2047+
2048+
# Sloppy null/undefined binds the callee realm's global object.
2049+
# Sloppy primitive receivers need ToObject(), which may allocate wrappers,
2050+
# so they go through the helper instead of the full Call slow path.
2051+
extract_tag t1, t8
2052+
mov t0, t1
2053+
and t0, 0xFFFE
2054+
branch_eq t0, UNDEFINED_TAG, .sloppy_global_this
2055+
branch_eq t1, OBJECT_TAG, .this_ready
2056+
jmp .call_interp_inline
2057+
2058+
.sloppy_global_this:
2059+
load64 t1, [t3, OBJECT_SHAPE]
2060+
load64 t1, [t1, SHAPE_REALM]
2061+
load64 t1, [t1, REALM_GLOBAL_ENVIRONMENT]
2062+
load64 t1, [t1, GLOBAL_ENVIRONMENT_GLOBAL_THIS_VALUE]
2063+
# Match Value(Object*): keep only the low 48 pointer bits before boxing.
2064+
shl t1, 16
2065+
shr t1, 16
2066+
mov t8, OBJECT_TAG_SHIFTED
2067+
or t8, t1
2068+
2069+
.this_ready:
2070+
# The pure asm path only runs once bytecode is already compiled.
2071+
load64 t0, [t2, SHARED_FUNCTION_INSTANCE_DATA_EXECUTABLE]
2072+
branch_zero t0, .call_slow
2073+
2074+
load32 t7, [pb, pc, m_argument_count]
2075+
load32 t4, [t2, SHARED_FUNCTION_INSTANCE_DATA_FORMAL_PARAMETER_COUNT]
2076+
branch_ge_unsigned t4, t7, .arg_count_ready
2077+
mov t4, t7
2078+
.arg_count_ready:
2079+
load32 t5, [t0, EXECUTABLE_REGISTERS_AND_LOCALS_COUNT]
2080+
load64 t6, [t0, EXECUTABLE_CONSTANTS_SIZE]
2081+
2082+
# Inline InterpreterStack::allocate().
2083+
# t1 = total Value slots, t2 = new stack top, t6 = current frame base.
2084+
mov t1, t5
2085+
add t1, t6
2086+
add t1, t4
2087+
mov t2, t1
2088+
shl t2, 3
2089+
add t2, SIZEOF_EXECUTION_CONTEXT
2090+
2091+
load_vm t0
2092+
lea t0, [t0, VM_INTERPRETER_STACK]
2093+
load64 t6, [t0, INTERPRETER_STACK_TOP]
2094+
add t2, t6
2095+
load64 t0, [t0, INTERPRETER_STACK_LIMIT]
2096+
branch_ge_unsigned t0, t2, .stack_ok
2097+
jmp .call_slow
2098+
2099+
.stack_ok:
2100+
load_vm t0
2101+
store64 [t0, VM_INTERPRETER_STACK_TOP], t2
2102+
2103+
# Set up the callee ExecutionContext header exactly the way
2104+
# VM::push_inline_frame() / run_executable() would see it.
2105+
store64 [t6, EXECUTION_CONTEXT_FUNCTION], t3
2106+
load64 t0, [t3, OBJECT_SHAPE]
2107+
load64 t0, [t0, SHAPE_REALM]
2108+
store64 [t6, EXECUTION_CONTEXT_REALM], t0
2109+
2110+
load64 t0, [t3, ECMASCRIPT_FUNCTION_OBJECT_ENVIRONMENT]
2111+
store64 [t6, EXECUTION_CONTEXT_LEXICAL_ENVIRONMENT], t0
2112+
store64 [t6, EXECUTION_CONTEXT_VARIABLE_ENVIRONMENT], t0
2113+
load64 t0, [t3, ECMASCRIPT_FUNCTION_OBJECT_PRIVATE_ENVIRONMENT]
2114+
store64 [t6, EXECUTION_CONTEXT_PRIVATE_ENVIRONMENT], t0
2115+
load64 t0, [t3, ECMASCRIPT_FUNCTION_OBJECT_SHARED_DATA]
2116+
load64 t0, [t0, SHARED_FUNCTION_INSTANCE_DATA_EXECUTABLE]
2117+
store64 [t6, EXECUTION_CONTEXT_EXECUTABLE], t0
2118+
2119+
# ScriptOrModule is a two-word Variant in ExecutionContext, so copy both
2120+
# machine words explicitly.
2121+
lea t0, [t6, EXECUTION_CONTEXT_SCRIPT_OR_MODULE]
2122+
lea t2, [t3, ECMASCRIPT_FUNCTION_OBJECT_SCRIPT_OR_MODULE]
2123+
load64 t3, [t2, 0]
2124+
store64 [t0, 0], t3
2125+
load64 t3, [t2, 8]
2126+
store64 [t0, 8], t3
2127+
2128+
store32 [t6, EXECUTION_CONTEXT_PROGRAM_COUNTER], 0
2129+
store32 [t6, EXECUTION_CONTEXT_SKIP_WHEN_DETERMINING_INCUMBENT_COUNTER], 0
2130+
mov t0, EXECUTION_CONTEXT_NO_YIELD_CONTINUATION
2131+
store32 [t6, EXECUTION_CONTEXT_YIELD_CONTINUATION], t0
2132+
store8 [t6, EXECUTION_CONTEXT_YIELD_IS_AWAIT], 0
2133+
store8 [t6, EXECUTION_CONTEXT_CALLER_IS_CONSTRUCT], 0
2134+
store64 [t6, EXECUTION_CONTEXT_THIS_VALUE], t8
2135+
store64 [t6, EXECUTION_CONTEXT_CALLER_FRAME], exec_ctx
2136+
store32 [t6, EXECUTION_CONTEXT_REGISTERS_AND_CONSTANTS_AND_LOCALS_AND_ARGUMENTS_COUNT], t1
2137+
store32 [t6, EXECUTION_CONTEXT_ARGUMENT_COUNT], t4
2138+
store32 [t6, EXECUTION_CONTEXT_PASSED_ARGUMENT_COUNT], t7
2139+
load32 t0, [pb, pc, m_length]
2140+
lea t2, [pb, pc]
2141+
sub t2, pb
2142+
add t0, t2
2143+
store32 [t6, EXECUTION_CONTEXT_CALLER_RETURN_PC], t0
2144+
load32 t0, [pb, pc, m_dst]
2145+
store32 [t6, EXECUTION_CONTEXT_CALLER_DST_RAW], t0
2146+
2147+
# values = [registers | locals | constants | arguments]
2148+
# Keep t2 at the ExecutionContext base while t6 walks the Value tail.
2149+
mov t2, t6
2150+
lea t6, [t6, SIZEOF_EXECUTION_CONTEXT]
2151+
mov t0, EMPTY_TAG_SHIFTED
2152+
xor t3, t3
2153+
.clear_registers_and_locals:
2154+
branch_ge_unsigned t3, t5, .copy_constants
2155+
store64 [t6, t3, 8], t0
2156+
add t3, 1
2157+
jmp .clear_registers_and_locals
2158+
2159+
.copy_constants:
2160+
load64 t0, [t2, EXECUTION_CONTEXT_EXECUTABLE]
2161+
load64 t3, [t0, EXECUTABLE_CONSTANTS_SIZE]
2162+
load64 t0, [t0, EXECUTABLE_CONSTANTS_DATA]
2163+
mov t1, t5
2164+
xor t8, t8
2165+
.copy_constants_loop:
2166+
branch_ge_unsigned t8, t3, .copy_arguments
2167+
load64 t7, [t0, t8, 8]
2168+
store64 [t6, t1, 8], t7
2169+
add t8, 1
2170+
add t1, 1
2171+
jmp .copy_constants_loop
2172+
2173+
.copy_arguments:
2174+
load32 t7, [t2, EXECUTION_CONTEXT_PASSED_ARGUMENT_COUNT]
2175+
mov t1, t5
2176+
add t1, t3
2177+
lea t0, [exec_ctx, SIZEOF_EXECUTION_CONTEXT]
2178+
lea t8, [pb, pc]
2179+
add t8, m_expression_string
2180+
add t8, 4
2181+
xor t3, t3
2182+
.copy_arguments_loop:
2183+
# The operand array in the bytecode stores caller register indices.
2184+
branch_ge_unsigned t3, t7, .fill_missing_arguments
2185+
load32 t5, [t8, t3, 4]
2186+
load64 t5, [t0, t5, 8]
2187+
store64 [t6, t1, 8], t5
2188+
add t3, 1
2189+
add t1, 1
2190+
jmp .copy_arguments_loop
2191+
2192+
.fill_missing_arguments:
2193+
mov t3, t1
2194+
add t3, t4
2195+
sub t3, t7
2196+
mov t0, UNDEFINED_SHIFTED
2197+
.fill_missing_arguments_loop:
2198+
branch_ge_unsigned t1, t3, .enter_callee
2199+
store64 [t6, t1, 8], t0
2200+
add t1, 1
2201+
jmp .fill_missing_arguments_loop
2202+
2203+
.enter_callee:
2204+
# Mirror the normal interpreter entry sequence: cache `this` in the
2205+
# dedicated register slot, then reload pb/values/exec_ctx for the callee.
2206+
load64 t0, [t2, EXECUTION_CONTEXT_THIS_VALUE]
2207+
store64 [t6, THIS_VALUE_REG_OFFSET], t0
2208+
2209+
load64 t0, [t2, EXECUTION_CONTEXT_EXECUTABLE]
2210+
load64 pb, [t0, EXECUTABLE_BYTECODE_DATA]
2211+
load_vm t0
2212+
store64 [t0, VM_RUNNING_EXECUTION_CONTEXT], t2
2213+
mov exec_ctx, t2
2214+
lea values, [exec_ctx, SIZEOF_EXECUTION_CONTEXT]
2215+
xor pc, pc
2216+
dispatch_current
2217+
.call_interp_inline:
2218+
# Shared escape hatch for the cases that need C++ help to build the inline
2219+
# frame correctly but must not take the full Call slow path, since that
2220+
# would insert a run_executable() boundary and observable microtask drain.
20122221
call_interp asm_try_inline_call
20132222
branch_nonzero t0, .call_slow
2014-
# Success: reload pb/values from new execution context, pc=0
2015-
reload_state_from_exec_ctx
2223+
load_vm t0
2224+
load64 exec_ctx, [t0, VM_RUNNING_EXECUTION_CONTEXT]
2225+
lea values, [exec_ctx, SIZEOF_EXECUTION_CONTEXT]
2226+
load64 t0, [exec_ctx, EXECUTION_CONTEXT_EXECUTABLE]
2227+
load64 pb, [t0, EXECUTABLE_BYTECODE_DATA]
20162228
xor pc, pc
20172229
dispatch_current
20182230
.call_slow:

Libraries/LibJS/Bytecode/AsmInterpreter/gen_asm_offsets.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -134,13 +134,15 @@ int main()
134134

135135
// ExecutionContext layout
136136
outln("\n# ExecutionContext layout");
137+
VERIFY(alignof(ExecutionContext) == sizeof(Value));
137138
EMIT_OFFSET(EXECUTION_CONTEXT_EXECUTABLE, ExecutionContext, executable);
138139
EMIT_OFFSET(EXECUTION_CONTEXT_FUNCTION, ExecutionContext, function);
139140
EMIT_OFFSET(EXECUTION_CONTEXT_REALM, ExecutionContext, realm);
140141
EMIT_OFFSET(EXECUTION_CONTEXT_SCRIPT_OR_MODULE, ExecutionContext, script_or_module);
141142
EMIT_OFFSET(EXECUTION_CONTEXT_LEXICAL_ENVIRONMENT, ExecutionContext, lexical_environment);
142143
EMIT_OFFSET(EXECUTION_CONTEXT_VARIABLE_ENVIRONMENT, ExecutionContext, variable_environment);
143144
EMIT_OFFSET(EXECUTION_CONTEXT_PRIVATE_ENVIRONMENT, ExecutionContext, private_environment);
145+
EMIT_OFFSET(EXECUTION_CONTEXT_SKIP_WHEN_DETERMINING_INCUMBENT_COUNTER, ExecutionContext, skip_when_determining_incumbent_counter);
144146
EMIT_OFFSET(EXECUTION_CONTEXT_YIELD_CONTINUATION, ExecutionContext, yield_continuation);
145147
EMIT_OFFSET(EXECUTION_CONTEXT_YIELD_IS_AWAIT, ExecutionContext, yield_is_await);
146148
EMIT_OFFSET(EXECUTION_CONTEXT_CALLER_IS_CONSTRUCT, ExecutionContext, caller_is_construct);
Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
/*
2+
* Copyright (c) 2026-present, the Ladybird developers.
3+
*
4+
* SPDX-License-Identifier: BSD-2-Clause
5+
*/
6+
7+
test("closure-capturing callees do not drain promise jobs before the caller resumes", () => {
8+
let order = [];
9+
10+
function inner() {
11+
let x = 1;
12+
order.push("inner");
13+
return () => x;
14+
}
15+
16+
inner();
17+
order.length = 0;
18+
19+
function outer() {
20+
inner();
21+
order.push("outer");
22+
}
23+
24+
Promise.resolve().then(() => order.push("micro"));
25+
26+
outer();
27+
expect(order).toEqual(["inner", "outer", "micro"]);
28+
});
29+
30+
test("sloppy primitive receivers do not drain promise jobs before the caller resumes", () => {
31+
let order = [];
32+
33+
Number.prototype.__asmInlineCallMicrotaskOrder = function () {
34+
this;
35+
order.push("inner");
36+
};
37+
38+
(1).__asmInlineCallMicrotaskOrder();
39+
order.length = 0;
40+
41+
function outer() {
42+
(1).__asmInlineCallMicrotaskOrder();
43+
order.push("outer");
44+
}
45+
46+
Promise.resolve().then(() => order.push("micro"));
47+
48+
outer();
49+
expect(order).toEqual(["inner", "outer", "micro"]);
50+
});
51+
52+
test("sloppy undefined receivers still bind the global object in inline calls", () => {
53+
function inner() {
54+
return this;
55+
}
56+
57+
inner();
58+
59+
function outer() {
60+
return inner();
61+
}
62+
63+
expect(outer()).toBe(globalThis);
64+
});

0 commit comments

Comments
 (0)