|
12 | 12 | # dispatch = dispatch table base pointer (256 entries, 8 bytes each) |
13 | 13 | # |
14 | 14 | # Temporary registers (caller-saved, clobbered by C++ calls): |
15 | | -# t0-t9 = general-purpose scratch |
| 15 | +# t0-t8 = general-purpose scratch |
16 | 16 | # ft0-ft3 = floating-point scratch (scalar double) |
17 | 17 | # |
18 | 18 | # NaN-boxing encoding: |
@@ -448,16 +448,6 @@ macro walk_env_chain(m_cache_field, fail_label) |
448 | 448 | branch_nonzero t0, fail_label |
449 | 449 | end |
450 | 450 |
|
451 | | -# Reload pb and values from the current running execution context. |
452 | | -# Used after inline call/return to switch to the new frame's bytecode. |
453 | | -# Clobbers t0, t1. |
454 | | -macro reload_state_from_exec_ctx() |
455 | | - reload_exec_ctx |
456 | | - load64 t1, [exec_ctx, EXECUTION_CONTEXT_EXECUTABLE] |
457 | | - load64 pb, [t1, EXECUTABLE_BYTECODE_DATA] |
458 | | - lea values, [exec_ctx, SIZEOF_EXECUTION_CONTEXT] |
459 | | -end |
460 | | - |
461 | 451 | # Pop an inline frame and resume the caller without bouncing through C++. |
462 | 452 | # The asm-managed JS-to-JS call fast path currently only inlines Call, never |
463 | 453 | # CallConstruct, so caller_is_construct is always false for asm-managed inline |
@@ -2008,11 +1998,233 @@ handler SetGlobal |
2008 | 1998 | end |
2009 | 1999 |
|
2010 | 2000 | handler Call |
2011 | | - # Try to inline the call |
| 2001 | + # Inline JS-to-JS Call in asm when the callee is an ECMAScriptFunctionObject |
| 2002 | + # with bytecode ready. Cases that need function-environment allocation or |
| 2003 | + # sloppy primitive this boxing bounce through a C++ helper that still |
| 2004 | + # prepares an inline frame instead of taking the full slow path. |
| 2005 | + # |
| 2006 | + # High-level flow: |
| 2007 | + # 1. Validate the callee and load its shared function metadata. |
| 2008 | + # 2. Bind `this` inline when we can do so without allocations. |
| 2009 | + # 3. Reserve an InterpreterStack frame and populate ExecutionContext. |
| 2010 | + # 4. Materialize [registers | locals | constants | arguments]. |
| 2011 | + # 5. Swap VM state over to the callee frame and dispatch at pc = 0. |
| 2012 | + # |
| 2013 | + # Register usage within this handler: |
| 2014 | + # t3 = callee ECMAScriptFunctionObject* |
| 2015 | + # t2 = SharedFunctionInstanceData* / later callee ExecutionContext* |
| 2016 | + # t8 = boxed `this` value carried into the callee |
| 2017 | + load_operand t0, m_callee |
| 2018 | + extract_tag t1, t0 |
| 2019 | + branch_ne t1, OBJECT_TAG, .call_slow |
| 2020 | + unbox_object t0, t0 |
| 2021 | + mov t3, t0 |
| 2022 | + |
| 2023 | + # Reject everything except ordinary ECMAScript function objects with |
| 2024 | + # already-prepared bytecode and cached inline-call eligibility. |
| 2025 | + load8 t1, [t3, OBJECT_FLAGS] |
| 2026 | + branch_bits_clear t1, OBJECT_FLAG_IS_ECMASCRIPT_FUNCTION_OBJECT, .call_slow |
| 2027 | + |
| 2028 | + load64 t2, [t3, ECMASCRIPT_FUNCTION_OBJECT_SHARED_DATA] |
| 2029 | + load8 t1, [t2, SHARED_FUNCTION_INSTANCE_DATA_CAN_INLINE_CALL] |
| 2030 | + branch_zero t1, .call_slow |
| 2031 | + # NewFunctionEnvironment() allocates and has to stay out of the pure asm |
| 2032 | + # path, but we still preserve inline-call semantics via .call_interp_inline. |
| 2033 | + load8 t1, [t2, SHARED_FUNCTION_INSTANCE_DATA_FUNCTION_ENVIRONMENT_NEEDED] |
| 2034 | + branch_nonzero t1, .call_interp_inline |
| 2035 | + |
| 2036 | + # Bind this without allocations. Sloppy primitive this-values still need |
| 2037 | + # ToObject(), so they use the C++ inline-frame helper. |
| 2038 | + # |
| 2039 | + # t8 starts as "empty" to match the normal interpreter behavior for |
| 2040 | + # callees that never observe `this`. |
| 2041 | + mov t8, EMPTY_TAG_SHIFTED |
| 2042 | + load8 t1, [t2, SHARED_FUNCTION_INSTANCE_DATA_USES_THIS] |
| 2043 | + branch_zero t1, .this_ready |
| 2044 | + load_operand t8, m_this_value |
| 2045 | + load8 t1, [t2, SHARED_FUNCTION_INSTANCE_DATA_STRICT] |
| 2046 | + branch_nonzero t1, .this_ready |
| 2047 | + |
| 2048 | + # Sloppy null/undefined binds the callee realm's global object. |
| 2049 | + # Sloppy primitive receivers need ToObject(), which may allocate wrappers, |
| 2050 | + # so they go through the helper instead of the full Call slow path. |
| 2051 | + extract_tag t1, t8 |
| 2052 | + mov t0, t1 |
| 2053 | + and t0, 0xFFFE |
| 2054 | + branch_eq t0, UNDEFINED_TAG, .sloppy_global_this |
| 2055 | + branch_eq t1, OBJECT_TAG, .this_ready |
| 2056 | + jmp .call_interp_inline |
| 2057 | + |
| 2058 | +.sloppy_global_this: |
| 2059 | + load64 t1, [t3, OBJECT_SHAPE] |
| 2060 | + load64 t1, [t1, SHAPE_REALM] |
| 2061 | + load64 t1, [t1, REALM_GLOBAL_ENVIRONMENT] |
| 2062 | + load64 t1, [t1, GLOBAL_ENVIRONMENT_GLOBAL_THIS_VALUE] |
| 2063 | + # Match Value(Object*): keep only the low 48 pointer bits before boxing. |
| 2064 | + shl t1, 16 |
| 2065 | + shr t1, 16 |
| 2066 | + mov t8, OBJECT_TAG_SHIFTED |
| 2067 | + or t8, t1 |
| 2068 | + |
| 2069 | +.this_ready: |
| 2070 | + # The pure asm path only runs once bytecode is already compiled. |
| 2071 | + load64 t0, [t2, SHARED_FUNCTION_INSTANCE_DATA_EXECUTABLE] |
| 2072 | + branch_zero t0, .call_slow |
| 2073 | + |
| 2074 | + load32 t7, [pb, pc, m_argument_count] |
| 2075 | + load32 t4, [t2, SHARED_FUNCTION_INSTANCE_DATA_FORMAL_PARAMETER_COUNT] |
| 2076 | + branch_ge_unsigned t4, t7, .arg_count_ready |
| 2077 | + mov t4, t7 |
| 2078 | +.arg_count_ready: |
| 2079 | + load32 t5, [t0, EXECUTABLE_REGISTERS_AND_LOCALS_COUNT] |
| 2080 | + load64 t6, [t0, EXECUTABLE_CONSTANTS_SIZE] |
| 2081 | + |
| 2082 | + # Inline InterpreterStack::allocate(). |
| 2083 | + # t1 = total Value slots, t2 = new stack top, t6 = current frame base. |
| 2084 | + mov t1, t5 |
| 2085 | + add t1, t6 |
| 2086 | + add t1, t4 |
| 2087 | + mov t2, t1 |
| 2088 | + shl t2, 3 |
| 2089 | + add t2, SIZEOF_EXECUTION_CONTEXT |
| 2090 | + |
| 2091 | + load_vm t0 |
| 2092 | + lea t0, [t0, VM_INTERPRETER_STACK] |
| 2093 | + load64 t6, [t0, INTERPRETER_STACK_TOP] |
| 2094 | + add t2, t6 |
| 2095 | + load64 t0, [t0, INTERPRETER_STACK_LIMIT] |
| 2096 | + branch_ge_unsigned t0, t2, .stack_ok |
| 2097 | + jmp .call_slow |
| 2098 | + |
| 2099 | +.stack_ok: |
| 2100 | + load_vm t0 |
| 2101 | + store64 [t0, VM_INTERPRETER_STACK_TOP], t2 |
| 2102 | + |
| 2103 | + # Set up the callee ExecutionContext header exactly the way |
| 2104 | + # VM::push_inline_frame() / run_executable() would see it. |
| 2105 | + store64 [t6, EXECUTION_CONTEXT_FUNCTION], t3 |
| 2106 | + load64 t0, [t3, OBJECT_SHAPE] |
| 2107 | + load64 t0, [t0, SHAPE_REALM] |
| 2108 | + store64 [t6, EXECUTION_CONTEXT_REALM], t0 |
| 2109 | + |
| 2110 | + load64 t0, [t3, ECMASCRIPT_FUNCTION_OBJECT_ENVIRONMENT] |
| 2111 | + store64 [t6, EXECUTION_CONTEXT_LEXICAL_ENVIRONMENT], t0 |
| 2112 | + store64 [t6, EXECUTION_CONTEXT_VARIABLE_ENVIRONMENT], t0 |
| 2113 | + load64 t0, [t3, ECMASCRIPT_FUNCTION_OBJECT_PRIVATE_ENVIRONMENT] |
| 2114 | + store64 [t6, EXECUTION_CONTEXT_PRIVATE_ENVIRONMENT], t0 |
| 2115 | + load64 t0, [t3, ECMASCRIPT_FUNCTION_OBJECT_SHARED_DATA] |
| 2116 | + load64 t0, [t0, SHARED_FUNCTION_INSTANCE_DATA_EXECUTABLE] |
| 2117 | + store64 [t6, EXECUTION_CONTEXT_EXECUTABLE], t0 |
| 2118 | + |
| 2119 | + # ScriptOrModule is a two-word Variant in ExecutionContext, so copy both |
| 2120 | + # machine words explicitly. |
| 2121 | + lea t0, [t6, EXECUTION_CONTEXT_SCRIPT_OR_MODULE] |
| 2122 | + lea t2, [t3, ECMASCRIPT_FUNCTION_OBJECT_SCRIPT_OR_MODULE] |
| 2123 | + load64 t3, [t2, 0] |
| 2124 | + store64 [t0, 0], t3 |
| 2125 | + load64 t3, [t2, 8] |
| 2126 | + store64 [t0, 8], t3 |
| 2127 | + |
| 2128 | + store32 [t6, EXECUTION_CONTEXT_PROGRAM_COUNTER], 0 |
| 2129 | + store32 [t6, EXECUTION_CONTEXT_SKIP_WHEN_DETERMINING_INCUMBENT_COUNTER], 0 |
| 2130 | + mov t0, EXECUTION_CONTEXT_NO_YIELD_CONTINUATION |
| 2131 | + store32 [t6, EXECUTION_CONTEXT_YIELD_CONTINUATION], t0 |
| 2132 | + store8 [t6, EXECUTION_CONTEXT_YIELD_IS_AWAIT], 0 |
| 2133 | + store8 [t6, EXECUTION_CONTEXT_CALLER_IS_CONSTRUCT], 0 |
| 2134 | + store64 [t6, EXECUTION_CONTEXT_THIS_VALUE], t8 |
| 2135 | + store64 [t6, EXECUTION_CONTEXT_CALLER_FRAME], exec_ctx |
| 2136 | + store32 [t6, EXECUTION_CONTEXT_REGISTERS_AND_CONSTANTS_AND_LOCALS_AND_ARGUMENTS_COUNT], t1 |
| 2137 | + store32 [t6, EXECUTION_CONTEXT_ARGUMENT_COUNT], t4 |
| 2138 | + store32 [t6, EXECUTION_CONTEXT_PASSED_ARGUMENT_COUNT], t7 |
| 2139 | + load32 t0, [pb, pc, m_length] |
| 2140 | + lea t2, [pb, pc] |
| 2141 | + sub t2, pb |
| 2142 | + add t0, t2 |
| 2143 | + store32 [t6, EXECUTION_CONTEXT_CALLER_RETURN_PC], t0 |
| 2144 | + load32 t0, [pb, pc, m_dst] |
| 2145 | + store32 [t6, EXECUTION_CONTEXT_CALLER_DST_RAW], t0 |
| 2146 | + |
| 2147 | + # values = [registers | locals | constants | arguments] |
| 2148 | + # Keep t2 at the ExecutionContext base while t6 walks the Value tail. |
| 2149 | + mov t2, t6 |
| 2150 | + lea t6, [t6, SIZEOF_EXECUTION_CONTEXT] |
| 2151 | + mov t0, EMPTY_TAG_SHIFTED |
| 2152 | + xor t3, t3 |
| 2153 | +.clear_registers_and_locals: |
| 2154 | + branch_ge_unsigned t3, t5, .copy_constants |
| 2155 | + store64 [t6, t3, 8], t0 |
| 2156 | + add t3, 1 |
| 2157 | + jmp .clear_registers_and_locals |
| 2158 | + |
| 2159 | +.copy_constants: |
| 2160 | + load64 t0, [t2, EXECUTION_CONTEXT_EXECUTABLE] |
| 2161 | + load64 t3, [t0, EXECUTABLE_CONSTANTS_SIZE] |
| 2162 | + load64 t0, [t0, EXECUTABLE_CONSTANTS_DATA] |
| 2163 | + mov t1, t5 |
| 2164 | + xor t8, t8 |
| 2165 | +.copy_constants_loop: |
| 2166 | + branch_ge_unsigned t8, t3, .copy_arguments |
| 2167 | + load64 t7, [t0, t8, 8] |
| 2168 | + store64 [t6, t1, 8], t7 |
| 2169 | + add t8, 1 |
| 2170 | + add t1, 1 |
| 2171 | + jmp .copy_constants_loop |
| 2172 | + |
| 2173 | +.copy_arguments: |
| 2174 | + load32 t7, [t2, EXECUTION_CONTEXT_PASSED_ARGUMENT_COUNT] |
| 2175 | + mov t1, t5 |
| 2176 | + add t1, t3 |
| 2177 | + lea t0, [exec_ctx, SIZEOF_EXECUTION_CONTEXT] |
| 2178 | + lea t8, [pb, pc] |
| 2179 | + add t8, m_expression_string |
| 2180 | + add t8, 4 |
| 2181 | + xor t3, t3 |
| 2182 | +.copy_arguments_loop: |
| 2183 | + # The operand array in the bytecode stores caller register indices. |
| 2184 | + branch_ge_unsigned t3, t7, .fill_missing_arguments |
| 2185 | + load32 t5, [t8, t3, 4] |
| 2186 | + load64 t5, [t0, t5, 8] |
| 2187 | + store64 [t6, t1, 8], t5 |
| 2188 | + add t3, 1 |
| 2189 | + add t1, 1 |
| 2190 | + jmp .copy_arguments_loop |
| 2191 | + |
| 2192 | +.fill_missing_arguments: |
| 2193 | + mov t3, t1 |
| 2194 | + add t3, t4 |
| 2195 | + sub t3, t7 |
| 2196 | + mov t0, UNDEFINED_SHIFTED |
| 2197 | +.fill_missing_arguments_loop: |
| 2198 | + branch_ge_unsigned t1, t3, .enter_callee |
| 2199 | + store64 [t6, t1, 8], t0 |
| 2200 | + add t1, 1 |
| 2201 | + jmp .fill_missing_arguments_loop |
| 2202 | + |
| 2203 | +.enter_callee: |
| 2204 | + # Mirror the normal interpreter entry sequence: cache `this` in the |
| 2205 | + # dedicated register slot, then reload pb/values/exec_ctx for the callee. |
| 2206 | + load64 t0, [t2, EXECUTION_CONTEXT_THIS_VALUE] |
| 2207 | + store64 [t6, THIS_VALUE_REG_OFFSET], t0 |
| 2208 | + |
| 2209 | + load64 t0, [t2, EXECUTION_CONTEXT_EXECUTABLE] |
| 2210 | + load64 pb, [t0, EXECUTABLE_BYTECODE_DATA] |
| 2211 | + load_vm t0 |
| 2212 | + store64 [t0, VM_RUNNING_EXECUTION_CONTEXT], t2 |
| 2213 | + mov exec_ctx, t2 |
| 2214 | + lea values, [exec_ctx, SIZEOF_EXECUTION_CONTEXT] |
| 2215 | + xor pc, pc |
| 2216 | + dispatch_current |
| 2217 | +.call_interp_inline: |
| 2218 | + # Shared escape hatch for the cases that need C++ help to build the inline |
| 2219 | + # frame correctly but must not take the full Call slow path, since that |
| 2220 | + # would insert a run_executable() boundary and observable microtask drain. |
2012 | 2221 | call_interp asm_try_inline_call |
2013 | 2222 | branch_nonzero t0, .call_slow |
2014 | | - # Success: reload pb/values from new execution context, pc=0 |
2015 | | - reload_state_from_exec_ctx |
| 2223 | + load_vm t0 |
| 2224 | + load64 exec_ctx, [t0, VM_RUNNING_EXECUTION_CONTEXT] |
| 2225 | + lea values, [exec_ctx, SIZEOF_EXECUTION_CONTEXT] |
| 2226 | + load64 t0, [exec_ctx, EXECUTION_CONTEXT_EXECUTABLE] |
| 2227 | + load64 pb, [t0, EXECUTABLE_BYTECODE_DATA] |
2016 | 2228 | xor pc, pc |
2017 | 2229 | dispatch_current |
2018 | 2230 | .call_slow: |
|
0 commit comments