diff --git a/PERFORMANCE_REPORT.md b/PERFORMANCE_REPORT.md index 6641acb5..bcabd0c4 100644 --- a/PERFORMANCE_REPORT.md +++ b/PERFORMANCE_REPORT.md @@ -1,142 +1,189 @@ -# luars 性能报告 +# LuaRS 性能报告 -**测试日期**: 2024年12月4日 -**对比版本**: luars vs Lua 5.4.6 -**总运行时间**: luars 16.55s vs Lua 5.4 10.02s (65%) +## 概述 ---- +本报告比较 luars(Rust 实现)与官方 Lua 5.4.6 的性能表现。 + +**测试环境:** +- OS: Windows +- Lua 5.4.6: 从源码编译 (CMake + MSVC) +- luars: `cargo build --release` -## 🔴 紧急优化项目(性能差距 > 50%) - -### 1. 闭包迭代器 - **5% of Lua 5.4** ⚠️ 最高优先级 -| 测试 | luars | Lua 5.4 | 比率 | -|------|-------|---------|------| -| Closure iterator (100) | 16.67 K/s | 303.03 K/s | **5%** | - -**分析**: 闭包迭代器在 Lua 中极其常见(io.lines, 自定义迭代器等),这是最严重的性能问题。 - -### 2. 元方法调用 - **17-51%** -| 测试 | luars | Lua 5.4 | 比率 | -|------|-------|---------|------| -| __index (function) | 5.22 M/s | 29.41 M/s | **18%** | -| __len metamethod | 5.50 M/s | 33.33 M/s | **17%** | -| __call metamethod | 17.10 M/s | 33.33 M/s | **51%** | -| __add metamethod | 2.91 M/s | 4.55 M/s | **64%** | - -**分析**: 元方法查找和调用开销过大。 - -### 3. pcall 成功路径 - **15%** -| 测试 | luars | Lua 5.4 | 比率 | -|------|-------|---------|------| -| pcall (success) | 2.49 M/s | 16.67 M/s | **15%** | -| assert (success) | 11.10 M/s | 50.00 M/s | **22%** | - -**分析**: pcall 成功时不应该有显著开销,当前实现有问题。 - -### 4. vararg 处理 - **18-40%** -| 测试 | luars | Lua 5.4 | 比率 | -|------|-------|---------|------| -| select('#', ...) | 3.48 M/s | 19.23 M/s | **18%** | -| select(3, ...) | 3.35 M/s | 17.54 M/s | **19%** | -| table.unpack (5 values) | 4.91 M/s | 20.00 M/s | **25%** | -| Vararg passthrough | 12.63 M/s | 31.25 M/s | **40%** | - -**分析**: vararg 的创建、访问和传递都很慢。 - -### 5. 迭代器通用问题 - **37-39%** -| 测试 | luars | Lua 5.4 | 比率 | -|------|-------|---------|------| -| Multi-value iterator | 7.60 K/s | 20.62 K/s | **37%** | -| Custom stateless iter | 12.46 K/s | 31.65 K/s | **39%** | - -### 6. 函数调用开销 - **42%** -| 测试 | luars | Lua 5.4 | 比率 | -|------|-------|---------|------| -| Simple function call | 22.13 M/s | 52.63 M/s | **42%** | -| Returns as func args | 10.87 M/s | 25.00 M/s | **43%** | - -### 7. OOP 方法调用 - **32-33%** -| 测试 | luars | Lua 5.4 | 比率 | -|------|-------|---------|------| -| Method call (colon) | 3.88 M/s | 12.20 M/s | **32%** | -| Method call (dot) | 4.28 M/s | 13.51 M/s | **32%** | -| Inherited method call | 3.49 M/s | 11.11 M/s | **31%** | - -### 8. 循环控制 - **51-53%** -| 测试 | luars | Lua 5.4 | 比率 | -|------|-------|---------|------| -| While loop | 74.49 M/s | 140.85 M/s | **53%** | -| Repeat-until | 80.35 M/s | 158.73 M/s | **51%** | -| Nested loops | 134.23 M/s | 250.00 M/s | **54%** | - -### 9. 变量访问 - **53-60%** -| 测试 | luars | Lua 5.4 | 比率 | -|------|-------|---------|------| -| Global var access | 38.34 M/s | 72.99 M/s | **53%** | -| Upvalue access | 81.64 M/s | 136.99 M/s | **60%** | +**总体结果:** +| 运行时 | 总时间 | 相对性能 | +|--------|--------|----------| +| Lua 5.4.6 | ~10.1 秒 | 100% | +| luars | ~15.3 秒 | 66% | + +luars 在基准测试套件中达到 Lua 5.4 约 **66%** 的性能。 --- -## 🟡 中等差距项目(性能差距 30-50%) +## 详细对比 -| 测试 | luars | Lua 5.4 | 比率 | -|------|-------|---------|------| -| Local var access | 165 M/s | 238 M/s | 69% | -| Integer addition | 159 M/s | 238 M/s | 67% | -| Float multiplication | 158 M/s | 217 M/s | 73% | -| Table access | 97.93 M/s | 142.86 M/s | 69% | -| ipairs iteration | 24.56 K/s | 45.45 K/s | 54% | -| Upvalue read/write | 22.37 M/s | 40.00 M/s | 56% | -| Multiple upvalues | 18.58 M/s | 34.48 M/s | 54% | -| Nested closures | 22.27 M/s | 35.71 M/s | 62% | +### 算术运算 ---- +| 测试项 | luars | Lua 5.4 | 比率 | +|--------|-------|---------|------| +| 整数加法 | 139 M/s | 233 M/s | 60% | +| 浮点乘法 | 127 M/s | 208 M/s | 61% | +| 混合运算 | 74 M/s | 123 M/s | 60% | + +### 控制流 + +| 测试项 | luars | Lua 5.4 | 比率 | +|--------|-------|---------|------| +| If-else | 56 M/s | 52 M/s | 108% ✓ | +| While 循环 | 80 M/s | 118 M/s | 68% | +| Repeat-until | 83 M/s | 147 M/s | 56% | +| 嵌套循环 | 145 M/s | 250 M/s | 58% | + +### 变量访问 + +| 测试项 | luars | Lua 5.4 | 比率 | +|--------|-------|---------|------| +| 全局变量 | 32 M/s | 74 M/s | 43% | +| 局部变量 | 121 M/s | 233 M/s | 52% | +| Upvalue | 53 M/s | 133 M/s | 40% | + +### 函数调用 + +| 测试项 | luars | Lua 5.4 | 比率 | +|--------|-------|---------|------| +| 简单调用 | 19 M/s | 53 M/s | 36% | +| 变长参数 | 1.7 M/s | 2.5 M/s | 68% | + +### 闭包 + +| 测试项 | luars | Lua 5.4 | 比率 | +|--------|-------|---------|------| +| 闭包创建 | 11.6 M/s | 7.1 M/s | 163% ✓ | +| Upvalue 读写 | 18.5 M/s | 43.5 M/s | 43% | + +### 表操作 + +| 测试项 | luars | Lua 5.4 | 比率 | +|--------|-------|---------|------| +| 表插入 | 43 M/s | 34 M/s | 126% ✓ | +| 表访问 | 84 M/s | 125 M/s | 67% | -## 🟢 luars 表现更好的项目 +### 元表 (fasttm 优化后) -| 测试 | luars | Lua 5.4 | 倍数 | -|------|-------|---------|------| -| table.sort (sorted) | 194 K/s | 8 K/s | **24x faster** | -| table.sort (reversed) | 35.6 K/s | 5.78 K/s | **6x faster** | -| Repeated yield | 3.06 M/s | 0.79 M/s | **4x faster** | -| pcall (error path) | 2.18 M/s | 0.55 M/s | **4x faster** | -| table.insert (end) | 21.27 M/s | 14.29 M/s | 1.5x faster | -| table.remove (end) | 26.81 M/s | 16.67 M/s | 1.6x faster | +| 测试项 | luars | Lua 5.4 | 比率 | +|--------|-------|---------|------| +| __index (函数) | 4.9 M/s | 22 M/s | 22% | +| __index (表) | 23 M/s | 26 M/s | 88% ✓ | +| __newindex | 6.1 M/s | 17 M/s | 36% | +| __call | 15.7 M/s | 25 M/s | 63% | +| __len | 5.3 M/s | 28 M/s | 19% | +| rawget | 21 M/s | 21 M/s | 100% ✓ | + +### fasttm 优化效果 + +当表有元表但**没有特定元方法**时的访问性能: + +| 测试项 | luars | Lua 5.4 | 比率 | +|--------|-------|---------|------| +| 缺失键 (有元表, 无__index) | 100-113 M/s | 100 M/s | 100-113% ✓ | +| #table (有元表, 无__len) | 83-105 M/s | 100 M/s | 83-105% ✓ | + +### 协程 + +| 测试项 | luars | Lua 5.4 | 比率 | +|--------|-------|---------|------| +| 创建/恢复/让出 | 319 K/s | 521 K/s | 61% | +| 重复 yield | 2.7 M/s | 735 K/s | 367% ✓ | +| coroutine.wrap | 405 K/s | 2.5 M/s | 16% | + +### 错误处理 + +| 测试项 | luars | Lua 5.4 | 比率 | +|--------|-------|---------|------| +| pcall (成功) | 3.2 M/s | 17 M/s | 19% | +| pcall (失败) | 3.4 M/s | 493 K/s | 690% ✓ | +| xpcall (失败) | 1.2 M/s | 459 K/s | 261% ✓ | --- -## 优化计划(按优先级) +## 优化实现 + +### fasttm 优化 (Lua 5.4 风格) + +参考 Lua 5.4 的 `ltm.h` 中的 `fasttm` 宏,实现了元方法缺失的快速检测: + +```rust +// lua_table.rs +pub mod TmFlags { + pub const TM_INDEX: u8 = 1 << 0; + pub const TM_NEWINDEX: u8 = 1 << 1; + pub const TM_GC: u8 = 1 << 2; + pub const TM_MODE: u8 = 1 << 3; + pub const TM_LEN: u8 = 1 << 4; + pub const TM_EQ: u8 = 1 << 5; + pub const TM_CALL: u8 = 1 << 6; +} + +pub struct LuaTable { + pub tm_flags: u8, // 缓存元方法不存在的位标志 + // ... +} + +impl LuaTable { + /// 检查元方法是否已知不存在 + pub fn tm_absent(&self, flag: u8) -> bool { + (self.tm_flags & flag) != 0 + } + + /// 标记元方法不存在 + pub fn set_tm_absent(&mut self, flag: u8) { + self.tm_flags |= flag; + } +} +``` + +**工作原理:** +1. 每个表的元表有一个 `tm_flags` 位域 +2. 当查找元方法(如 `__index`)失败时,设置对应位 +3. 下次访问时先检查位,如果已设置则跳过哈希查找 +4. 当元表被修改时,清除所有标志 + +**效果:** +- "缺失键 (有元表, 无__index)" 场景:从 55 M/s 提升到 100+ M/s +- 达到 Lua 5.4 同等水平 -### Phase 1: 核心调用路径优化 -1. [ ] 闭包迭代器优化(5% → 目标 60%+) -2. [ ] pcall 成功路径优化(15% → 目标 80%+) -3. [ ] 元方法调用优化(17% → 目标 60%+) +--- + +## 性能优势领域 + +luars 在以下方面**超越** Lua 5.4: -### Phase 2: vararg 和函数调用 -4. [ ] select() 和 vararg 处理优化 -5. [ ] 简单函数调用开销减少 -6. [ ] 多返回值传递优化 +1. **闭包创建**: 163% (11.6 M/s vs 7.1 M/s) +2. **表插入**: 126% (43 M/s vs 34 M/s) +3. **重复 yield**: 367% (2.7 M/s vs 735 K/s) +4. **pcall 失败**: 690% (luars 的错误处理更快) +5. **If-else 分支**: 108% +6. **fasttm 优化场景**: 100-113% -### Phase 3: 循环和变量访问 -7. [ ] while/repeat-until 循环优化 -8. [ ] 全局变量访问优化 -9. [ ] upvalue 访问优化 +--- + +## 需要改进的领域 -### Phase 4: 其他优化 -10. [ ] OOP 方法调用优化 -11. [ ] 迭代器通用优化 -12. [ ] 表访问优化 +1. **函数调用开销**: 36% - 需要优化调用栈管理 +2. **全局变量访问**: 43% - `_ENV` 查找较慢 +3. **元方法调用**: 19-36% - 元方法调用本身的开销较大 +4. **pcall 成功路径**: 19% - 每次 pcall 有较大开销 +5. **coroutine.wrap**: 16% - 包装函数创建较慢 --- -## 技术笔记 +## 结论 + +luars 作为 Rust 实现的 Lua 解释器,在整体性能上达到官方 Lua 5.4 的 66%。部分操作(闭包创建、表插入、协程 yield、错误处理)已经超越 Lua 5.4。 -### 已完成的优化 -- [x] Return 语句字节码优化(消除多余 MOVE 指令) -- [x] TAILCALL 字节码修复 -- [x] `get_result_reg` 辅助函数统一寄存器分配 +实现的 fasttm 优化有效解决了"有元表但无元方法"场景的性能问题,使该场景达到 Lua 5.4 水平。 -### 待分析的问题 -- 闭包迭代器为何如此慢?需要 profiling -- pcall 的成功路径在做什么额外工作? -- 元方法查找是否有缓存? +未来优化方向: +1. 优化函数调用栈管理 +2. 改进全局变量 `_ENV` 查找路径 +3. 优化元方法调用本身的开销 +4. 改进 pcall 成功路径的性能 diff --git a/bench_log/lua54_results.txt b/bench_log/lua54_results.txt new file mode 100644 index 00000000..9e47f950 --- /dev/null +++ b/bench_log/lua54_results.txt @@ -0,0 +1,194 @@ +====================================== + LUA-RS PERFORMANCE BENCHMARKS +====================================== + + +--- Running: bench_arithmetic.lua --- +=== Arithmetic Benchmark === +Iterations: 10000000 +Integer addition: sum: 50000005000000 0.047 seconds (212.77 M ops/sec) +Float multiplication: result: 2.718282 0.050 seconds (200.00 M ops/sec) +Mixed operations: z: 20000007 0.089 seconds (112.36 M ops/sec) + +--- Running: bench_control_flow.lua --- +=== Control Flow Benchmark === +Iterations: 10000000 +If-else: 0.178 seconds (56.18 M ops/sec) +While loop: 0.081 seconds (123.46 M ops/sec) +Repeat-until: 0.070 seconds (142.86 M ops/sec) +Nested loops (1000x1000): 0.005 seconds (200.00 M ops/sec) + +--- Running: bench_locals.lua --- +=== Local vs Global Benchmark === +Iterations: 10000000 +Global var access: 0.159 seconds (62.89 M ops/sec) +Local var access: 0.044 seconds (227.27 M ops/sec) +Upvalue access: 0.082 seconds (121.95 M ops/sec) +Global table field: 0.227 seconds (44.05 M ops/sec) +Local table field: 0.138 seconds (72.46 M ops/sec) +_ENV lookup (math.pi): 0.140 seconds (71.43 M ops/sec) +Cached global func: 0.265 seconds (37.74 M ops/sec) + +--- Running: bench_functions.lua --- +=== Function Call Benchmark === +Iterations: 1000000 +Simple function call: 0.023 seconds (43.48 M calls/sec) +Recursive fib(25): 0.006 seconds, result=75025 +Vararg function: 0.463 seconds (2.16 M calls/sec) + +--- Running: bench_closures.lua --- +=== Closures & Upvalues Benchmark === +Iterations: 1000000 +Closure creation: 0.133 seconds (7.52 M ops/sec) +Upvalue read/write: 0.025 seconds (40.00 M ops/sec) +Multiple upvalues: 0.031 seconds (32.26 M ops/sec) +Nested closures: 0.027 seconds (37.04 M ops/sec) + +--- Running: bench_multiret.lua --- +=== Multiple Returns & Select Benchmark === +Iterations: 1000000 +Single return: 0.018 seconds (55.56 M ops/sec) +Triple return: 0.026 seconds (38.46 M ops/sec) +10 returns: 0.040 seconds (25.00 M ops/sec) +Discard extra returns: 0.036 seconds (27.78 M ops/sec) +select('#', ...): 0.066 seconds (15.15 M ops/sec) +select(3, ...): 0.069 seconds (14.49 M ops/sec) +Vararg passthrough: 0.038 seconds (26.32 M ops/sec) +Vararg to table: 0.207 seconds (4.83 M ops/sec) +table.pack: 0.327 seconds (3.06 M ops/sec) +table.unpack: 0.080 seconds (12.50 M ops/sec) +Returns in table ctor: 0.173 seconds (5.78 M ops/sec) +Returns as func args: 0.045 seconds (22.22 M ops/sec) + +--- Running: bench_tables.lua --- +=== Table Operations Benchmark === +Iterations: 1000000 +Array creation & access: 0.002 seconds (5.00 M ops/sec) +Table insertion: 0.021 seconds (47.62 M inserts/sec) +Table access: 0.008 seconds (125.00 M accesses/sec) +Hash table insertion (100k): 0.065 seconds +ipairs iteration (10x1000000): 0.310 seconds + +--- Running: bench_table_lib.lua --- +=== Table Library Benchmark === +Iterations: 100000 +table.insert (end): 0.007 seconds (14285.71 K ops/sec) +table.insert (middle): 0.002 seconds (5000.00 K ops/sec) +table.remove (end): 0.011 seconds (9090.91 K ops/sec) +table.concat (1000 items): 0.275 seconds (36.36 K ops/sec) +table.sort (sorted): 0.156 seconds (6.41 K ops/sec) +table.sort (reversed): 0.235 seconds (4.26 K ops/sec) +table.sort (random): 0.281 seconds (3.56 K ops/sec) +table.sort (custom cmp): 0.522 seconds (1.92 K ops/sec) +table.move (1000 items): 0.093 seconds (107.53 K ops/sec) +table.unpack (5 values): 0.007 seconds (14285.71 K ops/sec) +table.pack (5 values): 0.033 seconds (3030.30 K ops/sec) +# operator (10k array): 0.000 seconds (inf M ops/sec) + +--- Running: bench_iterators.lua --- +=== Iterators Benchmark === +Iterations: 10000 +ipairs (1000 items): 0.302 seconds (33.11 K iters/sec) +pairs on array (1000): 0.313 seconds (31.95 K iters/sec) +pairs on hash (1000): 0.449 seconds (22.27 K iters/sec) +pairs on mixed (1000): 0.509 seconds (19.65 K iters/sec) +next() iteration (1000): 0.507 seconds (19.72 K iters/sec) +Numeric for (1000): 0.096 seconds (104.17 K iters/sec) +Custom stateless iter: 0.427 seconds (23.42 K iters/sec) +Closure iterator (100): 0.040 seconds (250.00 K iters/sec) +Multi-value iterator: 0.609 seconds (16.42 K iters/sec) + +--- Running: bench_strings.lua --- +=== String Operations Benchmark === +Iterations: 100000 +String concatenation: 0.039 seconds (2564.10 K ops/sec) +String length: 0.001 seconds (100.00 M ops/sec) +string.sub: 0.006 seconds (16666.67 K ops/sec) +string.find: 0.008 seconds (12500.00 K ops/sec) +string.gsub (10k): 0.195 seconds + +--- Running: bench_string_lib.lua --- +=== String Library Extended Benchmark === +Iterations: 100000 +string.upper: 0.023 seconds (4347.83 K ops/sec) +string.lower: 0.022 seconds (4545.45 K ops/sec) +string.reverse: 0.014 seconds (7142.86 K ops/sec) +string.rep (100 chars): 0.044 seconds (2272.73 K ops/sec) +string.byte: 0.005 seconds (20000.00 K ops/sec) +string.char (5 chars): 0.009 seconds (11111.11 K ops/sec) +string.format (%d): 0.027 seconds (3703.70 K ops/sec) +string.format (complex): 0.071 seconds (1408.45 K ops/sec) +string.match (simple): 0.012 seconds (8333.33 K ops/sec) +string.match (3 captures): 0.035 seconds (2857.14 K ops/sec) +string.gmatch: 0.001 seconds (1000.00 K ops/sec) +string.gsub (simple): 0.005 seconds (2000.00 K ops/sec) +string.gsub (pattern): 0.008 seconds (1250.00 K ops/sec) +string.sub (long str): 0.001 seconds (1000.00 K ops/sec) +string.find (long str): 0.000 seconds (inf K ops/sec) +String equality: 0.001 seconds (100000.00 K ops/sec) +Concat (4 parts): 0.005 seconds (20000.00 K ops/sec) + +--- Running: bench_math.lua --- +=== Math Operations Benchmark === +Iterations: 5000000 +Integer mul/add/mod: 0.086 seconds (58.14 M ops/sec) +Float mul/add/div: 0.049 seconds (102.04 M ops/sec) +math.sqrt: 0.119 seconds (42.02 M ops/sec) +math.sin: 0.159 seconds (31.45 M ops/sec) +math.floor/ceil: 0.293 seconds (17.06 M ops/sec) +math.min/max: 0.277 seconds (18.05 M ops/sec) +math.abs: 0.119 seconds (42.02 M ops/sec) +math.random: 0.196 seconds (25.51 M ops/sec) +Bitwise AND/OR/SHR: 0.052 seconds (96.15 M ops/sec) +Integer division (//): 0.065 seconds (76.92 M ops/sec) +Power (^2): 0.094 seconds (53.19 M ops/sec) + +--- Running: bench_metatables.lua --- +=== Metatables & Metamethods Benchmark === +Iterations: 500000 +__index (function): 0.022 seconds (22.73 M ops/sec) +__index (table): 0.016 seconds (31.25 M ops/sec) +__newindex: 0.006 seconds (16.67 M ops/sec) +__add metamethod: 0.025 seconds (4.00 M ops/sec) +__call metamethod: 0.003 seconds (33.33 M ops/sec) +__len metamethod: 0.020 seconds (25.00 M ops/sec) +rawget (no metamethod): 0.026 seconds (19.23 M ops/sec) + +--- Running: bench_oop.lua --- +=== OOP Patterns Benchmark === +Iterations: 100000 +Object creation: 0.028 seconds (3571.43 K ops/sec) +Method call (colon): 0.047 seconds (10638.30 K ops/sec) +Method call (dot): 0.041 seconds (12195.12 K ops/sec) +Inherited object creation: 0.011 seconds (1818.18 K ops/sec) +Inherited method call: 0.050 seconds (10000.00 K ops/sec) +Property access: 0.008 seconds (62500.00 K ops/sec) +Property modification: 0.006 seconds (83333.33 K ops/sec) +Closure object creation: 0.006 seconds (1666.67 K ops/sec) +Closure method call: 0.013 seconds (38461.54 K ops/sec) +Prototype chain (3 levels): 0.023 seconds (21739.13 K ops/sec) + +--- Running: bench_coroutines.lua --- +=== Coroutines Benchmark === +Iterations: 100000 +Create/resume/yield: 0.212 seconds (471.70 K cycles/sec) +Repeated yield: 0.147 seconds (680.27 K yields/sec) +Producer-consumer: 0.155 seconds (645.16 K msgs/sec) +coroutine.wrap: 0.004 seconds (2500.00 K ops/sec) +coroutine.status: 0.005 seconds (20.00 M ops/sec) + +--- Running: bench_errors.lua --- +=== Error Handling Benchmark === +Iterations: 100000 +pcall (success): 0.007 seconds (14285.71 K ops/sec) +pcall (error): 0.198 seconds (505.05 K ops/sec) +xpcall (error): 0.239 seconds (418.41 K ops/sec) +Direct call (baseline): 0.002 seconds (50000.00 K ops/sec) +pcall (multi-return): 0.008 seconds (12500.00 K ops/sec) +assert (success): 0.003 seconds (33333.33 K ops/sec) +pcall (type check): 0.014 seconds (7142.86 K ops/sec) + +====================================== + BENCHMARKS COMPLETE + Total time: 12.08 seconds +====================================== diff --git a/bench_log/luars_results.txt b/bench_log/luars_results.txt new file mode 100644 index 00000000..42cfb2d1 --- /dev/null +++ b/bench_log/luars_results.txt @@ -0,0 +1,194 @@ +====================================== + LUA-RS PERFORMANCE BENCHMARKS +====================================== + + +--- Running: bench_arithmetic.lua --- +=== Arithmetic Benchmark === +Iterations: 10000000 +Integer addition: sum: 50000005000000 0.061 seconds (163.23 M ops/sec) +Float multiplication: result: 2.718282 0.064 seconds (157.12 M ops/sec) +Mixed operations: z: 20000007 0.112 seconds (89.30 M ops/sec) + +--- Running: bench_control_flow.lua --- +=== Control Flow Benchmark === +Iterations: 10000000 +If-else: 0.161 seconds (62.13 M ops/sec) +While loop: 0.112 seconds (89.52 M ops/sec) +Repeat-until: 0.109 seconds (91.77 M ops/sec) +Nested loops (1000x1000): 0.006 seconds (166.52 M ops/sec) + +--- Running: bench_locals.lua --- +=== Local vs Global Benchmark === +Iterations: 10000000 +Global var access: 0.285 seconds (35.12 M ops/sec) +Local var access: 0.060 seconds (167.57 M ops/sec) +Upvalue access: 0.166 seconds (60.12 M ops/sec) +Global table field: 0.389 seconds (25.72 M ops/sec) +Local table field: 0.224 seconds (44.63 M ops/sec) +_ENV lookup (math.pi): 0.169 seconds (59.24 M ops/sec) +Cached global func: 0.289 seconds (34.59 M ops/sec) + +--- Running: bench_functions.lua --- +=== Function Call Benchmark === +Iterations: 1000000 +Simple function call: 0.047 seconds (21.35 M calls/sec) +Recursive fib(25): 0.011 seconds, result=75025 +Vararg function: 0.479 seconds (2.09 M calls/sec) + +--- Running: bench_closures.lua --- +=== Closures & Upvalues Benchmark === +Iterations: 1000000 +Closure creation: 0.074 seconds (13.47 M ops/sec) +Upvalue read/write: 0.046 seconds (21.61 M ops/sec) +Multiple upvalues: 0.065 seconds (15.46 M ops/sec) +Nested closures: 0.055 seconds (18.12 M ops/sec) + +--- Running: bench_multiret.lua --- +=== Multiple Returns & Select Benchmark === +Iterations: 1000000 +Single return: 0.033 seconds (30.31 M ops/sec) +Triple return: 0.059 seconds (16.92 M ops/sec) +10 returns: 0.076 seconds (13.12 M ops/sec) +Discard extra returns: 0.072 seconds (13.84 M ops/sec) +select('#', ...): 0.264 seconds (3.79 M ops/sec) +select(3, ...): 0.254 seconds (3.93 M ops/sec) +Vararg passthrough: 0.076 seconds (13.13 M ops/sec) +Vararg to table: 0.225 seconds (4.44 M ops/sec) +table.pack: 0.337 seconds (2.96 M ops/sec) +table.unpack: 0.202 seconds (4.95 M ops/sec) +Returns in table ctor: 0.145 seconds (6.91 M ops/sec) +Returns as func args: 0.100 seconds (10.03 M ops/sec) + +--- Running: bench_tables.lua --- +=== Table Operations Benchmark === +Iterations: 1000000 +Array creation & access: 0.002 seconds (6.51 M ops/sec) +Table insertion: 0.021 seconds (48.75 M inserts/sec) +Table access: 0.011 seconds (91.13 M accesses/sec) +Hash table insertion (100k): 0.035 seconds +ipairs iteration (10x1000000): 0.211 seconds + +--- Running: bench_table_lib.lua --- +=== Table Library Benchmark === +Iterations: 100000 +table.insert (end): 0.004 seconds (22681.39 K ops/sec) +table.insert (middle): 0.001 seconds (11567.38 K ops/sec) +table.remove (end): 0.004 seconds (24396.79 K ops/sec) +table.concat (1000 items): 0.718 seconds (13.93 K ops/sec) +table.sort (sorted): 0.005 seconds (212.69 K ops/sec) +table.sort (reversed): 0.023 seconds (42.73 K ops/sec) +table.sort (random): 0.169 seconds (5.92 K ops/sec) +table.sort (custom cmp): 0.358 seconds (2.79 K ops/sec) +table.move (1000 items): 0.063 seconds (157.77 K ops/sec) +table.unpack (5 values): 0.021 seconds (4701.44 K ops/sec) +table.pack (5 values): 0.034 seconds (2984.01 K ops/sec) +# operator (10k array): 0.001 seconds (153.59 M ops/sec) + +--- Running: bench_iterators.lua --- +=== Iterators Benchmark === +Iterations: 10000 +ipairs (1000 items): 0.210 seconds (47.51 K iters/sec) +pairs on array (1000): 0.279 seconds (35.79 K iters/sec) +pairs on hash (1000): 0.314 seconds (31.89 K iters/sec) +pairs on mixed (1000): 0.305 seconds (32.83 K iters/sec) +next() iteration (1000): 0.504 seconds (19.83 K iters/sec) +Numeric for (1000): 0.109 seconds (91.96 K iters/sec) +Custom stateless iter: 0.675 seconds (14.82 K iters/sec) +Closure iterator (100): 0.070 seconds (141.93 K iters/sec) +Multi-value iterator: 1.198 seconds (8.35 K iters/sec) + +--- Running: bench_strings.lua --- +=== String Operations Benchmark === +Iterations: 100000 +String concatenation: 0.049 seconds (2040.22 K ops/sec) +String length: 0.001 seconds (168.83 M ops/sec) +string.sub: 0.007 seconds (14169.52 K ops/sec) +string.find: 0.007 seconds (13399.98 K ops/sec) +string.gsub (10k): 0.136 seconds + +--- Running: bench_string_lib.lua --- +=== String Library Extended Benchmark === +Iterations: 100000 +string.upper: 0.016 seconds (6250.43 K ops/sec) +string.lower: 0.017 seconds (5973.36 K ops/sec) +string.reverse: 0.038 seconds (2653.72 K ops/sec) +string.rep (100 chars): 0.092 seconds (1081.71 K ops/sec) +string.byte: 0.004 seconds (22944.73 K ops/sec) +string.char (5 chars): 0.020 seconds (4940.98 K ops/sec) +string.format (%d): 0.044 seconds (2282.87 K ops/sec) +string.format (complex): 0.163 seconds (614.41 K ops/sec) +string.match (simple): 0.110 seconds (907.60 K ops/sec) +string.match (3 captures): 0.221 seconds (453.40 K ops/sec) +string.gmatch: 0.007 seconds (140.46 K ops/sec) +string.gsub (simple): 0.016 seconds (630.88 K ops/sec) +string.gsub (pattern): 0.016 seconds (623.18 K ops/sec) +string.sub (long str): 0.001 seconds (736.11 K ops/sec) +string.find (long str): 0.000 seconds (12004.80 K ops/sec) +String equality: 0.001 seconds (66777.96 K ops/sec) +Concat (4 parts): 0.012 seconds (8451.01 K ops/sec) + +--- Running: bench_math.lua --- +=== Math Operations Benchmark === +Iterations: 5000000 +Integer mul/add/mod: 0.069 seconds (72.18 M ops/sec) +Float mul/add/div: 0.083 seconds (60.24 M ops/sec) +math.sqrt: 0.143 seconds (34.98 M ops/sec) +math.sin: 0.200 seconds (25.02 M ops/sec) +math.floor/ceil: 0.325 seconds (15.40 M ops/sec) +math.min/max: 0.319 seconds (15.68 M ops/sec) +math.abs: 0.158 seconds (31.58 M ops/sec) +math.random: 0.410 seconds (12.19 M ops/sec) +Bitwise AND/OR/SHR: 0.081 seconds (61.36 M ops/sec) +Integer division (//): 0.039 seconds (129.79 M ops/sec) +Power (^2): 0.245 seconds (20.41 M ops/sec) + +--- Running: bench_metatables.lua --- +=== Metatables & Metamethods Benchmark === +Iterations: 500000 +__index (function): 0.095 seconds (5.25 M ops/sec) +__index (table): 0.021 seconds (24.17 M ops/sec) +__newindex: 0.015 seconds (6.69 M ops/sec) +__add metamethod: 0.045 seconds (2.24 M ops/sec) +__call metamethod: 0.007 seconds (15.34 M ops/sec) +__len metamethod: 0.090 seconds (5.58 M ops/sec) +rawget (no metamethod): 0.024 seconds (21.24 M ops/sec) + +--- Running: bench_oop.lua --- +=== OOP Patterns Benchmark === +Iterations: 100000 +Object creation: 0.042 seconds (2372.82 K ops/sec) +Method call (colon): 0.130 seconds (3846.56 K ops/sec) +Method call (dot): 0.116 seconds (4297.40 K ops/sec) +Inherited object creation: 0.012 seconds (1682.38 K ops/sec) +Inherited method call: 0.136 seconds (3674.02 K ops/sec) +Property access: 0.011 seconds (47317.12 K ops/sec) +Property modification: 0.024 seconds (20951.62 K ops/sec) +Closure object creation: 0.016 seconds (634.04 K ops/sec) +Closure method call: 0.024 seconds (20826.91 K ops/sec) +Prototype chain (3 levels): 0.046 seconds (10954.75 K ops/sec) + +--- Running: bench_coroutines.lua --- +=== Coroutines Benchmark === +Iterations: 100000 +Create/resume/yield: 0.259 seconds (386.17 K cycles/sec) +Repeated yield: 0.032 seconds (3090.32 K yields/sec) +Producer-consumer: 0.000 seconds (15151515.15 K msgs/sec) +coroutine.wrap: 0.022 seconds (464.90 K ops/sec) +coroutine.status: 0.006 seconds (17.04 M ops/sec) + +--- Running: bench_errors.lua --- +=== Error Handling Benchmark === +Iterations: 100000 +pcall (success): 0.025 seconds (3933.20 K ops/sec) +pcall (error): 0.027 seconds (3752.75 K ops/sec) +xpcall (error): 0.080 seconds (1255.33 K ops/sec) +Direct call (baseline): 0.003 seconds (28595.12 K ops/sec) +pcall (multi-return): 0.028 seconds (3524.99 K ops/sec) +assert (success): 0.009 seconds (11470.92 K ops/sec) +pcall (type check): 0.039 seconds (2589.69 K ops/sec) + +====================================== + BENCHMARKS COMPLETE + Total time: 14.63 seconds +====================================== diff --git a/crates/luars/src/compiler/stmt.rs b/crates/luars/src/compiler/stmt.rs index 93cd9bc5..7cfc2838 100644 --- a/crates/luars/src/compiler/stmt.rs +++ b/crates/luars/src/compiler/stmt.rs @@ -718,15 +718,15 @@ fn compile_return_stat(c: &mut Compiler, stat: &LuaReturnStat) -> Result<(), Str // First, compile all expressions except the last directly to target registers for (i, expr) in exprs.iter().take(num_exprs - 1).enumerate() { let target_reg = base_reg + i as u32; - + // Try to compile expression directly to target register let src_reg = compile_expr_to(c, expr, Some(target_reg))?; - + // If expression couldn't be placed in target, emit a MOVE if src_reg != target_reg { emit_move(c, target_reg, src_reg); } - + if target_reg >= c.freereg { c.freereg = target_reg + 1; } @@ -767,16 +767,16 @@ fn compile_return_stat(c: &mut Compiler, stat: &LuaReturnStat) -> Result<(), Str // Compile expressions directly to target registers when possible for i in 0..num_exprs { let target_reg = base_reg + i as u32; - + // Try to compile expression directly to target register // compile_expr_to will use get_result_reg which ensures max_stack_size is updated let src_reg = compile_expr_to(c, &exprs[i], Some(target_reg))?; - + // If expression couldn't be placed in target, emit a MOVE if src_reg != target_reg { emit_move(c, target_reg, src_reg); } - + // Update freereg to account for this register if target_reg >= c.freereg { c.freereg = target_reg + 1; @@ -1138,7 +1138,7 @@ fn compile_for_stat(c: &mut Compiler, stat: &LuaForStat) -> Result<(), String> { } /// Compile generic for loop using TFORPREP/TFORCALL/TFORLOOP instructions -/// +/// /// Lua 5.4 for-in register layout: /// R[A] = iter_func (f) /// R[A+1] = state (s) @@ -1176,7 +1176,7 @@ fn compile_for_range_stat(c: &mut Compiler, stat: &LuaForRangeStat) -> Result<() // FIRST: Compile iterator expressions BEFORE allocating the for-in block // This prevents the call results from overlapping with loop variables let base = c.freereg; - + // Compile iterator expressions to get (iter_func, state, control_var, to-be-closed) at base // Lua 5.4 for-in needs 4 control slots: iterator, state, control, closing value if iter_exprs.len() == 1 { @@ -1285,7 +1285,8 @@ fn compile_for_range_stat(c: &mut Compiler, stat: &LuaForRangeStat) -> Result<() // Patch TFORPREP to jump to TFORCALL // TFORPREP jumps forward by Bx let tforprep_jump = tforcall_pc - tforprep_pc - 1; - c.chunk.code[tforprep_pc] = Instruction::encode_abx(OpCode::TForPrep, base, tforprep_jump as u32); + c.chunk.code[tforprep_pc] = + Instruction::encode_abx(OpCode::TForPrep, base, tforprep_jump as u32); end_loop(c); end_scope(c); diff --git a/crates/luars/src/gc/gc_id.rs b/crates/luars/src/gc/gc_id.rs new file mode 100644 index 00000000..ee1de5ac --- /dev/null +++ b/crates/luars/src/gc/gc_id.rs @@ -0,0 +1,77 @@ +// ============ Object IDs ============ +// All IDs are simple u32 indices - compact and efficient + +#[derive(Clone, Copy, PartialEq, Eq, Hash, Debug, Default)] +#[repr(transparent)] +pub struct StringId(pub u32); + +#[derive(Clone, Copy, PartialEq, Eq, Hash, Debug, Default)] +#[repr(transparent)] +pub struct TableId(pub u32); + +#[derive(Clone, Copy, PartialEq, Eq, Hash, Debug, Default)] +#[repr(transparent)] +pub struct FunctionId(pub u32); + +#[derive(Clone, Copy, PartialEq, Eq, Hash, Debug, Default)] +#[repr(transparent)] +pub struct UpvalueId(pub u32); + +#[derive(Clone, Copy, PartialEq, Eq, Hash, Debug, Default)] +#[repr(transparent)] +pub struct UserdataId(pub u32); + +#[derive(Clone, Copy, PartialEq, Eq, Hash, Debug, Default)] +#[repr(transparent)] +pub struct ThreadId(pub u32); + +/// Object type tags (3 bits, supports up to 8 types) +#[repr(u8)] +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum GcObjectType { + String = 0, + Table = 1, + Function = 2, + Upvalue = 3, + Thread = 4, + Userdata = 5, +} + +/// Unified GC object identifier +/// Layout: [type: 3 bits][index: 29 bits] +/// Supports up to 536 million objects per type +#[derive(Clone, Copy, PartialEq, Eq, Hash, Debug)] +pub enum GcId { + StringId(StringId), + TableId(TableId), + FunctionId(FunctionId), + UpvalueId(UpvalueId), + ThreadId(ThreadId), + UserdataId(UserdataId), +} + +impl GcId { + #[inline(always)] + pub fn gc_type(self) -> GcObjectType { + match self { + GcId::StringId(_) => GcObjectType::String, + GcId::TableId(_) => GcObjectType::Table, + GcId::FunctionId(_) => GcObjectType::Function, + GcId::UpvalueId(_) => GcObjectType::Upvalue, + GcId::ThreadId(_) => GcObjectType::Thread, + GcId::UserdataId(_) => GcObjectType::Userdata, + } + } + + #[inline(always)] + pub fn index(self) -> u32 { + match self { + GcId::StringId(StringId(id)) => id, + GcId::TableId(TableId(id)) => id, + GcId::FunctionId(FunctionId(id)) => id, + GcId::UpvalueId(UpvalueId(id)) => id, + GcId::ThreadId(ThreadId(id)) => id, + GcId::UserdataId(UserdataId(id)) => id, + } + } +} diff --git a/crates/luars/src/gc/gc_object.rs b/crates/luars/src/gc/gc_object.rs new file mode 100644 index 00000000..40977af0 --- /dev/null +++ b/crates/luars/src/gc/gc_object.rs @@ -0,0 +1,255 @@ +// ============ GC Header ============ + +use std::rc::Rc; + +use crate::{Chunk, LuaString, LuaTable, LuaValue, UpvalueId, lua_value::LuaThread}; + +// Object ages for generational GC (like Lua 5.4) +// Uses 3 bits (0-7) +pub const G_NEW: u8 = 0; // Created in current cycle +pub const G_SURVIVAL: u8 = 1; // Created in previous cycle (survived one minor) +pub const G_OLD0: u8 = 2; // Marked old by forward barrier in this cycle +pub const G_OLD1: u8 = 3; // First full cycle as old +pub const G_OLD: u8 = 4; // Really old object (not to be visited in minor) +pub const G_TOUCHED1: u8 = 5; // Old object touched this cycle +pub const G_TOUCHED2: u8 = 6; // Old object touched in previous cycle + +// Color bits +pub const WHITE0BIT: u8 = 3; // Object is white (type 0) +pub const WHITE1BIT: u8 = 4; // Object is white (type 1) +pub const BLACKBIT: u8 = 5; // Object is black +pub const FIXEDBIT: u8 = 6; // Object is fixed (never collected) + +pub const WHITEBITS: u8 = (1 << WHITE0BIT) | (1 << WHITE1BIT); +pub const AGEBITS: u8 = 0x07; // Bits 0-2 for age + +/// GC object header - embedded in every GC-managed object +/// Based on Lua 5.4's CommonHeader design +/// +/// Bit layout of `marked` field: +/// - Bits 0-2: Age (G_NEW, G_SURVIVAL, G_OLD0, G_OLD1, G_OLD, G_TOUCHED1, G_TOUCHED2) +/// - Bit 3: WHITE0 (current white in even cycles) +/// - Bit 4: WHITE1 (current white in odd cycles) +/// - Bit 5: BLACK (fully marked) +/// - Bit 6: FIXED (never collected) +/// - Bit 7: Reserved +#[derive(Clone, Copy)] +#[repr(C)] +pub struct GcHeader { + pub marked: u8, // Color and age bits combined +} + +impl Default for GcHeader { + fn default() -> Self { + // New objects start as BLACK with age G_NEW + // This ensures they survive the current GC cycle + // They will be properly marked or turned white at the start of next cycle + GcHeader { + marked: (1 << BLACKBIT) | G_NEW, + } + } +} + +impl GcHeader { + /// Create a new header with given white bit and age + #[inline(always)] + pub fn new(current_white: u8) -> Self { + GcHeader { + marked: (1 << (WHITE0BIT + current_white)) | G_NEW, + } + } + + /// Get object age + #[inline(always)] + pub fn age(&self) -> u8 { + self.marked & AGEBITS + } + + /// Set object age + #[inline(always)] + pub fn set_age(&mut self, age: u8) { + self.marked = (self.marked & !AGEBITS) | (age & AGEBITS); + } + + /// Check if object is white (either white0 or white1) + #[inline(always)] + pub fn is_white(&self) -> bool { + (self.marked & WHITEBITS) != 0 + } + + /// Check if object is black + #[inline(always)] + pub fn is_black(&self) -> bool { + (self.marked & (1 << BLACKBIT)) != 0 + } + + /// Check if object is gray (neither white nor black) + #[inline(always)] + pub fn is_gray(&self) -> bool { + (self.marked & (WHITEBITS | (1 << BLACKBIT))) == 0 + } + + /// Check if object is fixed (never collected) + #[inline(always)] + pub fn is_fixed(&self) -> bool { + (self.marked & (1 << FIXEDBIT)) != 0 + } + + /// Set object as fixed + #[inline(always)] + pub fn set_fixed(&mut self) { + self.marked |= 1 << FIXEDBIT; + } + + /// Check if object is old (age > G_SURVIVAL) + #[inline(always)] + pub fn is_old(&self) -> bool { + self.age() > G_SURVIVAL + } + + /// Make object white with given current_white (0 or 1) + #[inline(always)] + pub fn make_white(&mut self, current_white: u8) { + // Clear color bits, set appropriate white bit, keep age + let age = self.age(); + self.marked = (1 << (WHITE0BIT + current_white)) | age; + } + + /// Make object gray (clear all color bits) + #[inline(always)] + pub fn make_gray(&mut self) { + self.marked &= !(WHITEBITS | (1 << BLACKBIT)); + } + + /// Make object black (from non-white state) + #[inline(always)] + pub fn make_black(&mut self) { + self.marked = (self.marked & !WHITEBITS) | (1 << BLACKBIT); + } + + /// Check if object is dead (has the "other" white) + #[inline(always)] + pub fn is_dead(&self, other_white: u8) -> bool { + (self.marked & (1 << (WHITE0BIT + other_white))) != 0 + } + + // Legacy compatibility + #[inline(always)] + pub fn is_marked(&self) -> bool { + !self.is_white() + } + + #[inline(always)] + pub fn set_marked(&mut self, marked: bool) { + if marked { + self.make_black(); + } else { + self.make_white(0); + } + } +} + +// Legacy field accessors for compatibility +impl GcHeader { + #[inline(always)] + pub fn get_fixed(&self) -> bool { + self.is_fixed() + } +} + +// ============ GC-managed Objects ============ + +/// Table with embedded GC header +pub struct GcTable { + pub header: GcHeader, + pub data: LuaTable, +} + +/// Lua function with embedded GC header +pub struct GcFunction { + pub header: GcHeader, + pub chunk: Rc, + pub upvalues: Vec, // Upvalue IDs, not Rc +} + +/// Upvalue state - uses absolute stack index like Lua C implementation +#[derive(Debug, Clone)] +pub enum UpvalueState { + Open { stack_index: usize }, + Closed(LuaValue), +} + +/// Upvalue with embedded GC header +pub struct GcUpvalue { + pub header: GcHeader, + pub state: UpvalueState, +} + +impl GcUpvalue { + /// Check if this upvalue points to the given absolute stack index + #[inline] + pub fn points_to_index(&self, index: usize) -> bool { + matches!(&self.state, UpvalueState::Open { stack_index } if *stack_index == index) + } + + /// Check if this upvalue is open (still points to stack) + #[inline] + pub fn is_open(&self) -> bool { + matches!(&self.state, UpvalueState::Open { .. }) + } + + /// Close this upvalue with the given value + #[inline] + pub fn close(&mut self, value: LuaValue) { + self.state = UpvalueState::Closed(value); + } + + /// Get the value of a closed upvalue (returns None if still open) + #[inline] + pub fn get_closed_value(&self) -> Option { + match &self.state { + UpvalueState::Closed(v) => Some(v.clone()), + _ => None, + } + } + + /// Get the absolute stack index if this upvalue is open + #[inline] + pub fn get_stack_index(&self) -> Option { + match &self.state { + UpvalueState::Open { stack_index } => Some(*stack_index), + _ => None, + } + } + + /// Set closed upvalue value directly without checking state + /// SAFETY: Must only be called when upvalue is in Closed state + #[inline(always)] + pub unsafe fn set_closed_value_unchecked(&mut self, value: LuaValue) { + if let UpvalueState::Closed(ref mut v) = self.state { + *v = value; + } + } + + /// Get closed value reference directly without Option + /// SAFETY: Must only be called when upvalue is in Closed state + #[inline(always)] + pub unsafe fn get_closed_value_ref_unchecked(&self) -> &LuaValue { + match &self.state { + UpvalueState::Closed(v) => v, + _ => unsafe { std::hint::unreachable_unchecked() }, + } + } +} + +/// String with embedded GC header +pub struct GcString { + pub header: GcHeader, + pub data: LuaString, +} + +/// Thread (coroutine) with embedded GC header +pub struct GcThread { + pub header: GcHeader, + pub data: LuaThread, +} diff --git a/crates/luars/src/gc/mod.rs b/crates/luars/src/gc/mod.rs index 708668d4..c10bb156 100644 --- a/crates/luars/src/gc/mod.rs +++ b/crates/luars/src/gc/mod.rs @@ -1,39 +1,130 @@ -// Simplified Garbage Collector for Lua VM +// Garbage Collector for Lua VM // -// Key insight: Objects are already stored in Arena with GcHeader. -// We don't need a separate HashMap to track them! +// Design based on Lua 5.4 with full Generational GC support: +// - GcId: Unified object identifier (type tag + pool index) +// - Dual-mode: Incremental (KGC_INC) or Generational (KGC_GEN) +// - Tri-color marking: white, gray, black +// - Generational: objects have ages (NEW, SURVIVAL, OLD0, OLD1, OLD, TOUCHED1, TOUCHED2) +// - Minor collection: Only collect young generation +// - Major collection: Full collection when memory grows too much // -// Design: -// - Arena, Arena, etc. store all objects -// - GcHeader.marked is used for mark-sweep -// - GC directly iterates over Arena, no extra tracking needed -// - Lua 5.4 style debt mechanism for triggering GC +// Key difference from Lua C: We use Vec instead of linked list +// and iterate pools directly for sweeping (allocation is O(1) via free list) +mod gc_id; +mod gc_object; mod object_pool; -use crate::lua_value::LuaValue; -pub use object_pool::{ - Arena, BoxPool, FunctionId, GcFunction, GcHeader, GcString, GcTable, GcThread, GcUpvalue, - ObjectPool, Pool, StringId, TableId, ThreadId, UpvalueId, UpvalueState, UserdataId, -}; - -// Re-export for compatibility -#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] -pub enum GcObjectType { - String, - Table, - Function, +use crate::lua_value::{LuaValue, LuaValueKind}; +pub use gc_id::*; +pub use gc_object::*; +pub use object_pool::*; + +/// GC mode: Incremental or Generational +#[repr(u8)] +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum GcKind { + Incremental = 0, // Traditional incremental mark-sweep + Generational = 1, // Generational with minor/major collections +} + +/// Object age for generational GC (like Lua 5.4) +/// Age transitions: +/// - NEW → SURVIVAL (after surviving a minor collection) +/// - SURVIVAL → OLD1 (after surviving another minor) +/// - OLD0 → OLD1 (barrier promoted objects) +/// - OLD1 → OLD (after another collection) +/// - TOUCHED1 → TOUCHED2 → OLD (old objects that got a back barrier) +#[repr(u8)] +#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)] +pub enum GcAge { + New = 0, // Created in current cycle + Survival = 1, // Survived one minor collection + Old0 = 2, // Marked old by forward barrier in this cycle + Old1 = 3, // First full cycle as old + Old = 4, // Really old (not visited in minor GC) + Touched1 = 5, // Old object touched this cycle (back barrier) + Touched2 = 6, // Old object touched in previous cycle } -/// Simplified GC state - no HashMap tracking! +impl GcAge { + /// Get the next age after a collection cycle + #[inline] + pub fn next_age(self) -> GcAge { + match self { + GcAge::New => GcAge::Survival, + GcAge::Survival => GcAge::Old1, + GcAge::Old0 => GcAge::Old1, + GcAge::Old1 => GcAge::Old, + GcAge::Old => GcAge::Old, + GcAge::Touched1 => GcAge::Touched1, // handled specially + GcAge::Touched2 => GcAge::Touched2, // handled specially + } + } + + /// Check if this age is considered "old" + #[inline] + pub fn is_old(self) -> bool { + self as u8 >= GcAge::Old0 as u8 + } +} + +/// GC color for tri-color marking +#[repr(u8)] +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum GcColor { + White0 = 0, // Unmarked (current white) + White1 = 1, // Unmarked (other white, for flip) + Gray = 2, // Marked, refs not yet scanned + Black = 3, // Fully marked +} + +/// GC state machine phases +#[repr(u8)] +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum GcState { + Pause = 0, // Between cycles + Propagate = 1, // Marking phase + Atomic = 2, // Atomic finish of marking + Sweep = 3, // Sweeping dead objects +} + +/// Main GC structure +/// Supports both incremental and generational modes (like Lua 5.4) pub struct GC { + // Gray lists for marking + gray: Vec, + grayagain: Vec, + // Lua 5.4 GC debt mechanism pub(crate) gc_debt: isize, pub(crate) total_bytes: usize, - // GC parameters - gc_pause: usize, // Pause parameter (default 200 = 200%) - // gc_step_mul: usize, // Step multiplier + // GC state machine + state: GcState, + current_white: u8, // 0 or 1, flips each cycle + + // GC mode (incremental or generational) + gc_kind: GcKind, + + // Incremental sweep state + sweep_index: usize, // Current position in sweep phase + propagate_work: usize, // Work done in propagate phase + + // GC parameters (like Lua's gcparam) + gc_pause: usize, // Pause parameter (default 200 = 200%) + gen_minor_mul: usize, // Minor collection multiplier (default 25 = 25%) + gen_major_mul: usize, // Major collection threshold (default 100 = 100%) + + // Generational mode state + last_atomic: usize, // Objects traversed in last atomic (0 = good collection) + gc_estimate: usize, // Estimate of memory in use after major collection + + // Generation boundaries (indices into allgc-style tracking) + // In our design, we track these via object ages in headers + // These are used for the minor collection optimization + young_list: Vec, // NEW and SURVIVAL objects + touched_list: Vec, // TOUCHED1 and TOUCHED2 old objects // Collection throttling check_counter: u32, @@ -59,22 +150,71 @@ pub struct GCStats { impl GC { pub fn new() -> Self { GC { - gc_debt: -(200 * 1024), // Start with 200KB credit + gray: Vec::with_capacity(256), + grayagain: Vec::with_capacity(64), + // Start with negative debt like Lua + // gc_debt < 0 means "credit" before next collection + gc_debt: -(8 * 1024), // 8KB credit before first GC total_bytes: 0, - gc_pause: 200, + state: GcState::Pause, + current_white: 0, + gc_kind: GcKind::Generational, // Default to generational mode like Lua 5.4 + sweep_index: 0, + propagate_work: 0, + gc_pause: 200, // Like Lua: 200 = wait until memory doubles + gen_minor_mul: 25, // Minor GC when memory grows 25% + gen_major_mul: 100, // Major GC when memory grows 100% since last major + last_atomic: 0, + gc_estimate: 0, + young_list: Vec::with_capacity(1024), + touched_list: Vec::with_capacity(256), check_counter: 0, - check_interval: 10000, + check_interval: 1, stats: GCStats::default(), } } + + /// Create GC in incremental mode (for compatibility/testing) + pub fn new_incremental() -> Self { + let mut gc = Self::new(); + gc.gc_kind = GcKind::Incremental; + gc + } + + /// Get current GC mode + #[inline] + pub fn gc_kind(&self) -> GcKind { + self.gc_kind + } + + /// Set GC mode + pub fn set_gc_kind(&mut self, kind: GcKind) { + self.gc_kind = kind; + } - /// Record allocation - just update debt, no HashMap insertion! + /// Register a new object for GC tracking + /// In generational mode, new objects are added to young_list + #[inline(always)] + pub fn track_object(&mut self, gc_id: GcId, size: usize) { + self.total_bytes += size; + self.gc_debt += size as isize; + + // In generational mode, track new objects in young_list + if self.gc_kind == GcKind::Generational { + self.young_list.push(gc_id); + } + } + + /// Record allocation - compatibility with old API #[inline(always)] pub fn register_object(&mut self, _obj_id: u32, obj_type: GcObjectType) { let size = match obj_type { GcObjectType::String => 64, GcObjectType::Table => 256, GcObjectType::Function => 128, + GcObjectType::Upvalue => 64, + GcObjectType::Thread => 512, + GcObjectType::Userdata => 32, }; self.total_bytes += size; self.gc_debt += size as isize; @@ -115,67 +255,855 @@ impl GC { self.check_counter >= self.check_interval } - /// Perform GC step + /// Perform GC step - like Lua's luaC_step + /// Dispatches to incremental or generational mode based on gc_kind pub fn step(&mut self, roots: &[LuaValue], pool: &mut ObjectPool) { - if !self.should_collect() { + // Like Lua: run GC when debt > 0 + if self.gc_debt <= 0 { return; } - self.check_counter = 0; - self.collect(roots, pool); + match self.gc_kind { + GcKind::Generational => self.gen_step(roots, pool), + GcKind::Incremental => self.inc_step(roots, pool), + } + } + + /// Generational GC step - like Lua's genstep + fn gen_step(&mut self, roots: &[LuaValue], pool: &mut ObjectPool) { + if self.last_atomic != 0 { + // Last collection was bad, do a full step + self.step_gen_full(roots, pool); + } else { + // Check if we need a major collection + let major_base = self.gc_estimate; + let major_inc = (major_base / 100) * self.gen_major_mul; + + if self.gc_debt > 0 && self.total_bytes > major_base + major_inc { + // Memory grew too much, do a major collection + let num_objs = self.full_gen(roots, pool); + + if self.total_bytes < major_base + (major_inc / 2) { + // Good collection - collected at least half of growth + self.last_atomic = 0; + } else { + // Bad collection + self.last_atomic = num_objs; + self.set_pause(); + } + } else { + // Regular case: do a minor collection + self.young_collection(roots, pool); + self.set_minor_debt(); + } + } + } + + /// Incremental GC step - original incremental mode + fn inc_step(&mut self, roots: &[LuaValue], pool: &mut ObjectPool) { + const WORK_PER_STEP: usize = 4096; + let mut work = 0; + + // State machine for incremental GC + loop { + match self.state { + GcState::Pause => { + // Start new cycle: mark roots and transition to propagate + self.start_cycle(roots, pool); + self.state = GcState::Propagate; + work += 100; // Small fixed cost + } + + GcState::Propagate => { + // Incremental marking: process some gray objects + let marked = self.propagate_step(pool, WORK_PER_STEP - work); + work += marked; + + if self.gray.is_empty() && self.grayagain.is_empty() { + // All marking done, go to atomic phase + self.state = GcState::Atomic; + } + } + + GcState::Atomic => { + // Atomic phase - must finish marking (like Lua's atomic) + // Process any grayagain objects + while let Some(gc_id) = self.grayagain.pop() { + self.mark_one(gc_id, pool); + } + // Start sweep + self.sweep_index = 0; + self.state = GcState::Sweep; + work += 50; + } + + GcState::Sweep => { + // Complete sweep in one step (pools are iterated directly) + let swept = self.sweep_step(pool, WORK_PER_STEP - work); + work += swept; + // sweep_step handles state transition and finish_cycle + break; + } + } + + // Check if we've done enough work for this step + if work >= WORK_PER_STEP { + break; + } + } + + // Reduce debt by work done (convert work to "bytes paid off") + self.gc_debt -= (work as isize) * 2; + } + + /// Start a new GC cycle - mark roots and build gray list + fn start_cycle(&mut self, roots: &[LuaValue], pool: &mut ObjectPool) { + self.stats.collection_count += 1; + self.gray.clear(); + self.grayagain.clear(); + self.propagate_work = 0; + + // Make all objects white by iterating pools directly + for (_id, table) in pool.tables.iter_mut() { + if !table.header.is_fixed() { + table.header.make_white(self.current_white); + } + } + for (_id, func) in pool.functions.iter_mut() { + if !func.header.is_fixed() { + func.header.make_white(self.current_white); + } + } + for (_id, upval) in pool.upvalues.iter_mut() { + if !upval.header.is_fixed() { + upval.header.make_white(self.current_white); + } + } + for (_id, thread) in pool.threads.iter_mut() { + if !thread.header.is_fixed() { + thread.header.make_white(self.current_white); + } + } + for (_id, string) in pool.strings.iter_mut() { + if !string.header.is_fixed() { + string.header.make_white(self.current_white); + } + } + + // Mark roots and add to gray list + for value in roots { + self.mark_value(value, pool); + } + } + + /// Make an object white (for start of cycle) + #[inline] + fn make_white(&self, gc_id: GcId, pool: &mut ObjectPool) { + match gc_id.gc_type() { + GcObjectType::Table => { + if let Some(t) = pool.tables.get_mut(gc_id.index()) { + if !t.header.is_fixed() { + t.header.make_white(self.current_white); + } + } + } + GcObjectType::Function => { + if let Some(f) = pool.functions.get_mut(gc_id.index()) { + if !f.header.is_fixed() { + f.header.make_white(self.current_white); + } + } + } + GcObjectType::Upvalue => { + if let Some(u) = pool.upvalues.get_mut(gc_id.index()) { + if !u.header.is_fixed() { + u.header.make_white(self.current_white); + } + } + } + GcObjectType::Thread => { + if let Some(t) = pool.threads.get_mut(gc_id.index()) { + if !t.header.is_fixed() { + t.header.make_white(self.current_white); + } + } + } + GcObjectType::String => { + if let Some(s) = pool.strings.get_mut(gc_id.index()) { + if !s.header.is_fixed() { + s.header.make_white(self.current_white); + } + } + } + GcObjectType::Userdata => {} + } + } + + /// Mark a value and add to gray list if needed + fn mark_value(&mut self, value: &LuaValue, pool: &mut ObjectPool) { + match value.kind() { + LuaValueKind::Table => { + if let Some(id) = value.as_table_id() { + if let Some(t) = pool.tables.get_mut(id.0) { + if t.header.is_white() { + t.header.make_gray(); + self.gray.push(GcId::TableId(id)); + } + } + } + } + LuaValueKind::Function => { + if let Some(id) = value.as_function_id() { + if let Some(f) = pool.functions.get_mut(id.0) { + if f.header.is_white() { + f.header.make_gray(); + self.gray.push(GcId::FunctionId(id)); + } + } + } + } + LuaValueKind::Thread => { + if let Some(id) = value.as_thread_id() { + if let Some(t) = pool.threads.get_mut(id.0) { + if t.header.is_white() { + t.header.make_gray(); + self.gray.push(GcId::ThreadId(id)); + } + } + } + } + LuaValueKind::String => { + if let Some(id) = value.as_string_id() { + if let Some(s) = pool.strings.get_mut(id.0) { + // Strings are leaves - mark black directly + s.header.make_black(); + } + } + } + _ => {} + } + } + + /// Do one step of propagation - process some gray objects + fn propagate_step(&mut self, pool: &mut ObjectPool, max_work: usize) -> usize { + let mut work = 0; + + while work < max_work { + if let Some(gc_id) = self.gray.pop() { + work += self.mark_one(gc_id, pool); + } else { + break; + } + } + + work + } + + /// Mark one gray object and its references + fn mark_one(&mut self, gc_id: GcId, pool: &mut ObjectPool) -> usize { + let mut work = 1; + + match gc_id.gc_type() { + GcObjectType::Table => { + if let Some(table) = pool.tables.get_mut(gc_id.index()) { + if table.header.is_gray() { + table.header.make_black(); + work += table.data.len(); + + // Collect references to mark + let refs: Vec = table + .data + .iter_all() + .into_iter() + .flat_map(|(k, v)| [k, v]) + .collect(); + let mt = table.data.get_metatable(); + + // Mark references + for v in refs { + self.mark_value(&v, pool); + } + if let Some(mt) = mt { + self.mark_value(&mt, pool); + } + } + } + } + GcObjectType::Function => { + if let Some(func) = pool.functions.get(gc_id.index()) { + let upvalue_ids = func.upvalues.clone(); + let constants = func.chunk.constants.clone(); + + if let Some(f) = pool.functions.get_mut(gc_id.index()) { + if f.header.is_gray() { + f.header.make_black(); + work += upvalue_ids.len() + constants.len(); + } + } + + // Mark upvalues + for upval_id in upvalue_ids { + if let Some(upval) = pool.upvalues.get_mut(upval_id.0) { + if upval.header.is_white() { + upval.header.make_gray(); + self.gray.push(GcId::UpvalueId(upval_id)); + } + } + } + + // Mark constants + for c in constants { + self.mark_value(&c, pool); + } + } + } + GcObjectType::Upvalue => { + if let Some(upval) = pool.upvalues.get_mut(gc_id.index()) { + if upval.header.is_gray() { + upval.header.make_black(); + if let UpvalueState::Closed(v) = upval.state { + self.mark_value(&v, pool); + } + } + } + } + GcObjectType::Thread => { + if let Some(thread) = pool.threads.get(gc_id.index()) { + let stack = thread.data.register_stack.clone(); + + if let Some(t) = pool.threads.get_mut(gc_id.index()) { + if t.header.is_gray() { + t.header.make_black(); + work += stack.len(); + } + } + + for v in stack { + self.mark_value(&v, pool); + } + } + } + GcObjectType::String => { + // Strings are leaves, just make black + if let Some(s) = pool.strings.get_mut(gc_id.index()) { + s.header.make_black(); + } + } + GcObjectType::Userdata => {} + } + + work + } + + /// Do one step of sweeping - sweep all pools in one step + /// This is acceptable because sweep is much faster than marking + fn sweep_step(&mut self, pool: &mut ObjectPool, _max_work: usize) -> usize { + // Sweep all pools in one step (much faster than incremental) + let collected = self.sweep_pools(pool); + self.stats.objects_collected += collected; + + // Sweeping done - transition to finished + self.state = GcState::Pause; + self.finish_cycle(); + + collected + } + + /// Sweep all pools directly + fn sweep_pools(&mut self, pool: &mut ObjectPool) -> usize { + let mut collected = 0; + + // Sweep tables + let mut dead_tables: Vec = Vec::with_capacity(64); + for (id, table) in pool.tables.iter() { + if !table.header.is_fixed() && table.header.is_white() { + dead_tables.push(id); + } + } + for id in dead_tables { + pool.tables.free(id); + self.record_deallocation(256); + collected += 1; + } + + // Sweep functions + let mut dead_funcs: Vec = Vec::with_capacity(64); + for (id, func) in pool.functions.iter() { + if !func.header.is_fixed() && func.header.is_white() { + dead_funcs.push(id); + } + } + for id in dead_funcs { + pool.functions.free(id); + self.record_deallocation(128); + collected += 1; + } + + // Sweep upvalues + let mut dead_upvals: Vec = Vec::with_capacity(64); + for (id, upval) in pool.upvalues.iter() { + if !upval.header.is_fixed() && upval.header.is_white() { + dead_upvals.push(id); + } + } + for id in dead_upvals { + pool.upvalues.free(id); + self.record_deallocation(64); + collected += 1; + } + + // Sweep strings + let mut dead_strings: Vec = Vec::with_capacity(64); + for (id, string) in pool.strings.iter() { + if !string.header.is_fixed() && string.header.is_white() { + dead_strings.push(id); + } + } + for id in dead_strings { + pool.strings.free(id); + self.record_deallocation(64); + collected += 1; + } + + // Sweep threads + let mut dead_threads: Vec = Vec::with_capacity(8); + for (id, thread) in pool.threads.iter() { + if !thread.header.is_fixed() && thread.header.is_white() { + dead_threads.push(id); + } + } + for id in dead_threads { + pool.threads.free(id); + self.record_deallocation(512); + collected += 1; + } + + collected } - /// Main collection - mark and sweep directly on Arena + /// Finish the GC cycle + fn finish_cycle(&mut self) { + // Flip white bit for next cycle + self.current_white ^= 1; + + // Set debt based on memory and pause factor + let estimate = self.total_bytes; + let threshold = (estimate as isize * self.gc_pause as isize) / 100; + self.gc_debt = self.total_bytes as isize - threshold; + } + + // ============ Generational GC Methods ============ + + /// Set debt for next minor collection + /// Minor GC happens when memory grows by gen_minor_mul% + fn set_minor_debt(&mut self) { + let debt = -((self.total_bytes / 100) as isize * self.gen_minor_mul as isize); + self.gc_debt = debt; + } + + /// Set pause for major collection (like Lua's setpause) + fn set_pause(&mut self) { + let estimate = self.gc_estimate.max(self.total_bytes); + let threshold = (estimate as isize * self.gc_pause as isize) / 100; + self.gc_debt = self.total_bytes as isize - threshold; + } + + /// Minor collection - only collect young generation + /// Like Lua's youngcollection + fn young_collection(&mut self, roots: &[LuaValue], pool: &mut ObjectPool) { + self.stats.collection_count += 1; + self.stats.minor_collections += 1; + + // Clear gray lists + self.gray.clear(); + self.grayagain.clear(); + + // Mark roots + for value in roots { + self.mark_value(value, pool); + } + + // Mark touched old objects (they may point to young objects) + for gc_id in std::mem::take(&mut self.touched_list) { + self.mark_object_gen(gc_id, pool); + } + + // Propagate marks + while let Some(gc_id) = self.gray.pop() { + self.mark_one(gc_id, pool); + } + + // Process grayagain + while let Some(gc_id) = self.grayagain.pop() { + self.mark_one(gc_id, pool); + } + + // Sweep young objects and age them + let collected = self.sweep_young(pool); + self.stats.objects_collected += collected; + + // Flip white for next cycle + self.current_white ^= 1; + } + + /// Mark an object for generational GC + fn mark_object_gen(&mut self, gc_id: GcId, pool: &mut ObjectPool) { + match gc_id.gc_type() { + GcObjectType::Table => { + if let Some(t) = pool.tables.get_mut(gc_id.index()) { + if t.header.is_white() { + t.header.make_gray(); + self.gray.push(gc_id); + } + } + } + GcObjectType::Function => { + if let Some(f) = pool.functions.get_mut(gc_id.index()) { + if f.header.is_white() { + f.header.make_gray(); + self.gray.push(gc_id); + } + } + } + GcObjectType::Upvalue => { + if let Some(u) = pool.upvalues.get_mut(gc_id.index()) { + if u.header.is_white() { + u.header.make_gray(); + self.gray.push(gc_id); + } + } + } + GcObjectType::Thread => { + if let Some(t) = pool.threads.get_mut(gc_id.index()) { + if t.header.is_white() { + t.header.make_gray(); + self.gray.push(gc_id); + } + } + } + GcObjectType::String => { + if let Some(s) = pool.strings.get_mut(gc_id.index()) { + s.header.make_black(); + } + } + GcObjectType::Userdata => {} + } + } + + /// Sweep young objects: delete dead, age survivors + fn sweep_young(&mut self, pool: &mut ObjectPool) -> usize { + let mut collected = 0; + let mut new_young = Vec::with_capacity(self.young_list.len()); + + for gc_id in std::mem::take(&mut self.young_list) { + let (is_alive, age) = self.get_object_age(gc_id, pool); + + if !is_alive { + // Dead object - free it + self.free_object(gc_id, pool); + collected += 1; + } else { + // Alive - advance age + let new_age = match age { + G_NEW => G_SURVIVAL, + G_SURVIVAL => G_OLD1, + _ => age, + }; + + self.set_object_age(gc_id, new_age, pool); + + // Keep in young list if still young, otherwise it graduates + if new_age <= G_SURVIVAL { + new_young.push(gc_id); + } else { + self.stats.promoted_objects += 1; + } + + // Make white for next cycle + self.make_white(gc_id, pool); + } + } + + self.young_list = new_young; + collected + } + + /// Get object's age + fn get_object_age(&self, gc_id: GcId, pool: &ObjectPool) -> (bool, u8) { + match gc_id.gc_type() { + GcObjectType::Table => { + if let Some(t) = pool.tables.get(gc_id.index()) { + (!t.header.is_white(), t.header.age()) + } else { + (false, G_NEW) + } + } + GcObjectType::Function => { + if let Some(f) = pool.functions.get(gc_id.index()) { + (!f.header.is_white(), f.header.age()) + } else { + (false, G_NEW) + } + } + GcObjectType::Upvalue => { + if let Some(u) = pool.upvalues.get(gc_id.index()) { + (!u.header.is_white(), u.header.age()) + } else { + (false, G_NEW) + } + } + GcObjectType::Thread => { + if let Some(t) = pool.threads.get(gc_id.index()) { + (!t.header.is_white(), t.header.age()) + } else { + (false, G_NEW) + } + } + GcObjectType::String => { + if let Some(s) = pool.strings.get(gc_id.index()) { + (!s.header.is_white(), s.header.age()) + } else { + (false, G_NEW) + } + } + GcObjectType::Userdata => (true, G_OLD), + } + } + + /// Set object's age + fn set_object_age(&self, gc_id: GcId, age: u8, pool: &mut ObjectPool) { + match gc_id.gc_type() { + GcObjectType::Table => { + if let Some(t) = pool.tables.get_mut(gc_id.index()) { + t.header.set_age(age); + } + } + GcObjectType::Function => { + if let Some(f) = pool.functions.get_mut(gc_id.index()) { + f.header.set_age(age); + } + } + GcObjectType::Upvalue => { + if let Some(u) = pool.upvalues.get_mut(gc_id.index()) { + u.header.set_age(age); + } + } + GcObjectType::Thread => { + if let Some(t) = pool.threads.get_mut(gc_id.index()) { + t.header.set_age(age); + } + } + GcObjectType::String => { + if let Some(s) = pool.strings.get_mut(gc_id.index()) { + s.header.set_age(age); + } + } + GcObjectType::Userdata => {} + } + } + + /// Full generational collection - like Lua's fullgen + fn full_gen(&mut self, roots: &[LuaValue], pool: &mut ObjectPool) -> usize { + self.stats.major_collections += 1; + + // Do a full mark-sweep + self.clear_marks(pool); + self.mark_roots(roots, pool); + let collected = self.sweep(pool); + + // Reset generational state + self.gc_estimate = self.total_bytes; + self.young_list.clear(); + self.touched_list.clear(); + + // Make all surviving objects old + self.make_all_old(pool); + + self.stats.objects_collected += collected; + collected + } + + /// Make all surviving objects old (for entering generational mode) + fn make_all_old(&self, pool: &mut ObjectPool) { + for (_id, t) in pool.tables.iter_mut() { + if !t.header.is_fixed() { + t.header.set_age(G_OLD); + t.header.make_black(); + } + } + for (_id, f) in pool.functions.iter_mut() { + if !f.header.is_fixed() { + f.header.set_age(G_OLD); + f.header.make_black(); + } + } + for (_id, u) in pool.upvalues.iter_mut() { + if !u.header.is_fixed() { + u.header.set_age(G_OLD); + u.header.make_black(); + } + } + for (_id, t) in pool.threads.iter_mut() { + if !t.header.is_fixed() { + t.header.set_age(G_OLD); + t.header.make_black(); + } + } + for (_id, s) in pool.strings.iter_mut() { + if !s.header.is_fixed() { + s.header.set_age(G_OLD); + s.header.make_black(); + } + } + } + + /// Handle bad collection - step through full GC + fn step_gen_full(&mut self, roots: &[LuaValue], pool: &mut ObjectPool) { + let last_atomic = self.last_atomic; + + // Do a full collection + let new_atomic = self.full_gen(roots, pool); + + // Check if this was a good collection + if new_atomic < last_atomic + (last_atomic / 8) { + // Good - return to generational mode + self.last_atomic = 0; + self.set_minor_debt(); + } else { + // Still bad + self.last_atomic = new_atomic; + self.set_pause(); + } + } + + // ============ Write Barriers for Generational GC ============ + + /// Forward barrier: when black object 'from' points to white object 'to' + /// Mark 'to' and possibly make it old + pub fn barrier_forward_gen(&mut self, from_id: GcId, to_id: GcId, pool: &mut ObjectPool) { + if self.gc_kind != GcKind::Generational { + return; + } + + // Check if 'from' is old + let from_is_old = self.is_object_old(from_id, pool); + + if from_is_old { + // Mark the target object and make it OLD0 + // This ensures it won't be collected and will age properly + self.mark_object_gen(to_id, pool); + self.set_object_age(to_id, G_OLD0, pool); + } + } + + /// Back barrier: when old object 'obj' is modified to point to young object + /// Mark 'obj' as touched so it will be revisited in minor collection + pub fn barrier_back_gen(&mut self, obj_id: GcId, pool: &mut ObjectPool) { + if self.gc_kind != GcKind::Generational { + return; + } + + let age = match obj_id.gc_type() { + GcObjectType::Table => pool.tables.get(obj_id.index()).map(|t| t.header.age()), + GcObjectType::Function => pool.functions.get(obj_id.index()).map(|f| f.header.age()), + GcObjectType::Thread => pool.threads.get(obj_id.index()).map(|t| t.header.age()), + _ => None, + }; + + if let Some(age) = age { + if age >= G_OLD0 && age != G_TOUCHED1 { + // Mark as touched and add to touched list + self.set_object_age(obj_id, G_TOUCHED1, pool); + self.touched_list.push(obj_id); + } + } + } + + /// Check if an object is old + fn is_object_old(&self, gc_id: GcId, pool: &ObjectPool) -> bool { + match gc_id.gc_type() { + GcObjectType::Table => { + pool.tables.get(gc_id.index()).map(|t| t.header.age() >= G_OLD0).unwrap_or(false) + } + GcObjectType::Function => { + pool.functions.get(gc_id.index()).map(|f| f.header.age() >= G_OLD0).unwrap_or(false) + } + GcObjectType::Upvalue => { + pool.upvalues.get(gc_id.index()).map(|u| u.header.age() >= G_OLD0).unwrap_or(false) + } + GcObjectType::Thread => { + pool.threads.get(gc_id.index()).map(|t| t.header.age() >= G_OLD0).unwrap_or(false) + } + GcObjectType::String => { + pool.strings.get(gc_id.index()).map(|s| s.header.age() >= G_OLD0).unwrap_or(false) + } + GcObjectType::Userdata => true, + } + } + + /// Main collection - mark and sweep using allgc list + /// Like Lua's full GC cycle pub fn collect(&mut self, roots: &[LuaValue], pool: &mut ObjectPool) -> usize { self.stats.collection_count += 1; self.stats.major_collections += 1; - // Phase 1: Clear all marks + // Phase 1: Clear all marks (only for tracked objects) self.clear_marks(pool); // Phase 2: Mark from roots self.mark_roots(roots, pool); - // Phase 3: Sweep (free unmarked objects) + // Phase 3: Sweep (only traverse allgc, not entire pools!) let collected = self.sweep(pool); - // Update debt - let alive_estimate = - pool.tables.len() * 256 + pool.functions.len() * 128 + pool.strings.len() * 64; - self.gc_debt = -((alive_estimate * self.gc_pause / 100) as isize); + // Like Lua's setpause: set debt based on memory and pause factor + // gc_pause = 200 means wait until memory doubles (200% of current) + // debt = current_memory - (estimate * pause / 100) + // Since estimate ≈ current_memory after GC, debt becomes negative + let estimate = self.total_bytes; + let threshold = (estimate as isize * self.gc_pause as isize) / 100; + self.gc_debt = self.total_bytes as isize - threshold; self.stats.objects_collected += collected; collected } - /// Clear all marks in all arenas (skip fixed objects - they stay marked) + /// Clear marks by iterating pools directly (no allgc needed) fn clear_marks(&self, pool: &mut ObjectPool) { - for (_, table) in pool.tables.iter_mut() { - if !table.header.fixed { - table.header.marked = false; + // Clear tables + for (_id, table) in pool.tables.iter_mut() { + if !table.header.is_fixed() { + table.header.make_white(0); } } - for (_, func) in pool.functions.iter_mut() { - if !func.header.fixed { - func.header.marked = false; + + // Clear functions + for (_id, func) in pool.functions.iter_mut() { + if !func.header.is_fixed() { + func.header.make_white(0); } } - for (_, upval) in pool.upvalues.iter_mut() { - if !upval.header.fixed { - upval.header.marked = false; + + // Clear upvalues + for (_id, upval) in pool.upvalues.iter_mut() { + if !upval.header.is_fixed() { + upval.header.make_white(0); } } - for (_, thread) in pool.threads.iter_mut() { - if !thread.header.fixed { - thread.header.marked = false; + + // Clear threads + for (_id, thread) in pool.threads.iter_mut() { + if !thread.header.is_fixed() { + thread.header.make_white(0); } } - for (_, string) in pool.strings.iter_mut() { - if !string.header.fixed { - string.header.marked = false; + + // Clear strings (but leave interned strings fixed) + for (_id, string) in pool.strings.iter_mut() { + if !string.header.is_fixed() { + string.header.make_white(0); } } - // Note: userdata uses Rc internally, no GcHeader } /// Mark phase - traverse from roots @@ -188,8 +1116,8 @@ impl GC { crate::lua_value::LuaValueKind::Table => { if let Some(id) = value.as_table_id() { if let Some(table) = pool.tables.get_mut(id.0) { - if !table.header.marked { - table.header.marked = true; + if table.header.is_white() { + table.header.make_black(); // Add table contents to worklist for (k, v) in table.data.iter_all() { worklist.push(k); @@ -207,7 +1135,7 @@ impl GC { // First, collect data we need without holding mutable borrow let (should_mark, upvalue_ids, constants) = { if let Some(func) = pool.functions.get(id.0) { - if !func.header.marked { + if func.header.is_white() { (true, func.upvalues.clone(), func.chunk.constants.clone()) } else { (false, vec![], vec![]) @@ -220,14 +1148,14 @@ impl GC { if should_mark { // Now we can safely mark if let Some(func) = pool.functions.get_mut(id.0) { - func.header.marked = true; + func.header.make_black(); } // Mark upvalues separately for upval_id in upvalue_ids { if let Some(upval) = pool.upvalues.get_mut(upval_id.0) { - if !upval.header.marked { - upval.header.marked = true; + if upval.header.is_white() { + upval.header.make_black(); if let UpvalueState::Closed(v) = &upval.state { worklist.push(*v); } @@ -245,7 +1173,7 @@ impl GC { // Collect stack values first let stack_values = { if let Some(thread) = pool.threads.get(id.0) { - if !thread.header.marked { + if thread.header.is_white() { Some(thread.data.register_stack.clone()) } else { None @@ -257,7 +1185,7 @@ impl GC { if let Some(values) = stack_values { if let Some(thread) = pool.threads.get_mut(id.0) { - thread.header.marked = true; + thread.header.make_black(); } worklist.extend(values); } @@ -270,7 +1198,7 @@ impl GC { // Mark strings (they can be collected if not fixed) if let Some(id) = value.as_string_id() { if let Some(string) = pool.strings.get_mut(id.0) { - string.header.marked = true; + string.header.make_black(); } } } @@ -279,79 +1207,151 @@ impl GC { } } - /// Sweep phase - free unmarked objects (skip fixed objects) + /// Sweep phase - iterate pools directly instead of allgc + /// This is much faster for allocation (no allgc.push) at cost of sweep traversal fn sweep(&mut self, pool: &mut ObjectPool) -> usize { let mut collected = 0; - // Collect unmarked tables (skip fixed ones) - let tables_to_free: Vec = pool - .tables - .iter() - .filter(|(_, t)| !t.header.marked && !t.header.fixed) - .map(|(id, _)| id) - .collect(); - for id in tables_to_free { + // Sweep tables + let mut dead_tables: Vec = Vec::with_capacity(64); + for (id, table) in pool.tables.iter() { + if !table.header.is_fixed() && table.header.is_white() { + dead_tables.push(id); + } + } + for id in dead_tables { pool.tables.free(id); - collected += 1; self.record_deallocation(256); + collected += 1; } - // Collect unmarked functions (skip fixed ones) - let funcs_to_free: Vec = pool - .functions - .iter() - .filter(|(_, f)| !f.header.marked && !f.header.fixed) - .map(|(id, _)| id) - .collect(); - for id in funcs_to_free { + // Sweep functions + let mut dead_funcs: Vec = Vec::with_capacity(64); + for (id, func) in pool.functions.iter() { + if !func.header.is_fixed() && func.header.is_white() { + dead_funcs.push(id); + } + } + for id in dead_funcs { pool.functions.free(id); - collected += 1; self.record_deallocation(128); + collected += 1; } - // Collect unmarked upvalues (skip fixed ones) - let upvals_to_free: Vec = pool - .upvalues - .iter() - .filter(|(_, u)| !u.header.marked && !u.header.fixed) - .map(|(id, _)| id) - .collect(); - for id in upvals_to_free { + // Sweep upvalues + let mut dead_upvals: Vec = Vec::with_capacity(64); + for (id, upval) in pool.upvalues.iter() { + if !upval.header.is_fixed() && upval.header.is_white() { + dead_upvals.push(id); + } + } + for id in dead_upvals { pool.upvalues.free(id); + self.record_deallocation(64); collected += 1; } - // Collect unmarked threads (skip fixed ones) - let threads_to_free: Vec = pool - .threads - .iter() - .filter(|(_, t)| !t.header.marked && !t.header.fixed) - .map(|(id, _)| id) - .collect(); - for id in threads_to_free { - pool.threads.free(id); - collected += 1; + // Sweep strings - but leave interned strings (short strings are usually fixed) + let mut dead_strings: Vec = Vec::with_capacity(64); + for (id, string) in pool.strings.iter() { + if !string.header.is_fixed() && string.header.is_white() { + dead_strings.push(id); + } } - - // Collect unmarked strings (skip fixed ones) - // Note: interned strings are usually kept, but this handles non-interned long strings - let strings_to_free: Vec = pool - .strings - .iter() - .filter(|(_, s)| !s.header.marked && !s.header.fixed) - .map(|(id, _)| id) - .collect(); - for id in strings_to_free { + for id in dead_strings { pool.strings.free(id); - collected += 1; self.record_deallocation(64); + collected += 1; } - // Note: userdata uses Rc internally, no sweep needed + // Sweep threads + let mut dead_threads: Vec = Vec::with_capacity(8); + for (id, thread) in pool.threads.iter() { + if !thread.header.is_fixed() && thread.header.is_white() { + dead_threads.push(id); + } + } + for id in dead_threads { + pool.threads.free(id); + self.record_deallocation(512); + collected += 1; + } collected } + /// Get marked (not white) and fixed state for an object + #[allow(unused)] + #[inline] + fn get_object_state(&self, gc_id: GcId, pool: &ObjectPool) -> (bool, bool) { + match gc_id.gc_type() { + GcObjectType::Table => { + if let Some(t) = pool.tables.get(gc_id.index()) { + (!t.header.is_white(), t.header.is_fixed()) + } else { + (false, false) + } + } + GcObjectType::Function => { + if let Some(f) = pool.functions.get(gc_id.index()) { + (!f.header.is_white(), f.header.is_fixed()) + } else { + (false, false) + } + } + GcObjectType::Upvalue => { + if let Some(u) = pool.upvalues.get(gc_id.index()) { + (!u.header.is_white(), u.header.is_fixed()) + } else { + (false, false) + } + } + GcObjectType::Thread => { + if let Some(t) = pool.threads.get(gc_id.index()) { + (!t.header.is_white(), t.header.is_fixed()) + } else { + (false, false) + } + } + GcObjectType::String => { + if let Some(s) = pool.strings.get(gc_id.index()) { + (!s.header.is_white(), s.header.is_fixed()) + } else { + (false, false) + } + } + GcObjectType::Userdata => (true, true), // Userdata uses Rc, always "alive" + } + } + + /// Free an object from its pool + #[inline] + fn free_object(&mut self, gc_id: GcId, pool: &mut ObjectPool) { + match gc_id.gc_type() { + GcObjectType::Table => { + pool.tables.free(gc_id.index()); + self.record_deallocation(256); + } + GcObjectType::Function => { + pool.functions.free(gc_id.index()); + self.record_deallocation(128); + } + GcObjectType::Upvalue => { + pool.upvalues.free(gc_id.index()); + self.record_deallocation(64); + } + GcObjectType::Thread => { + pool.threads.free(gc_id.index()); + self.record_deallocation(512); + } + GcObjectType::String => { + pool.strings.free(gc_id.index()); + self.record_deallocation(64); + } + GcObjectType::Userdata => {} // Rc handles this + } + } + /// Write barrier - no-op in simple mark-sweep #[inline(always)] pub fn barrier_forward(&mut self, _obj_type: GcObjectType, _obj_id: u32) { @@ -373,6 +1373,9 @@ impl GC { GcObjectType::String => 64, GcObjectType::Table => 256, GcObjectType::Function => 128, + GcObjectType::Upvalue => 64, + GcObjectType::Thread => 512, + GcObjectType::Userdata => 32, }; self.record_deallocation(size); } diff --git a/crates/luars/src/gc/object_pool.rs b/crates/luars/src/gc/object_pool.rs index b89ee179..58de4226 100644 --- a/crates/luars/src/gc/object_pool.rs +++ b/crates/luars/src/gc/object_pool.rs @@ -9,147 +9,12 @@ // 6. GC headers embedded in objects for mark-sweep use crate::lua_value::{Chunk, LuaThread, LuaUserdata}; -use crate::{LuaString, LuaTable, LuaValue}; -use std::hash::Hash; +use crate::{ + FunctionId, GcFunction, GcHeader, GcString, GcTable, GcThread, GcUpvalue, LuaString, LuaTable, + LuaValue, StringId, TableId, ThreadId, UpvalueId, UpvalueState, UserdataId, +}; use std::rc::Rc; -// ============ GC Header ============ - -/// GC object header - embedded in every GC-managed object -/// Based on Lua 5.4's CommonHeader design -/// Kept minimal to reduce memory overhead -#[derive(Clone, Copy, Default)] -#[repr(C)] -pub struct GcHeader { - pub marked: bool, - pub age: u8, // For generational GC (like Lua's G_NEW, G_SURVIVAL, G_OLD, etc.) - pub fixed: bool, // If true, object is never collected (like Lua's fixedgc list) -} - -// ============ Object IDs ============ -// All IDs are simple u32 indices - compact and efficient - -#[derive(Clone, Copy, PartialEq, Eq, Hash, Debug, Default)] -#[repr(transparent)] -pub struct StringId(pub u32); - -#[derive(Clone, Copy, PartialEq, Eq, Hash, Debug, Default)] -#[repr(transparent)] -pub struct TableId(pub u32); - -#[derive(Clone, Copy, PartialEq, Eq, Hash, Debug, Default)] -#[repr(transparent)] -pub struct FunctionId(pub u32); - -#[derive(Clone, Copy, PartialEq, Eq, Hash, Debug, Default)] -#[repr(transparent)] -pub struct UpvalueId(pub u32); - -#[derive(Clone, Copy, PartialEq, Eq, Hash, Debug, Default)] -#[repr(transparent)] -pub struct UserdataId(pub u32); - -#[derive(Clone, Copy, PartialEq, Eq, Hash, Debug, Default)] -#[repr(transparent)] -pub struct ThreadId(pub u32); - -// ============ GC-managed Objects ============ - -/// Table with embedded GC header -pub struct GcTable { - pub header: GcHeader, - pub data: LuaTable, -} - -/// Lua function with embedded GC header -pub struct GcFunction { - pub header: GcHeader, - pub chunk: Rc, - pub upvalues: Vec, // Upvalue IDs, not Rc -} - -/// Upvalue state - uses absolute stack index like Lua C implementation -#[derive(Debug, Clone)] -pub enum UpvalueState { - Open { stack_index: usize }, - Closed(LuaValue), -} - -/// Upvalue with embedded GC header -pub struct GcUpvalue { - pub header: GcHeader, - pub state: UpvalueState, -} - -impl GcUpvalue { - /// Check if this upvalue points to the given absolute stack index - #[inline] - pub fn points_to_index(&self, index: usize) -> bool { - matches!(&self.state, UpvalueState::Open { stack_index } if *stack_index == index) - } - - /// Check if this upvalue is open (still points to stack) - #[inline] - pub fn is_open(&self) -> bool { - matches!(&self.state, UpvalueState::Open { .. }) - } - - /// Close this upvalue with the given value - #[inline] - pub fn close(&mut self, value: LuaValue) { - self.state = UpvalueState::Closed(value); - } - - /// Get the value of a closed upvalue (returns None if still open) - #[inline] - pub fn get_closed_value(&self) -> Option { - match &self.state { - UpvalueState::Closed(v) => Some(v.clone()), - _ => None, - } - } - - /// Get the absolute stack index if this upvalue is open - #[inline] - pub fn get_stack_index(&self) -> Option { - match &self.state { - UpvalueState::Open { stack_index } => Some(*stack_index), - _ => None, - } - } - - /// Set closed upvalue value directly without checking state - /// SAFETY: Must only be called when upvalue is in Closed state - #[inline(always)] - pub unsafe fn set_closed_value_unchecked(&mut self, value: LuaValue) { - if let UpvalueState::Closed(ref mut v) = self.state { - *v = value; - } - } - - /// Get closed value reference directly without Option - /// SAFETY: Must only be called when upvalue is in Closed state - #[inline(always)] - pub unsafe fn get_closed_value_ref_unchecked(&self) -> &LuaValue { - match &self.state { - UpvalueState::Closed(v) => v, - _ => unsafe { std::hint::unreachable_unchecked() }, - } - } -} - -/// String with embedded GC header -pub struct GcString { - pub header: GcHeader, - pub data: LuaString, -} - -/// Thread (coroutine) with embedded GC header -pub struct GcThread { - pub header: GcHeader, - pub data: LuaThread, -} - // ============ Pool Storage ============ /// Simple Vec-based pool for small objects @@ -244,6 +109,24 @@ impl Pool { } } + /// Get number of free slots in the free list + #[inline] + pub fn free_slots_count(&self) -> usize { + self.free_list.len() + } + + /// Trim trailing None values from the pool to reduce iteration overhead + /// This removes None values from the end of the data vec + pub fn trim_tail(&mut self) { + // Remove trailing None values + while self.data.last().map_or(false, |v| v.is_none()) { + self.data.pop(); + } + // Remove free list entries that are now out of bounds + let max_valid = self.data.len() as u32; + self.free_list.retain(|&id| id < max_valid); + } + /// Check if a slot is occupied #[inline(always)] pub fn is_valid(&self, id: u32) -> bool { @@ -392,6 +275,21 @@ impl BoxPool { } } + /// Get number of free slots in the free list + #[inline] + pub fn free_slots_count(&self) -> usize { + self.free_list.len() + } + + /// Trim trailing None values from the pool to reduce iteration overhead + pub fn trim_tail(&mut self) { + while self.data.last().map_or(false, |v| v.is_none()) { + self.data.pop(); + } + let max_valid = self.data.len() as u32; + self.free_list.retain(|&id| id < max_valid); + } + /// Check if a slot is occupied #[inline(always)] pub fn is_valid(&self, id: u32) -> bool { @@ -1037,8 +935,8 @@ impl ObjectPool { #[inline] pub fn fix_string(&mut self, id: StringId) { if let Some(gs) = self.strings.get_mut(id.0) { - gs.header.fixed = true; - gs.header.marked = true; // Always considered marked + gs.header.set_fixed(); + gs.header.make_black(); // Always considered marked } } @@ -1046,8 +944,8 @@ impl ObjectPool { #[inline] pub fn fix_table(&mut self, id: TableId) { if let Some(gt) = self.tables.get_mut(id.0) { - gt.header.fixed = true; - gt.header.marked = true; + gt.header.set_fixed(); + gt.header.make_black(); } } @@ -1228,56 +1126,57 @@ impl ObjectPool { // ==================== GC Support ==================== - /// Clear all mark bits before GC mark phase + /// Clear all mark bits before GC mark phase (make all objects white) pub fn clear_marks(&mut self) { for (_, gs) in self.strings.iter_mut() { - gs.header.marked = false; + gs.header.make_white(0); } for (_, gt) in self.tables.iter_mut() { - gt.header.marked = false; + gt.header.make_white(0); } for (_, gf) in self.functions.iter_mut() { - gf.header.marked = false; + gf.header.make_white(0); } for (_, gu) in self.upvalues.iter_mut() { - gu.header.marked = false; + gu.header.make_white(0); } for (_, gth) in self.threads.iter_mut() { - gth.header.marked = false; + gth.header.make_white(0); } } - /// Sweep phase: free all unmarked objects + /// Sweep phase: free all unmarked (white) objects pub fn sweep(&mut self) { // Collect IDs to free (can't free while iterating) + // White objects are unmarked and should be collected let strings_to_free: Vec = self .strings .iter() - .filter(|(_, gs)| !gs.header.marked) + .filter(|(_, gs)| gs.header.is_white()) .map(|(id, _)| id) .collect(); let tables_to_free: Vec = self .tables .iter() - .filter(|(_, gt)| !gt.header.marked) + .filter(|(_, gt)| gt.header.is_white()) .map(|(id, _)| id) .collect(); let functions_to_free: Vec = self .functions .iter() - .filter(|(_, gf)| !gf.header.marked) + .filter(|(_, gf)| gf.header.is_white()) .map(|(id, _)| id) .collect(); let upvalues_to_free: Vec = self .upvalues .iter() - .filter(|(_, gu)| !gu.header.marked) + .filter(|(_, gu)| gu.header.is_white()) .map(|(id, _)| id) .collect(); let threads_to_free: Vec = self .threads .iter() - .filter(|(_, gth)| !gth.header.marked) + .filter(|(_, gth)| gth.header.is_white()) .map(|(id, _)| id) .collect(); diff --git a/crates/luars/src/lib_registry.rs b/crates/luars/src/lib_registry.rs index eb66ecbd..70567078 100644 --- a/crates/luars/src/lib_registry.rs +++ b/crates/luars/src/lib_registry.rs @@ -219,7 +219,7 @@ pub fn args_iter(vm: &LuaVM) -> impl Iterator + '_ { let frame = vm.current_frame(); let base_ptr = frame.base_ptr as usize; let top = frame.top as usize; - + (1..top).map(move |i| (i, vm.register_stack[base_ptr + i])) } diff --git a/crates/luars/src/lua_value/lua_table.rs b/crates/luars/src/lua_value/lua_table.rs index 84450ff7..d23b6ec9 100644 --- a/crates/luars/src/lua_value/lua_table.rs +++ b/crates/luars/src/lua_value/lua_table.rs @@ -28,6 +28,20 @@ impl Node { } } +/// Metamethod flags for fast lookup (like Lua 5.4's flags field) +/// A bit set to 1 means the metamethod is NOT present (absence cache) +/// Only the first 6 metamethods use this optimization (TM_INDEX..TM_EQ) +pub mod tm_flags { + pub const TM_INDEX: u8 = 1 << 0; // __index + pub const TM_NEWINDEX: u8 = 1 << 1; // __newindex + pub const TM_GC: u8 = 1 << 2; // __gc + pub const TM_MODE: u8 = 1 << 3; // __mode + pub const TM_LEN: u8 = 1 << 4; // __len + pub const TM_EQ: u8 = 1 << 5; // __eq + pub const TM_CALL: u8 = 1 << 6; // __call (bonus: very common) + pub const MASK_ALL: u8 = 0x7F; // All 7 bits +} + /// Lua table implementation /// - Array part for integer keys [1..n] /// - Hash part using open addressing with chaining (same as Lua 5.4) @@ -47,6 +61,11 @@ pub struct LuaTable { /// Metatable - optional table that defines special behaviors /// Store as LuaValue (table ID) instead of Rc for ID-based architecture metatable: Option, + + /// Metamethod absence flags (like Lua 5.4) + /// A bit set to 1 means the metamethod is NOT present (cached absence) + /// This allows O(1) check for common metamethods instead of hash lookup + pub tm_flags: u8, } impl LuaTable { @@ -60,6 +79,7 @@ impl LuaTable { nodes: Vec::new(), hash_size: 0, metatable: None, + tm_flags: 0, // All metamethods unknown initially }; } @@ -83,6 +103,7 @@ impl LuaTable { }, hash_size: 0, metatable: None, + tm_flags: 0, // All metamethods unknown initially } } @@ -235,8 +256,32 @@ impl LuaTable { } /// Set the metatable of this table + /// Resets tm_flags since the new metatable may have different metamethods pub fn set_metatable(&mut self, mt: Option) { self.metatable = mt; + // Reset all flags - metamethods need to be re-checked + self.tm_flags = 0; + } + + /// Fast metamethod absence check (like Lua 5.4's fasttm macro) + /// Returns true if the metamethod is known to be absent (flag is set) + /// This is O(1) vs O(n) hash lookup + #[inline(always)] + pub fn tm_absent(&self, flag: u8) -> bool { + (self.tm_flags & flag) != 0 + } + + /// Mark a metamethod as absent (cache the lookup result) + /// Called after a failed lookup to speed up future checks + #[inline(always)] + pub fn set_tm_absent(&mut self, flag: u8) { + self.tm_flags |= flag; + } + + /// Clear a specific tm flag (called when metamethod is set) + #[inline(always)] + pub fn clear_tm_absent(&mut self, flag: u8) { + self.tm_flags &= !flag; } /// Fast integer key access - O(1) for array part diff --git a/crates/luars/src/lua_value/mod.rs b/crates/luars/src/lua_value/mod.rs index 36235eee..9c3c3e16 100644 --- a/crates/luars/src/lua_value/mod.rs +++ b/crates/luars/src/lua_value/mod.rs @@ -15,6 +15,7 @@ use std::rc::Rc; // Re-export the optimized LuaValue and type enum for pattern matching pub use lua_table::LuaTable; +pub use lua_table::tm_flags; // Export TmFlags for metamethod absence caching pub use lua_thread::*; pub use lua_value::{ ID_MASK, diff --git a/crates/luars/src/lua_vm/execute/arithmetic_instructions.rs b/crates/luars/src/lua_vm/execute/arithmetic_instructions.rs index d4b467b0..996998cf 100644 --- a/crates/luars/src/lua_vm/execute/arithmetic_instructions.rs +++ b/crates/luars/src/lua_vm/execute/arithmetic_instructions.rs @@ -1056,6 +1056,7 @@ pub fn exec_not(vm: &mut LuaVM, instr: u32, base_ptr: usize) { } /// LEN: R[A] = #R[B] +/// OPTIMIZED: Fast path for tables without __len metamethod #[inline(always)] pub fn exec_len(vm: &mut LuaVM, instr: u32, base_ptr: usize) -> LuaResult<()> { let a = Instruction::get_a(instr) as usize; @@ -1063,43 +1064,85 @@ pub fn exec_len(vm: &mut LuaVM, instr: u32, base_ptr: usize) -> LuaResult<()> { let value = vm.register_stack[base_ptr + b]; - // Check for __len metamethod first (for tables) - if value.is_table() { - // Use pre-cached __len StringId - let mm_key = LuaValue::string(vm.object_pool.tm_len); - if let Some(mt) = vm.table_get_metatable(&value) { - if let Some(metamethod) = vm.table_get_with_meta(&mt, &mm_key) { - if !metamethod.is_nil() { - let result = vm - .call_metamethod(&metamethod, &[value])? - .unwrap_or(LuaValue::nil()); - vm.register_stack[base_ptr + a] = result; - return Ok(()); - } - } + // Fast path: string length - no metamethod + if let Some(string_id) = value.as_string_id() { + if let Some(s) = vm.object_pool.get_string(string_id) { + vm.register_stack[base_ptr + a] = LuaValue::integer(s.as_str().len() as i64); + return Ok(()); } } - // Use ObjectPool for table/string length - let len = if let Some(table_id) = value.as_table_id() { - if let Some(table) = vm.object_pool.get_table(table_id) { - table.len() as i64 - } else { - 0 - } - } else if let Some(string_id) = value.as_string_id() { - if let Some(s) = vm.object_pool.get_string(string_id) { - s.as_str().len() as i64 + // Table length with metamethod support + if let Some(table_id) = value.as_table_id() { + use crate::lua_value::tm_flags; + + // Single table access to get both length and metatable info + let table = match vm.object_pool.get_table(table_id) { + Some(t) => t, + None => return Err(vm.error("invalid table")), + }; + + // FAST PATH: No metatable + let mt_val = match table.get_metatable() { + None => { + let len = table.len() as i64; + vm.register_stack[base_ptr + a] = LuaValue::integer(len); + return Ok(()); + } + Some(mt) => mt, + }; + + // Has metatable - check for __len + let mt_id = match mt_val.as_table_id() { + Some(id) => id, + None => { + let len = table.len() as i64; + vm.register_stack[base_ptr + a] = LuaValue::integer(len); + return Ok(()); + } + }; + + // Get both fasttm flag and __len value in one lookup + let (len, len_mm) = { + let mt_table = match vm.object_pool.get_table(mt_id) { + Some(t) => t, + None => { + let len = table.len() as i64; + return Ok({ + vm.register_stack[base_ptr + a] = LuaValue::integer(len); + }); + } + }; + + // FAST PATH: fasttm check - __len is known to be absent + if mt_table.tm_absent(tm_flags::TM_LEN) { + (table.len() as i64, None) + } else { + let mm_key = LuaValue::string(vm.object_pool.tm_len); + match mt_table.raw_get(&mm_key) { + Some(mm) if !mm.is_nil() => (0, Some(mm)), + _ => (table.len() as i64, None), + } + } + }; + + if let Some(metamethod) = len_mm { + // Call __len metamethod + let result = vm + .call_metamethod(&metamethod, &[value])? + .unwrap_or(LuaValue::nil()); + vm.register_stack[base_ptr + a] = result; } else { - 0 + // Cache __len absence for future lookups + if let Some(mt_table) = vm.object_pool.get_table_mut(mt_id) { + mt_table.set_tm_absent(tm_flags::TM_LEN); + } + vm.register_stack[base_ptr + a] = LuaValue::integer(len); } - } else { - return Err(vm.error(format!("attempt to get length of {}", value.type_name()))); - }; + return Ok(()); + } - let result = LuaValue::integer(len); - vm.register_stack[base_ptr + a] = result; - Ok(()) + Err(vm.error(format!("attempt to get length of {}", value.type_name()))) } /// MmBin: Metamethod binary operation (register, register) diff --git a/crates/luars/src/lua_vm/execute/control_instructions.rs b/crates/luars/src/lua_vm/execute/control_instructions.rs index 809d7edd..0103b58a 100644 --- a/crates/luars/src/lua_vm/execute/control_instructions.rs +++ b/crates/luars/src/lua_vm/execute/control_instructions.rs @@ -77,11 +77,7 @@ pub fn exec_return( // Return all values - use memcpy if return_count > 0 { let src_start = base_ptr + a; - std::ptr::copy( - reg_ptr.add(src_start), - reg_ptr.add(dest_base), - return_count, - ); + std::ptr::copy(reg_ptr.add(src_start), reg_ptr.add(dest_base), return_count); } (*caller_ptr).top = (result_reg + return_count) as u32; } else if num_results == 0 { @@ -745,6 +741,33 @@ pub fn exec_gei(vm: &mut LuaVM, instr: u32, pc: &mut usize, base_ptr: usize) -> // ============ Call Instructions ============ +/// Cold path: Relocate parent vararg when it would be overwritten +#[cold] +#[inline(never)] +fn relocate_parent_vararg( + vm: &mut LuaVM, + frame_ptr_ptr: &mut *mut LuaCallFrame, + new_frame_end: usize, + parent_vararg_start: usize, + parent_vararg_count: usize, +) { + let new_vararg_start = new_frame_end; + let required_capacity = new_vararg_start + parent_vararg_count; + vm.ensure_stack_capacity(required_capacity); + if vm.register_stack.len() < required_capacity { + vm.register_stack.resize(required_capacity, LuaValue::nil()); + } + + // Copy vararg values to new location + for i in 0..parent_vararg_count { + vm.register_stack[new_vararg_start + i] = vm.register_stack[parent_vararg_start + i]; + } + + // Update parent frame's vararg position + let parent_frame = unsafe { &mut **frame_ptr_ptr }; + parent_frame.set_vararg(new_vararg_start, parent_vararg_count); +} + /// CALL A B C /// R[A], ... ,R[A+C-2] := R[A](R[A+1], ... ,R[A+B-1]) /// ULTRA-OPTIMIZED: Inline fast path for Lua function calls @@ -790,31 +813,23 @@ pub fn exec_call( // New frame base = R[A+1] let new_base = base + a + 1; - // Check if this call would overwrite parent frame's vararg - // This is critical for for-in loops with vararg functions - let parent_frame = unsafe { &mut **frame_ptr_ptr }; - let parent_vararg_start = parent_frame.get_vararg_start(); + // FAST PATH: No vararg in parent frame (most common case) + // Only check parent vararg if function uses significant stack + let parent_frame = unsafe { &**frame_ptr_ptr }; let parent_vararg_count = parent_frame.get_vararg_count(); - + + // Cold path: parent has vararg that might be overwritten if parent_vararg_count > 0 { + let parent_vararg_start = parent_frame.get_vararg_start(); let new_frame_end = new_base + max_stack_size; if new_frame_end > parent_vararg_start { - // Vararg would be overwritten! Relocate it to after new frame's stack - let new_vararg_start = new_frame_end; - let required_capacity = new_vararg_start + parent_vararg_count; - vm.ensure_stack_capacity(required_capacity); - if vm.register_stack.len() < required_capacity { - vm.register_stack.resize(required_capacity, LuaValue::nil()); - } - - // Copy vararg values to new location - for i in 0..parent_vararg_count { - vm.register_stack[new_vararg_start + i] = vm.register_stack[parent_vararg_start + i]; - } - - // Update parent frame's vararg position - let parent_frame = unsafe { &mut **frame_ptr_ptr }; - parent_frame.set_vararg(new_vararg_start, parent_vararg_count); + relocate_parent_vararg( + vm, + frame_ptr_ptr, + new_frame_end, + parent_vararg_start, + parent_vararg_count, + ); } } @@ -1049,7 +1064,7 @@ fn exec_call_lua_function( let parent_frame = unsafe { &mut **frame_ptr_ptr }; let parent_vararg_start = parent_frame.get_vararg_start(); let parent_vararg_count = parent_frame.get_vararg_count(); - + if parent_vararg_count > 0 { let new_frame_end = new_base + max_stack_size; if new_frame_end > parent_vararg_start { @@ -1060,12 +1075,13 @@ fn exec_call_lua_function( if vm.register_stack.len() < required_capacity { vm.register_stack.resize(required_capacity, LuaValue::nil()); } - + // Copy vararg values to new location (copy forward, no overlap issue since new > old) for i in 0..parent_vararg_count { - vm.register_stack[new_vararg_start + i] = vm.register_stack[parent_vararg_start + i]; + vm.register_stack[new_vararg_start + i] = + vm.register_stack[parent_vararg_start + i]; } - + // Update parent frame's vararg position // Need to get mutable reference again after potential reallocation let parent_frame = unsafe { &mut **frame_ptr_ptr }; @@ -1498,20 +1514,23 @@ pub fn exec_return0( if vm.frame_count > 1 { // FAST PATH: Calculate caller frame pointer BEFORE pop let caller_ptr = unsafe { vm.frames.as_mut_ptr().add(vm.frame_count - 2) }; - + // Pop frame - just decrement counter (like Lua C: L->ci = ci->previous) vm.frame_count -= 1; - + // Check if caller is Lua function if unsafe { (*caller_ptr).is_lua() } { // Get info we need let (result_reg, num_results) = unsafe { - ((**frame_ptr_ptr).get_result_reg(), (**frame_ptr_ptr).get_num_results()) + ( + (**frame_ptr_ptr).get_result_reg(), + (**frame_ptr_ptr).get_num_results(), + ) }; - + // Update frame_ptr to caller *frame_ptr_ptr = caller_ptr; - + // Only fill nil if caller expects results // Like Lua C: for (nres = ci->nresults; l_unlikely(nres > 0); nres--) if num_results > 0 && num_results != usize::MAX { @@ -1525,7 +1544,7 @@ pub fn exec_return0( } } } - + return Ok(()); } else { // C function caller @@ -1534,7 +1553,7 @@ pub fn exec_return0( return Err(LuaError::Exit); } } - + // No caller - exit VM vm.frame_count -= 1; vm.return_values.clear(); @@ -1562,23 +1581,24 @@ pub fn exec_return1( if vm.frame_count > 1 { // FAST PATH: Calculate caller frame pointer BEFORE pop let caller_ptr = unsafe { vm.frames.as_mut_ptr().add(vm.frame_count - 2) }; - + // Pop frame - just decrement counter vm.frame_count -= 1; - + // Check if caller is Lua function if unsafe { (*caller_ptr).is_lua() } { let result_reg = unsafe { (**frame_ptr_ptr).get_result_reg() }; - + // Update frame_ptr to caller *frame_ptr_ptr = caller_ptr; - + // Write return value directly to caller's register let caller_base = unsafe { (*caller_ptr).base_ptr } as usize; unsafe { - *vm.register_stack.get_unchecked_mut(caller_base + result_reg) = return_value; + *vm.register_stack + .get_unchecked_mut(caller_base + result_reg) = return_value; } - + return Ok(()); } else { // C function caller @@ -1588,7 +1608,7 @@ pub fn exec_return1( return Err(LuaError::Exit); } } - + // No caller - exit VM vm.frame_count -= 1; vm.return_values.clear(); diff --git a/crates/luars/src/lua_vm/execute/load_instructions.rs b/crates/luars/src/lua_vm/execute/load_instructions.rs index c0be0c2f..4c28fefd 100644 --- a/crates/luars/src/lua_vm/execute/load_instructions.rs +++ b/crates/luars/src/lua_vm/execute/load_instructions.rs @@ -10,6 +10,7 @@ use crate::lua_vm::{Instruction, LuaCallFrame, LuaVM}; /// /// This instruction moves vararg arguments to a safe location after max_stack_size, /// so they won't be overwritten by local variable operations. +#[inline(always)] pub fn exec_varargprep( vm: &mut LuaVM, instr: u32, @@ -46,9 +47,24 @@ pub fn exec_varargprep( vm.ensure_stack_capacity(required_size); // Move varargs from frame_base + a to frame_base + max_stack_size - // Copy in reverse order in case source and destination overlap - for i in (0..vararg_count).rev() { - vm.register_stack[vararg_dest + i] = vm.register_stack[frame_base + a + i]; + // OPTIMIZED: Use ptr::copy_nonoverlapping when safe, otherwise copy in reverse + unsafe { + let reg_ptr = vm.register_stack.as_mut_ptr(); + let src = frame_base + a; + + if vararg_dest >= src + vararg_count { + // No overlap - use fast copy + std::ptr::copy_nonoverlapping( + reg_ptr.add(src), + reg_ptr.add(vararg_dest), + vararg_count, + ); + } else { + // Overlapping - copy in reverse + for i in (0..vararg_count).rev() { + *reg_ptr.add(vararg_dest + i) = *reg_ptr.add(src + i); + } + } } // Set vararg info in frame @@ -59,11 +75,17 @@ pub fn exec_varargprep( .set_vararg(frame_base + max_stack_size, 0); } - // Initialize local variables (registers from 0 to max_stack_size) with nil - // But preserve fixed parameters (0..a) - for i in a..max_stack_size { - if frame_base + i < vm.register_stack.len() { - vm.register_stack[frame_base + i] = LuaValue::nil(); + // Initialize local variables (registers from a to max_stack_size) with nil + // OPTIMIZED: Use bulk fill + let nil_start = frame_base + a; + let nil_end = (frame_base + max_stack_size).min(vm.register_stack.len()); + if nil_start < nil_end { + let nil_val = LuaValue::nil(); + unsafe { + let reg_ptr = vm.register_stack.as_mut_ptr(); + for i in nil_start..nil_end { + *reg_ptr.add(i) = nil_val; + } } } diff --git a/crates/luars/src/lua_vm/execute/loop_instructions.rs b/crates/luars/src/lua_vm/execute/loop_instructions.rs index b1caf3f9..9bc2a3c7 100644 --- a/crates/luars/src/lua_vm/execute/loop_instructions.rs +++ b/crates/luars/src/lua_vm/execute/loop_instructions.rs @@ -246,7 +246,7 @@ pub fn exec_tforprep(vm: &mut LuaVM, instr: u32, pc: &mut usize, base_ptr: usize /// TFORCALL A C /// R[A+4], ... ,R[A+3+C] := R[A](R[A+1], R[A+2]); -/// +/// /// Lua 5.4 for-in loop layout: /// R[A] = iter_func /// R[A+1] = state @@ -267,49 +267,82 @@ pub fn exec_tforcall( let c = Instruction::get_c(instr) as usize; // Get iterator function and state - let func = vm.register_stack[base_ptr + a]; - let state = vm.register_stack[base_ptr + a + 1]; - let control = vm.register_stack[base_ptr + a + 2]; - - // Call func(state, control) - // This is similar to CALL instruction but with fixed arguments - match func.kind() { - LuaValueKind::CFunction => { - let Some(cfunc) = func.as_cfunction() else { - return Err(vm.error("Invalid CFunction".to_string())); - }; - - // Use a temporary position for the call setup (beyond result area) - let call_base = base_ptr + a + 4 + c + 1; - vm.ensure_stack_capacity(call_base + 3); - vm.register_stack[call_base] = func; - vm.register_stack[call_base + 1] = state; - vm.register_stack[call_base + 2] = control; - - // Create temporary frame for the call - let temp_frame = LuaCallFrame::new_c_function( - call_base, 3, // func + 2 args (top) - ); - - vm.push_frame(temp_frame); - let result = cfunc(vm)?; - vm.pop_frame_discard(); + let func = unsafe { *vm.register_stack.get_unchecked(base_ptr + a) }; + let state = unsafe { *vm.register_stack.get_unchecked(base_ptr + a + 1) }; + let control = unsafe { *vm.register_stack.get_unchecked(base_ptr + a + 2) }; + + // FAST PATH: Check if it's a CFunction (most common for ipairs/pairs) + use crate::lua_value::TAG_CFUNCTION; + if func.primary == TAG_CFUNCTION { + let cfunc = unsafe { func.as_cfunction().unwrap_unchecked() }; + + // Set up temporary call frame position (beyond result area) + let call_base = base_ptr + a + 4 + c + 1; + vm.ensure_stack_capacity(call_base + 3); + + unsafe { + *vm.register_stack.get_unchecked_mut(call_base) = func; + *vm.register_stack.get_unchecked_mut(call_base + 1) = state; + *vm.register_stack.get_unchecked_mut(call_base + 2) = control; + } - // Store results starting at R[A+4] - let values = result.all_values(); - for (i, value) in values.iter().enumerate().take(c) { - vm.register_stack[base_ptr + a + 4 + i] = *value; + // Create minimal temporary frame for the call + let temp_frame = LuaCallFrame::new_c_function(call_base, 3); + vm.push_frame(temp_frame); + let result = cfunc(vm)?; + vm.pop_frame_discard(); + + // OPTIMIZED: Direct inline access without Vec allocation + // ipairs/pairs typically return 2 values (index, value) or nil + let result_base = base_ptr + a + 4; + + if result.overflow.is_some() { + // Rare case: more than 2 values + let values = result.overflow.unwrap(); + let count = values.len().min(c); + for i in 0..count { + unsafe { + *vm.register_stack.get_unchecked_mut(result_base + i) = values[i]; + } } - // Fill remaining with nil - for i in values.len()..c { - vm.register_stack[base_ptr + a + 4 + i] = LuaValue::nil(); + for i in count..c { + unsafe { + *vm.register_stack.get_unchecked_mut(result_base + i) = LuaValue::nil(); + } + } + } else { + // Common case: 0-2 inline values + let inline_count = result.inline_count as usize; + unsafe { + if c >= 1 { + *vm.register_stack.get_unchecked_mut(result_base) = if inline_count >= 1 { + result.inline[0] + } else { + LuaValue::nil() + }; + } + if c >= 2 { + *vm.register_stack.get_unchecked_mut(result_base + 1) = if inline_count >= 2 { + result.inline[1] + } else { + LuaValue::nil() + }; + } + // Fill any remaining slots with nil + for i in 2..c { + *vm.register_stack.get_unchecked_mut(result_base + i) = LuaValue::nil(); + } } - Ok(false) // No frame change for C functions } + return Ok(false); + } + + // Lua function path + match func.kind() { LuaValueKind::Function => { // For Lua functions, set up for a normal call // We need to place function and arguments, then results go to R[A+4] - + // Use new ID-based API to get function let Some(func_id) = func.as_function_id() else { return Err(vm.error("Not a Lua function".to_string())); @@ -326,7 +359,7 @@ pub fn exec_tforcall( // Arguments: state, control let call_base = base_ptr + a + 4; vm.ensure_stack_capacity(call_base + max_stack_size); - + // Place arguments (overwriting result slots temporarily is OK) vm.register_stack[call_base] = state; vm.register_stack[call_base + 1] = control; @@ -360,7 +393,7 @@ pub fn exec_tforcall( /// TFORLOOP A Bx /// if R[A+4] ~= nil then { R[A+2]=R[A+4]; pc -= Bx } -/// +/// /// Lua 5.4 for-in loop layout: /// R[A] = iter_func /// R[A+1] = state diff --git a/crates/luars/src/lua_vm/execute/mod.rs b/crates/luars/src/lua_vm/execute/mod.rs index 3b6df374..5d68a5d5 100644 --- a/crates/luars/src/lua_vm/execute/mod.rs +++ b/crates/luars/src/lua_vm/execute/mod.rs @@ -493,6 +493,9 @@ pub fn luavm_execute(vm: &mut LuaVM) -> LuaResult { } UpvalueState::Closed(val) => *val = value, }; + + // GC write barrier for upvalue + vm.gc_barrier_upvalue(upvalue_id, &value); } continue 'mainloop; } diff --git a/crates/luars/src/lua_vm/execute/table_instructions.rs b/crates/luars/src/lua_vm/execute/table_instructions.rs index 3891d8aa..b0e44e28 100644 --- a/crates/luars/src/lua_vm/execute/table_instructions.rs +++ b/crates/luars/src/lua_vm/execute/table_instructions.rs @@ -1,4 +1,4 @@ -use crate::lua_value::LuaValue; +use crate::lua_value::{LuaValue, tm_flags}; /// Table operations /// /// These instructions handle table creation, access, and manipulation. @@ -62,7 +62,12 @@ pub fn exec_newtable( *vm.register_stack.get_unchecked_mut(base_ptr + a) = table; } - // GC checkpoint disabled for testing + // GC checkpoint - inline fast path to avoid function call overhead + // Only call slow path when debt exceeds 1MB threshold + const GC_THRESHOLD: isize = 1024 * 1024; + if vm.gc_debt_local > GC_THRESHOLD { + vm.check_gc_slow_pub(); + } } /// GETTABLE A B C @@ -110,11 +115,25 @@ pub fn exec_gettable( } } - // Key not found - check if no metatable to skip metamethod handling - if lua_table.get_metatable().is_none() { + // Key not found - check metatable for __index + let metatable = lua_table.get_metatable(); + if metatable.is_none() { + // No metatable - just return nil unsafe { *vm.register_stack.get_unchecked_mut(*base_ptr + a) = LuaValue::nil() }; return Ok(()); } + + // FAST PATH: fasttm optimization - check if __index is known to be absent + if let Some(mt_val) = metatable + && let Some(mt_id) = mt_val.as_table_id() + { + let mt_table = unsafe { vm.object_pool.get_table_unchecked(mt_id) }; + if mt_table.tm_absent(tm_flags::TM_INDEX) { + // __index is known to be absent - skip slow path + unsafe { *vm.register_stack.get_unchecked_mut(*base_ptr + a) = LuaValue::nil() }; + return Ok(()); + } + } } // Slow path: Use metamethod handling @@ -175,7 +194,8 @@ pub fn exec_settable( lua_table.raw_set(key_value, set_value); } - // Note: GC barrier is handled lazily during collection + // GC write barrier: if table is old and value is young, mark table as touched + vm.gc_barrier_back_table(table_id, &set_value); return Ok(()); } } @@ -209,11 +229,25 @@ pub fn exec_geti(vm: &mut LuaVM, instr: u32, base_ptr: usize) -> LuaResult<()> { return Ok(()); } - // Key not found - check if no metatable to skip metamethod handling - if lua_table.get_metatable().is_none() { + // Key not found - check for metatable and fasttm + let metatable = lua_table.get_metatable(); + if metatable.is_none() { + // No metatable - return nil directly unsafe { *vm.register_stack.get_unchecked_mut(base_ptr + a) = LuaValue::nil() }; return Ok(()); } + + // FAST PATH: fasttm optimization - check if __index is known to be absent + if let Some(mt_val) = metatable + && let Some(mt_id) = mt_val.as_table_id() + { + let mt_table = unsafe { vm.object_pool.get_table_unchecked(mt_id) }; + if mt_table.tm_absent(tm_flags::TM_INDEX) { + // __index is known to be absent - skip slow path + unsafe { *vm.register_stack.get_unchecked_mut(base_ptr + a) = LuaValue::nil() }; + return Ok(()); + } + } } // Slow path: Use metamethod handling @@ -266,8 +300,8 @@ pub fn exec_seti( let lua_table = unsafe { vm.object_pool.get_table_mut_unchecked(table_id) }; lua_table.set_int(b, set_value); - // Note: GC barrier is handled lazily during collection - // This significantly improves write performance + // GC write barrier + vm.gc_barrier_back_table(table_id, &set_value); return Ok(()); } } @@ -310,11 +344,25 @@ pub fn exec_getfield( } } - // Check if no metatable - can return nil directly - if table_ref.get_metatable().is_none() { + // Check for metatable and fasttm optimization + let metatable = table_ref.get_metatable(); + if metatable.is_none() { + // No metatable - return nil directly unsafe { *vm.register_stack.get_unchecked_mut(*base_ptr + a) = LuaValue::nil() }; return Ok(()); } + + // FAST PATH: fasttm optimization - check if __index is known to be absent + if let Some(mt_val) = metatable + && let Some(mt_id) = mt_val.as_table_id() + { + let mt_table = unsafe { vm.object_pool.get_table_unchecked(mt_id) }; + if mt_table.tm_absent(tm_flags::TM_INDEX) { + // __index is known to be absent - skip slow path + unsafe { *vm.register_stack.get_unchecked_mut(*base_ptr + a) = LuaValue::nil() }; + return Ok(()); + } + } } // Slow path: Use metamethod handling @@ -366,6 +414,8 @@ pub fn exec_setfield( let table_ref = unsafe { vm.object_pool.get_table_mut_unchecked(table_id) }; // Ultra-fast path: direct set without any metamethod checks table_ref.raw_set(key_value, set_value); + // GC write barrier + vm.gc_barrier_back_table(table_id, &set_value); return Ok(()); } } @@ -482,7 +532,8 @@ pub fn exec_settabup( // Ultra-fast path: direct set without any metamethod checks table_ref.raw_set(key_value.clone(), set_value.clone()); - // Note: GC barrier is handled lazily during collection + // GC write barrier + vm.gc_barrier_back_table(table_id, &set_value); return Ok(()); } } diff --git a/crates/luars/src/lua_vm/execute/upvalue_instructions.rs b/crates/luars/src/lua_vm/execute/upvalue_instructions.rs index a241aa0c..491f92f0 100644 --- a/crates/luars/src/lua_vm/execute/upvalue_instructions.rs +++ b/crates/luars/src/lua_vm/execute/upvalue_instructions.rs @@ -85,7 +85,7 @@ pub fn exec_closure( // Get prototype and parent upvalues without cloning upvalues early let func_id = unsafe { (*frame_ptr).get_function_id_unchecked() }; let func_ref = unsafe { vm.object_pool.get_function_unchecked(func_id) }; - + let proto = func_ref.chunk.child_protos.get(bx).cloned(); let proto = match proto { Some(p) => p, @@ -127,7 +127,7 @@ pub fn exec_closure( upvalue_ids.push(existing_uv_id); } else { // Create new open upvalue and add to open list directly - let new_uv_id = vm.object_pool.create_upvalue_open(stack_index); + let new_uv_id = vm.create_upvalue_open(stack_index); upvalue_ids.push(new_uv_id); vm.open_upvalues.push(new_uv_id); } @@ -178,6 +178,9 @@ pub fn exec_vararg( ) }; + let dest_base = base_ptr + a; + let reg_ptr = vm.register_stack.as_mut_ptr(); + if c == 0 { // Variable number of results - copy all varargs // Update frame top to accommodate all varargs @@ -186,24 +189,54 @@ pub fn exec_vararg( (*frame_ptr).top = (new_top.max(top)) as u32; } - for i in 0..vararg_count { - let value = if vararg_start + i < vm.register_stack.len() { - vm.register_stack[vararg_start + i] - } else { - LuaValue::nil() - }; - vm.register_stack[base_ptr + a + i] = value; + // OPTIMIZED: Use ptr::copy for bulk transfer when possible + if vararg_count > 0 && vararg_start + vararg_count <= vm.register_stack.len() { + unsafe { + std::ptr::copy( + reg_ptr.add(vararg_start), + reg_ptr.add(dest_base), + vararg_count, + ); + } + } else { + // Fallback: copy with bounds checking + let nil_val = LuaValue::nil(); + for i in 0..vararg_count { + let value = if vararg_start + i < vm.register_stack.len() { + unsafe { *reg_ptr.add(vararg_start + i) } + } else { + nil_val + }; + unsafe { + *reg_ptr.add(dest_base + i) = value; + } + } } } else { // Fixed number of results (c-1 values) let count = c - 1; - for i in 0..count { - let value = if i < vararg_count && vararg_start + i < vm.register_stack.len() { - vm.register_stack[vararg_start + i] - } else { - LuaValue::nil() - }; - vm.register_stack[base_ptr + a + i] = value; + let copy_count = count.min(vararg_count); + let nil_count = count.saturating_sub(vararg_count); + + // OPTIMIZED: Bulk copy available varargs + if copy_count > 0 && vararg_start + copy_count <= vm.register_stack.len() { + unsafe { + std::ptr::copy( + reg_ptr.add(vararg_start), + reg_ptr.add(dest_base), + copy_count, + ); + } + } + + // Fill remaining with nil + if nil_count > 0 { + let nil_val = LuaValue::nil(); + for i in copy_count..count { + unsafe { + *reg_ptr.add(dest_base + i) = nil_val; + } + } } } @@ -267,8 +300,8 @@ pub fn exec_concat(vm: &mut LuaVM, instr: u32, base_ptr: usize) -> LuaResult<()> let result_value = vm.create_string_owned(result); vm.register_stack[base_ptr + a] = result_value; - // No GC check for fast path - rely on debt mechanism - // Only large allocations trigger automatic GC + // GC checkpoint - Lua checks GC after CONCAT + vm.check_gc(); return Ok(()); } @@ -354,7 +387,8 @@ pub fn exec_concat(vm: &mut LuaVM, instr: u32, base_ptr: usize) -> LuaResult<()> vm.register_stack[base_ptr + a] = result_value; - // No GC check - rely on debt mechanism + // GC checkpoint - Lua checks GC after CONCAT + vm.check_gc(); Ok(()) } diff --git a/crates/luars/src/lua_vm/mod.rs b/crates/luars/src/lua_vm/mod.rs index 2e8c87fd..0ef9981e 100644 --- a/crates/luars/src/lua_vm/mod.rs +++ b/crates/luars/src/lua_vm/mod.rs @@ -5,7 +5,7 @@ mod lua_call_frame; mod lua_error; mod opcode; -use crate::gc::{GC, GcFunction, ThreadId, UpvalueId}; +use crate::gc::{GC, GcFunction, TableId, ThreadId, UpvalueId}; #[cfg(feature = "async")] use crate::lua_async::AsyncExecutor; use crate::lua_value::{ @@ -37,6 +37,9 @@ pub struct LuaVM { // This is updated on every allocation and checked frequently pub(crate) gc_debt_local: isize, + // GC roots buffer - pre-allocated to avoid allocation during GC + gc_roots_buffer: Vec, + // Call stack - Pre-allocated Vec with fixed capacity // Using Vec directly (no Box indirection) for cache efficiency // Vec is pre-allocated to MAX_CALL_DEPTH and never reallocated @@ -117,6 +120,7 @@ impl LuaVM { global_value: LuaValue::nil(), registry: LuaValue::nil(), // Will be initialized below gc_debt_local: -(200 * 1024), // Start with negative debt (can allocate 200KB before GC) + gc_roots_buffer: Vec::with_capacity(512), // Pre-allocate roots buffer frames, frame_count: 0, register_stack: Vec::with_capacity(256), // Pre-allocate for initial stack @@ -246,7 +250,7 @@ impl LuaVM { // Create upvalue for _ENV (global table) // Main chunks in Lua 5.4 always have _ENV as upvalue[0] - let env_upvalue_id = self.object_pool.create_upvalue_closed(self.global_value); + let env_upvalue_id = self.create_upvalue_closed(self.global_value); let upvalues = vec![env_upvalue_id]; // Create main function in object pool with _ENV upvalue @@ -885,8 +889,14 @@ impl LuaVM { // Use pre-cached __index StringId - avoids hash computation and intern lookup let index_key = LuaValue::string(self.object_pool.tm_index); + // Single table lookup: check tm_flags AND get __index value together let index_value = { let metatable = self.object_pool.get_table(meta_id)?; + // FAST PATH: Check tm_flags first (like Lua 5.4's fasttm) + // If flag is set, __index is known to be absent - skip lookup + if metatable.tm_absent(crate::lua_value::tm_flags::TM_INDEX) { + return None; + } metatable.raw_get(&index_key) }; @@ -913,6 +923,11 @@ impl LuaVM { } _ => {} } + } else { + // __index not found - cache this fact for future lookups + if let Some(metatable) = self.object_pool.get_table_mut(meta_id) { + metatable.set_tm_absent(crate::lua_value::tm_flags::TM_INDEX); + } } } @@ -1130,6 +1145,7 @@ impl LuaVM { /// Fast path for calling CFunction metamethods with 2 arguments /// Used by __index, __newindex, etc. Avoids Vec allocation. /// Returns the first return value. + /// OPTIMIZED: Skip expensive get_function lookup by using a fixed offset from current base #[inline(always)] pub fn call_cfunc_metamethod_2( &mut self, @@ -1137,19 +1153,13 @@ impl LuaVM { arg1: LuaValue, arg2: LuaValue, ) -> LuaResult> { - // Calculate new base position - use current frame's top area + // Fast path: use a fixed offset from current base (256 slots is enough for most cases) + // This avoids the expensive object_pool.get_function lookup let new_base = if self.frame_count > 0 { let current_frame = &self.frames[self.frame_count - 1]; - let caller_base = current_frame.base_ptr as usize; - let caller_max_stack = if let Some(func_id) = current_frame.get_function_id() { - self.object_pool - .get_function(func_id) - .map(|f| f.chunk.max_stack_size) - .unwrap_or(256) - } else { - 256 - }; - caller_base + caller_max_stack + // Use top as the base for nested calls, since all args are already there + // Adding 256 ensures we don't overwrite the caller's stack + (current_frame.base_ptr as usize) + 256 } else { 0 }; @@ -1158,9 +1168,12 @@ impl LuaVM { self.ensure_stack_capacity(new_base + stack_size); // Set up arguments directly (no Vec allocation) - self.register_stack[new_base] = LuaValue::cfunction(cfunc); - self.register_stack[new_base + 1] = arg1; - self.register_stack[new_base + 2] = arg2; + unsafe { + let base = self.register_stack.as_mut_ptr().add(new_base); + *base = LuaValue::cfunction(cfunc); + *base.add(1) = arg1; + *base.add(2) = arg2; + } // Create C function frame let temp_frame = LuaCallFrame::new_c_function(new_base, stack_size); @@ -1184,6 +1197,7 @@ impl LuaVM { /// Fast path for calling CFunction metamethods with 1 argument /// Used by __len, __unm, __bnot, etc. Avoids Vec allocation. + /// OPTIMIZED: Skip expensive get_function lookup #[inline(always)] pub fn call_cfunc_metamethod_1( &mut self, @@ -1192,16 +1206,7 @@ impl LuaVM { ) -> LuaResult> { let new_base = if self.frame_count > 0 { let current_frame = &self.frames[self.frame_count - 1]; - let caller_base = current_frame.base_ptr as usize; - let caller_max_stack = if let Some(func_id) = current_frame.get_function_id() { - self.object_pool - .get_function(func_id) - .map(|f| f.chunk.max_stack_size) - .unwrap_or(256) - } else { - 256 - }; - caller_base + caller_max_stack + (current_frame.base_ptr as usize) + 256 } else { 0 }; @@ -1209,8 +1214,11 @@ impl LuaVM { let stack_size = 2; // func + 1 arg self.ensure_stack_capacity(new_base + stack_size); - self.register_stack[new_base] = LuaValue::cfunction(cfunc); - self.register_stack[new_base + 1] = arg1; + unsafe { + let base = self.register_stack.as_mut_ptr().add(new_base); + *base = LuaValue::cfunction(cfunc); + *base.add(1) = arg1; + } let temp_frame = LuaCallFrame::new_c_function(new_base, stack_size); self.push_frame(temp_frame); @@ -1232,6 +1240,7 @@ impl LuaVM { /// Fast path for calling CFunction metamethods with 3 arguments /// Used by __newindex. Avoids Vec allocation. + /// OPTIMIZED: Skip expensive get_function lookup #[inline(always)] pub fn call_cfunc_metamethod_3( &mut self, @@ -1242,16 +1251,7 @@ impl LuaVM { ) -> LuaResult> { let new_base = if self.frame_count > 0 { let current_frame = &self.frames[self.frame_count - 1]; - let caller_base = current_frame.base_ptr as usize; - let caller_max_stack = if let Some(func_id) = current_frame.get_function_id() { - self.object_pool - .get_function(func_id) - .map(|f| f.chunk.max_stack_size) - .unwrap_or(256) - } else { - 256 - }; - caller_base + caller_max_stack + (current_frame.base_ptr as usize) + 256 } else { 0 }; @@ -1259,10 +1259,13 @@ impl LuaVM { let stack_size = 4; // func + 3 args self.ensure_stack_capacity(new_base + stack_size); - self.register_stack[new_base] = LuaValue::cfunction(cfunc); - self.register_stack[new_base + 1] = arg1; - self.register_stack[new_base + 2] = arg2; - self.register_stack[new_base + 3] = arg3; + unsafe { + let base = self.register_stack.as_mut_ptr().add(new_base); + *base = LuaValue::cfunction(cfunc); + *base.add(1) = arg1; + *base.add(2) = arg2; + *base.add(3) = arg3; + } let temp_frame = LuaCallFrame::new_c_function(new_base, stack_size); self.push_frame(temp_frame); @@ -1506,15 +1509,7 @@ impl LuaVM { /// - Long string: 1 Box allocation, GC registration, no pooling pub fn create_string(&mut self, s: &str) -> LuaValue { let id = self.object_pool.create_string(s); - - // Estimate memory cost: string data + LuaString struct overhead - // LuaString: ~32 bytes base + string length - let estimated_bytes = 32 + s.len(); - self.gc.record_allocation(estimated_bytes); - - // GC check MUST NOT happen here - object not yet protected! - // Caller must call check_gc() AFTER storing value in register - + self.gc_debt_local += (32 + s.len()) as isize; LuaValue::string(id) } @@ -1523,10 +1518,7 @@ impl LuaVM { pub fn create_string_owned(&mut self, s: String) -> LuaValue { let len = s.len(); let id = self.object_pool.create_string_owned(s); - - let estimated_bytes = 32 + len; - self.gc.record_allocation(estimated_bytes); - + self.gc_debt_local += (32 + len) as isize; LuaValue::string(id) } @@ -1539,16 +1531,74 @@ impl LuaVM { } } + // ============ GC Write Barriers ============ + // These are called when modifying old objects to point to young objects + // Critical for correct generational GC behavior + + /// Write barrier for table modification + /// Called when: table[key] = value (fast path) + /// If table is old and value is young/collectable, mark table as touched + #[inline(always)] + pub fn gc_barrier_back_table(&mut self, table_id: TableId, value: &LuaValue) { + // Only process in generational mode and if value is collectable + if self.gc.gc_kind() != crate::gc::GcKind::Generational { + return; + } + + // Check if value is a collectable GC object + let value_gc_id = match value.kind() { + LuaValueKind::Table => value.as_table_id().map(crate::gc::GcId::TableId), + LuaValueKind::Function => value.as_function_id().map(crate::gc::GcId::FunctionId), + LuaValueKind::Thread => value.as_thread_id().map(crate::gc::GcId::ThreadId), + _ => None, + }; + + if value_gc_id.is_some() { + // Call back barrier on the table + let table_gc_id = crate::gc::GcId::TableId(table_id); + self.gc.barrier_back_gen(table_gc_id, &mut self.object_pool); + } + } + + /// Write barrier for upvalue modification + /// Called when: upvalue = value (SETUPVAL) + /// If upvalue is old/closed and value is young, mark upvalue as touched + #[inline(always)] + pub fn gc_barrier_upvalue(&mut self, upvalue_id: UpvalueId, value: &LuaValue) { + // Only process in generational mode + if self.gc.gc_kind() != crate::gc::GcKind::Generational { + return; + } + + // Check if value is a collectable GC object + let is_collectable = matches!( + value.kind(), + LuaValueKind::Table | LuaValueKind::Function | LuaValueKind::Thread | LuaValueKind::String + ); + + if is_collectable { + // Forward barrier: mark the value if upvalue is old + let uv_gc_id = crate::gc::GcId::UpvalueId(upvalue_id); + + // Get value's GcId for forward barrier + if let Some(value_gc_id) = match value.kind() { + LuaValueKind::Table => value.as_table_id().map(crate::gc::GcId::TableId), + LuaValueKind::Function => value.as_function_id().map(crate::gc::GcId::FunctionId), + LuaValueKind::Thread => value.as_thread_id().map(crate::gc::GcId::ThreadId), + LuaValueKind::String => value.as_string_id().map(crate::gc::GcId::StringId), + _ => None, + } { + self.gc.barrier_forward_gen(uv_gc_id, value_gc_id, &mut self.object_pool); + } + } + } + /// Create a new table in object pool - /// OPTIMIZATION: Only update local debt counter, no function calls + /// GC tracks objects via ObjectPool iteration, no allgc list needed #[inline(always)] pub fn create_table(&mut self, array_size: usize, hash_size: usize) -> LuaValue { let id = self.object_pool.create_table(array_size, hash_size); - - // Lightweight GC tracking: just increment debt - // This is a single integer add, should be very fast self.gc_debt_local += 256; - LuaValue::table(id) } @@ -1646,15 +1696,28 @@ impl LuaVM { } /// Create a function in object pool + /// Tracks the object in GC's allgc list for efficient sweep #[inline(always)] pub fn create_function(&mut self, chunk: Rc, upvalue_ids: Vec) -> LuaValue { let id = self.object_pool.create_function(chunk, upvalue_ids); + self.gc_debt_local += 128; + LuaValue::function(id) + } - // Register with GC - ultra-lightweight - self.gc - .register_object(id.0, crate::gc::GcObjectType::Function); + /// Create an open upvalue pointing to a stack index + #[inline(always)] + pub fn create_upvalue_open(&mut self, stack_index: usize) -> UpvalueId { + let id = self.object_pool.create_upvalue_open(stack_index); + self.gc_debt_local += 64; + id + } - LuaValue::function(id) + /// Create a closed upvalue with a value + #[inline(always)] + pub fn create_upvalue_closed(&mut self, value: LuaValue) -> UpvalueId { + let id = self.object_pool.create_upvalue_closed(value); + self.gc_debt_local += 64; + id } /// Get function by LuaValue (resolves ID from object pool) @@ -1750,15 +1813,17 @@ impl LuaVM { /// Check GC and run a step if needed (like luaC_checkGC in Lua 5.4) /// This is called after allocating new objects (strings, tables, functions) - /// Uses GC debt mechanism: runs when debt > 0 + /// Uses GC debt mechanism like Lua: runs when debt > threshold /// /// OPTIMIZATION: Fast path is inlined, slow path is separate function #[inline(always)] fn check_gc(&mut self) { - // Ultra-fast path: single integer comparison with local debt counter - // Only check if debt exceeds a significant threshold (1MB) - // This reduces the overhead of frequent checks dramatically - if self.gc_debt_local <= 1024 * 1024 { + // Fast path: check if gc_debt_local > threshold + // Use a larger threshold to reduce GC frequency + // 1MB threshold = about 16000 small object allocations before GC + // The incremental GC will catch up during collection anyway + const GC_THRESHOLD: isize = 1024 * 1024; + if self.gc_debt_local <= GC_THRESHOLD { return; } // Slow path: actual GC work @@ -1779,72 +1844,66 @@ impl LuaVM { // Sync local debt to GC self.gc.gc_debt = self.gc_debt_local; - // Incremental GC: only collect every N checks to reduce overhead - self.gc.increment_check_counter(); - if !self.gc.should_run_collection() { - return; - } - - // Collect roots: all reachable objects from VM state - let mut roots = Vec::new(); + // Collect roots using pre-allocated buffer (avoid allocation) + self.gc_roots_buffer.clear(); // 1. Global table - roots.push(self.global_value); + self.gc_roots_buffer.push(self.global_value); // 2. Registry table (persistent objects storage) - roots.push(self.registry); + self.gc_roots_buffer.push(self.registry); // 3. String metatable if let Some(mt) = &self.string_metatable { - roots.push(*mt); + self.gc_roots_buffer.push(*mt); } - // 3. ALL frame registers AND function values (not just current frame) + // 4. ALL frame registers AND function values (not just current frame) // This is critical - any register in any active frame must be kept alive // Also, the function being executed in each frame must be kept alive! for frame in &self.frames[..self.frame_count] { // Add the function value for this frame - this is CRITICAL! - roots.push(frame.as_function_value()); + self.gc_roots_buffer.push(frame.as_function_value()); let base_ptr = frame.base_ptr as usize; let top = frame.top as usize; for i in 0..top { if base_ptr + i < self.register_stack.len() { - roots.push(self.register_stack[base_ptr + i]); + self.gc_roots_buffer.push(self.register_stack[base_ptr + i]); } } } - // 4. All registers beyond the frames (temporary values) + // 5. All registers beyond the frames (temporary values) if self.frame_count > 0 { let last_frame = &self.frames[self.frame_count - 1]; let last_frame_end = last_frame.base_ptr as usize + last_frame.top as usize; for i in last_frame_end..self.register_stack.len() { - roots.push(self.register_stack[i]); + self.gc_roots_buffer.push(self.register_stack[i]); } } else { // No frames? Collect all registers for reg in &self.register_stack { - roots.push(*reg); + self.gc_roots_buffer.push(*reg); } } - // 5. Return values + // 6. Return values for value in &self.return_values { - roots.push(*value); + self.gc_roots_buffer.push(*value); } - // 6. Open upvalues - these point to stack locations that must stay alive + // 7. Open upvalues - these point to stack locations that must stay alive for upval_id in &self.open_upvalues { if let Some(uv) = self.object_pool.get_upvalue(*upval_id) { if let Some(val) = uv.get_closed_value() { - roots.push(val); + self.gc_roots_buffer.push(val); } } } // Perform GC step with complete root set - self.gc.step(&roots, &mut self.object_pool); + self.gc.step(&self.gc_roots_buffer, &mut self.object_pool); // Sync debt back from GC (it may have been reset to negative after collection) self.gc_debt_local = self.gc.gc_debt; @@ -2348,8 +2407,8 @@ impl LuaVM { self.pop_frame_discard(); } - // Return error - the actual message is stored in vm.error_message - let msg = self.error_message.clone(); + // Return error - take the message to avoid allocation + let msg = std::mem::take(&mut self.error_message); let error_str = self.create_string(&msg); Ok((false, vec![error_str])) @@ -2420,19 +2479,10 @@ impl LuaVM { LuaValueKind::CFunction => { let cfunc = func.as_cfunction().unwrap(); - // Calculate new base position + // OPTIMIZED: Use fixed offset instead of expensive get_function lookup let new_base = if self.frame_count > 0 { let current_frame = &self.frames[self.frame_count - 1]; - let caller_base = current_frame.base_ptr as usize; - let caller_max_stack = if let Some(func_id) = current_frame.get_function_id() { - self.object_pool - .get_function(func_id) - .map(|f| f.chunk.max_stack_size) - .unwrap_or(256) - } else { - 256 - }; - caller_base + caller_max_stack + (current_frame.base_ptr as usize) + 256 } else { 0 }; @@ -2483,20 +2533,10 @@ impl LuaVM { ) }; - // Calculate new base + // OPTIMIZED: Use fixed offset instead of expensive get_function lookup let new_base = if self.frame_count > 0 { let current_frame = &self.frames[self.frame_count - 1]; - let caller_base = current_frame.base_ptr as usize; - let caller_max_stack = - if let Some(caller_func_id) = current_frame.get_function_id() { - self.object_pool - .get_function(caller_func_id) - .map(|f| f.chunk.max_stack_size) - .unwrap_or(256) - } else { - 256 - }; - caller_base + caller_max_stack + (current_frame.base_ptr as usize) + 256 } else { 0 }; diff --git a/crates/luars/src/lua_vm/opcode/mod.rs b/crates/luars/src/lua_vm/opcode/mod.rs index 0176246e..8c31d455 100644 --- a/crates/luars/src/lua_vm/opcode/mod.rs +++ b/crates/luars/src/lua_vm/opcode/mod.rs @@ -492,12 +492,12 @@ mod tests { fn test_opcode_mode() { assert_eq!(OpCode::Move.get_mode(), OpMode::IABC); assert_eq!(OpCode::LoadK.get_mode(), OpMode::IABx); - assert_eq!(OpCode::Jmp.get_mode(), OpMode::IsJ); // JMP uses sJ format (signed jump) + assert_eq!(OpCode::Jmp.get_mode(), OpMode::IsJ); // JMP uses sJ format (signed jump) assert_eq!(OpCode::ExtraArg.get_mode(), OpMode::IAx); assert_eq!(OpCode::Add.get_mode(), OpMode::IABC); - assert_eq!(OpCode::TForCall.get_mode(), OpMode::IABC); // TFORCALL uses ABC format - assert_eq!(OpCode::TForLoop.get_mode(), OpMode::IABx); // TFORLOOP uses ABx format - assert_eq!(OpCode::LoadI.get_mode(), OpMode::IAsBx); // LOADI uses signed sBx + assert_eq!(OpCode::TForCall.get_mode(), OpMode::IABC); // TFORCALL uses ABC format + assert_eq!(OpCode::TForLoop.get_mode(), OpMode::IABx); // TFORLOOP uses ABx format + assert_eq!(OpCode::LoadI.get_mode(), OpMode::IAsBx); // LOADI uses signed sBx } #[test] diff --git a/crates/luars/src/stdlib/basic.rs b/crates/luars/src/stdlib/basic.rs index 1c71ac92..dde66614 100644 --- a/crates/luars/src/stdlib/basic.rs +++ b/crates/luars/src/stdlib/basic.rs @@ -194,17 +194,26 @@ fn lua_tostring(vm: &mut LuaVM) -> LuaResult { } /// select(index, ...) - Return subset of arguments +/// OPTIMIZED: Avoid Vec allocation for common case fn lua_select(vm: &mut LuaVM) -> LuaResult { - let index_arg = require_arg(vm, 1, "select")?; - let args = get_args(vm); + let frame = vm.current_frame(); + let base_ptr = frame.base_ptr as usize; + let top = frame.top as usize; - // Handle "#" special case + // Get index argument (at register 1) + let index_arg = if base_ptr + 1 < vm.register_stack.len() && 1 < top { + vm.register_stack[base_ptr + 1] + } else { + return Err(vm.error("bad argument #1 to 'select' (value expected)".to_string())); + }; + + // Handle "#" special case - return count of varargs if let Some(string_id) = index_arg.as_string_id() { if let Some(s) = vm.object_pool.get_string(string_id) { if s.as_str() == "#" { - return Ok(MultiValue::single(LuaValue::integer( - (args.len() - 1) as i64, - ))); + // Count of extra arguments (excluding index itself) + let count = top.saturating_sub(2); // top - 1 (index) - 1 (function) + return Ok(MultiValue::single(LuaValue::integer(count as i64))); } } } @@ -217,18 +226,28 @@ fn lua_select(vm: &mut LuaVM) -> LuaResult { return Err(vm.error("bad argument #1 to 'select' (index out of range)".to_string())); } + let arg_count = top.saturating_sub(1); // Exclude function register + let start = if index > 0 { - (index - 1) as usize + index as usize } else { - (args.len() as i64 + index) as usize + (arg_count as i64 + index) as usize }; - if start >= args.len() - 1 { + if start >= arg_count { return Ok(MultiValue::empty()); } - // Return args from start+1 onwards (skip the index argument itself) - let result: Vec = args.iter().skip(start + 1).cloned().collect(); + // Collect result directly from registers + let result_count = arg_count - start; + let mut result = Vec::with_capacity(result_count); + for i in 0..result_count { + let reg_idx = base_ptr + 1 + start + i; + if reg_idx < vm.register_stack.len() { + result.push(vm.register_stack[reg_idx]); + } + } + Ok(MultiValue::multiple(result)) } @@ -336,16 +355,29 @@ fn lua_next(vm: &mut LuaVM) -> LuaResult { } /// pcall(f [, arg1, ...]) - Protected call +/// OPTIMIZED: Avoid Vec allocations on success path fn lua_pcall(vm: &mut LuaVM) -> LuaResult { // pcall(f, arg1, arg2, ...) -> status, result or error - // Get the function to call (argument 1) - let func = require_arg(vm, 1, "pcall")?; + // Get frame info to read args directly + let frame = vm.current_frame(); + let base_ptr = frame.base_ptr as usize; + let top = frame.top as usize; - // Get all arguments after the function - let all_args = get_args(vm); - let args: Vec = if all_args.len() > 1 { - all_args[1..].to_vec() + // Arg 1 is the function (at base_ptr + 1) + let func = if top > 1 { + vm.register_stack[base_ptr + 1] + } else { + return Err(vm.error("pcall() requires argument 1".to_string())); + }; + + // Collect remaining args (2..top) into a small vec + // Most pcalls have 0-3 args, so this is fast + let arg_count = if top > 2 { top - 2 } else { 0 }; + let args: Vec = if arg_count > 0 { + (2..top) + .map(|i| vm.register_stack[base_ptr + i]) + .collect() } else { Vec::new() }; @@ -353,25 +385,44 @@ fn lua_pcall(vm: &mut LuaVM) -> LuaResult { // Use protected_call from VM let (success, results) = vm.protected_call(func, args)?; - // Return status and results - let mut return_values = vec![LuaValue::boolean(success)]; + // Return status and results - preallocate with capacity + let mut return_values = Vec::with_capacity(1 + results.len()); + return_values.push(LuaValue::boolean(success)); return_values.extend(results); Ok(MultiValue::multiple(return_values)) } /// xpcall(f, msgh [, arg1, ...]) - Protected call with error handler +/// OPTIMIZED: Avoid Vec allocations fn lua_xpcall(vm: &mut LuaVM) -> LuaResult { // xpcall(f, msgh, arg1, arg2, ...) -> status, result or error - // Get the function to call (argument 1) - let func = require_arg(vm, 1, "xpcall")?; - // Get the error handler (argument 2) - let err_handler = require_arg(vm, 2, "xpcall")?; - - // Get all arguments after the function and error handler - let all_args = get_args(vm); - let args: Vec = if all_args.len() > 2 { - all_args[3..].to_vec() + + // Get frame info to read args directly + let frame = vm.current_frame(); + let base_ptr = frame.base_ptr as usize; + let top = frame.top as usize; + + // Arg 1 is the function (at base_ptr + 1) + let func = if top > 1 { + vm.register_stack[base_ptr + 1] + } else { + return Err(vm.error("xpcall() requires argument 1".to_string())); + }; + + // Arg 2 is the error handler (at base_ptr + 2) + let err_handler = if top > 2 { + vm.register_stack[base_ptr + 2] + } else { + return Err(vm.error("xpcall() requires argument 2".to_string())); + }; + + // Collect remaining args (3..top) into a small vec + let arg_count = if top > 3 { top - 3 } else { 0 }; + let args: Vec = if arg_count > 0 { + (3..top) + .map(|i| vm.register_stack[base_ptr + i]) + .collect() } else { Vec::new() }; @@ -861,9 +912,9 @@ fn lua_load(vm: &mut LuaVM) -> LuaResult { // Create upvalue for _ENV (global table) // Loaded chunks need _ENV as upvalue[0] let env_upvalue_id = if let Some(env) = env { - vm.object_pool.create_upvalue_closed(env) + vm.create_upvalue_closed(env) } else { - vm.object_pool.create_upvalue_closed(vm.global_value) + vm.create_upvalue_closed(vm.global_value) }; let upvalues = vec![env_upvalue_id]; @@ -905,7 +956,7 @@ fn lua_loadfile(vm: &mut LuaVM) -> LuaResult { match vm.compile_with_name(&code, &chunkname) { Ok(chunk) => { // Create upvalue for _ENV (global table) - let env_upvalue_id = vm.object_pool.create_upvalue_closed(vm.global_value); + let env_upvalue_id = vm.create_upvalue_closed(vm.global_value); let upvalues = vec![env_upvalue_id]; let func = vm.create_function(std::rc::Rc::new(chunk), upvalues); Ok(MultiValue::single(func)) @@ -945,7 +996,7 @@ fn lua_dofile(vm: &mut LuaVM) -> LuaResult { match vm.compile_with_name(&code, &chunkname) { Ok(chunk) => { // Create upvalue for _ENV (global table) - let env_upvalue_id = vm.object_pool.create_upvalue_closed(vm.global_value); + let env_upvalue_id = vm.create_upvalue_closed(vm.global_value); let upvalues = vec![env_upvalue_id]; let func = vm.create_function(std::rc::Rc::new(chunk), upvalues); diff --git a/crates/luars/src/stdlib/math.rs b/crates/luars/src/stdlib/math.rs index 1105476d..4416e533 100644 --- a/crates/luars/src/stdlib/math.rs +++ b/crates/luars/src/stdlib/math.rs @@ -119,59 +119,63 @@ fn math_log(vm: &mut LuaVM) -> LuaResult { fn math_max(vm: &mut LuaVM) -> LuaResult { use crate::lib_registry::{arg_count, get_arg}; - + let argc = arg_count(vm); if argc == 0 { return Err(vm.error("bad argument to 'math.max' (value expected)".to_string())); } - + // Get first argument let first = get_arg(vm, 1).unwrap(); - let mut max_val = first.as_number() + let mut max_val = first + .as_number() .ok_or_else(|| vm.error("bad argument to 'math.max' (number expected)".to_string()))?; let mut max_arg = first; - + // Compare with rest for i in 2..=argc { if let Some(arg) = get_arg(vm, i) { - let val = arg.as_number() - .ok_or_else(|| vm.error("bad argument to 'math.max' (number expected)".to_string()))?; + let val = arg.as_number().ok_or_else(|| { + vm.error("bad argument to 'math.max' (number expected)".to_string()) + })?; if val > max_val { max_val = val; max_arg = arg; } } } - + Ok(MultiValue::single(max_arg)) } fn math_min(vm: &mut LuaVM) -> LuaResult { use crate::lib_registry::{arg_count, get_arg}; - + let argc = arg_count(vm); if argc == 0 { return Err(vm.error("bad argument to 'math.min' (value expected)".to_string())); } - + // Get first argument let first = get_arg(vm, 1).unwrap(); - let mut min_val = first.as_number() + let mut min_val = first + .as_number() .ok_or_else(|| vm.error("bad argument to 'math.min' (number expected)".to_string()))?; let mut min_arg = first; - + // Compare with rest for i in 2..=argc { if let Some(arg) = get_arg(vm, i) { - let val = arg.as_number() - .ok_or_else(|| vm.error("bad argument to 'math.min' (number expected)".to_string()))?; + let val = arg.as_number().ok_or_else(|| { + vm.error("bad argument to 'math.min' (number expected)".to_string()) + })?; if val < min_val { min_val = val; min_arg = arg; } } } - + Ok(MultiValue::single(min_arg)) } diff --git a/crates/luars/src/stdlib/package.rs b/crates/luars/src/stdlib/package.rs index 710905e5..56cd38b6 100644 --- a/crates/luars/src/stdlib/package.rs +++ b/crates/luars/src/stdlib/package.rs @@ -218,7 +218,7 @@ fn lua_file_loader(vm: &mut LuaVM) -> LuaResult { let chunk = vm.compile_with_name(&source, &chunkname)?; // Create a function from the chunk with _ENV upvalue - let env_upvalue_id = vm.object_pool.create_upvalue_closed(vm.global_value); + let env_upvalue_id = vm.create_upvalue_closed(vm.global_value); let func = vm.create_function(Rc::new(chunk), vec![env_upvalue_id]); // Call the function to execute the module diff --git a/need_optimize b/need_optimize deleted file mode 100644 index 035e461d..00000000 --- a/need_optimize +++ /dev/null @@ -1,24 +0,0 @@ -🔴 Lua-RS 落后的领域 (需要优化) -测试项 Lua-RS Native Lua 比例 差距 优先级 -Simple function call 54.01 M 86.53 M 62% -38% 🔥高 -fib(25) 6ms 3ms 50% -50% 🔥高 -Upvalue read/write 40.70 M 93.91 M 43% -57% 🔥高 -Multiple upvalues 32.76 M 49.22 M 67% -33% 🔥高 -Nested closures 34.93 M 63.31 M 55% -45% 🔥高 -Single return 62.57 M 139.78 M 45% -55% 🔥高 -Triple return 44.29 M 81.58 M 54% -46% 🔥高 -__index (function) 18.55 M 60.11 M 31% -69% 🔥高 -__newindex 21.79 M 48.22 M 45% -55% 🔥高 -__call metamethod 5.87 M 60.28 M 10% -90% 🔥🔥极高 -pcall (success) 9.42 M 37.85 M 25% -75% 🔥高 -Direct call baseline 2.94 M 93.98 M 3% -97% 🔥🔥极高 -assert (success) 40.28 M 93.63 M 43% -57% 中 -Create/resume/yield 1.21 M 4.15 M 29% -71% 中 -coroutine.wrap 2.12 M 5.92 M 36% -64% 中 -Object creation 1.73 M 11.91 M 15% -85% 中 -Method call (colon) 12.16 M 21.38 M 57% -43% 中 -Closure method call 28.83 M 97.87 M 29% -71% 中 -string.match (simple) 3.50 M 22.25 M 16% -84% 低 -string.match (captures) 1.57 M 7.80 M 20% -80% 低 -string.reverse 7.67 M 18.67 M 41% -59% 低 -table.concat 40.4 K 75.8 K 53% -47% 低 \ No newline at end of file