Add fast path for pure binary integer operations in JIT execution

3tty0n · 3tty0n · commit f52497b31f72 · 2026-01-07T11:30:11.000+01:00
diff --git a/rpython/jit/codewriter/genextension.py b/rpython/jit/codewriter/genextension.py
@@ -4,6 +4,7 @@
 from rpython.jit.metainterp.history import (Const, ConstInt, ConstPtr,
     ConstFloat, CONST_NULL, getkind, AbstractDescr)
 from rpython.jit.metainterp import support
+from rpython.jit.metainterp.resoperation import rop
 from rpython.flowspace.model import Constant
 from rpython.jit.codewriter.flatten import (
     Register, TLabel, Label, ListOfKind, IndirectCallTargets)
@@ -131,7 +132,7 @@ def generate(self):
         full_code = _apply_peephole_optimizations(full_code)
         self.jitcode._genext_source = full_code
         d = {"ConstInt": ConstInt, "ConstPtr": ConstPtr, "ConstFloat": ConstFloat, "JitCode": JitCode, "ChangeFrame": ChangeFrame,
-             "lltype": lltype, "rstr": rstr, 'llmemory': llmemory, 'OBJECTPTR': OBJECTPTR, 'support': support}
+             "lltype": lltype, "rstr": rstr, 'llmemory': llmemory, 'OBJECTPTR': OBJECTPTR, 'support': support, 'rop': rop}
         d.update(self.globals)
         source = py.code.Source(self.jitcode._genext_source)
         exec source.compile() in d
@@ -2818,6 +2819,52 @@ def _emit_unspecialized_binary(self):
         self._emit_jump(lines)
         return lines
 
+    # Mapping from operation name to rop constant for pure binary int ops
+    _PURE_BINARY_ROP = {
+        'int_add': 'rop.INT_ADD',
+        'int_sub': 'rop.INT_SUB',
+        'int_mul': 'rop.INT_MUL',
+        'int_and': 'rop.INT_AND',
+        'int_or': 'rop.INT_OR',
+        'int_xor': 'rop.INT_XOR',
+        'int_lshift': 'rop.INT_LSHIFT',
+        'int_rshift': 'rop.INT_RSHIFT',
+        'int_lt': 'rop.INT_LT',
+        'int_le': 'rop.INT_LE',
+        'int_gt': 'rop.INT_GT',
+        'int_ge': 'rop.INT_GE',
+        'int_eq': 'rop.INT_EQ',
+        'int_ne': 'rop.INT_NE',
+    }
+
+    def _emit_unspecialized_pure_binary_i(self):
+        """Fast path for pure integer binary operations.
+
+        Uses MetaInterp.execute_and_record_pure_i() which bypasses:
+        - Method lookup chain (opimpl_* -> execute -> execute_and_record)
+        - Unnecessary constant-fold checks (we already know args aren't const)
+        - Heapcache invalidation (pure ops can't affect it)
+        """
+        lines = []
+        arg0, arg1, result = self._get_args_and_res()
+        self._emit_n_ary_if([arg0, arg1], lines)
+        self._emit_jump(lines, constant_registers=self.constant_registers.union({arg0, arg1}),
+                        indent='    ', target_pc=self.orig_pc)
+        lines.append("else:")
+        # Use fast path: direct execute_and_record_pure_i
+        rop_const = self._PURE_BINARY_ROP.get(self.name)
+        if rop_const is not None:
+            lines.append("    self.registers_i[%d] = self.metainterp.execute_and_record_pure_i(%s, %s, %s)" % (
+                result.index, rop_const,
+                self._get_as_box(arg0), self._get_as_box(arg1)))
+        else:
+            # Fallback to opimpl for operations not in fast-path list
+            lines.append("    self.registers_i[%d] = self.%s(%s, %s)" % (
+                result.index, self.methodname,
+                self._get_as_box(arg0), self._get_as_box(arg1)))
+        self._emit_jump(lines)
+        return lines
+
     def _emit_unspecialized_float_binary(self):
         lines = []
         arg0, arg1, result = self._get_args_and_res()
@@ -2831,20 +2878,22 @@ def _emit_unspecialized_float_binary(self):
         self._emit_jump(lines)
         return lines
 
-    emit_unspecialized_int_add = _emit_unspecialized_binary
-    emit_unspecialized_int_sub = _emit_unspecialized_binary
-    emit_unspecialized_int_mul = _emit_unspecialized_binary
-    emit_unspecialized_int_or = _emit_unspecialized_binary
-    emit_unspecialized_int_and = _emit_unspecialized_binary
-    emit_unspecialized_int_rshift = _emit_unspecialized_binary
-    emit_unspecialized_int_lshift = _emit_unspecialized_binary
-    emit_unspecialized_int_le = _emit_unspecialized_binary
-    emit_unspecialized_int_lt = _emit_unspecialized_binary
-    emit_unspecialized_int_ge = _emit_unspecialized_binary
-    emit_unspecialized_int_gt = _emit_unspecialized_binary
-    emit_unspecialized_int_eq = _emit_unspecialized_binary
-    emit_unspecialized_int_ne = _emit_unspecialized_binary
-    emit_unspecialized_int_xor = _emit_unspecialized_binary
+    # Pure integer binary ops use fast path (skip method chain overhead)
+    emit_unspecialized_int_add = _emit_unspecialized_pure_binary_i
+    emit_unspecialized_int_sub = _emit_unspecialized_pure_binary_i
+    emit_unspecialized_int_mul = _emit_unspecialized_pure_binary_i
+    emit_unspecialized_int_or = _emit_unspecialized_pure_binary_i
+    emit_unspecialized_int_and = _emit_unspecialized_pure_binary_i
+    emit_unspecialized_int_rshift = _emit_unspecialized_pure_binary_i
+    emit_unspecialized_int_lshift = _emit_unspecialized_pure_binary_i
+    emit_unspecialized_int_le = _emit_unspecialized_pure_binary_i
+    emit_unspecialized_int_lt = _emit_unspecialized_pure_binary_i
+    emit_unspecialized_int_ge = _emit_unspecialized_pure_binary_i
+    emit_unspecialized_int_gt = _emit_unspecialized_pure_binary_i
+    emit_unspecialized_int_eq = _emit_unspecialized_pure_binary_i
+    emit_unspecialized_int_ne = _emit_unspecialized_pure_binary_i
+    emit_unspecialized_int_xor = _emit_unspecialized_pure_binary_i
+    # int_mod and int_floordiv use old path (can raise ZeroDivisionError)
     emit_unspecialized_int_mod = _emit_unspecialized_binary
     emit_unspecialized_int_floordiv = _emit_unspecialized_binary
 
diff --git a/rpython/jit/metainterp/heapcache.py b/rpython/jit/metainterp/heapcache.py
@@ -233,13 +233,19 @@ def update_version(self, ref_frontend_op):
             ref_frontend_op._heapc_deps = None
 
     def invalidate_caches_varargs(self, opnum, descr, argboxes):
+        # Pure operations cannot escape objects or invalidate caches - skip entirely
+        if rop.is_always_pure(opnum):
+            return
         self.mark_escaped_varargs(opnum, descr, argboxes)
         if self.clear_caches_not_necessary(opnum, descr):
             return
         self.clear_caches_varargs(opnum, descr, argboxes)
 
     @specialize.arg(1)
     def invalidate_caches(self, opnum, descr, *argboxes):
+        # Pure operations cannot escape objects or invalidate caches - skip entirely
+        if rop.is_always_pure(opnum):
+            return
         self.mark_escaped(opnum, descr, *argboxes)
         if self.clear_caches_not_necessary(opnum, descr):
             return
diff --git a/rpython/jit/metainterp/jitprof.py b/rpython/jit/metainterp/jitprof.py
@@ -243,6 +243,7 @@ def _print_stats(self):
         self._print_intline("forcings", cnt[Counters.OPT_FORCINGS])
         self._print_intline("slow tracing function executions", cnt[Counters.SLOW_TRACING_FUNCTION_EXECUTIONS])
         self._print_intline("fast tracing function executions", cnt[Counters.FAST_TRACING_FUNCTION_EXECUTIONS])
+        self._print_intline("pure binary fastpath ops", cnt[Counters.PURE_BINARY_FASTPATH_OPS])
         self._print_intline("abort: trace too long",
                             cnt[Counters.ABORT_TOO_LONG])
         self._print_intline("abort: compiling", cnt[Counters.ABORT_BRIDGE])
diff --git a/rpython/jit/metainterp/pyjitpl.py b/rpython/jit/metainterp/pyjitpl.py
@@ -2811,6 +2811,33 @@ def _record_helper(self, opnum, resvalue, descr, *argboxes):
         if op.type != 'v':
             return op
 
+    @specialize.arg(1)
+    def execute_and_record_pure_i(self, opnum, box0, box1):
+        """Fast path for pure integer binary operations.
+
+        Bypasses unnecessary overhead for operations that:
+        - Are always pure (no side effects)
+        - Have no descr
+        - Cannot raise exceptions
+        - Return integer result
+
+        Used by GenExtension for operations like INT_ADD, INT_SUB, etc.
+        """
+        # Execute the operation
+        profiler = self.staticdata.profiler
+        profiler.count_ops(opnum)
+        profiler.count(Counters.PURE_BINARY_FASTPATH_OPS)  # Track fast-path usage
+        resvalue = executor.execute(self.cpu, self, opnum, None, box0, box1)
+        # Record directly - skip heapcache (pure ops don't affect it)
+        # and skip constant fold check (caller already verified non-const)
+        profiler.count_ops(opnum, Counters.RECORDED_OPS)
+        # heapcache.invalidate_caches is skipped for pure ops (returns early)
+        if self.framestack:
+            self.framestack[-1].jitcode.traced_operations += 1
+        op = self.history.record2(opnum, box0, box1, resvalue, None)
+        self.attach_debug_info(op)
+        return op
+
     def execute_new_with_vtable(self, descr):
         resbox = self.execute_and_record(rop.NEW_WITH_VTABLE, descr)
         self.heapcache.new(resbox)
diff --git a/rpython/rlib/jit.py b/rpython/rlib/jit.py
@@ -1444,6 +1444,7 @@ class Counters(object):
     NVREUSED
     FAST_TRACING_FUNCTION_EXECUTIONS
     SLOW_TRACING_FUNCTION_EXECUTIONS
+    PURE_BINARY_FASTPATH_OPS
     TOTAL_COMPILED_LOOPS
     TOTAL_COMPILED_BRIDGES
     TOTAL_FREED_LOOPS