openvm-org · jonathanpwang · Aug 22, 2025 · Aug 19, 2025 · Aug 20, 2025 · Aug 20, 2025
diff --git a/.github/workflows/benchmark-call.yml b/.github/workflows/benchmark-call.yml
@@ -107,7 +107,7 @@ on:
 env:
   S3_METRICS_PATH: s3://openvm-public-data-sandbox-us-east-1/benchmark/github/metrics
   S3_FLAMEGRAPHS_PATH: s3://openvm-public-data-sandbox-us-east-1/benchmark/github/flamegraphs
-  FEATURE_FLAGS: "metrics,parallel,nightly-features"
+  FEATURE_FLAGS: "metrics,parallel,nightly-features,tco"
   INPUT_ARGS: ""
   CARGO_NET_GIT_FETCH_WITH_CLI: "true"
 

diff --git a/.github/workflows/benchmarks-execute.yml b/.github/workflows/benchmarks-execute.yml
@@ -27,6 +27,7 @@ env:
   CARGO_TERM_COLOR: always
   S3_FIXTURES_PATH: s3://openvm-public-data-sandbox-us-east-1/benchmark/fixtures
   JEMALLOC_SYS_WITH_MALLOC_CONF: "retain:true,background_thread:true,metadata_thp:always,thp:always,dirty_decay_ms:10000,muzzy_decay_ms:10000,abort_conf:true"
+  TOOLCHAIN: "+nightly-2025-08-19"
 
 jobs:
   codspeed-walltime-benchmarks:
@@ -65,12 +66,12 @@ jobs:
 
       - name: Build benchmarks
         working-directory: benchmarks/execute
-        run: cargo codspeed build --profile maxperf
+        run: cargo $TOOLCHAIN codspeed build --profile maxperf
       - name: Run benchmarks
         uses: CodSpeedHQ/action@v3
         with:
           working-directory: benchmarks/execute
-          run: cargo codspeed run
+          run: cargo $TOOLCHAIN codspeed run
           token: ${{ secrets.CODSPEED_TOKEN }}
 
   codspeed-instrumentation-benchmarks:
@@ -110,10 +111,10 @@ jobs:
 
       - name: Build benchmarks
         working-directory: benchmarks/execute
-        run: cargo codspeed build
+        run: cargo $TOOLCHAIN codspeed build
       - name: Run benchmarks
         uses: CodSpeedHQ/action@v3
         with:
           working-directory: benchmarks/execute
-          run: cargo codspeed run
+          run: cargo $TOOLCHAIN codspeed run
           token: ${{ secrets.CODSPEED_TOKEN }}
diff --git a/Cargo.toml b/Cargo.toml
@@ -229,6 +229,7 @@ dashmap = "6.1.0"
 memmap2 = "0.9.5"
 libc = "0.2.175"
 tracing-subscriber = { version = "0.3.17", features = ["std", "env-filter"] }
+paste = "1.0.15"
 
 # default-features = false for no_std for use in guest programs
 itertools = { version = "0.14.0", default-features = false }

diff --git a/benchmarks/execute/Cargo.toml b/benchmarks/execute/Cargo.toml
@@ -45,6 +45,7 @@ divan = { package = "codspeed-divan-compat", version = "3.0.2" }
 
 [features]
 default = ["jemalloc"]
+tco = ["openvm-sdk/tco"]
 mimalloc = ["openvm-circuit/mimalloc"]
 jemalloc = ["openvm-circuit/jemalloc"]
 jemalloc-prof = ["openvm-circuit/jemalloc-prof"]

diff --git a/benchmarks/prove/Cargo.toml b/benchmarks/prove/Cargo.toml
@@ -33,8 +33,9 @@ metrics.workspace = true
 [dev-dependencies]
 
 [features]
-default = ["parallel", "jemalloc", "metrics", "evm"]
+default = ["parallel", "jemalloc", "metrics"]
 metrics = ["openvm-sdk/metrics"]
+tco = ["openvm-sdk/tco"]
 perf-metrics = ["openvm-sdk/perf-metrics", "metrics"]
 stark-debug = ["openvm-sdk/stark-debug"]
 # runs leaf aggregation benchmarks:

diff --git a/ci/scripts/bench.py b/ci/scripts/bench.py
@@ -15,9 +15,12 @@ def run_cargo_command(
     kzg_params_dir,
     profile="release"
 ):
+    toolchain = "+1.86"
+    if "tco" in feature_flags:
+        toolchain = "+nightly-2025-08-19"
     # Command to run (for best performance but slower builds, use --profile maxperf)
     command = [
-        "cargo", "run", "--no-default-features", "-p", "openvm-benchmarks-prove", "--bin", bin_name, "--profile", profile, "--features", ",".join(feature_flags), "--"
+        "cargo", toolchain, "run", "--no-default-features", "-p", "openvm-benchmarks-prove", "--bin", bin_name, "--profile", profile, "--features", ",".join(feature_flags), "--"
     ]
 
     if app_log_blowup is not None:

diff --git a/crates/cli/Cargo.toml b/crates/cli/Cargo.toml
@@ -45,6 +45,7 @@ default = ["parallel", "jemalloc", "evm-verify", "metrics"]
 evm-prove = ["openvm-sdk/evm-prove"]
 evm-verify = ["evm-prove", "openvm-sdk/evm-verify"]
 metrics = ["openvm-sdk/metrics"]
+tco = ["openvm-sdk/tco"]
 # for guest profiling:
 perf-metrics = ["openvm-sdk/perf-metrics", "metrics"]
 # performance features:

diff --git a/crates/cli/src/lib.rs b/crates/cli/src/lib.rs
@@ -1,3 +1,6 @@
+#![cfg_attr(feature = "tco", allow(incomplete_features))]
+#![cfg_attr(feature = "tco", feature(explicit_tail_calls))]
+
 pub mod commands;
 pub mod default;
 pub mod input;

diff --git a/crates/sdk/Cargo.toml b/crates/sdk/Cargo.toml
@@ -79,6 +79,17 @@ metrics = [
     "openvm-native-recursion/metrics",
     "openvm-native-compiler/metrics",
 ]
+tco = [
+    "openvm-circuit/tco",
+    "openvm-rv32im-circuit/tco",
+    "openvm-native-circuit/tco",
+    "openvm-sha256-circuit/tco",
+    "openvm-keccak256-circuit/tco",
+    "openvm-bigint-circuit/tco",
+    "openvm-algebra-circuit/tco",
+    "openvm-ecc-circuit/tco",
+    "openvm-pairing-circuit/tco"
+]
 # for guest profiling:
 perf-metrics = ["openvm-circuit/perf-metrics", "openvm-transpiler/function-span"]
 # turns on stark-backend debugger in all proofs

diff --git a/crates/sdk/src/lib.rs b/crates/sdk/src/lib.rs
@@ -1,3 +1,5 @@
+#![cfg_attr(feature = "tco", allow(incomplete_features))]
+#![cfg_attr(feature = "tco", feature(explicit_tail_calls))]
 use std::{
     borrow::Borrow,
     fs::read,

diff --git a/crates/vm/Cargo.toml b/crates/vm/Cargo.toml
@@ -68,6 +68,9 @@ basic-memory = []
 # turns on stark-backend debugger in all proofs
 stark-debug = []
 test-utils = ["openvm-stark-sdk"]
+# Tail call optimizations. This requires nightly for the `become` keyword (https://github.com/rust-lang/rust/pull/144232).
+# However tail call elimination is still an incomplete feature in Rust, so the `tco` feature remains experimental until then.
+tco = ["openvm-circuit-derive/tco"]
 # performance features:
 mimalloc = ["openvm-stark-backend/mimalloc"]
 jemalloc = ["openvm-stark-backend/jemalloc"]

diff --git a/crates/vm/derive/Cargo.toml b/crates/vm/derive/Cargo.toml
@@ -10,7 +10,10 @@ license.workspace = true
 proc-macro = true
 
 [dependencies]
-syn = { version = "2.0", features = ["parsing"] }
+syn = { version = "2.0", features = ["parsing", "full"] }
 quote = "1.0"
 proc-macro2 = "1.0"
 itertools = { workspace = true }
+
+[features]
+tco = []
diff --git a/crates/vm/derive/src/lib.rs b/crates/vm/derive/src/lib.rs
@@ -9,6 +9,9 @@ use syn::{
     GenericParam, Ident, Meta, Token,
 };
 
+#[cfg(feature = "tco")]
+mod tco;
+
 #[proc_macro_derive(PreflightExecutor)]
 pub fn preflight_executor_derive(input: TokenStream) -> TokenStream {
     let ast: syn::DeriveInput = syn::parse(input).unwrap();
@@ -172,6 +175,18 @@ pub fn executor_derive(input: TokenStream) -> TokenStream {
                         Ctx: ::openvm_circuit::arch::execution_mode::ExecutionCtxTrait, {
                         self.0.pre_compute(pc, inst, data)
                     }
+
+                    #[cfg(feature = "tco")]
+                    fn handler<Ctx>(
+                        &self,
+                        pc: u32,
+                        inst: &::openvm_circuit::arch::instructions::instruction::Instruction<F>,
+                        data: &mut [u8],
+                    ) -> Result<::openvm_circuit::arch::Handler<F, Ctx>, ::openvm_circuit::arch::StaticProgramError>
+                    where
+                        Ctx: ::openvm_circuit::arch::execution_mode::ExecutionCtxTrait, {
+                        self.0.handler(pc, inst, data)
+                    }
                 }
             }
             .into()
@@ -205,18 +220,21 @@ pub fn executor_derive(input: TokenStream) -> TokenStream {
                 });
             // Use full path ::openvm_circuit... so it can be used either within or outside the vm
             // crate. Assume F is already generic of the field.
-            let (pre_compute_size_arms, pre_compute_arms, where_predicates): (Vec<_>, Vec<_>, Vec<_>) = multiunzip(variants.iter().map(|(variant_name, field)| {
+            let (pre_compute_size_arms, pre_compute_arms, handler_arms, where_predicates): (Vec<_>, Vec<_>, Vec<_>, Vec<_>) = multiunzip(variants.iter().map(|(variant_name, field)| {
                 let field_ty = &field.ty;
                 let pre_compute_size_arm = quote! {
                     #name::#variant_name(x) => <#field_ty as ::openvm_circuit::arch::Executor<#first_ty_generic>>::pre_compute_size(x)
                 };
                 let pre_compute_arm = quote! {
                     #name::#variant_name(x) => <#field_ty as ::openvm_circuit::arch::Executor<#first_ty_generic>>::pre_compute(x, pc, instruction, data)
                 };
+                let handler_arm = quote! {
+                    #name::#variant_name(x) => <#field_ty as ::openvm_circuit::arch::Executor<#first_ty_generic>>::handler(x, pc, instruction, data)
+                };
                 let where_predicate = syn::parse_quote! {
                     #field_ty: ::openvm_circuit::arch::Executor<#first_ty_generic>
                 };
-                (pre_compute_size_arm, pre_compute_arm, where_predicate)
+                (pre_compute_size_arm, pre_compute_arm, handler_arm, where_predicate)
             }));
             let where_clause = new_generics.make_where_clause();
             for predicate in where_predicates {
@@ -247,6 +265,20 @@ pub fn executor_derive(input: TokenStream) -> TokenStream {
                             #(#pre_compute_arms,)*
                         }
                     }
+
+                    #[cfg(feature = "tco")]
+                    fn handler<Ctx>(
+                        &self,
+                        pc: u32,
+                        instruction: &::openvm_circuit::arch::instructions::instruction::Instruction<F>,
+                        data: &mut [u8],
+                    ) -> Result<::openvm_circuit::arch::Handler<F, Ctx>, ::openvm_circuit::arch::StaticProgramError>
+                    where
+                        Ctx: ::openvm_circuit::arch::execution_mode::ExecutionCtxTrait, {
+                        match self {
+                            #(#handler_arms,)*
+                        }
+                    }
                 }
             }
             .into()
@@ -300,6 +332,18 @@ pub fn metered_executor_derive(input: TokenStream) -> TokenStream {
                         Ctx: ::openvm_circuit::arch::execution_mode::MeteredExecutionCtxTrait, {
                         self.0.metered_pre_compute(chip_idx, pc, inst, data)
                     }
+                    #[cfg(feature = "tco")]
+                    fn metered_handler<Ctx>(
+                        &self,
+                        chip_idx: usize,
+                        pc: u32,
+                        inst: &::openvm_circuit::arch::instructions::instruction::Instruction<F>,
+                        data: &mut [u8],
+                    ) -> Result<::openvm_circuit::arch::Handler<F, Ctx>, ::openvm_circuit::arch::StaticProgramError>
+                    where
+                        Ctx: ::openvm_circuit::arch::execution_mode::MeteredExecutionCtxTrait, {
+                        self.0.metered_handler(chip_idx, pc, inst, data)
+                    }
                 }
             }
                 .into()
@@ -333,18 +377,21 @@ pub fn metered_executor_derive(input: TokenStream) -> TokenStream {
                 });
             // Use full path ::openvm_circuit... so it can be used either within or outside the vm
             // crate. Assume F is already generic of the field.
-            let (pre_compute_size_arms, metered_pre_compute_arms, where_predicates): (Vec<_>, Vec<_>, Vec<_>) = multiunzip(variants.iter().map(|(variant_name, field)| {
+            let (pre_compute_size_arms, metered_pre_compute_arms, metered_handler_arms, where_predicates): (Vec<_>, Vec<_>, Vec<_>, Vec<_>) = multiunzip(variants.iter().map(|(variant_name, field)| {
                 let field_ty = &field.ty;
                 let pre_compute_size_arm = quote! {
                     #name::#variant_name(x) => <#field_ty as ::openvm_circuit::arch::MeteredExecutor<#first_ty_generic>>::metered_pre_compute_size(x)
                 };
                 let metered_pre_compute_arm = quote! {
                     #name::#variant_name(x) => <#field_ty as ::openvm_circuit::arch::MeteredExecutor<#first_ty_generic>>::metered_pre_compute(x, chip_idx, pc, instruction, data)
                 };
+                let metered_handler_arm = quote! {
+                    #name::#variant_name(x) => <#field_ty as ::openvm_circuit::arch::MeteredExecutor<#first_ty_generic>>::metered_handler(x, chip_idx, pc, instruction, data)
+                };
                 let where_predicate = syn::parse_quote! {
                     #field_ty: ::openvm_circuit::arch::MeteredExecutor<#first_ty_generic>
                 };
-                (pre_compute_size_arm, metered_pre_compute_arm, where_predicate)
+                (pre_compute_size_arm, metered_pre_compute_arm, metered_handler_arm, where_predicate)
             }));
             let where_clause = new_generics.make_where_clause();
             for predicate in where_predicates {
@@ -376,6 +423,21 @@ pub fn metered_executor_derive(input: TokenStream) -> TokenStream {
                             #(#metered_pre_compute_arms,)*
                         }
                     }
+
+                    #[cfg(feature = "tco")]
+                    fn metered_handler<Ctx>(
+                        &self,
+                        chip_idx: usize,
+                        pc: u32,
+                        instruction: &::openvm_circuit::arch::instructions::instruction::Instruction<F>,
+                        data: &mut [u8],
+                    ) -> Result<::openvm_circuit::arch::Handler<F, Ctx>, ::openvm_circuit::arch::StaticProgramError>
+                    where
+                        Ctx: ::openvm_circuit::arch::execution_mode::MeteredExecutionCtxTrait, {
+                        match self {
+                            #(#metered_handler_arms,)*
+                        }
+                    }
                 }
             }
                 .into()
@@ -501,7 +563,6 @@ fn generate_config_traits_impl(name: &Ident, inner: &DataStruct) -> syn::Result<
         .iter()
         .filter(|f| f.attrs.iter().any(|attr| attr.path().is_ident("config")))
         .exactly_one()
-        .clone()
         .expect("Exactly one field must have the #[config] attribute");
     let (source_name, source_name_upper) =
         gen_name_with_uppercase_idents(source_field.ident.as_ref().unwrap());
@@ -700,3 +761,44 @@ fn parse_executor_type(
         })
     }
 }
+
+/// An attribute procedural macro for creating TCO (Tail Call Optimization) handlers.
+///
+/// This macro generates a handler function that wraps an execute implementation
+/// with tail call optimization using the `become` keyword. It extracts the generics
+/// and where clauses from the original function.
+///
+/// # Usage
+///
+/// Place this attribute above a function definition:
+/// ```
+/// #[create_tco_handler]
+/// unsafe fn execute_e1_impl<F: PrimeField32, CTX, const B_IS_IMM: bool>(
+///     pre_compute: &[u8],
+///     state: &mut VmExecState<F, GuestMemory, CTX>,
+/// ) where
+///     CTX: ExecutionCtxTrait,
+/// {
+///     // function body
+/// }
+/// ```
+///
+/// This will generate a TCO handler function with the same generics and where clauses.
+///
+/// # Safety
+///
+/// Do not use this macro if your function wants to terminate execution without error with a
+/// specific error code. The handler generated by this macro assumes that execution should continue
+/// unless the execute_impl returns an error. This is done for performance to skip an exit code
+/// check.
+#[proc_macro_attribute]
+pub fn create_tco_handler(_attr: TokenStream, item: TokenStream) -> TokenStream {
+    #[cfg(feature = "tco")]
+    {
+        tco::tco_impl(item)
+    }
+    #[cfg(not(feature = "tco"))]
+    {
+        item
+    }
+}