diff --git a/libs/qec/lib/decoders/plugins/trt_decoder/trt_decoder.cpp b/libs/qec/lib/decoders/plugins/trt_decoder/trt_decoder.cpp
index 30012502..8306536c 100644
--- a/libs/qec/lib/decoders/plugins/trt_decoder/trt_decoder.cpp
+++ b/libs/qec/lib/decoders/plugins/trt_decoder/trt_decoder.cpp
@@ -14,6 +14,7 @@
 #include <fstream>
 #include <iostream>
 #include <memory>
+#include <variant>
 #include <vector>
 
 // TensorRT headers
@@ -101,208 +102,474 @@ static Logger gLogger;
 /// not both.
 namespace cudaq::qec {
 
-class trt_decoder : public decoder {
-private:
-  // TensorRT-specific members
-  std::unique_ptr<nvinfer1::ICudaEngine> engine_;
-  std::unique_ptr<nvinfer1::IExecutionContext> context_;
-  int input_index_ = 0;
-  int output_index_ = 0;
-  int input_size_ = 0;
-  int output_size_ = 0;
-  void *buffers_[2] = {nullptr, nullptr};
-  cudaStream_t stream_;
-  bool initialized_ = false;
+// ============================================================================
+// Executor implementations (internal)
+// ============================================================================
+
+namespace {
+// Traditional TensorRT execution without CUDA graphs
+struct TraditionalExecutor {
+  void execute(nvinfer1::IExecutionContext *context, cudaStream_t stream,
+               void *input_buffer, void *output_buffer, int input_index,
+               int output_index, nvinfer1::ICudaEngine *engine) {
+    context->setTensorAddress(engine->getIOTensorName(input_index),
+                              input_buffer);
+    context->setTensorAddress(engine->getIOTensorName(output_index),
+                              output_buffer);
+    context->enqueueV3(stream);
+    HANDLE_CUDA_ERROR(cudaStreamSynchronize(stream));
+  }
+};
 
-public:
-  trt_decoder(const cudaqx::tensor<uint8_t> &H,
-              const cudaqx::heterogeneous_map &params)
-      : decoder(H), initialized_(false) {
-    // Decoder-specific constructor arguments can be placed in `params`.
-
-    try {
-      // Validate parameters
-      trt_decoder_internal::validate_trt_decoder_parameters(params);
-
-      // Check if CUDA is available
-      check_cuda();
-
-      bool has_engine_path = params.contains("engine_load_path");
-
-      if (has_engine_path) {
-        // Load pre-built TensorRT engine directly
-        std::string engine_path = params.get<std::string>("engine_load_path");
-        auto engineData = trt_decoder_internal::load_file(engine_path);
-
-        // Create runtime and deserialize engine
-        auto runtime = std::unique_ptr<nvinfer1::IRuntime>(
-            nvinfer1::createInferRuntime(gLogger));
-        if (!runtime) {
-          throw std::runtime_error("Failed to create TensorRT runtime");
-        }
-
-        engine_ = std::unique_ptr<nvinfer1::ICudaEngine>(
-            runtime->deserializeCudaEngine(engineData.data(),
-                                           engineData.size()));
-        if (!engine_) {
-          throw std::runtime_error(
-              "Failed to deserialize TensorRT engine from: " + engine_path);
-        }
-      } else {
-        // Load ONNX model and build engine
-        std::string onnx_model_path = params.get<std::string>("onnx_load_path");
-        engine_ = trt_decoder_internal::build_engine_from_onnx(onnx_model_path,
-                                                               params, gLogger);
-
-        // Save engine if requested
-        if (params.contains("engine_save_path")) {
-          std::string engine_save_path =
-              params.get<std::string>("engine_save_path");
-          trt_decoder_internal::save_engine_to_file(engine_.get(),
-                                                    engine_save_path);
-        }
-      }
+// CUDA graph-based execution for optimized performance
+struct CudaGraphExecutor {
+  cudaGraph_t graph;
+  cudaGraphExec_t graph_exec;
 
-      // Create execution context
-      context_ = std::unique_ptr<nvinfer1::IExecutionContext>(
-          engine_->createExecutionContext());
-      if (!context_) {
-        throw std::runtime_error("Failed to create execution context");
-      }
+  // Constructor now takes ownership of pre-captured graph
+  CudaGraphExecutor(cudaGraph_t g, cudaGraphExec_t ge)
+      : graph(g), graph_exec(ge) {}
 
-      // Get input/output info
-      int n_bindings = engine_->getNbIOTensors();
-      input_index_ = -1;
-      output_index_ = -1;
-      for (int i = 0; i < n_bindings; ++i) {
-        const char *tensorName = engine_->getIOTensorName(i);
-        if (engine_->getTensorIOMode(tensorName) ==
-            nvinfer1::TensorIOMode::kINPUT) {
-          input_index_ = i;
-        } else {
-          output_index_ = i;
-        }
-      }
+  // Delete copy constructor and assignment to prevent double-free
+  CudaGraphExecutor(const CudaGraphExecutor &) = delete;
+  CudaGraphExecutor &operator=(const CudaGraphExecutor &) = delete;
 
-      if (input_index_ == -1 || output_index_ == -1) {
-        throw std::runtime_error("Failed to identify input/output tensors");
+  // Move constructor - transfer ownership
+  CudaGraphExecutor(CudaGraphExecutor &&other) noexcept
+      : graph(other.graph), graph_exec(other.graph_exec) {
+    other.graph = nullptr;
+    other.graph_exec = nullptr;
+  }
+
+  // Move assignment - transfer ownership
+  CudaGraphExecutor &operator=(CudaGraphExecutor &&other) noexcept {
+    if (this != &other) {
+      // Clean up existing resources
+      if (graph_exec) {
+        HANDLE_CUDA_ERROR_NO_THROW(cudaGraphExecDestroy(graph_exec));
+      }
+      if (graph) {
+        HANDLE_CUDA_ERROR_NO_THROW(cudaGraphDestroy(graph));
       }
+      // Transfer ownership
+      graph = other.graph;
+      graph_exec = other.graph_exec;
+      other.graph = nullptr;
+      other.graph_exec = nullptr;
+    }
+    return *this;
+  }
 
-      auto inputDims =
-          engine_->getTensorShape(engine_->getIOTensorName(input_index_));
-      input_size_ = 1;
-      for (int j = 0; j < inputDims.nbDims; ++j)
-        input_size_ *= inputDims.d[j];
+  void execute(nvinfer1::IExecutionContext *context, cudaStream_t stream,
+               void *input_buffer, void *output_buffer, int input_index,
+               int output_index, nvinfer1::ICudaEngine *engine) {
+    // Just launch the graph - no lazy capture needed!
+    HANDLE_CUDA_ERROR(cudaGraphLaunch(graph_exec, stream));
+    HANDLE_CUDA_ERROR(cudaStreamSynchronize(stream));
+  }
 
-      auto outputDims =
-          engine_->getTensorShape(engine_->getIOTensorName(output_index_));
-      output_size_ = 1;
-      for (int j = 0; j < outputDims.nbDims; ++j)
-        output_size_ *= outputDims.d[j];
+  ~CudaGraphExecutor() {
+    if (graph_exec) {
+      HANDLE_CUDA_ERROR_NO_THROW(cudaGraphExecDestroy(graph_exec));
+    }
+    if (graph) {
+      HANDLE_CUDA_ERROR_NO_THROW(cudaGraphDestroy(graph));
+    }
+  }
+};
 
-      // Allocate GPU buffers
-      HANDLE_CUDA_ERROR(
-          cudaMalloc(&buffers_[input_index_], input_size_ * sizeof(float)));
-      HANDLE_CUDA_ERROR(
-          cudaMalloc(&buffers_[output_index_], output_size_ * sizeof(float)));
+// Result structure for CUDA graph capture attempts
+struct CaptureResult {
+  bool success = false;
+  cudaGraph_t graph = nullptr;
+  cudaGraphExec_t graph_exec = nullptr;
+  std::string error_message;
+};
 
-      // Create CUDA stream
-      HANDLE_CUDA_ERROR(cudaStreamCreate(&stream_));
+// Attempt to capture a CUDA graph for TensorRT inference
+// Uses dummy input data to perform the capture during initialization
+CaptureResult try_capture_cuda_graph(nvinfer1::IExecutionContext *context,
+                                     cudaStream_t stream, void *input_buffer,
+                                     void *output_buffer, int input_index,
+                                     int output_index,
+                                     nvinfer1::ICudaEngine *engine,
+                                     size_t input_size) {
+  CaptureResult result;
+
+  try {
+    // Generate dummy input data (values don't matter for capture, just shape)
+    std::vector<float> dummy_input(input_size, 0.0f);
+
+    // Copy dummy data to GPU
+    cudaError_t err =
+        cudaMemcpy(input_buffer, dummy_input.data(), input_size * sizeof(float),
+                   cudaMemcpyHostToDevice);
+    if (err != cudaSuccess) {
+      result.error_message =
+          "Failed to copy dummy data: " + std::string(cudaGetErrorString(err));
+      return result;
+    }
 
-      initialized_ = true;
+    // Attempt to capture the graph
+    CUDAQ_INFO("Attempting to capture CUDA graph during initialization...");
 
-    } catch (const std::exception &e) {
-      CUDAQ_WARN("TensorRT initialization failed: {}", e.what());
-      initialized_ = false;
+    err = cudaStreamBeginCapture(stream, cudaStreamCaptureModeGlobal);
+    if (err != cudaSuccess) {
+      result.error_message = "cudaStreamBeginCapture failed: " +
+                             std::string(cudaGetErrorString(err));
+      return result;
     }
-  }
 
-  virtual decoder_result decode(const std::vector<float_t> &syndrome) override {
-    decoder_result result{false, std::vector<float_t>(output_size_, 0.0)};
-
-    if (!initialized_) {
-      // Return unconverged result if not properly initialized
+    // Record TensorRT operations
+    context->setTensorAddress(engine->getIOTensorName(input_index),
+                              input_buffer);
+    context->setTensorAddress(engine->getIOTensorName(output_index),
+                              output_buffer);
+    context->enqueueV3(stream);
+
+    err = cudaStreamEndCapture(stream, &result.graph);
+    if (err != cudaSuccess) {
+      result.error_message = "cudaStreamEndCapture failed: " +
+                             std::string(cudaGetErrorString(err));
       return result;
     }
 
-    try {
-      // Preprocess syndrome data for TensorRT input
-      // Ensure input size matches expected TensorRT input size
-      assert(syndrome.size() == input_size_);
-      std::vector<float> input_host(syndrome.begin(), syndrome.end());
-
-      // Copy input to GPU
-      HANDLE_CUDA_ERROR(cudaMemcpy(buffers_[input_index_], input_host.data(),
-                                   input_size_ * sizeof(float),
-                                   cudaMemcpyHostToDevice));
-
-      // Set tensor addresses for TensorRT V1 API
-      context_->setTensorAddress(engine_->getIOTensorName(input_index_),
-                                 buffers_[input_index_]);
-      context_->setTensorAddress(engine_->getIOTensorName(output_index_),
-                                 buffers_[output_index_]);
-
-      // Run inference
-      context_->enqueueV3(stream_);
-      HANDLE_CUDA_ERROR(cudaStreamSynchronize(stream_));
-
-      // Copy output back from GPU
-      std::vector<float> output_host(output_size_);
-      HANDLE_CUDA_ERROR(cudaMemcpy(output_host.data(), buffers_[output_index_],
-                                   output_size_ * sizeof(float),
-                                   cudaMemcpyDeviceToHost));
-
-      // Postprocess output to get error probabilities
-      std::transform(output_host.begin(), output_host.end(),
-                     result.result.begin(),
-                     [](float val) { return static_cast<float_t>(val); });
-
-      result.converged = true;
-
-    } catch (const std::exception &e) {
-      CUDAQ_WARN("TensorRT inference failed: {}", e.what());
-      result.converged = false;
+    // Instantiate the graph
+    err = cudaGraphInstantiate(&result.graph_exec, result.graph, 0);
+    if (err != cudaSuccess) {
+      result.error_message = "cudaGraphInstantiate failed: " +
+                             std::string(cudaGetErrorString(err));
+      if (result.graph) {
+        cudaGraphDestroy(result.graph);
+        result.graph = nullptr;
+      }
+      return result;
     }
 
-    return result;
-  }
+    CUDAQ_INFO("CUDA graph captured successfully during initialization");
+    result.success = true;
 
-  virtual ~trt_decoder() {
-    // Clean up TensorRT resources
-    if (initialized_) {
-      HANDLE_CUDA_ERROR(cudaStreamDestroy(stream_));
-      HANDLE_CUDA_ERROR(cudaFree(buffers_[input_index_]));
-      HANDLE_CUDA_ERROR(cudaFree(buffers_[output_index_]));
-      // TensorRT engine and context will be automatically destroyed by
-      // unique_ptr
+  } catch (const std::exception &e) {
+    result.error_message = "Exception during capture: " + std::string(e.what());
+    // Clean up on failure
+    if (result.graph_exec) {
+      cudaGraphExecDestroy(result.graph_exec);
+      result.graph_exec = nullptr;
+    }
+    if (result.graph) {
+      cudaGraphDestroy(result.graph);
+      result.graph = nullptr;
     }
   }
 
-private:
-  void check_cuda() {
-    int deviceCount = 0;
-    cudaError_t error = cudaGetDeviceCount(&deviceCount);
-    if (error != cudaSuccess || deviceCount == 0) {
-      throw std::runtime_error(
-          "CUDA is not available or no CUDA-capable devices found. "
-          "TensorRT decoder requires CUDA to be installed and at least one "
-          "CUDA-capable GPU. Error: " +
-          std::string(cudaGetErrorString(error)));
+  return result;
+}
+
+// Check if CUDA graphs are supported for this engine
+bool supports_cuda_graphs(const nvinfer1::ICudaEngine *engine) {
+  // Check for dynamic shapes
+  for (int i = 0; i < engine->getNbIOTensors(); ++i) {
+    const char *name = engine->getIOTensorName(i);
+    auto dims = engine->getTensorShape(name);
+    for (int j = 0; j < dims.nbDims; ++j) {
+      if (dims.d[j] == -1) {
+        CUDAQ_INFO(
+            "Dynamic shape detected in tensor '{}', CUDA graphs not supported",
+            name);
+        return false;
+      }
     }
   }
 
+  // Check for multiple optimization profiles (often used with dynamic shapes)
+  if (engine->getNbOptimizationProfiles() > 1) {
+    CUDAQ_INFO(
+        "Multiple optimization profiles detected, CUDA graphs not supported");
+    return false;
+  }
+
+  return true;
+}
+} // anonymous namespace
+
+// ============================================================================
+// trt_decoder implementation
+// ============================================================================
+
+class trt_decoder : public decoder {
+private:
+  // Forward declaration of implementation
+  struct Impl;
+  std::unique_ptr<Impl> impl_;
+
+  // True when decoder is fully configured and ready for inference
+  bool decoder_ready_ = false;
+
 public:
+  trt_decoder(const cudaqx::tensor<uint8_t> &H,
+              const cudaqx::heterogeneous_map &params);
+
+  virtual decoder_result decode(const std::vector<float_t> &syndrome) override;
+
+  virtual ~trt_decoder();
+
   CUDAQ_EXTENSION_CUSTOM_CREATOR_FUNCTION(
       trt_decoder, static std::unique_ptr<decoder> create(
                        const cudaqx::tensor<uint8_t> &H,
                        const cudaqx::heterogeneous_map &params) {
         return std::make_unique<trt_decoder>(H, params);
       })
+
+private:
+  void check_cuda();
 };
 
-} // namespace cudaq::qec
+// ============================================================================
+// PIMPL Implementation struct
+// ============================================================================
+
+struct trt_decoder::Impl {
+  // TensorRT resources
+  std::unique_ptr<nvinfer1::ICudaEngine> engine;
+  std::unique_ptr<nvinfer1::IExecutionContext> context;
+  int input_index = 0;
+  int output_index = 0;
+  int input_size = 0;
+  int output_size = 0;
+  void *buffers[2] = {nullptr, nullptr};
+  cudaStream_t stream;
+
+  // Executor (chosen once at construction, never changes)
+  std::variant<TraditionalExecutor, CudaGraphExecutor> executor;
+
+  // Execute inference (variant dispatch)
+  void execute_inference() {
+    std::visit(
+        [&](auto &exec) {
+          exec.execute(context.get(), stream, buffers[input_index],
+                       buffers[output_index], input_index, output_index,
+                       engine.get());
+        },
+        executor);
+  }
 
-namespace cudaq::qec {
+  ~Impl() {
+    if (buffers[input_index]) {
+      HANDLE_CUDA_ERROR_NO_THROW(cudaFree(buffers[input_index]));
+    }
+    if (buffers[output_index]) {
+      HANDLE_CUDA_ERROR_NO_THROW(cudaFree(buffers[output_index]));
+    }
+    HANDLE_CUDA_ERROR_NO_THROW(cudaStreamDestroy(stream));
+  }
+};
+
+// ============================================================================
+// trt_decoder method implementations
+// ============================================================================
+
+trt_decoder::trt_decoder(const cudaqx::tensor<uint8_t> &H,
+                         const cudaqx::heterogeneous_map &params)
+    : decoder(H), decoder_ready_(false) {
+
+  impl_ = std::make_unique<Impl>();
+
+  try {
+    // Validate parameters
+    trt_decoder_internal::validate_trt_decoder_parameters(params);
+
+    // Check if CUDA is available
+    check_cuda();
+
+    bool has_engine_path = params.contains("engine_load_path");
+
+    if (has_engine_path) {
+      // Load pre-built TensorRT engine directly
+      std::string engine_path = params.get<std::string>("engine_load_path");
+      auto engineData = trt_decoder_internal::load_file(engine_path);
+
+      // Create runtime and deserialize engine
+      auto runtime = std::unique_ptr<nvinfer1::IRuntime>(
+          nvinfer1::createInferRuntime(gLogger));
+      if (!runtime) {
+        throw std::runtime_error("Failed to create TensorRT runtime");
+      }
+
+      impl_->engine = std::unique_ptr<nvinfer1::ICudaEngine>(
+          runtime->deserializeCudaEngine(engineData.data(), engineData.size()));
+      if (!impl_->engine) {
+        throw std::runtime_error(
+            "Failed to deserialize TensorRT engine from: " + engine_path);
+      }
+    } else {
+      // Load ONNX model and build engine
+      std::string onnx_model_path = params.get<std::string>("onnx_load_path");
+      impl_->engine = trt_decoder_internal::build_engine_from_onnx(
+          onnx_model_path, params, gLogger);
+
+      // Save engine if requested
+      if (params.contains("engine_save_path")) {
+        std::string engine_save_path =
+            params.get<std::string>("engine_save_path");
+        trt_decoder_internal::save_engine_to_file(impl_->engine.get(),
+                                                  engine_save_path);
+      }
+    }
+
+    // Create execution context
+    impl_->context = std::unique_ptr<nvinfer1::IExecutionContext>(
+        impl_->engine->createExecutionContext());
+    if (!impl_->context) {
+      throw std::runtime_error("Failed to create execution context");
+    }
+
+    // Get input/output info
+    int n_bindings = impl_->engine->getNbIOTensors();
+    impl_->input_index = -1;
+    impl_->output_index = -1;
+    for (int i = 0; i < n_bindings; ++i) {
+      const char *tensorName = impl_->engine->getIOTensorName(i);
+      if (impl_->engine->getTensorIOMode(tensorName) ==
+          nvinfer1::TensorIOMode::kINPUT) {
+        impl_->input_index = i;
+      } else {
+        impl_->output_index = i;
+      }
+    }
+
+    if (impl_->input_index == -1 || impl_->output_index == -1) {
+      throw std::runtime_error("Failed to identify input/output tensors");
+    }
+
+    auto inputDims = impl_->engine->getTensorShape(
+        impl_->engine->getIOTensorName(impl_->input_index));
+    impl_->input_size = 1;
+    for (int j = 0; j < inputDims.nbDims; ++j)
+      impl_->input_size *= inputDims.d[j];
+
+    auto outputDims = impl_->engine->getTensorShape(
+        impl_->engine->getIOTensorName(impl_->output_index));
+    impl_->output_size = 1;
+    for (int j = 0; j < outputDims.nbDims; ++j)
+      impl_->output_size *= outputDims.d[j];
+
+    // Allocate GPU buffers
+    HANDLE_CUDA_ERROR(cudaMalloc(&impl_->buffers[impl_->input_index],
+                                 impl_->input_size * sizeof(float)));
+    HANDLE_CUDA_ERROR(cudaMalloc(&impl_->buffers[impl_->output_index],
+                                 impl_->output_size * sizeof(float)));
+
+    // Create CUDA stream
+    HANDLE_CUDA_ERROR(cudaStreamCreate(&impl_->stream));
+
+    // ========================================================================
+    // SELECT EXECUTOR (once, at construction - never changes)
+    // ========================================================================
+    bool use_cuda_graph = true; // default preference
+
+    // User override
+    if (params.contains("use_cuda_graph")) {
+      use_cuda_graph = params.get<bool>("use_cuda_graph");
+      if (!use_cuda_graph) {
+        CUDAQ_INFO("CUDA graphs explicitly disabled by user");
+      }
+    }
+
+    // Check engine compatibility
+    if (use_cuda_graph && !supports_cuda_graphs(impl_->engine.get())) {
+      CUDAQ_WARN("Model has dynamic shapes or multiple profiles, "
+                 "CUDA graphs not supported. Using traditional execution.");
+      use_cuda_graph = false;
+    }
+
+    // Attempt to capture CUDA graph if enabled
+    if (use_cuda_graph) {
+      auto capture_result = try_capture_cuda_graph(
+          impl_->context.get(), impl_->stream,
+          impl_->buffers[impl_->input_index],
+          impl_->buffers[impl_->output_index], impl_->input_index,
+          impl_->output_index, impl_->engine.get(), impl_->input_size);
+
+      if (capture_result.success) {
+        impl_->executor =
+            CudaGraphExecutor{capture_result.graph, capture_result.graph_exec};
+        CUDAQ_INFO("TensorRT decoder initialized with CUDA graph execution");
+      } else {
+        CUDAQ_WARN("CUDA graph capture failed: {}. Falling back to traditional "
+                   "execution.",
+                   capture_result.error_message);
+        impl_->executor = TraditionalExecutor{};
+      }
+    } else {
+      impl_->executor = TraditionalExecutor{};
+      CUDAQ_INFO("TensorRT decoder initialized with traditional execution");
+    }
+
+    // Decoder is now fully configured and ready for inference
+    decoder_ready_ = true;
+
+  } catch (const std::exception &e) {
+    CUDAQ_WARN("TensorRT initialization failed: {}", e.what());
+    decoder_ready_ = false;
+  }
+}
+
+decoder_result trt_decoder::decode(const std::vector<float_t> &syndrome) {
+  decoder_result result{false, std::vector<float_t>(impl_->output_size, 0.0)};
+
+  if (!decoder_ready_) {
+    // Return unconverged result if decoder is not ready
+    return result;
+  }
+
+  try {
+    // Preprocess syndrome data for TensorRT input
+    // Ensure input size matches expected TensorRT input size
+    assert(syndrome.size() == impl_->input_size);
+    std::vector<float> input_host(syndrome.begin(), syndrome.end());
+
+    // Copy input to GPU
+    HANDLE_CUDA_ERROR(
+        cudaMemcpy(impl_->buffers[impl_->input_index], input_host.data(),
+                   impl_->input_size * sizeof(float), cudaMemcpyHostToDevice));
+
+    // Execute inference (variant handles both traditional and CUDA graph paths)
+    impl_->execute_inference();
+
+    // Copy output back from GPU
+    std::vector<float> output_host(impl_->output_size);
+    HANDLE_CUDA_ERROR(
+        cudaMemcpy(output_host.data(), impl_->buffers[impl_->output_index],
+                   impl_->output_size * sizeof(float), cudaMemcpyDeviceToHost));
+
+    // Postprocess output to get error probabilities
+    std::transform(output_host.begin(), output_host.end(),
+                   result.result.begin(),
+                   [](float val) { return static_cast<float_t>(val); });
+
+    result.converged = true;
+
+  } catch (const std::exception &e) {
+    CUDAQ_WARN("TensorRT inference failed: {}", e.what());
+    result.converged = false;
+  }
+
+  return result;
+}
+
+trt_decoder::~trt_decoder() = default;
+
+void trt_decoder::check_cuda() {
+  int deviceCount = 0;
+  cudaError_t error = cudaGetDeviceCount(&deviceCount);
+  if (error != cudaSuccess || deviceCount == 0) {
+    throw std::runtime_error(
+        "CUDA is not available or no CUDA-capable devices found. "
+        "TensorRT decoder requires CUDA to be installed and at least one "
+        "CUDA-capable GPU. Error: " +
+        std::string(cudaGetErrorString(error)));
+  }
+}
 
 CUDAQ_REGISTER_TYPE(trt_decoder)
 
diff --git a/libs/qec/unittests/decoders/trt_decoder/test_trt_decoder.cpp b/libs/qec/unittests/decoders/trt_decoder/test_trt_decoder.cpp
index 6f94793e..3d0a7cee 100644
--- a/libs/qec/unittests/decoders/trt_decoder/test_trt_decoder.cpp
+++ b/libs/qec/unittests/decoders/trt_decoder/test_trt_decoder.cpp
@@ -9,6 +9,7 @@
 #include "trt_test_data.h"
 #include "cudaq/qec/decoder.h"
 #include "cudaq/qec/trt_decoder_internal.h"
+#include <chrono>
 #include <cmath>
 #include <filesystem>
 #include <fstream>
@@ -297,6 +298,143 @@ TEST_F(TRTDecoderTest, ValidateSingleTestCase) {
   EXPECT_TRUE(result.converged) << "Decoder did not converge";
 }
 
+// Test performance comparison: CUDA Graph vs Traditional execution
+TEST_F(TRTDecoderTest, PerformanceComparisonCudaGraphVsTraditional) {
+  // Check if the ONNX model file exists
+  std::string onnx_path = "surface_code_decoder.onnx";
+  if (!std::filesystem::exists(onnx_path)) {
+    GTEST_SKIP() << "ONNX model file not found: " << onnx_path;
+  }
+
+  // Create dummy H matrix
+  std::size_t num_detectors = NUM_DETECTORS;
+  cudaqx::tensor<uint8_t> H({num_detectors, num_detectors});
+  for (std::size_t i = 0; i < num_detectors; ++i) {
+    H.at({i, i}) = 1;
+  }
+
+  // Create test syndrome
+  std::vector<cudaq::qec::float_t> syndrome(TEST_INPUTS[0].begin(),
+                                            TEST_INPUTS[0].end());
+
+  // =========================================================================
+  // Create decoder WITH CUDA graphs (default)
+  // =========================================================================
+  cudaqx::heterogeneous_map params_cuda_graph;
+  params_cuda_graph.insert("onnx_load_path", onnx_path);
+  params_cuda_graph.insert("precision", "fp16");
+  params_cuda_graph.insert("use_cuda_graph", true);
+
+  std::unique_ptr<decoder> decoder_cuda_graph;
+  try {
+    decoder_cuda_graph = decoder::get("trt_decoder", H, params_cuda_graph);
+  } catch (const std::exception &e) {
+    GTEST_SKIP() << "Failed to create CUDA graph decoder: " << e.what();
+  }
+
+  // =========================================================================
+  // Create decoder WITHOUT CUDA graphs (traditional)
+  // =========================================================================
+  cudaqx::heterogeneous_map params_traditional;
+  params_traditional.insert("onnx_load_path", onnx_path);
+  params_traditional.insert("precision", "fp16");
+  params_traditional.insert("use_cuda_graph", false);
+
+  std::unique_ptr<decoder> decoder_traditional;
+  try {
+    decoder_traditional = decoder::get("trt_decoder", H, params_traditional);
+  } catch (const std::exception &e) {
+    GTEST_SKIP() << "Failed to create traditional decoder: " << e.what();
+  }
+
+  // =========================================================================
+  // Warm-up phase (for fair comparison)
+  // =========================================================================
+  const int warmup_iterations = 5;
+  std::cout << "\n=== Warming up decoders ===" << std::endl;
+
+  for (int i = 0; i < warmup_iterations; ++i) {
+    decoder_cuda_graph->decode(syndrome);
+    decoder_traditional->decode(syndrome);
+  }
+
+  // =========================================================================
+  // Benchmark CUDA Graph Executor
+  // =========================================================================
+  const int benchmark_iterations = 200;
+  std::cout << "Benchmarking CUDA Graph executor..." << std::endl;
+
+  auto start_cuda_graph = std::chrono::high_resolution_clock::now();
+  for (int i = 0; i < benchmark_iterations; ++i) {
+    auto result = decoder_cuda_graph->decode(syndrome);
+    ASSERT_TRUE(result.converged)
+        << "CUDA graph decoder failed at iteration " << i;
+  }
+  auto end_cuda_graph = std::chrono::high_resolution_clock::now();
+
+  auto duration_cuda_graph =
+      std::chrono::duration_cast<std::chrono::microseconds>(end_cuda_graph -
+                                                            start_cuda_graph);
+  double avg_time_cuda_graph =
+      duration_cuda_graph.count() / static_cast<double>(benchmark_iterations);
+
+  // =========================================================================
+  // Benchmark Traditional Executor
+  // =========================================================================
+  std::cout << "Benchmarking Traditional executor..." << std::endl;
+
+  auto start_traditional = std::chrono::high_resolution_clock::now();
+  for (int i = 0; i < benchmark_iterations; ++i) {
+    auto result = decoder_traditional->decode(syndrome);
+    ASSERT_TRUE(result.converged)
+        << "Traditional decoder failed at iteration " << i;
+  }
+  auto end_traditional = std::chrono::high_resolution_clock::now();
+
+  auto duration_traditional =
+      std::chrono::duration_cast<std::chrono::microseconds>(end_traditional -
+                                                            start_traditional);
+  double avg_time_traditional =
+      duration_traditional.count() / static_cast<double>(benchmark_iterations);
+
+  // =========================================================================
+  // Calculate and report performance improvement
+  // =========================================================================
+  double speedup = avg_time_traditional / avg_time_cuda_graph;
+  double improvement_percent =
+      ((avg_time_traditional - avg_time_cuda_graph) / avg_time_traditional) *
+      100.0;
+
+  std::cout << "\n=== Performance Comparison Results ===" << std::endl;
+  std::cout << "Iterations: " << benchmark_iterations << std::endl;
+  std::cout << "CUDA Graph avg time:   " << avg_time_cuda_graph << " μs"
+            << std::endl;
+  std::cout << "Traditional avg time:  " << avg_time_traditional << " μs"
+            << std::endl;
+  std::cout << "Speedup:               " << speedup << "x" << std::endl;
+  std::cout << "Improvement:           " << improvement_percent << "%"
+            << std::endl;
+  std::cout << "======================================\n" << std::endl;
+
+  // =========================================================================
+  // Performance assertions
+  // =========================================================================
+  // CUDA graphs should provide at least 5% improvement
+  // (Conservative threshold - typical improvement is 10-20%)
+  EXPECT_GT(speedup, 1.05)
+      << "CUDA graph execution should be at least 5% faster than traditional. "
+      << "Speedup: " << speedup << "x, Improvement: " << improvement_percent
+      << "%";
+
+  // Sanity check: both should be reasonably fast (< 100ms per decode)
+  EXPECT_LT(avg_time_cuda_graph, 100000.0)
+      << "CUDA graph execution unexpectedly slow: " << avg_time_cuda_graph
+      << " μs";
+  EXPECT_LT(avg_time_traditional, 100000.0)
+      << "Traditional execution unexpectedly slow: " << avg_time_traditional
+      << " μs";
+}
+
 // Note: Constructor tests and parse_precision tests are disabled because they
 // require actual TensorRT/CUDA initialization which is not available in the
 // test environment. Only parameter validation and utility function tests are