diff --git a/libs/qec/lib/decoders/plugins/trt_decoder/trt_decoder.cpp b/libs/qec/lib/decoders/plugins/trt_decoder/trt_decoder.cpp index 30012502..8306536c 100644 --- a/libs/qec/lib/decoders/plugins/trt_decoder/trt_decoder.cpp +++ b/libs/qec/lib/decoders/plugins/trt_decoder/trt_decoder.cpp @@ -14,6 +14,7 @@ #include #include #include +#include #include // TensorRT headers @@ -101,208 +102,474 @@ static Logger gLogger; /// not both. namespace cudaq::qec { -class trt_decoder : public decoder { -private: - // TensorRT-specific members - std::unique_ptr engine_; - std::unique_ptr context_; - int input_index_ = 0; - int output_index_ = 0; - int input_size_ = 0; - int output_size_ = 0; - void *buffers_[2] = {nullptr, nullptr}; - cudaStream_t stream_; - bool initialized_ = false; +// ============================================================================ +// Executor implementations (internal) +// ============================================================================ + +namespace { +// Traditional TensorRT execution without CUDA graphs +struct TraditionalExecutor { + void execute(nvinfer1::IExecutionContext *context, cudaStream_t stream, + void *input_buffer, void *output_buffer, int input_index, + int output_index, nvinfer1::ICudaEngine *engine) { + context->setTensorAddress(engine->getIOTensorName(input_index), + input_buffer); + context->setTensorAddress(engine->getIOTensorName(output_index), + output_buffer); + context->enqueueV3(stream); + HANDLE_CUDA_ERROR(cudaStreamSynchronize(stream)); + } +}; -public: - trt_decoder(const cudaqx::tensor &H, - const cudaqx::heterogeneous_map ¶ms) - : decoder(H), initialized_(false) { - // Decoder-specific constructor arguments can be placed in `params`. - - try { - // Validate parameters - trt_decoder_internal::validate_trt_decoder_parameters(params); - - // Check if CUDA is available - check_cuda(); - - bool has_engine_path = params.contains("engine_load_path"); - - if (has_engine_path) { - // Load pre-built TensorRT engine directly - std::string engine_path = params.get("engine_load_path"); - auto engineData = trt_decoder_internal::load_file(engine_path); - - // Create runtime and deserialize engine - auto runtime = std::unique_ptr( - nvinfer1::createInferRuntime(gLogger)); - if (!runtime) { - throw std::runtime_error("Failed to create TensorRT runtime"); - } - - engine_ = std::unique_ptr( - runtime->deserializeCudaEngine(engineData.data(), - engineData.size())); - if (!engine_) { - throw std::runtime_error( - "Failed to deserialize TensorRT engine from: " + engine_path); - } - } else { - // Load ONNX model and build engine - std::string onnx_model_path = params.get("onnx_load_path"); - engine_ = trt_decoder_internal::build_engine_from_onnx(onnx_model_path, - params, gLogger); - - // Save engine if requested - if (params.contains("engine_save_path")) { - std::string engine_save_path = - params.get("engine_save_path"); - trt_decoder_internal::save_engine_to_file(engine_.get(), - engine_save_path); - } - } +// CUDA graph-based execution for optimized performance +struct CudaGraphExecutor { + cudaGraph_t graph; + cudaGraphExec_t graph_exec; - // Create execution context - context_ = std::unique_ptr( - engine_->createExecutionContext()); - if (!context_) { - throw std::runtime_error("Failed to create execution context"); - } + // Constructor now takes ownership of pre-captured graph + CudaGraphExecutor(cudaGraph_t g, cudaGraphExec_t ge) + : graph(g), graph_exec(ge) {} - // Get input/output info - int n_bindings = engine_->getNbIOTensors(); - input_index_ = -1; - output_index_ = -1; - for (int i = 0; i < n_bindings; ++i) { - const char *tensorName = engine_->getIOTensorName(i); - if (engine_->getTensorIOMode(tensorName) == - nvinfer1::TensorIOMode::kINPUT) { - input_index_ = i; - } else { - output_index_ = i; - } - } + // Delete copy constructor and assignment to prevent double-free + CudaGraphExecutor(const CudaGraphExecutor &) = delete; + CudaGraphExecutor &operator=(const CudaGraphExecutor &) = delete; - if (input_index_ == -1 || output_index_ == -1) { - throw std::runtime_error("Failed to identify input/output tensors"); + // Move constructor - transfer ownership + CudaGraphExecutor(CudaGraphExecutor &&other) noexcept + : graph(other.graph), graph_exec(other.graph_exec) { + other.graph = nullptr; + other.graph_exec = nullptr; + } + + // Move assignment - transfer ownership + CudaGraphExecutor &operator=(CudaGraphExecutor &&other) noexcept { + if (this != &other) { + // Clean up existing resources + if (graph_exec) { + HANDLE_CUDA_ERROR_NO_THROW(cudaGraphExecDestroy(graph_exec)); + } + if (graph) { + HANDLE_CUDA_ERROR_NO_THROW(cudaGraphDestroy(graph)); } + // Transfer ownership + graph = other.graph; + graph_exec = other.graph_exec; + other.graph = nullptr; + other.graph_exec = nullptr; + } + return *this; + } - auto inputDims = - engine_->getTensorShape(engine_->getIOTensorName(input_index_)); - input_size_ = 1; - for (int j = 0; j < inputDims.nbDims; ++j) - input_size_ *= inputDims.d[j]; + void execute(nvinfer1::IExecutionContext *context, cudaStream_t stream, + void *input_buffer, void *output_buffer, int input_index, + int output_index, nvinfer1::ICudaEngine *engine) { + // Just launch the graph - no lazy capture needed! + HANDLE_CUDA_ERROR(cudaGraphLaunch(graph_exec, stream)); + HANDLE_CUDA_ERROR(cudaStreamSynchronize(stream)); + } - auto outputDims = - engine_->getTensorShape(engine_->getIOTensorName(output_index_)); - output_size_ = 1; - for (int j = 0; j < outputDims.nbDims; ++j) - output_size_ *= outputDims.d[j]; + ~CudaGraphExecutor() { + if (graph_exec) { + HANDLE_CUDA_ERROR_NO_THROW(cudaGraphExecDestroy(graph_exec)); + } + if (graph) { + HANDLE_CUDA_ERROR_NO_THROW(cudaGraphDestroy(graph)); + } + } +}; - // Allocate GPU buffers - HANDLE_CUDA_ERROR( - cudaMalloc(&buffers_[input_index_], input_size_ * sizeof(float))); - HANDLE_CUDA_ERROR( - cudaMalloc(&buffers_[output_index_], output_size_ * sizeof(float))); +// Result structure for CUDA graph capture attempts +struct CaptureResult { + bool success = false; + cudaGraph_t graph = nullptr; + cudaGraphExec_t graph_exec = nullptr; + std::string error_message; +}; - // Create CUDA stream - HANDLE_CUDA_ERROR(cudaStreamCreate(&stream_)); +// Attempt to capture a CUDA graph for TensorRT inference +// Uses dummy input data to perform the capture during initialization +CaptureResult try_capture_cuda_graph(nvinfer1::IExecutionContext *context, + cudaStream_t stream, void *input_buffer, + void *output_buffer, int input_index, + int output_index, + nvinfer1::ICudaEngine *engine, + size_t input_size) { + CaptureResult result; + + try { + // Generate dummy input data (values don't matter for capture, just shape) + std::vector dummy_input(input_size, 0.0f); + + // Copy dummy data to GPU + cudaError_t err = + cudaMemcpy(input_buffer, dummy_input.data(), input_size * sizeof(float), + cudaMemcpyHostToDevice); + if (err != cudaSuccess) { + result.error_message = + "Failed to copy dummy data: " + std::string(cudaGetErrorString(err)); + return result; + } - initialized_ = true; + // Attempt to capture the graph + CUDAQ_INFO("Attempting to capture CUDA graph during initialization..."); - } catch (const std::exception &e) { - CUDAQ_WARN("TensorRT initialization failed: {}", e.what()); - initialized_ = false; + err = cudaStreamBeginCapture(stream, cudaStreamCaptureModeGlobal); + if (err != cudaSuccess) { + result.error_message = "cudaStreamBeginCapture failed: " + + std::string(cudaGetErrorString(err)); + return result; } - } - virtual decoder_result decode(const std::vector &syndrome) override { - decoder_result result{false, std::vector(output_size_, 0.0)}; - - if (!initialized_) { - // Return unconverged result if not properly initialized + // Record TensorRT operations + context->setTensorAddress(engine->getIOTensorName(input_index), + input_buffer); + context->setTensorAddress(engine->getIOTensorName(output_index), + output_buffer); + context->enqueueV3(stream); + + err = cudaStreamEndCapture(stream, &result.graph); + if (err != cudaSuccess) { + result.error_message = "cudaStreamEndCapture failed: " + + std::string(cudaGetErrorString(err)); return result; } - try { - // Preprocess syndrome data for TensorRT input - // Ensure input size matches expected TensorRT input size - assert(syndrome.size() == input_size_); - std::vector input_host(syndrome.begin(), syndrome.end()); - - // Copy input to GPU - HANDLE_CUDA_ERROR(cudaMemcpy(buffers_[input_index_], input_host.data(), - input_size_ * sizeof(float), - cudaMemcpyHostToDevice)); - - // Set tensor addresses for TensorRT V1 API - context_->setTensorAddress(engine_->getIOTensorName(input_index_), - buffers_[input_index_]); - context_->setTensorAddress(engine_->getIOTensorName(output_index_), - buffers_[output_index_]); - - // Run inference - context_->enqueueV3(stream_); - HANDLE_CUDA_ERROR(cudaStreamSynchronize(stream_)); - - // Copy output back from GPU - std::vector output_host(output_size_); - HANDLE_CUDA_ERROR(cudaMemcpy(output_host.data(), buffers_[output_index_], - output_size_ * sizeof(float), - cudaMemcpyDeviceToHost)); - - // Postprocess output to get error probabilities - std::transform(output_host.begin(), output_host.end(), - result.result.begin(), - [](float val) { return static_cast(val); }); - - result.converged = true; - - } catch (const std::exception &e) { - CUDAQ_WARN("TensorRT inference failed: {}", e.what()); - result.converged = false; + // Instantiate the graph + err = cudaGraphInstantiate(&result.graph_exec, result.graph, 0); + if (err != cudaSuccess) { + result.error_message = "cudaGraphInstantiate failed: " + + std::string(cudaGetErrorString(err)); + if (result.graph) { + cudaGraphDestroy(result.graph); + result.graph = nullptr; + } + return result; } - return result; - } + CUDAQ_INFO("CUDA graph captured successfully during initialization"); + result.success = true; - virtual ~trt_decoder() { - // Clean up TensorRT resources - if (initialized_) { - HANDLE_CUDA_ERROR(cudaStreamDestroy(stream_)); - HANDLE_CUDA_ERROR(cudaFree(buffers_[input_index_])); - HANDLE_CUDA_ERROR(cudaFree(buffers_[output_index_])); - // TensorRT engine and context will be automatically destroyed by - // unique_ptr + } catch (const std::exception &e) { + result.error_message = "Exception during capture: " + std::string(e.what()); + // Clean up on failure + if (result.graph_exec) { + cudaGraphExecDestroy(result.graph_exec); + result.graph_exec = nullptr; + } + if (result.graph) { + cudaGraphDestroy(result.graph); + result.graph = nullptr; } } -private: - void check_cuda() { - int deviceCount = 0; - cudaError_t error = cudaGetDeviceCount(&deviceCount); - if (error != cudaSuccess || deviceCount == 0) { - throw std::runtime_error( - "CUDA is not available or no CUDA-capable devices found. " - "TensorRT decoder requires CUDA to be installed and at least one " - "CUDA-capable GPU. Error: " + - std::string(cudaGetErrorString(error))); + return result; +} + +// Check if CUDA graphs are supported for this engine +bool supports_cuda_graphs(const nvinfer1::ICudaEngine *engine) { + // Check for dynamic shapes + for (int i = 0; i < engine->getNbIOTensors(); ++i) { + const char *name = engine->getIOTensorName(i); + auto dims = engine->getTensorShape(name); + for (int j = 0; j < dims.nbDims; ++j) { + if (dims.d[j] == -1) { + CUDAQ_INFO( + "Dynamic shape detected in tensor '{}', CUDA graphs not supported", + name); + return false; + } } } + // Check for multiple optimization profiles (often used with dynamic shapes) + if (engine->getNbOptimizationProfiles() > 1) { + CUDAQ_INFO( + "Multiple optimization profiles detected, CUDA graphs not supported"); + return false; + } + + return true; +} +} // anonymous namespace + +// ============================================================================ +// trt_decoder implementation +// ============================================================================ + +class trt_decoder : public decoder { +private: + // Forward declaration of implementation + struct Impl; + std::unique_ptr impl_; + + // True when decoder is fully configured and ready for inference + bool decoder_ready_ = false; + public: + trt_decoder(const cudaqx::tensor &H, + const cudaqx::heterogeneous_map ¶ms); + + virtual decoder_result decode(const std::vector &syndrome) override; + + virtual ~trt_decoder(); + CUDAQ_EXTENSION_CUSTOM_CREATOR_FUNCTION( trt_decoder, static std::unique_ptr create( const cudaqx::tensor &H, const cudaqx::heterogeneous_map ¶ms) { return std::make_unique(H, params); }) + +private: + void check_cuda(); }; -} // namespace cudaq::qec +// ============================================================================ +// PIMPL Implementation struct +// ============================================================================ + +struct trt_decoder::Impl { + // TensorRT resources + std::unique_ptr engine; + std::unique_ptr context; + int input_index = 0; + int output_index = 0; + int input_size = 0; + int output_size = 0; + void *buffers[2] = {nullptr, nullptr}; + cudaStream_t stream; + + // Executor (chosen once at construction, never changes) + std::variant executor; + + // Execute inference (variant dispatch) + void execute_inference() { + std::visit( + [&](auto &exec) { + exec.execute(context.get(), stream, buffers[input_index], + buffers[output_index], input_index, output_index, + engine.get()); + }, + executor); + } -namespace cudaq::qec { + ~Impl() { + if (buffers[input_index]) { + HANDLE_CUDA_ERROR_NO_THROW(cudaFree(buffers[input_index])); + } + if (buffers[output_index]) { + HANDLE_CUDA_ERROR_NO_THROW(cudaFree(buffers[output_index])); + } + HANDLE_CUDA_ERROR_NO_THROW(cudaStreamDestroy(stream)); + } +}; + +// ============================================================================ +// trt_decoder method implementations +// ============================================================================ + +trt_decoder::trt_decoder(const cudaqx::tensor &H, + const cudaqx::heterogeneous_map ¶ms) + : decoder(H), decoder_ready_(false) { + + impl_ = std::make_unique(); + + try { + // Validate parameters + trt_decoder_internal::validate_trt_decoder_parameters(params); + + // Check if CUDA is available + check_cuda(); + + bool has_engine_path = params.contains("engine_load_path"); + + if (has_engine_path) { + // Load pre-built TensorRT engine directly + std::string engine_path = params.get("engine_load_path"); + auto engineData = trt_decoder_internal::load_file(engine_path); + + // Create runtime and deserialize engine + auto runtime = std::unique_ptr( + nvinfer1::createInferRuntime(gLogger)); + if (!runtime) { + throw std::runtime_error("Failed to create TensorRT runtime"); + } + + impl_->engine = std::unique_ptr( + runtime->deserializeCudaEngine(engineData.data(), engineData.size())); + if (!impl_->engine) { + throw std::runtime_error( + "Failed to deserialize TensorRT engine from: " + engine_path); + } + } else { + // Load ONNX model and build engine + std::string onnx_model_path = params.get("onnx_load_path"); + impl_->engine = trt_decoder_internal::build_engine_from_onnx( + onnx_model_path, params, gLogger); + + // Save engine if requested + if (params.contains("engine_save_path")) { + std::string engine_save_path = + params.get("engine_save_path"); + trt_decoder_internal::save_engine_to_file(impl_->engine.get(), + engine_save_path); + } + } + + // Create execution context + impl_->context = std::unique_ptr( + impl_->engine->createExecutionContext()); + if (!impl_->context) { + throw std::runtime_error("Failed to create execution context"); + } + + // Get input/output info + int n_bindings = impl_->engine->getNbIOTensors(); + impl_->input_index = -1; + impl_->output_index = -1; + for (int i = 0; i < n_bindings; ++i) { + const char *tensorName = impl_->engine->getIOTensorName(i); + if (impl_->engine->getTensorIOMode(tensorName) == + nvinfer1::TensorIOMode::kINPUT) { + impl_->input_index = i; + } else { + impl_->output_index = i; + } + } + + if (impl_->input_index == -1 || impl_->output_index == -1) { + throw std::runtime_error("Failed to identify input/output tensors"); + } + + auto inputDims = impl_->engine->getTensorShape( + impl_->engine->getIOTensorName(impl_->input_index)); + impl_->input_size = 1; + for (int j = 0; j < inputDims.nbDims; ++j) + impl_->input_size *= inputDims.d[j]; + + auto outputDims = impl_->engine->getTensorShape( + impl_->engine->getIOTensorName(impl_->output_index)); + impl_->output_size = 1; + for (int j = 0; j < outputDims.nbDims; ++j) + impl_->output_size *= outputDims.d[j]; + + // Allocate GPU buffers + HANDLE_CUDA_ERROR(cudaMalloc(&impl_->buffers[impl_->input_index], + impl_->input_size * sizeof(float))); + HANDLE_CUDA_ERROR(cudaMalloc(&impl_->buffers[impl_->output_index], + impl_->output_size * sizeof(float))); + + // Create CUDA stream + HANDLE_CUDA_ERROR(cudaStreamCreate(&impl_->stream)); + + // ======================================================================== + // SELECT EXECUTOR (once, at construction - never changes) + // ======================================================================== + bool use_cuda_graph = true; // default preference + + // User override + if (params.contains("use_cuda_graph")) { + use_cuda_graph = params.get("use_cuda_graph"); + if (!use_cuda_graph) { + CUDAQ_INFO("CUDA graphs explicitly disabled by user"); + } + } + + // Check engine compatibility + if (use_cuda_graph && !supports_cuda_graphs(impl_->engine.get())) { + CUDAQ_WARN("Model has dynamic shapes or multiple profiles, " + "CUDA graphs not supported. Using traditional execution."); + use_cuda_graph = false; + } + + // Attempt to capture CUDA graph if enabled + if (use_cuda_graph) { + auto capture_result = try_capture_cuda_graph( + impl_->context.get(), impl_->stream, + impl_->buffers[impl_->input_index], + impl_->buffers[impl_->output_index], impl_->input_index, + impl_->output_index, impl_->engine.get(), impl_->input_size); + + if (capture_result.success) { + impl_->executor = + CudaGraphExecutor{capture_result.graph, capture_result.graph_exec}; + CUDAQ_INFO("TensorRT decoder initialized with CUDA graph execution"); + } else { + CUDAQ_WARN("CUDA graph capture failed: {}. Falling back to traditional " + "execution.", + capture_result.error_message); + impl_->executor = TraditionalExecutor{}; + } + } else { + impl_->executor = TraditionalExecutor{}; + CUDAQ_INFO("TensorRT decoder initialized with traditional execution"); + } + + // Decoder is now fully configured and ready for inference + decoder_ready_ = true; + + } catch (const std::exception &e) { + CUDAQ_WARN("TensorRT initialization failed: {}", e.what()); + decoder_ready_ = false; + } +} + +decoder_result trt_decoder::decode(const std::vector &syndrome) { + decoder_result result{false, std::vector(impl_->output_size, 0.0)}; + + if (!decoder_ready_) { + // Return unconverged result if decoder is not ready + return result; + } + + try { + // Preprocess syndrome data for TensorRT input + // Ensure input size matches expected TensorRT input size + assert(syndrome.size() == impl_->input_size); + std::vector input_host(syndrome.begin(), syndrome.end()); + + // Copy input to GPU + HANDLE_CUDA_ERROR( + cudaMemcpy(impl_->buffers[impl_->input_index], input_host.data(), + impl_->input_size * sizeof(float), cudaMemcpyHostToDevice)); + + // Execute inference (variant handles both traditional and CUDA graph paths) + impl_->execute_inference(); + + // Copy output back from GPU + std::vector output_host(impl_->output_size); + HANDLE_CUDA_ERROR( + cudaMemcpy(output_host.data(), impl_->buffers[impl_->output_index], + impl_->output_size * sizeof(float), cudaMemcpyDeviceToHost)); + + // Postprocess output to get error probabilities + std::transform(output_host.begin(), output_host.end(), + result.result.begin(), + [](float val) { return static_cast(val); }); + + result.converged = true; + + } catch (const std::exception &e) { + CUDAQ_WARN("TensorRT inference failed: {}", e.what()); + result.converged = false; + } + + return result; +} + +trt_decoder::~trt_decoder() = default; + +void trt_decoder::check_cuda() { + int deviceCount = 0; + cudaError_t error = cudaGetDeviceCount(&deviceCount); + if (error != cudaSuccess || deviceCount == 0) { + throw std::runtime_error( + "CUDA is not available or no CUDA-capable devices found. " + "TensorRT decoder requires CUDA to be installed and at least one " + "CUDA-capable GPU. Error: " + + std::string(cudaGetErrorString(error))); + } +} CUDAQ_REGISTER_TYPE(trt_decoder) diff --git a/libs/qec/unittests/decoders/trt_decoder/test_trt_decoder.cpp b/libs/qec/unittests/decoders/trt_decoder/test_trt_decoder.cpp index 6f94793e..3d0a7cee 100644 --- a/libs/qec/unittests/decoders/trt_decoder/test_trt_decoder.cpp +++ b/libs/qec/unittests/decoders/trt_decoder/test_trt_decoder.cpp @@ -9,6 +9,7 @@ #include "trt_test_data.h" #include "cudaq/qec/decoder.h" #include "cudaq/qec/trt_decoder_internal.h" +#include #include #include #include @@ -297,6 +298,143 @@ TEST_F(TRTDecoderTest, ValidateSingleTestCase) { EXPECT_TRUE(result.converged) << "Decoder did not converge"; } +// Test performance comparison: CUDA Graph vs Traditional execution +TEST_F(TRTDecoderTest, PerformanceComparisonCudaGraphVsTraditional) { + // Check if the ONNX model file exists + std::string onnx_path = "surface_code_decoder.onnx"; + if (!std::filesystem::exists(onnx_path)) { + GTEST_SKIP() << "ONNX model file not found: " << onnx_path; + } + + // Create dummy H matrix + std::size_t num_detectors = NUM_DETECTORS; + cudaqx::tensor H({num_detectors, num_detectors}); + for (std::size_t i = 0; i < num_detectors; ++i) { + H.at({i, i}) = 1; + } + + // Create test syndrome + std::vector syndrome(TEST_INPUTS[0].begin(), + TEST_INPUTS[0].end()); + + // ========================================================================= + // Create decoder WITH CUDA graphs (default) + // ========================================================================= + cudaqx::heterogeneous_map params_cuda_graph; + params_cuda_graph.insert("onnx_load_path", onnx_path); + params_cuda_graph.insert("precision", "fp16"); + params_cuda_graph.insert("use_cuda_graph", true); + + std::unique_ptr decoder_cuda_graph; + try { + decoder_cuda_graph = decoder::get("trt_decoder", H, params_cuda_graph); + } catch (const std::exception &e) { + GTEST_SKIP() << "Failed to create CUDA graph decoder: " << e.what(); + } + + // ========================================================================= + // Create decoder WITHOUT CUDA graphs (traditional) + // ========================================================================= + cudaqx::heterogeneous_map params_traditional; + params_traditional.insert("onnx_load_path", onnx_path); + params_traditional.insert("precision", "fp16"); + params_traditional.insert("use_cuda_graph", false); + + std::unique_ptr decoder_traditional; + try { + decoder_traditional = decoder::get("trt_decoder", H, params_traditional); + } catch (const std::exception &e) { + GTEST_SKIP() << "Failed to create traditional decoder: " << e.what(); + } + + // ========================================================================= + // Warm-up phase (for fair comparison) + // ========================================================================= + const int warmup_iterations = 5; + std::cout << "\n=== Warming up decoders ===" << std::endl; + + for (int i = 0; i < warmup_iterations; ++i) { + decoder_cuda_graph->decode(syndrome); + decoder_traditional->decode(syndrome); + } + + // ========================================================================= + // Benchmark CUDA Graph Executor + // ========================================================================= + const int benchmark_iterations = 200; + std::cout << "Benchmarking CUDA Graph executor..." << std::endl; + + auto start_cuda_graph = std::chrono::high_resolution_clock::now(); + for (int i = 0; i < benchmark_iterations; ++i) { + auto result = decoder_cuda_graph->decode(syndrome); + ASSERT_TRUE(result.converged) + << "CUDA graph decoder failed at iteration " << i; + } + auto end_cuda_graph = std::chrono::high_resolution_clock::now(); + + auto duration_cuda_graph = + std::chrono::duration_cast(end_cuda_graph - + start_cuda_graph); + double avg_time_cuda_graph = + duration_cuda_graph.count() / static_cast(benchmark_iterations); + + // ========================================================================= + // Benchmark Traditional Executor + // ========================================================================= + std::cout << "Benchmarking Traditional executor..." << std::endl; + + auto start_traditional = std::chrono::high_resolution_clock::now(); + for (int i = 0; i < benchmark_iterations; ++i) { + auto result = decoder_traditional->decode(syndrome); + ASSERT_TRUE(result.converged) + << "Traditional decoder failed at iteration " << i; + } + auto end_traditional = std::chrono::high_resolution_clock::now(); + + auto duration_traditional = + std::chrono::duration_cast(end_traditional - + start_traditional); + double avg_time_traditional = + duration_traditional.count() / static_cast(benchmark_iterations); + + // ========================================================================= + // Calculate and report performance improvement + // ========================================================================= + double speedup = avg_time_traditional / avg_time_cuda_graph; + double improvement_percent = + ((avg_time_traditional - avg_time_cuda_graph) / avg_time_traditional) * + 100.0; + + std::cout << "\n=== Performance Comparison Results ===" << std::endl; + std::cout << "Iterations: " << benchmark_iterations << std::endl; + std::cout << "CUDA Graph avg time: " << avg_time_cuda_graph << " μs" + << std::endl; + std::cout << "Traditional avg time: " << avg_time_traditional << " μs" + << std::endl; + std::cout << "Speedup: " << speedup << "x" << std::endl; + std::cout << "Improvement: " << improvement_percent << "%" + << std::endl; + std::cout << "======================================\n" << std::endl; + + // ========================================================================= + // Performance assertions + // ========================================================================= + // CUDA graphs should provide at least 5% improvement + // (Conservative threshold - typical improvement is 10-20%) + EXPECT_GT(speedup, 1.05) + << "CUDA graph execution should be at least 5% faster than traditional. " + << "Speedup: " << speedup << "x, Improvement: " << improvement_percent + << "%"; + + // Sanity check: both should be reasonably fast (< 100ms per decode) + EXPECT_LT(avg_time_cuda_graph, 100000.0) + << "CUDA graph execution unexpectedly slow: " << avg_time_cuda_graph + << " μs"; + EXPECT_LT(avg_time_traditional, 100000.0) + << "Traditional execution unexpectedly slow: " << avg_time_traditional + << " μs"; +} + // Note: Constructor tests and parse_precision tests are disabled because they // require actual TensorRT/CUDA initialization which is not available in the // test environment. Only parameter validation and utility function tests are