diff --git a/backends/cuda/tests/multimodal_benchmark.cpp b/backends/cuda/tests/multimodal_benchmark.cpp index 679db889b71..2ed74bbfc04 100644 --- a/backends/cuda/tests/multimodal_benchmark.cpp +++ b/backends/cuda/tests/multimodal_benchmark.cpp @@ -17,6 +17,9 @@ #include #include +#include +#include + namespace { using executorch::aten::ScalarType; @@ -201,8 +204,21 @@ TensorPtr create_fallback_text_embedding(const ModelConfig& config) { struct MethodTiming { double load_ms{0.0}; double run_ms{0.0}; + size_t peak_gpu_memory_bytes{0}; }; +size_t get_gpu_memory_used() { + size_t free_bytes = 0; + size_t total_bytes = 0; + cudaError_t status = cudaMemGetInfo(&free_bytes, &total_bytes); + if (status != cudaSuccess) { + std::cerr << "Warning: cudaMemGetInfo failed: " + << cudaGetErrorString(status) << std::endl; + return 0; + } + return total_bytes - free_bytes; +} + enum class MethodCategory { ENCODER, TOKEN_EMBEDDING, TEXT_DECODER, UNKNOWN }; MethodCategory categorize_method(const std::string& method_name) { @@ -306,6 +322,9 @@ Error execute_method( std::vector inputs = create_inputs_for_method( method_name, category, model_type, config, token_output, owned_inputs); + cudaDeviceSynchronize(); + size_t mem_before = get_gpu_memory_used(); + const auto run_start = Clock::now(); ET_LOG(Info, "%s running", method_name.c_str()); Result> output_result = @@ -314,6 +333,11 @@ Error execute_method( const auto run_end = Clock::now(); timing.run_ms = DurationMs(run_end - run_start).count(); + cudaDeviceSynchronize(); + size_t mem_after = get_gpu_memory_used(); + timing.peak_gpu_memory_bytes = + mem_after > mem_before ? (mem_after - mem_before) : 0; + if (output_result.error() != Error::Ok) { std::cerr << method_name << " execution failed: error code " << static_cast(output_result.error()) << std::endl; @@ -457,6 +481,13 @@ int main(int argc, char** argv) { std::cout << " " << name << ": " << timing.run_ms << std::endl; } + std::cout << "\nPeak GPU memory usage:" << std::endl; + for (const auto& [name, timing] : timings) { + double memory_mb = timing.peak_gpu_memory_bytes / (1024.0 * 1024.0); + std::cout << " " << name << ": " << memory_mb << " MB (" + << timing.peak_gpu_memory_bytes << " bytes)" << std::endl; + } + return 0; } catch (const std::exception& ex) { std::cerr << "Unhandled exception: " << ex.what() << std::endl; diff --git a/examples/models/gemma3/e2e_runner.cpp b/examples/models/gemma3/e2e_runner.cpp index 68f19e8296d..18e5f14e405 100644 --- a/examples/models/gemma3/e2e_runner.cpp +++ b/examples/models/gemma3/e2e_runner.cpp @@ -23,6 +23,8 @@ #include #include +#include + #define STB_IMAGE_IMPLEMENTATION #include #define STB_IMAGE_RESIZE_IMPLEMENTATION @@ -67,6 +69,20 @@ using ::executorch::extension::llm::make_text_input; using ::executorch::extension::llm::MultimodalInput; using ::executorch::runtime::EValue; +size_t get_gpu_memory_used() { + size_t free_bytes = 0; + size_t total_bytes = 0; + cudaError_t status = cudaMemGetInfo(&free_bytes, &total_bytes); + if (status != cudaSuccess) { + ET_LOG( + Error, + "Warning: cudaMemGetInfo failed: %s", + cudaGetErrorString(status)); + return 0; + } + return total_bytes - free_bytes; +} + bool ends_with(const std::string& str, const std::string& suffix) { return str.size() >= suffix.size() && str.compare(str.size() - suffix.size(), suffix.size(), suffix) == 0; @@ -200,6 +216,14 @@ int32_t main(int32_t argc, char** argv) { return 1; } + // Measure memory before loading + cudaDeviceSynchronize(); + size_t mem_before_load = get_gpu_memory_used(); + ET_LOG( + Info, + "GPU memory before loading: %.2f MB", + mem_before_load / (1024.0 * 1024.0)); + // Load runner auto load_error = runner->load(); if (load_error != ::executorch::runtime::Error::Ok) { @@ -207,6 +231,14 @@ int32_t main(int32_t argc, char** argv) { return 1; } + // Measure memory after loading + cudaDeviceSynchronize(); + size_t mem_after_load = get_gpu_memory_used(); + ET_LOG( + Info, + "GPU memory after loading: %.2f MB", + mem_after_load / (1024.0 * 1024.0)); + // Prepare inputs std::vector inputs = { make_text_input("user\n"), @@ -230,13 +262,64 @@ int32_t main(int32_t argc, char** argv) { runner->reset(); } + // Measure memory before generation + cudaDeviceSynchronize(); + size_t mem_before_gen = get_gpu_memory_used(); + auto error = runner->generate(inputs, config); if (error != ::executorch::runtime::Error::Ok) { ET_LOG(Error, "Failed to generate with multimodal runner\n"); return 1; } + + // Measure memory after generation + cudaDeviceSynchronize(); + size_t mem_after_gen = get_gpu_memory_used(); + ET_LOG(Info, "Generated successfully"); + // Calculate and print memory usage statistics + size_t load_memory = mem_after_load - mem_before_load; + size_t gen_memory = + mem_after_gen > mem_before_gen ? (mem_after_gen - mem_before_gen) : 0; + size_t total_memory = mem_after_gen - mem_before_load; + size_t peak_memory = mem_after_gen; + + std::printf("\n=== CUDA Memory Usage Statistics ===\n"); + std::printf( + "Memory before loading: %.2f MB (%zu bytes)\n", + mem_before_load / (1024.0 * 1024.0), + mem_before_load); + std::printf( + "Memory after loading: %.2f MB (%zu bytes)\n", + mem_after_load / (1024.0 * 1024.0), + mem_after_load); + std::printf( + "Memory consumed by loading: %.2f MB (%zu bytes)\n", + load_memory / (1024.0 * 1024.0), + load_memory); + std::printf( + "Memory before generation: %.2f MB (%zu bytes)\n", + mem_before_gen / (1024.0 * 1024.0), + mem_before_gen); + std::printf( + "Memory after generation: %.2f MB (%zu bytes)\n", + mem_after_gen / (1024.0 * 1024.0), + mem_after_gen); + std::printf( + "Memory consumed by generation: %.2f MB (%zu bytes)\n", + gen_memory / (1024.0 * 1024.0), + gen_memory); + std::printf( + "Total memory consumed: %.2f MB (%zu bytes)\n", + total_memory / (1024.0 * 1024.0), + total_memory); + std::printf( + "Peak GPU memory used: %.2f MB (%zu bytes)\n", + peak_memory / (1024.0 * 1024.0), + peak_memory); + std::printf("====================================\n\n"); + return 0; }