diff --git a/.github/workflows/cuda.yml b/.github/workflows/cuda.yml index 9ee72a34ef0..f59a29420d7 100644 --- a/.github/workflows/cuda.yml +++ b/.github/workflows/cuda.yml @@ -164,6 +164,61 @@ jobs: ls -al "${RUNNER_ARTIFACT_DIR}" echo "::endgroup::" + export-gemma3-cuda-artifact: + name: export-gemma3-cuda-artifact + uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main + permissions: + id-token: write + contents: read + secrets: inherit + strategy: + fail-fast: false + with: + timeout: 90 + secrets-env: EXECUTORCH_HF_TOKEN + runner: linux.g5.4xlarge.nvidia.gpu + gpu-arch-type: cuda + gpu-arch-version: 12.6 + use-custom-docker-registry: false + submodules: recursive + upload-artifact: gemma3-cuda-export + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + script: | + set -eux + + echo "::group::Setup ExecuTorch" + ./install_executorch.sh + echo "::endgroup::" + + echo "::group::Setup Huggingface" + pip install -U "huggingface_hub[cli]" accelerate + huggingface-cli login --token $SECRET_EXECUTORCH_HF_TOKEN + OPTIMUM_ET_VERSION=$(cat .ci/docker/ci_commit_pins/optimum-executorch.txt) + pip install git+https://github.com/huggingface/optimum-executorch.git@${OPTIMUM_ET_VERSION} + pip list + echo "::endgroup::" + + echo "::group::Export Gemma3" + optimum-cli export executorch \ + --model "google/gemma-3-4b-it" \ + --task "multimodal-text-to-text" \ + --recipe "cuda" \ + --dtype bfloat16 \ + --device cuda \ + --max_seq_len 64 \ + --output_dir ./ + + test -f model.pte + test -f aoti_cuda_blob.ptd + echo "::endgroup::" + + echo "::group::Store Gemma3 Artifacts" + mkdir -p "${RUNNER_ARTIFACT_DIR}/" + cp model.pte "${RUNNER_ARTIFACT_DIR}/" + cp aoti_cuda_blob.ptd "${RUNNER_ARTIFACT_DIR}/" + ls -al "${RUNNER_ARTIFACT_DIR}/" + echo "::endgroup::" + benchmark-voxtral-cuda: name: benchmark-voxtral-cuda needs: export-voxtral-cuda-artifact @@ -204,13 +259,63 @@ jobs: -DEXECUTORCH_BUILD_EXTENSION_NAMED_DATA_MAP=ON \ -DEXECUTORCH_BUILD_TESTS=ON \ -Bcmake-out . - cmake --build cmake-out -j$(( $(nproc) - 1 )) --target voxtral_runner + cmake --build cmake-out -j$(( $(nproc) - 1 )) --target multimodal_benchmark echo "::endgroup::" echo "::group::Run Voxtral Benchmark" export LD_LIBRARY_PATH=/opt/conda/lib:$LD_LIBRARY_PATH - cmake-out/backends/cuda/voxtral_runner model.pte aoti_cuda_blob.ptd + cmake-out/backends/cuda/multimodal_benchmark voxtral model.pte aoti_cuda_blob.ptd + + echo "::endgroup::" + + benchmark-gemma3-cuda: + name: benchmark-gemma3-cuda + needs: export-gemma3-cuda-artifact + uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main + permissions: + id-token: write + contents: read + strategy: + fail-fast: false + with: + timeout: 90 + runner: linux.g5.4xlarge.nvidia.gpu + gpu-arch-type: cuda + gpu-arch-version: 12.6 + use-custom-docker-registry: false + submodules: recursive + download-artifact: gemma3-cuda-export + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + script: | + set -eux + + echo "::group::Setup ExecuTorch Requirements" + CMAKE_ARGS="-DEXECUTORCH_BUILD_CUDA=ON" ./install_requirements.sh + pip list + echo "::endgroup::" + + echo "::group::Prepare Gemma3 Artifacts" + cp "${RUNNER_ARTIFACT_DIR}/model.pte" . + cp "${RUNNER_ARTIFACT_DIR}/aoti_cuda_blob.ptd" . + ls -al model.pte aoti_cuda_blob.ptd + echo "::endgroup::" + + echo "::group::Build Gemma3 Benchmark" + cmake -DCMAKE_BUILD_TYPE=Release \ + -DEXECUTORCH_BUILD_CUDA=ON \ + -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \ + -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ + -DEXECUTORCH_BUILD_EXTENSION_NAMED_DATA_MAP=ON \ + -DEXECUTORCH_BUILD_TESTS=ON \ + -Bcmake-out . + cmake --build cmake-out -j$(( $(nproc) - 1 )) --target multimodal_benchmark + echo "::endgroup::" + + echo "::group::Run Gemma3 Benchmark" + + export LD_LIBRARY_PATH=/opt/conda/lib:$LD_LIBRARY_PATH + cmake-out/backends/cuda/multimodal_benchmark gemma3 model.pte aoti_cuda_blob.ptd echo "::endgroup::" diff --git a/backends/cuda/CMakeLists.txt b/backends/cuda/CMakeLists.txt index 1db8792e0c0..af36c89585e 100644 --- a/backends/cuda/CMakeLists.txt +++ b/backends/cuda/CMakeLists.txt @@ -105,11 +105,10 @@ target_link_libraries( executorch_target_link_options_shared_lib(aoti_cuda) if(BUILD_TESTING) - # Add runtime - add_executable(voxtral_runner tests/voxtral_runner.cpp) + add_executable(multimodal_benchmark tests/multimodal_benchmark.cpp) target_link_libraries( - voxtral_runner PUBLIC aoti_cuda extension_module_static - extension_flat_tensor portable_ops_lib + multimodal_benchmark PUBLIC aoti_cuda extension_module_static + extension_flat_tensor portable_ops_lib ) endif() diff --git a/backends/cuda/cuda_backend.py b/backends/cuda/cuda_backend.py index ba6da92b991..f8482835ea5 100644 --- a/backends/cuda/cuda_backend.py +++ b/backends/cuda/cuda_backend.py @@ -140,6 +140,9 @@ def preprocess( user_input_placeholders.append(node.meta["val"]) options: dict[str, typing.Any] = { + # Disable this to support sdpa decomposition + # TODO(gasoonjia): remove it after pin bump to latest pytorch + "loop_ordering_after_fusion": False, # Better model precision "emulate_precision_casts": True, # Embed CUDA kernel binaries directly into the compiled shared object diff --git a/backends/cuda/tests/multimodal_benchmark.cpp b/backends/cuda/tests/multimodal_benchmark.cpp new file mode 100644 index 00000000000..679db889b71 --- /dev/null +++ b/backends/cuda/tests/multimodal_benchmark.cpp @@ -0,0 +1,465 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +namespace { + +using executorch::aten::ScalarType; +using executorch::aten::Tensor; +using executorch::extension::make_tensor_ptr; +using executorch::extension::TensorPtr; +using executorch::extension::module::Module; +using executorch::runtime::Error; +using executorch::runtime::EValue; +using executorch::runtime::Result; +using Clock = std::chrono::steady_clock; +using executorch::aten::TensorShapeDynamism; +using DurationMs = std::chrono::duration; + +enum class ModelType { GEMMA3, VOXTRAL, UNKNOWN }; + +struct ModelConfig { + std::string name; + size_t token_seq_len; + size_t text_embed_dim; + std::vector expected_methods; +}; + +const std::map model_configs = { + {ModelType::GEMMA3, + {"gemma3", + 128, + 2304, + {"vision_encoder", "token_embedding", "text_decoder"}}}, + {ModelType::VOXTRAL, + {"voxtral", + 1138, + 3072, + {"audio_encoder", "token_embedding", "text_decoder"}}}}; + +ModelType parse_model_type(const std::string& model_name) { + std::string lower_name = model_name; + std::transform( + lower_name.begin(), + lower_name.end(), + lower_name.begin(), + [](unsigned char c) { return std::tolower(c); }); + + if (lower_name.find("gemma3") != std::string::npos) { + return ModelType::GEMMA3; + } else if (lower_name.find("voxtral") != std::string::npos) { + return ModelType::VOXTRAL; + } + return ModelType::UNKNOWN; +} + +std::vector to_sizes( + std::initializer_list dims) { + return std::vector(dims.begin(), dims.end()); +} + +std::string format_shape(const Tensor& tensor) { + std::ostringstream oss; + oss << "["; + const auto& sizes = tensor.sizes(); + for (size_t i = 0; i < sizes.size(); ++i) { + if (i > 0) { + oss << ", "; + } + oss << sizes[i]; + } + oss << "]"; + return oss.str(); +} + +void print_tensor_summary(const std::string& label, const Tensor& tensor) { + std::cout << " " << label + << ": dtype=" << executorch::runtime::toString(tensor.scalar_type()) + << ", shape=" << format_shape(tensor) + << ", numel=" << tensor.numel() << std::endl; +} + +void dump_tensor_to_file(const std::string& filename, const Tensor& tensor) { + std::ofstream file(filename, std::ios::binary); + if (!file.is_open()) { + std::cerr << "Failed to open file for writing: " << filename << std::endl; + return; + } + + int32_t dtype = static_cast(tensor.scalar_type()); + file.write(reinterpret_cast(&dtype), sizeof(int32_t)); + + int32_t ndim = static_cast(tensor.sizes().size()); + file.write(reinterpret_cast(&ndim), sizeof(int32_t)); + + for (size_t i = 0; i < tensor.sizes().size(); ++i) { + int64_t dim_size = tensor.sizes()[i]; + file.write(reinterpret_cast(&dim_size), sizeof(int64_t)); + } + + const void* data_ptr = tensor.const_data_ptr(); + size_t element_size = 0; + + switch (tensor.scalar_type()) { + case ScalarType::Float: + element_size = sizeof(float); + break; + case ScalarType::BFloat16: + element_size = 2; + break; + case ScalarType::Half: + element_size = 2; + break; + case ScalarType::Long: + element_size = sizeof(int64_t); + break; + case ScalarType::Int: + element_size = sizeof(int32_t); + break; + default: + std::cerr << "Unsupported dtype for dumping: " + << executorch::runtime::toString(tensor.scalar_type()) + << std::endl; + return; + } + + size_t data_size = tensor.numel() * element_size; + file.write(reinterpret_cast(data_ptr), data_size); + file.close(); + + std::cout << "Dumped tensor to: " << filename << std::endl; +} + +TensorPtr create_vision_input() { + const auto sizes = to_sizes({1, 3, 896, 896}); + const size_t numel = 1ull * 3ull * 896ull * 896ull; + std::vector data(numel); + for (size_t i = 0; i < numel; ++i) { + data[i] = static_cast((i % 255) / 255.0); + } + return make_tensor_ptr( + sizes, + std::move(data), + {}, + {}, + ScalarType::BFloat16, + TensorShapeDynamism::DYNAMIC_UNBOUND); +} + +TensorPtr create_audio_input() { + const auto sizes = to_sizes({3, 128, 3000}); + const size_t numel = 3ull * 128ull * 3000ull; + std::vector data(numel, 0.5f); + return make_tensor_ptr( + sizes, std::move(data), {}, {}, ScalarType::BFloat16); +} + +TensorPtr create_token_ids_input(const ModelConfig& config) { + const auto sizes = to_sizes({1, static_cast(config.token_seq_len)}); + std::vector data(config.token_seq_len); + for (size_t i = 0; i < config.token_seq_len; ++i) { + data[i] = static_cast(i + 1); + } + return make_tensor_ptr(sizes, std::move(data)); +} + +TensorPtr create_positions_input(const ModelConfig& config) { + const auto sizes = to_sizes({static_cast(config.token_seq_len)}); + std::vector data(config.token_seq_len); + for (size_t i = 0; i < config.token_seq_len; ++i) { + data[i] = static_cast(i); + } + return make_tensor_ptr(sizes, std::move(data)); +} + +TensorPtr create_fallback_text_embedding(const ModelConfig& config) { + const auto sizes = to_sizes( + {1, + static_cast(config.token_seq_len), + static_cast(config.text_embed_dim)}); + const size_t numel = 1ull * config.token_seq_len * config.text_embed_dim; + std::vector data(numel, 0.0f); + return make_tensor_ptr( + sizes, std::move(data), {}, {}, ScalarType::BFloat16); +} + +struct MethodTiming { + double load_ms{0.0}; + double run_ms{0.0}; +}; + +enum class MethodCategory { ENCODER, TOKEN_EMBEDDING, TEXT_DECODER, UNKNOWN }; + +MethodCategory categorize_method(const std::string& method_name) { + std::string lower_name = method_name; + std::transform( + lower_name.begin(), + lower_name.end(), + lower_name.begin(), + [](unsigned char c) { return std::tolower(c); }); + + if (lower_name.find("vision") != std::string::npos || + lower_name.find("audio") != std::string::npos || + lower_name.find("encoder") != std::string::npos) { + return MethodCategory::ENCODER; + } else if ( + lower_name.find("token") != std::string::npos && + lower_name.find("embedding") != std::string::npos) { + return MethodCategory::TOKEN_EMBEDDING; + } else if ( + lower_name.find("text") != std::string::npos && + lower_name.find("decoder") != std::string::npos) { + return MethodCategory::TEXT_DECODER; + } + return MethodCategory::UNKNOWN; +} + +std::vector create_inputs_for_method( + const std::string& method_name, + MethodCategory category, + ModelType model_type, + const ModelConfig& config, + const EValue* token_output, + std::vector& owned_inputs) { + std::vector inputs; + + switch (category) { + case MethodCategory::ENCODER: { + if (method_name.find("vision") != std::string::npos) { + auto input = create_vision_input(); + owned_inputs.emplace_back(input); + inputs.emplace_back(*input); + } else if (method_name.find("audio") != std::string::npos) { + auto input = create_audio_input(); + owned_inputs.emplace_back(input); + inputs.emplace_back(*input); + } + break; + } + + case MethodCategory::TOKEN_EMBEDDING: { + auto token_ids = create_token_ids_input(config); + owned_inputs.emplace_back(token_ids); + inputs.emplace_back(*token_ids); + break; + } + + case MethodCategory::TEXT_DECODER: { + if (token_output && token_output->isTensor()) { + inputs.emplace_back(*token_output); + } else { + auto fallback_embedding = create_fallback_text_embedding(config); + owned_inputs.emplace_back(fallback_embedding); + inputs.emplace_back(*fallback_embedding); + } + + auto positions = create_positions_input(config); + owned_inputs.emplace_back(positions); + inputs.emplace_back(*positions); + break; + } + + default: + break; + } + + return inputs; +} + +Error execute_method( + Module& module, + const std::string& method_name, + MethodCategory category, + ModelType model_type, + const ModelConfig& config, + const EValue* token_output, + MethodTiming& timing, + EValue* output_storage = nullptr) { + ET_LOG(Info, "Loading %s...", method_name.c_str()); + + const auto load_start = Clock::now(); + const Error load_err = module.load_method(method_name); + const auto load_end = Clock::now(); + if (load_err != Error::Ok) { + std::cerr << "Failed to load method " << method_name << ": error code " + << static_cast(load_err) << std::endl; + return load_err; + } + timing.load_ms = DurationMs(load_end - load_start).count(); + + std::vector owned_inputs; + std::vector inputs = create_inputs_for_method( + method_name, category, model_type, config, token_output, owned_inputs); + + const auto run_start = Clock::now(); + ET_LOG(Info, "%s running", method_name.c_str()); + Result> output_result = + module.execute(method_name, inputs); + ET_LOG(Info, "%s done", method_name.c_str()); + const auto run_end = Clock::now(); + timing.run_ms = DurationMs(run_end - run_start).count(); + + if (output_result.error() != Error::Ok) { + std::cerr << method_name << " execution failed: error code " + << static_cast(output_result.error()) << std::endl; + return output_result.error(); + } + + const auto& outputs = output_result.get(); + if (!outputs.empty() && outputs[0].isTensor()) { + print_tensor_summary(method_name + " output", outputs[0].toTensor()); + + if (category == MethodCategory::ENCODER || + category == MethodCategory::TOKEN_EMBEDDING) { + dump_tensor_to_file(method_name + "_output.bin", outputs[0].toTensor()); + } + + if (output_storage) { + *output_storage = outputs[0]; + } + } + + return Error::Ok; +} + +} // namespace + +int main(int argc, char** argv) { + if (argc != 4) { + std::cerr + << "Usage: " << argv[0] + << " " + << std::endl; + std::cerr << " model_name: gemma3 or voxtral" << std::endl; + return 1; + } + + const std::string model_name = argv[1]; + const std::string program_path = argv[2]; + const std::string data_map_path = argv[3]; + + const ModelType model_type = parse_model_type(model_name); + if (model_type == ModelType::UNKNOWN) { + std::cerr << "Unknown model type: " << model_name << std::endl; + std::cerr << "Supported models: gemma3, voxtral" << std::endl; + return 1; + } + + const ModelConfig& config = model_configs.at(model_type); + std::cout << "Running benchmark for model: " << config.name << std::endl; + + try { + Module module(program_path, data_map_path); + + const auto program_load_start = Clock::now(); + const Error program_load_error = module.load(); + const auto program_load_end = Clock::now(); + if (program_load_error != Error::Ok) { + std::cerr << "Failed to load ExecuTorch program: error code " + << static_cast(program_load_error) << std::endl; + return 1; + } + const DurationMs program_load_latency = + program_load_end - program_load_start; + + auto method_names_result = module.method_names(); + if (method_names_result.error() != Error::Ok) { + std::cerr << "Failed to get method names: error code " + << static_cast(method_names_result.error()) << std::endl; + return 1; + } + + const auto& available_methods = method_names_result.get(); + + std::cout << "Checking for expected methods..." << std::endl; + std::vector missing_methods; + for (const auto& expected : config.expected_methods) { + if (available_methods.find(expected) == available_methods.end()) { + missing_methods.push_back(expected); + } else { + std::cout << " ✓ " << expected << std::endl; + } + } + + if (!missing_methods.empty()) { + std::cerr << "\nError: Missing expected methods:" << std::endl; + for (const auto& missing : missing_methods) { + std::cerr << " ✗ " << missing << std::endl; + } + return 1; + } + + std::map timings; + EValue token_output; + bool token_executed = false; + + for (const auto& method_name : config.expected_methods) { + MethodCategory category = categorize_method(method_name); + MethodTiming timing; + + const EValue* input_token_ptr = + (category == MethodCategory::TEXT_DECODER && token_executed) + ? &token_output + : nullptr; + + EValue* output_storage = (category == MethodCategory::TOKEN_EMBEDDING) + ? &token_output + : nullptr; + + Error err = execute_method( + module, + method_name, + category, + model_type, + config, + input_token_ptr, + timing, + output_storage); + + if (err != Error::Ok) { + return 1; + } + + if (category == MethodCategory::TOKEN_EMBEDDING) { + token_executed = true; + } + + timings[method_name] = timing; + } + + std::cout << std::fixed << std::setprecision(3); + std::cout << "\n=== Benchmark Results ===" << std::endl; + std::cout << "Program load latency (ms): " << program_load_latency.count() + << std::endl; + + std::cout << "\nMethod load latency (ms):" << std::endl; + for (const auto& [name, timing] : timings) { + std::cout << " " << name << ": " << timing.load_ms << std::endl; + } + + std::cout << "\nRun latency (ms):" << std::endl; + for (const auto& [name, timing] : timings) { + std::cout << " " << name << ": " << timing.run_ms << std::endl; + } + + return 0; + } catch (const std::exception& ex) { + std::cerr << "Unhandled exception: " << ex.what() << std::endl; + return 1; + } +} diff --git a/backends/cuda/tests/voxtral_runner.cpp b/backends/cuda/tests/voxtral_runner.cpp deleted file mode 100644 index feed458e1f5..00000000000 --- a/backends/cuda/tests/voxtral_runner.cpp +++ /dev/null @@ -1,264 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include - -namespace { - -using executorch::aten::ScalarType; -using executorch::aten::Tensor; -using executorch::extension::make_tensor_ptr; -using executorch::extension::TensorPtr; -using executorch::extension::module::Module; -using executorch::runtime::Error; -using executorch::runtime::EValue; -using executorch::runtime::Result; -using Clock = std::chrono::steady_clock; -using DurationMs = std::chrono::duration; - -std::vector to_sizes( - std::initializer_list dims) { - return std::vector(dims.begin(), dims.end()); -} - -std::string format_shape(const Tensor& tensor) { - std::ostringstream oss; - oss << "["; - const auto& sizes = tensor.sizes(); - for (size_t i = 0; i < sizes.size(); ++i) { - if (i > 0) { - oss << ", "; - } - oss << sizes[i]; - } - oss << "]"; - return oss.str(); -} - -void print_tensor_summary(const std::string& label, const Tensor& tensor) { - std::cout << " " << label - << ": dtype=" << executorch::runtime::toString(tensor.scalar_type()) - << ", shape=" << format_shape(tensor) - << ", numel=" << tensor.numel() << std::endl; -} - -TensorPtr create_audio_input() { - const auto sizes = to_sizes({3, 128, 3000}); - const size_t numel = 3ull * 128ull * 3000ull; - std::vector data(numel, 0.5f); - return make_tensor_ptr( - sizes, std::move(data), {}, {}, ScalarType::BFloat16); -} - -TensorPtr create_token_ids_input() { - const auto sizes = to_sizes({1, 1138}); - std::vector data(static_cast(1) * 1138, 0); - return make_tensor_ptr(sizes, std::move(data)); -} - -TensorPtr create_positions_input() { - const auto sizes = to_sizes({1138}); - std::vector data(static_cast(1138), 0); - return make_tensor_ptr(sizes, std::move(data)); -} - -TensorPtr create_fallback_text_embedding() { - const auto sizes = to_sizes({1, 1138, 3072}); - const size_t numel = 1ull * 1138ull * 3072ull; - std::vector data(numel, 0.0f); - return make_tensor_ptr( - sizes, std::move(data), {}, {}, ScalarType::BFloat16); -} - -struct MethodTiming { - double load_ms{0.0}; - double run_ms{0.0}; -}; - -} // namespace - -int main(int argc, char** argv) { - if (argc != 3) { - std::cerr << "Usage: " << argv[0] - << " " - << std::endl; - return 1; - } - - const std::string program_path = argv[1]; - const std::string data_map_path = argv[2]; - - try { - Module module(program_path, data_map_path); - - const auto program_load_start = Clock::now(); - const Error program_load_error = module.load(); - const auto program_load_end = Clock::now(); - if (program_load_error != Error::Ok) { - std::cerr << "Failed to load ExecuTorch program: error code " - << static_cast(program_load_error) << std::endl; - return 1; - } - const DurationMs program_load_latency = - program_load_end - program_load_start; - - MethodTiming audio_timing; - MethodTiming token_timing; - MethodTiming text_timing; - - auto measure_method_load = - [&](const std::string& name) -> std::pair { - const auto start = Clock::now(); - const Error err = module.load_method(name); - const auto end = Clock::now(); - return {err, DurationMs(end - start).count()}; - }; - - // audio_encoder - { - const auto [err, load_ms] = measure_method_load("audio_encoder"); - if (err != Error::Ok) { - std::cerr << "Failed to load method audio_encoder: error code " - << static_cast(err) << std::endl; - return 1; - } - audio_timing.load_ms = load_ms; - - const TensorPtr audio_input = create_audio_input(); - std::vector inputs; - std::vector owned_inputs; - owned_inputs.emplace_back(audio_input); - inputs.emplace_back(*audio_input); - - const auto run_start = Clock::now(); - Result> output_result = - module.execute("audio_encoder", inputs); - const auto run_end = Clock::now(); - audio_timing.run_ms = DurationMs(run_end - run_start).count(); - - if (output_result.error() != Error::Ok) { - std::cerr << "audio_encoder execution failed: error code " - << static_cast(output_result.error()) << std::endl; - return 1; - } - - const auto& outputs = output_result.get(); - if (!outputs.empty() && outputs[0].isTensor()) { - print_tensor_summary("audio_encoder output", outputs[0].toTensor()); - } - } - - EValue token_output; - bool token_executed = false; - - // token_embedding - { - const auto [err, load_ms] = measure_method_load("token_embedding"); - if (err != Error::Ok) { - std::cerr << "Failed to load method token_embedding: error code " - << static_cast(err) << std::endl; - return 1; - } - token_timing.load_ms = load_ms; - - const TensorPtr token_ids = create_token_ids_input(); - std::vector inputs; - std::vector owned_inputs; - owned_inputs.emplace_back(token_ids); - inputs.emplace_back(*token_ids); - - const auto run_start = Clock::now(); - auto token_output_result = module.execute("token_embedding", inputs); - const auto run_end = Clock::now(); - token_timing.run_ms = DurationMs(run_end - run_start).count(); - - if (token_output_result.error() != Error::Ok) { - std::cerr << "token_embedding execution failed: error code " - << static_cast(token_output_result.error()) << std::endl; - return 1; - } - - token_executed = true; - const auto& outputs = token_output_result.get(); - if (!outputs.empty() && outputs[0].isTensor()) { - print_tensor_summary("token_embedding output", outputs[0].toTensor()); - token_output = outputs[0]; - } - } - - // text_decoder - { - const auto [err, load_ms] = measure_method_load("text_decoder"); - if (err != Error::Ok) { - std::cerr << "Failed to load method text_decoder: error code " - << static_cast(err) << std::endl; - return 1; - } - text_timing.load_ms = load_ms; - - std::vector inputs; - std::vector owned_inputs; - if (token_executed) { - if (token_output.isTensor()) { - inputs.emplace_back(token_output); - } - } - - if (inputs.empty()) { - auto fallback_embedding = create_fallback_text_embedding(); - owned_inputs.emplace_back(fallback_embedding); - inputs.emplace_back(*fallback_embedding); - } - - auto positions = create_positions_input(); - owned_inputs.emplace_back(positions); - inputs.emplace_back(*positions); - - const auto run_start = Clock::now(); - Result> output_result = - module.execute("text_decoder", inputs); - const auto run_end = Clock::now(); - text_timing.run_ms = DurationMs(run_end - run_start).count(); - - if (output_result.error() != Error::Ok) { - std::cerr << "text_decoder execution failed: error code " - << static_cast(output_result.error()) << std::endl; - return 1; - } - - const auto& outputs = output_result.get(); - if (!outputs.empty() && outputs[0].isTensor()) { - print_tensor_summary("text_decoder output", outputs[0].toTensor()); - } - } - - std::cout << std::fixed << std::setprecision(3); - std::cout << "Program load latency (ms): " << program_load_latency.count() - << std::endl; - - std::cout << "Method load latency (ms):" << std::endl; - std::cout << " audio_encoder: " << audio_timing.load_ms << std::endl; - std::cout << " token_embedding: " << token_timing.load_ms << std::endl; - std::cout << " text_decoder: " << text_timing.load_ms << std::endl; - - std::cout << "Run latency (ms):" << std::endl; - std::cout << " audio_encoder: " << audio_timing.run_ms << std::endl; - std::cout << " token_embedding: " << token_timing.run_ms << std::endl; - std::cout << " text_decoder: " << text_timing.run_ms << std::endl; - - return 0; - } catch (const std::exception& ex) { - std::cerr << "Unhandled exception: " << ex.what() << std::endl; - return 1; - } -}