diff --git a/.github/workflows/cuda.yml b/.github/workflows/cuda.yml index f59a29420d7..448987e8d6b 100644 --- a/.github/workflows/cuda.yml +++ b/.github/workflows/cuda.yml @@ -165,7 +165,7 @@ jobs: echo "::endgroup::" export-gemma3-cuda-artifact: - name: export-gemma3-cuda-artifact + name: export-gemma3-cuda-${{ matrix.quant.name }} uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main permissions: id-token: write @@ -173,6 +173,19 @@ jobs: secrets: inherit strategy: fail-fast: false + matrix: + quant: + - name: "non-quantized" + artifact: "voxtral-cuda-export" + extra_args: "" + # TODO: enable gemma3 quantization + # - name: "quantized-int4-tile-packed" + # artifact: "voxtral-cuda-quantized-int4-tile-packed" + # extra_args: "--qlinear 4w --qlinear_encoder 4w --qlinear_packing_format tile_packed_to_4d --qlinear_encoder_packing_format tile_packed_to_4d" + # - name: "quantized-int4-weight-only" + # artifact: "voxtral-cuda-quantized-int4-weight-only" + # # TODO: adding "--qlinear 4w" produces invalid results. Need further investigation. + # extra_args: "--qlinear_encoder 4w" with: timeout: 90 secrets-env: EXECUTORCH_HF_TOKEN @@ -198,7 +211,8 @@ jobs: pip list echo "::endgroup::" - echo "::group::Export Gemma3" + echo "::group::Export Gemma3 (${{ matrix.quant.name }})" + EXTRA_ARGS="${{ matrix.quant.extra_args }}" optimum-cli export executorch \ --model "google/gemma-3-4b-it" \ --task "multimodal-text-to-text" \ @@ -212,7 +226,7 @@ jobs: test -f aoti_cuda_blob.ptd echo "::endgroup::" - echo "::group::Store Gemma3 Artifacts" + echo "::group::Store Gemma3 Artifacts (${{ matrix.quant.name }})" mkdir -p "${RUNNER_ARTIFACT_DIR}/" cp model.pte "${RUNNER_ARTIFACT_DIR}/" cp aoti_cuda_blob.ptd "${RUNNER_ARTIFACT_DIR}/" @@ -407,3 +421,87 @@ jobs: exit $EXIT_CODE fi echo "::endgroup::" + + test-gemma3-cuda-e2e: + name: test-gemma3-cuda-e2e-${{ matrix.format.name }} + needs: export-gemma3-cuda-artifact + uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main + permissions: + id-token: write + contents: read + strategy: + fail-fast: false + matrix: + format: + - name: "non-quantized" + artifact: "gemma3-cuda-export" + # TODO: enable quantized gemma3. + # - name: "quantized-int4-tile-packed" + # artifact: "gemma3-cuda-quantized-int4-tile-packed" + # - name: "quantized-int4-weight-only" + # artifact: "gemma3-cuda-quantized-int4-weight-only" + with: + timeout: 90 + runner: linux.g5.4xlarge.nvidia.gpu + gpu-arch-type: cuda + gpu-arch-version: 12.6 + use-custom-docker-registry: false + submodules: recursive + download-artifact: ${{ matrix.format.artifact }} + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + script: | + set -eux + + echo "::group::Setup ExecuTorch Requirements" + ./install_requirements.sh + pip list + echo "::endgroup::" + + echo "::group::Prepare Gemma3 Artifacts (${{ matrix.format.name }})" + cp "${RUNNER_ARTIFACT_DIR}/model.pte" . + cp "${RUNNER_ARTIFACT_DIR}/aoti_cuda_blob.ptd" . + TOKENIZER_URL="https://huggingface.co/unsloth/gemma-3-1b-it/resolve/main/tokenizer.json" + curl -L $TOKENIZER_URL -o tokenizer.json + ls -al model.pte aoti_cuda_blob.ptd tokenizer.json + IMAGE_PATH="docs/source/_static/img/et-logo.png" + echo "::endgroup::" + + echo "::group::Build Gemma3 Runner" + cmake --preset llm \ + -DEXECUTORCH_BUILD_CUDA=ON \ + -DCMAKE_INSTALL_PREFIX=cmake-out \ + -DCMAKE_BUILD_TYPE=Release \ + -Bcmake-out -S. + cmake --build cmake-out -j$(( $(nproc) - 1 )) --target install --config Release + + cmake -DEXECUTORCH_BUILD_CUDA=ON \ + -DCMAKE_BUILD_TYPE=Release \ + -Sexamples/models/gemma3 \ + -Bcmake-out/examples/models/gemma3/ + cmake --build cmake-out/examples/models/gemma3 --target gemma3_e2e_runner --config Release + echo "::endgroup::" + + echo "::group::Run Gemma3 Runner (${{ matrix.format.name }})" + set +e + export LD_LIBRARY_PATH=/opt/conda/lib:$LD_LIBRARY_PATH + OUTPUT=$(cmake-out/examples/models/gemma3/gemma3_e2e_runner \ + --model_path model.pte \ + --data_path aoti_cuda_blob.ptd \ + --tokenizer_path tokenizer.json \ + --image_path $IMAGE_PATH \ + --temperature 0 2>&1) + EXIT_CODE=$? + set -e + + echo "$OUTPUT" + + if ! echo "$OUTPUT" | grep -iq "chip"; then + echo "Expected output 'chip' not found in output" + exit 1 + fi + + if [ $EXIT_CODE -ne 0 ]; then + echo "Unexpected exit code: $EXIT_CODE" + exit $EXIT_CODE + fi + echo "::endgroup::" diff --git a/examples/models/gemma3/CMakeLists.txt b/examples/models/gemma3/CMakeLists.txt new file mode 100644 index 00000000000..0be346d70f2 --- /dev/null +++ b/examples/models/gemma3/CMakeLists.txt @@ -0,0 +1,124 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +# +# Simple CMake build system for gemma3 e2e runner. +# +cmake_minimum_required(VERSION 3.24) +project(gemma3) + +set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../..) + +include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake) + +if(CMAKE_TOOLCHAIN_FILE MATCHES ".*(iOS|ios\.toolchain)\.cmake$") + set(CMAKE_TOOLCHAIN_IOS ON) +else() + set(CMAKE_TOOLCHAIN_IOS OFF) +endif() + +if(NOT CMAKE_CXX_STANDARD) + set(CMAKE_CXX_STANDARD 17) +endif() + +# Let files say "include " +set(_common_include_directories ${EXECUTORCH_ROOT}/..) + +# Need this for gflags for some reason +set(gflags_DIR ${CMAKE_CURRENT_BINARY_DIR}/../../../third-party/gflags) +find_package(gflags REQUIRED) + +# Find `executorch` libraries, same as for gflags +list(APPEND CMAKE_FIND_ROOT_PATH ${CMAKE_CURRENT_BINARY_DIR}/../../..) +find_package(executorch CONFIG REQUIRED FIND_ROOT_PATH_BOTH) +executorch_target_link_options_shared_lib(executorch) + +set(link_libraries executorch gflags) +set(_srcs e2e_runner.cpp) + +list( + APPEND + link_libraries + optimized_native_cpu_ops_lib + quantized_ops_lib + custom_ops + cpublas + eigen_blas +) +executorch_target_link_options_shared_lib(optimized_native_cpu_ops_lib) +executorch_target_link_options_shared_lib(quantized_ops_lib) +executorch_target_link_options_shared_lib(custom_ops) + +# XNNPACK +if(TARGET xnnpack_backend) + set(xnnpack_backend_libs xnnpack_backend XNNPACK xnnpack-microkernels-prod) + if(TARGET kleidiai) + list(APPEND xnnpack_backend_libs kleidiai) + endif() + list(APPEND link_libraries ${xnnpack_backend_libs}) + executorch_target_link_options_shared_lib(xnnpack_backend) +endif() + +# Add LLM runner and extension module +if(NOT TARGET extension_llm_runner) + message( + FATAL_ERROR + "ExecuTorch must be installed with EXECUTORCH_BUILD_EXTENSION_LLM_RUNNER enabled." + ) +endif() + +# Needed for cpuinfo where it uses android specific log lib +if(ANDROID) + list(APPEND link_libraries log) +endif() + +# stb_image: a lightweight library to load images +include(FetchContent) +FetchContent_Declare( + stb + GIT_REPOSITORY https://github.com/nothings/stb.git + GIT_TAG f0569113c93ad095470c54bf34a17b36646bbbb5 +) +FetchContent_MakeAvailable(stb) +# Add deprecated/ to use stb_image_resize.h for internal compatibility +list(APPEND _common_include_directories ${stb_SOURCE_DIR} + ${stb_SOURCE_DIR}/deprecated +) + +# Add the required ExecuTorch extensions for multimodal LLM runner +list( + APPEND + link_libraries + extension_llm_runner + extension_module + extension_data_loader + extension_tensor + extension_flat_tensor +) + +# Link CUDA backend +if(EXECUTORCH_BUILD_CUDA) + find_package(CUDAToolkit REQUIRED) + list(APPEND link_libraries aoti_cuda) + executorch_target_link_options_shared_lib(aoti_cuda) +endif() + +# Add tokenizers +list(APPEND link_libraries tokenizers::tokenizers) + +add_executable(gemma3_e2e_runner ${_srcs}) +if(NOT CMAKE_BUILD_TYPE STREQUAL "Debug") + target_link_options_gc_sections(gemma3_e2e_runner) + if(NOT APPLE) + target_link_options(gemma3_e2e_runner PRIVATE "LINKER:-s") + endif() +endif() + +target_include_directories( + gemma3_e2e_runner PUBLIC ${_common_include_directories} +) +target_link_libraries(gemma3_e2e_runner PUBLIC ${link_libraries}) +target_compile_options(gemma3_e2e_runner PUBLIC ${_common_compile_options}) diff --git a/examples/models/gemma3/e2e_runner.cpp b/examples/models/gemma3/e2e_runner.cpp new file mode 100644 index 00000000000..68f19e8296d --- /dev/null +++ b/examples/models/gemma3/e2e_runner.cpp @@ -0,0 +1,242 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include +#include + +#include + +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#define STB_IMAGE_IMPLEMENTATION +#include +#define STB_IMAGE_RESIZE_IMPLEMENTATION +#include + +#if defined(ET_USE_THREADPOOL) +#include +#include +#endif + +DEFINE_string( + model_path, + "multimodal.pte", + "Model serialized in flatbuffer format."); + +DEFINE_string(data_path, "", "Path to data file."); +DEFINE_string(tokenizer_path, "tokenizer.json", "Tokenizer stuff."); + +DEFINE_string(prompt, "What is in this image?", "Text prompt."); + +DEFINE_string(image_path, "", "Path to input image file."); + +DEFINE_double( + temperature, + 0.0f, + "Temperature; Default is 0. 0 = greedy argmax sampling (deterministic). Lower temperature = more deterministic"); + +DEFINE_int32( + cpu_threads, + -1, + "Number of CPU threads for inference. Defaults to -1, which implies we'll use a heuristic to derive the # of performant cores for a specific device."); + +DEFINE_bool(warmup, false, "Whether to run a warmup run."); + +namespace { + +using ::executorch::extension::from_blob; +using ::executorch::extension::Module; +using ::executorch::extension::llm::Image; +using ::executorch::extension::llm::make_image_input; +using ::executorch::extension::llm::make_text_input; +using ::executorch::extension::llm::MultimodalInput; +using ::executorch::runtime::EValue; + +bool ends_with(const std::string& str, const std::string& suffix) { + return str.size() >= suffix.size() && + str.compare(str.size() - suffix.size(), suffix.size(), suffix) == 0; +} + +/** + * @brief Loads an image from a file and resizes it to 896x896 + * + * This function loads an image using stb_image and resizes it to the expected + * input size for Gemma3 (896x896). The image is converted to CHW (Channel, + * Height, Width) format which is expected by the model. + * + * @param image_path Path to the image file (.jpg, .png, etc.) + * @return MultimodalInput containing the loaded and processed image data + * @throws std::runtime_error if image loading fails + */ +MultimodalInput loadImage(const std::string& image_path) { + if (!ends_with(image_path, ".jpg") && !ends_with(image_path, ".jpeg") && + !ends_with(image_path, ".png") && !ends_with(image_path, ".bmp")) { + ET_LOG( + Error, + "Unsupported image file format: %s (only .jpg, .jpeg, .png, .bmp are supported)", + image_path.c_str()); + throw std::runtime_error("Unsupported image file format"); + } + + int width, height, channels; + unsigned char* data = + stbi_load(image_path.c_str(), &width, &height, &channels, 0); + if (!data) { + ET_LOG(Error, "Failed to load image: %s", image_path.c_str()); + throw std::runtime_error("Failed to load image"); + } + + ET_LOG( + Info, + "Loaded image: %s, original size: %dx%d, channels: %d", + image_path.c_str(), + width, + height, + channels); + + // Resize to 896x896 (Gemma3 vision encoder input size) + const int target_size = 896; + std::vector resized_data(target_size * target_size * channels); + + int resize_result = stbir_resize_uint8( + data, + width, + height, + 0, + resized_data.data(), + target_size, + target_size, + 0, + channels); + + if (!resize_result) { + stbi_image_free(data); + ET_LOG(Error, "Failed to resize image"); + throw std::runtime_error("Failed to resize image"); + } + + // Convert from HWC (Height, Width, Channel) to CHW (Channel, Height, Width) + // and normalize uint8 [0, 255] to float32 [0.0, 1.0] + std::vector chw_data(channels * target_size * target_size); + for (int h = 0; h < target_size; ++h) { + for (int w = 0; w < target_size; ++w) { + for (int c = 0; c < channels; ++c) { + uint8_t pixel_value = + resized_data[h * target_size * channels + w * channels + c]; + chw_data[c * target_size * target_size + h * target_size + w] = + static_cast(pixel_value) / 255.0f; + } + } + } + + ET_LOG( + Info, + "Resized and converted image to CHW format (float32): %dx%d, channels: %d", + target_size, + target_size, + channels); + + Image image(std::move(chw_data), target_size, target_size, channels); + stbi_image_free(data); + + return make_image_input(std::move(image)); +} + +} // namespace + +int32_t main(int32_t argc, char** argv) { + gflags::ParseCommandLineFlags(&argc, &argv, true); + + const char* model_path = FLAGS_model_path.c_str(); + const char* tokenizer_path = FLAGS_tokenizer_path.c_str(); + const char* prompt = FLAGS_prompt.c_str(); + const char* image_path = FLAGS_image_path.c_str(); + const char* data_path = FLAGS_data_path.c_str(); + float temperature = FLAGS_temperature; + int32_t cpu_threads = FLAGS_cpu_threads; + bool warmup = FLAGS_warmup; + +#if defined(ET_USE_THREADPOOL) + uint32_t num_performant_cores = cpu_threads == -1 + ? ::executorch::extension::cpuinfo::get_num_performant_cores() + : static_cast(cpu_threads); + ET_LOG( + Info, "Resetting threadpool with num threads = %d", num_performant_cores); + if (num_performant_cores > 0) { + ::executorch::extension::threadpool::get_threadpool() + ->_unsafe_reset_threadpool(num_performant_cores); + } +#endif + + std::unique_ptr<::tokenizers::Tokenizer> tokenizer = + ::executorch::extension::llm::load_tokenizer(tokenizer_path); + if (tokenizer == nullptr) { + ET_LOG(Error, "Failed to load tokenizer from: %s", tokenizer_path); + return 1; + } + + // Create multimodal runner + std::unique_ptr<::executorch::extension::llm::MultimodalRunner> runner = + ::executorch::extension::llm::create_multimodal_runner( + model_path, std::move(tokenizer), data_path); + + if (runner == nullptr) { + ET_LOG(Error, "Failed to create multimodal runner"); + return 1; + } + + // Load runner + auto load_error = runner->load(); + if (load_error != ::executorch::runtime::Error::Ok) { + ET_LOG(Error, "Failed to load multimodal runner"); + return 1; + } + + // Prepare inputs + std::vector inputs = { + make_text_input("user\n"), + loadImage(image_path), + make_text_input( + std::string(prompt) + "\nmodel\n"), + }; + + ::executorch::extension::llm::GenerationConfig config; + config.max_new_tokens = 100; + config.temperature = temperature; + + // Run warmup if requested + if (warmup) { + ET_LOG(Info, "Running warmup..."); + auto warmup_error = runner->generate(inputs, config); + if (warmup_error != ::executorch::runtime::Error::Ok) { + ET_LOG(Error, "Failed to run warmup"); + return 1; + } + runner->reset(); + } + + auto error = runner->generate(inputs, config); + + if (error != ::executorch::runtime::Error::Ok) { + ET_LOG(Error, "Failed to generate with multimodal runner\n"); + return 1; + } + ET_LOG(Info, "Generated successfully"); + + return 0; +} diff --git a/extension/llm/runner/multimodal_prefiller.cpp b/extension/llm/runner/multimodal_prefiller.cpp index 8578187128f..ddf9c86f180 100644 --- a/extension/llm/runner/multimodal_prefiller.cpp +++ b/extension/llm/runner/multimodal_prefiller.cpp @@ -69,6 +69,11 @@ Result MultimodalPrefiller::prefill( image.is_uint8(), InvalidArgument, "Model expects uint8_t image data, but image has float data."); + } else if (expected_dtype == ::executorch::aten::ScalarType::BFloat16) { + ET_CHECK_OR_RETURN_ERROR( + image.is_float(), + InvalidArgument, + "Model expects BFloat16 data, we need to take image in float32 type and convert afterwards. But now image has uint8_t data."); } else { ET_CHECK_OR_RETURN_ERROR( false, @@ -85,6 +90,16 @@ Result MultimodalPrefiller::prefill( ET_CHECK_OK_OR_RETURN_ERROR( image_tensor_result.error(), "Failed to convert image to tensor"); auto image_tensor = image_tensor_result.get(); + + if (expected_dtype == ::executorch::aten::ScalarType::BFloat16) { + // Convert to bfloat16 for model input + auto image_tensor_return = convert_to_bfloat16(image_tensor); + ET_CHECK_OK_OR_RETURN_ERROR( + image_tensor_return.error(), + "Failed to convert image tensor to bfloat16"); + image_tensor = image_tensor_return.get(); + } + ET_LOG( Info, "Image tensor dim: %zu, dtype: %s",