From 127ff2e6698027a7291f778db39a452f2921ae9a Mon Sep 17 00:00:00 2001 From: Jack Zhang <32371937+jackzhxng@users.noreply.github.com> Date: Mon, 25 Aug 2025 15:01:30 -0700 Subject: [PATCH 1/4] Add audio to multimodal runner [ghstack-poisoned] --- extension/llm/runner/constants.h | 5 +- extension/llm/runner/multimodal_input.h | 152 +++++++++++++++++- extension/llm/runner/multimodal_prefiller.cpp | 55 +++++-- 3 files changed, 193 insertions(+), 19 deletions(-) diff --git a/extension/llm/runner/constants.h b/extension/llm/runner/constants.h index fc6ddcb451c..b26f319b5ec 100644 --- a/extension/llm/runner/constants.h +++ b/extension/llm/runner/constants.h @@ -21,7 +21,8 @@ inline constexpr auto kUseSDPAWithKVCache = "use_sdpa_with_kv_cache"; // Multimodal method name conventions inline constexpr auto kImageEncoderMethod = "image_encoder"; -inline constexpr auto kTokenEmbeddingMethod = "token_embedding"; -inline constexpr auto kTextModelMethod = "text_model"; +inline constexpr auto kAudioEncoderMethod = "audio_encoder"; +inline constexpr auto kTokenEmbeddingMethod = "token_embeddings"; +inline constexpr auto kTextModelMethod = "decoder"; } // namespace executorch::extension::llm diff --git a/extension/llm/runner/multimodal_input.h b/extension/llm/runner/multimodal_input.h index ae243992fec..93bbd2a79ce 100644 --- a/extension/llm/runner/multimodal_input.h +++ b/extension/llm/runner/multimodal_input.h @@ -11,6 +11,7 @@ #pragma once +#include #include #include #include @@ -19,19 +20,24 @@ namespace executorch::extension::llm { /** - * A generic class to hold either image or text data for multimodal inputs. - * This allows the generate() API to take a std::vector of these objects - * instead of separate image and text parameters. + * A generic class to hold either image, text, or audio data for multimodal + * inputs. This allows the generate() API to take a std::vector of these objects + * instead of separate image, text, and audio parameters. */ class ET_EXPERIMENTAL MultimodalInput { public: - enum class Type { TEXT, IMAGE }; + enum class Type { TEXT, IMAGE, AUDIO, RAW_AUDIO }; // Constructors explicit MultimodalInput(const std::string& text) : data_(text) {} explicit MultimodalInput(std::string&& text) : data_(std::move(text)) {} explicit MultimodalInput(const Image& image) : data_(image) {} explicit MultimodalInput(Image&& image) : data_(std::move(image)) {} + explicit MultimodalInput(const Audio& audio) : data_(audio) {} + explicit MultimodalInput(Audio&& audio) : data_(std::move(audio)) {} + explicit MultimodalInput(const RawAudio& raw_audio) : data_(raw_audio) {} + explicit MultimodalInput(RawAudio&& raw_audio) + : data_(std::move(raw_audio)) {} // Copy constructor and assignment MultimodalInput(const MultimodalInput& other) = default; @@ -60,12 +66,35 @@ class ET_EXPERIMENTAL MultimodalInput { return std::holds_alternative(data_); } + /** + * Check if this input contains audio data. + * @return true if this input contains audio, false otherwise. + */ + bool is_audio() const noexcept { + return std::holds_alternative