Skip to content

Commit 7508ccb

Browse files
larryliu0820facebook-github-bot
authored andcommitted
Add a generic multimodal runner
Summary: This diff adds a generic multimodal runner for Executorch. It includes changes to the `image_prefiller.h` file, which adds a `prefill` method that takes an `Image` object and returns the next token of the LLM module after prefill. It also includes changes to the `multimodal_runner.cpp` file, which implements the `MultimodalRunner` class for multimodal input and text output LLMs. The `MultimodalRunner` class uses the `ImagePrefiller`, `TextPrefiller` classes to prefill the KV cache of the model, then uses `TextTokenGenerator` to run the autoregressive generation loop. See diagram: ``` ┌─────────────────┐ │ IRunner │ │ <<interface>> │ │ │ │ + is_loaded() │ │ + load() │ │ + generate() │ │ + stop() │ └─────────────────┘ △ │ │ implements │ │ │ │ ┌──────┴──────────┐ ┌─────────────────┐ │ TextLLMRunner │ │MultimodalRunner │ │ │ │ │ │ - tokenizer_ │ │ - tokenizer_ ┼───────┐ ┌─────┼ - module_ │ │ - module_ ┼─────┐ │ │ ┌───┼ - stats_ │ │ - stats_ ┼───┐ │ │ │ │ ┌─┼ - metadata_ │ │ - metadata_ ┼─┐ │ │ │ │ │ │ │ - temperature_ │ │ - pos_ │ │ │ │ │ │ │ │ └─────────────────┘ └─────────────────┘ │ │ │ │ │ │ │ │ │ │ │ │ │ │ │ │ │ │ │ │ │ │ │ │ │ │ │ │ ┌─────────────────┐ │ │ │ │ │ │ │ │TextTokenGenerat-│ │ │ │ │ │ │ │ │or │ │ │ │ │ │ │ │ │ │ │ │ │ │ │ │ │ │ - tokenizer_* │ │ │ │ │ │ │ │ consists │ - text_decoder_ │ consists │ │ │ │ │ │ └──────────────►│ runner_ │◄───────────────┘ │ │ │ │ │ │ - eos_ids_ │ │ │ │ │ │ │ - use_kv_cache_ │ │ │ │ │ │ │ - stats_* │ │ │ │ │ │ │ │ │ │ │ │ │consists │ + generate() │ consists │ │ │ │ │ └────────┬────────┘ │ │ │ │ │ ┌──────────────┴───────────────┐ │ │ │ │ │ ▼ uses ▼ │ │ │ │ │ ┌─────────────────┐ ┌─────────────────┐ │ │ │ │ │ │TextDecoderRunner│ │MultimodalTextDe-│ │ │ │ │ │ │ │ │coderRunner │ │ │ │ │ │ │ - module_* │ extends │ - module_* │ │ │ │ │ └──►│ - should_stop_ │◄─────────┼ - should_stop_ │◄──┘ │ │ │ │ │ │ │ │ │ │ │ + step() │ │ + step() │ │ │ │ │ + logits_to_ │ │ + logits_to_ │ │ │ │ │ token() │ │ token() │ │ │ │ └─────────────────┘ └─────────────────┘ │ │ │ ▲ ▲ │ │ │ │ uses │ │ │ │ └──────────────┬──────────────┘ │ │ │ ┌───────┴─────────┐ │ │ │ │ TextPrefiller │ │ │ │ │ │ │ │ │ │ - text_decoder_ │ │ │ │ consists │ runner_ │ consists │ │ └───────────────────►│ - use_kv_cache_ │◄──────────────────┘ │ │ - enable_ │ │ │ parallel_ │ │ │ prefill_ │ │ │ │ │ │ + prefill() │ │ └─────────────────┘ consists │ │ │ ┌─────────────────┐ │ │ ImagePrefiller │ │ │ │ │ │ - module_* │ │ │ │◄──────┘ │ + prefill() │ │ + logits_to_ │ │ token() │ └─────────────────┘ ``` Differential Revision: D79231625
1 parent 72ef7b1 commit 7508ccb

File tree

10 files changed

+1377
-88
lines changed

10 files changed

+1377
-88
lines changed

extension/llm/runner/README.md

Lines changed: 541 additions & 0 deletions
Large diffs are not rendered by default.
Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,92 @@
1+
/*
2+
* Copyright (c) Meta Platforms, Inc. and affiliates.
3+
* All rights reserved.
4+
*
5+
* This source code is licensed under the BSD-style license found in the
6+
* LICENSE file in the root directory of this source tree.
7+
*/
8+
9+
// Given a image tensor, prefill the KV cache of LLaVA.
10+
11+
#include <executorch/extension/llm/runner/constants.h>
12+
#include <executorch/extension/llm/runner/image_prefiller.h>
13+
#include <executorch/extension/tensor/tensor.h>
14+
15+
namespace executorch::extension::llm {
16+
/**
17+
* Prefill an LLM Module with the given image input.
18+
* @param image The image input to LLaVa.
19+
* @param start_pos The starting position in KV cache of the input in the LLM
20+
* @return logits of the image prefill.
21+
*/
22+
::executorch::runtime::Result<uint64_t> ImagePrefiller::prefill(
23+
::executorch::extension::llm::Image& image,
24+
int64_t& start_pos) {
25+
auto image_tensor = executorch::extension::from_blob(
26+
image.data.data(),
27+
{3, image.height, image.width},
28+
::executorch::aten::ScalarType::Byte);
29+
// Run image encoder
30+
auto image_encoder_outputs =
31+
ET_UNWRAP(module_->execute(kImageEncoderMethod, image_tensor));
32+
33+
// inputs:[start_pos, embeds]
34+
auto start_pos_tensor = executorch::extension::from_blob(
35+
&start_pos, {1}, ::executorch::aten::ScalarType::Long);
36+
37+
// Run text model
38+
auto outputs_res = ET_UNWRAP(module_->execute(
39+
kTextModelMethod, {start_pos_tensor, image_encoder_outputs[0]}));
40+
ET_CHECK_MSG(
41+
outputs_res[0].isTensor(),
42+
"Non Tensor Output returned from executing image prefill");
43+
44+
// Update the start_pos, which is only available inside this function.
45+
// outputs_res can have only one logits.
46+
start_pos += image_encoder_outputs[0].toTensor().size(1);
47+
48+
return logits_to_token(outputs_res[0].toTensor());
49+
}
50+
51+
/**
52+
* Load the Module for image prefill purpose.
53+
* @return The error code.
54+
*/
55+
::executorch::runtime::Error ImagePrefiller::load() {
56+
if (is_method_loaded()) {
57+
return ::executorch::runtime::Error::Ok;
58+
}
59+
ET_CHECK_OK_OR_RETURN_ERROR(module_->load_method(kImageEncoderMethod));
60+
ET_CHECK_OK_OR_RETURN_ERROR(module_->load_method(kTextModelMethod));
61+
return ::executorch::runtime::Error::Ok;
62+
}
63+
64+
/**
65+
* Check if the required methods in the Module is loaded.
66+
* @return True if the Module is loaded, false otherwise.
67+
*/
68+
bool ImagePrefiller::is_method_loaded() {
69+
::executorch::runtime::Result<std::unordered_set<std::string>> methods_res =
70+
module_->method_names();
71+
if (methods_res.error() != ::executorch::runtime::Error::Ok) {
72+
ET_CHECK_MSG(false, "Failed to get method names");
73+
}
74+
std::unordered_set<std::string> methods = methods_res.get();
75+
bool methods_exist = methods.find(kImageEncoderMethod) != methods.end() &&
76+
methods.find(kTextModelMethod) != methods.end();
77+
if (!methods_exist) {
78+
for (const auto& method : methods) {
79+
ET_LOG(Error, "Method: %s", method.c_str());
80+
}
81+
ET_CHECK_MSG(
82+
methods_exist,
83+
"Missing required methods (%s, %s) in the model",
84+
kImageEncoderMethod,
85+
kTextModelMethod);
86+
}
87+
bool methods_loaded = module_->is_method_loaded(kImageEncoderMethod) &&
88+
module_->is_method_loaded(kTextModelMethod);
89+
return methods_loaded;
90+
}
91+
92+
} // namespace executorch::extension::llm

extension/llm/runner/image_prefiller.h

Lines changed: 41 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
#pragma once
1212

1313
#include <executorch/extension/llm/runner/image.h>
14+
#include <executorch/extension/llm/sampler/sampler.h>
1415
#include <executorch/extension/module/module.h>
1516
#include <executorch/runtime/platform/compiler.h>
1617

@@ -31,16 +32,52 @@ class ET_EXPERIMENTAL ImagePrefiller {
3132
* It's passed as reference and will be updated inside this function.
3233
* @return The next token of the LLM Module after prefill.
3334
*/
34-
virtual ::executorch::runtime::Result<executorch::aten::Tensor> prefill(
35+
virtual ::executorch::runtime::Result<uint64_t> prefill(
3536
Image& image,
36-
int64_t& start_pos) = 0;
37+
int64_t& start_pos);
3738

38-
virtual ::executorch::runtime::Error load() = 0;
39-
virtual bool is_method_loaded() = 0;
39+
virtual ::executorch::runtime::Error load();
40+
virtual bool is_method_loaded();
4041

4142
virtual ~ImagePrefiller() = default;
4243

4344
protected:
45+
/**
46+
* Sample the next token from the logits tensor.
47+
* @param logits_tensor The logits tensor.
48+
* @param temperature The temperature parameter used to control randomness in
49+
* sampling.
50+
* @return The next token.
51+
*/
52+
inline uint64_t logits_to_token(
53+
const executorch::aten::Tensor& logits_tensor,
54+
const float temperature = 0.0f) {
55+
uint64_t result = 0;
56+
ET_SWITCH_THREE_TYPES(
57+
Float,
58+
Half,
59+
BFloat16,
60+
logits_tensor.scalar_type(),
61+
unused,
62+
"logits_to_token",
63+
CTYPE,
64+
[&]() {
65+
// If the logit_tensor rank is 3, the shape is [batch, seq_length,
66+
// vocab_size], get the last logits, sample and return. Else the model
67+
// outputs the last logit, directly sample and return.
68+
auto* logits = logits_tensor.mutable_data_ptr<CTYPE>();
69+
ssize_t vocab_size = logits_tensor.size(logits_tensor.dim() - 1);
70+
if (logits_tensor.dim() == 3) {
71+
auto num_tokens = logits_tensor.size(1);
72+
logits += (num_tokens - 1) * vocab_size;
73+
}
74+
// @lint-ignore CLANGTIDY facebook-hte-Deprecated
75+
Sampler sampler(vocab_size, temperature);
76+
result = sampler.sample(logits);
77+
});
78+
return result;
79+
}
80+
4481
Module* module_;
4582
};
4683

extension/llm/runner/llm_runner_helper.cpp

Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,10 @@
88

99
// Implementation of helper utilities for creating and configuring LLM runners
1010

11+
#include <executorch/extension/llm/runner/image_prefiller.h>
1112
#include <executorch/extension/llm/runner/llm_runner_helper.h>
13+
#include <executorch/extension/llm/runner/multimodal_runner.h>
14+
#include <executorch/extension/llm/runner/multimodal_text_decoder_runner.h>
1215
#include <executorch/extension/llm/runner/stats.h>
1316
#include <executorch/extension/llm/runner/text_llm_runner.h>
1417
#include <executorch/extension/llm/runner/text_prefiller.h>
@@ -205,6 +208,68 @@ std::unique_ptr<TextLLMRunner> create_text_llm_runner(
205208
temperature);
206209
}
207210

211+
std::unique_ptr<MultimodalRunner> create_multimodal_runner(
212+
const std::string& model_path,
213+
std::unique_ptr<::tokenizers::Tokenizer> tokenizer,
214+
std::optional<const std::string> data_path,
215+
float temperature) {
216+
// Sanity check tokenizer
217+
if (!tokenizer || !tokenizer->is_loaded()) {
218+
ET_LOG(Error, "Tokenizer is null or not loaded");
219+
return nullptr;
220+
}
221+
222+
// Create the Module
223+
std::unique_ptr<Module> module;
224+
if (data_path.has_value()) {
225+
module = std::make_unique<Module>(
226+
model_path, data_path.value(), Module::LoadMode::File);
227+
} else {
228+
module = std::make_unique<Module>(model_path, Module::LoadMode::File);
229+
}
230+
231+
// Get metadata from Module
232+
ET_LOG(Info, "Reading metadata from model");
233+
auto metadata = get_llm_metadata(tokenizer.get(), module.get());
234+
235+
auto eos_ids = std::make_unique<std::unordered_set<uint64_t>>(
236+
get_eos_ids(tokenizer.get(), module.get()));
237+
238+
// Create text_decoder_runner
239+
auto text_decoder_runner =
240+
std::make_unique<MultimodalTextDecoderRunner>(module.get());
241+
242+
// Create text_prefiller
243+
auto text_prefiller = std::make_unique<TextPrefiller>(
244+
text_decoder_runner.get(),
245+
metadata.at(kUseKVCache),
246+
metadata.at(kEnableDynamicShape),
247+
metadata.at(kMaxSeqLen));
248+
249+
// Create image_prefiller
250+
auto image_prefiller = std::make_unique<ImagePrefiller>(module.get());
251+
252+
// Create text_token_generator with stats
253+
auto stats = std::make_unique<Stats>();
254+
auto text_token_generator = std::make_unique<TextTokenGenerator>(
255+
tokenizer.get(),
256+
text_decoder_runner.get(),
257+
metadata.at(kUseKVCache),
258+
std::move(eos_ids),
259+
stats.get());
260+
261+
// Create and return the MultimodalRunner instance
262+
return std::make_unique<MultimodalRunner>(
263+
std::move(metadata),
264+
std::move(tokenizer),
265+
std::move(module),
266+
std::move(text_decoder_runner),
267+
std::move(text_prefiller),
268+
std::move(image_prefiller),
269+
std::move(text_token_generator),
270+
std::move(stats));
271+
}
272+
208273
} // namespace llm
209274
} // namespace extension
210275
} // namespace executorch

extension/llm/runner/llm_runner_helper.h

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,25 @@ ET_EXPERIMENTAL std::unique_ptr<TextLLMRunner> create_text_llm_runner(
103103
std::optional<const std::string> data_path = std::nullopt,
104104
float temperature = -1.0f);
105105

106+
/**
107+
* @brief Creates a MultimodalRunner instance with dependency injection
108+
*
109+
* This factory function creates and initializes a MultimodalRunner with all
110+
* necessary components for multimodal text generation.
111+
*
112+
* @param model_path Path to the model file
113+
* @param tokenizer Initialized tokenizer instance
114+
* @param data_path Optional path to additional data required by the model
115+
* @param temperature Optional temperature parameter for controlling randomness
116+
* @return std::unique_ptr<MultimodalRunner> Initialized MultimodalRunner
117+
* instance, or nullptr on failure
118+
*/
119+
ET_EXPERIMENTAL std::unique_ptr<MultimodalRunner> create_multimodal_runner(
120+
const std::string& model_path,
121+
std::unique_ptr<::tokenizers::Tokenizer> tokenizer,
122+
std::optional<const std::string> data_path = std::nullopt,
123+
float temperature = 0.8f);
124+
106125
} // namespace llm
107126
} // namespace extension
108127
} // namespace executorch

0 commit comments

Comments
 (0)