Skip to content

Commit 2d81eeb

Browse files
jackzhxnglarryliu0820
authored andcommitted
Add a generic multimodal runner (#13166)
Summary: Pull Request resolved: #13166 This diff adds a generic multimodal runner for Executorch. It includes changes to the `multimodal_prefiller.h` file, which adds a `prefill` method that takes an `MultimodalInput` object and returns the next token of the LLM module after prefill. It also includes changes to the `multimodal_runner.cpp` file, which implements the `MultimodalRunner` class for multimodal input and text output LLMs. The `MultimodalRunner` class uses the `MultimodalRunner` class to prefill the KV cache of the model, then uses `TextTokenGenerator` to run the autoregressive generation loop. See diagram: ``` ┌─────────────────┐ │ IRunner │ │ <<interface>> │ │ │ │ + is_loaded() │ │ + load() │ │ + generate() │ │ + stop() │ └─────────────────┘ △ │ │ implements │ │ │ │ ┌──────┴──────────┐ ┌─────────────────┐ │ TextLLMRunner │ │MultimodalRunner │ │ │ │ │ │ - tokenizer_ │ │ - tokenizer_ │ ┌─────┼ - module_ │ │ - module_ ┼─────┐ │ ┌───┼ - stats_ │ │ - stats_ ┼───┐ │ │ │ ┌─┼ - metadata_ │ │ - metadata_ ┼─┐ │ │ │ │ │ │ - temperature_ │ │ - pos_ │ │ │ │ │ │ │ └─────────────────┘ └─────────────────┘ │ │ │ │ │ │ │ │ │ │ │ │ │ │ │ │ │ │ │ │ │ │ │ │ ┌─────────────────┐ │ │ │ │ │ │ │TextTokenGenerat-│ │ │ │ │ │ │ │or │ │ │ │ │ │ │ │ │ │ │ │ │ │ │ │ - tokenizer_* │ │ │ │ │ │ │ consists │ - text_decoder_ │ consists │ │ │ │ │ └──────────────►│ runner_ │◄───────────────┘ │ │ │ │ │ - eos_ids_ │ │ │ │ │ │ - use_kv_cache_ │ │ │ │ │ │ - stats_* │ │ │ │ │ │ │ │ │ │ │consists │ + generate() │ consists │ │ │ │ └────────┬────────┘ │ │ │ │ ┌──────────────┴───────────────┐ │ │ │ │ ▼ uses ▼ │ │ │ │ ┌─────────────────┐ ┌─────────────────┐ │ │ │ │ │TextDecoderRunner│ │MultimodalDecode-│ │ │ │ │ │ │ │rRunner │ │ │ │ │ │ - module_* │ extends │ - module_* │ │ │ │ └──►│ - should_stop_ │◄─────────┼ - should_stop_ │◄──┘ │ │ │ │ │ │ │ │ │ + step() │ │ + step() │ │ │ │ + logits_to_ │ │ + logits_to_ │ │ │ │ token() │ │ token() │ │ │ └─────────────────┘ └─────────────────┘ │ │ ▲ ▲ │ │ │ uses │ │ │consists ├─────────────────────────────┤ │ │ ┌───────┴─────────┐ │ │ │ │ TextPrefiller │ │ consists│ │ │ │ ┌────────┴────────┐ │ │ │ - text_decoder_ │ │ MultimodalPrefi-│ │ │ │ runner_ │ │ller │ │ └────►│ - use_kv_cache_ │ │ - module_* │ │ │ - enable_ │ │ │◄────┘ │ parallel_ │ │ + prefill() │ │ prefill_ │ │ + logits_to_ │ │ │ │ token() │ │ + prefill() │ └─────────────────┘ ├─────────────────┘ ``` Differential Revision: D79231625
1 parent 31e13b0 commit 2d81eeb

File tree

9 files changed

+884
-91
lines changed

9 files changed

+884
-91
lines changed

extension/llm/runner/README.md

Lines changed: 527 additions & 0 deletions
Large diffs are not rendered by default.

extension/llm/runner/llm_runner_helper.cpp

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,11 @@
88

99
// Implementation of helper utilities for creating and configuring LLM runners
1010

11+
#include <executorch/extension/llm/runner/image_prefiller.h>
1112
#include <executorch/extension/llm/runner/llm_runner_helper.h>
13+
#include <executorch/extension/llm/runner/multimodal_decoder_runner.h>
14+
#include <executorch/extension/llm/runner/multimodal_prefiller.h>
15+
#include <executorch/extension/llm/runner/multimodal_runner.h>
1216
#include <executorch/extension/llm/runner/stats.h>
1317
#include <executorch/extension/llm/runner/text_llm_runner.h>
1418
#include <executorch/extension/llm/runner/text_prefiller.h>
@@ -205,6 +209,65 @@ std::unique_ptr<TextLLMRunner> create_text_llm_runner(
205209
temperature);
206210
}
207211

212+
std::unique_ptr<MultimodalRunner> create_multimodal_runner(
213+
const std::string& model_path,
214+
std::unique_ptr<::tokenizers::Tokenizer> tokenizer,
215+
std::optional<const std::string> data_path,
216+
float temperature) {
217+
// Sanity check tokenizer
218+
if (!tokenizer || !tokenizer->is_loaded()) {
219+
ET_LOG(Error, "Tokenizer is null or not loaded");
220+
return nullptr;
221+
}
222+
223+
// Create the Module
224+
std::unique_ptr<Module> module;
225+
if (data_path.has_value()) {
226+
module = std::make_unique<Module>(
227+
model_path, data_path.value(), Module::LoadMode::File);
228+
} else {
229+
module = std::make_unique<Module>(model_path, Module::LoadMode::File);
230+
}
231+
232+
// Get metadata from Module
233+
ET_LOG(Info, "Reading metadata from model");
234+
auto metadata = get_llm_metadata(tokenizer.get(), module.get());
235+
236+
auto eos_ids = std::make_unique<std::unordered_set<uint64_t>>(
237+
get_eos_ids(tokenizer.get(), module.get()));
238+
239+
// Create IOManager
240+
std::unique_ptr<IOManager> io_manager = std::make_unique<IOManager>();
241+
242+
// Create text_decoder_runner
243+
auto text_decoder_runner =
244+
std::make_unique<MultimodalDecoderRunner>(module.get(), io_manager.get());
245+
246+
// Create multimodal_prefiller
247+
auto multimodal_prefiller = std::make_unique<MultimodalPrefiller>(
248+
module.get(), text_decoder_runner.get(), tokenizer.get(), io_manager.get());
249+
250+
// Create text_token_generator with stats
251+
auto stats = std::make_unique<Stats>();
252+
auto text_token_generator = std::make_unique<TextTokenGenerator>(
253+
tokenizer.get(),
254+
text_decoder_runner.get(),
255+
metadata.at(kUseKVCache),
256+
std::move(eos_ids),
257+
stats.get());
258+
259+
// Create and return the MultimodalRunner instance
260+
return std::make_unique<MultimodalRunner>(
261+
std::move(metadata),
262+
std::move(tokenizer),
263+
std::move(module),
264+
std::move(text_decoder_runner),
265+
std::move(multimodal_prefiller),
266+
std::move(io_manager),
267+
std::move(text_token_generator),
268+
std::move(stats));
269+
}
270+
208271
} // namespace llm
209272
} // namespace extension
210273
} // namespace executorch

extension/llm/runner/llm_runner_helper.h

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,23 @@ ET_EXPERIMENTAL std::unique_ptr<TextLLMRunner> create_text_llm_runner(
103103
std::optional<const std::string> data_path = std::nullopt,
104104
float temperature = -1.0f);
105105

106+
/**
107+
* @brief Creates a MultimodalRunner instance with dependency injection
108+
*
109+
* This factory function creates and initializes a MultimodalRunner with all
110+
* necessary components for multimodal text generation.
111+
*
112+
* @param model_path Path to the model file
113+
* @param tokenizer Initialized tokenizer instance
114+
* @param data_path Optional path to additional .ptd required by the model
115+
* @return std::unique_ptr<MultimodalRunner> Initialized MultimodalRunner
116+
* instance, or nullptr on failure
117+
*/
118+
ET_EXPERIMENTAL std::unique_ptr<MultimodalRunner> create_multimodal_runner(
119+
const std::string& model_path,
120+
std::unique_ptr<::tokenizers::Tokenizer> tokenizer,
121+
std::optional<const std::string> data_path = std::nullopt);
122+
106123
} // namespace llm
107124
} // namespace extension
108125
} // namespace executorch

extension/llm/runner/multimodal_decoder_runner.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
* This source code is licensed under the BSD-style license found in the
66
* LICENSE file in the root directory of this source tree.
77
*/
8+
#pragma once
89

910
#include <executorch/extension/llm/runner/constants.h>
1011
#include <executorch/extension/llm/runner/text_decoder_runner.h>
Lines changed: 181 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,181 @@
1+
/*
2+
* Copyright (c) Meta Platforms, Inc. and affiliates.
3+
* All rights reserved.
4+
*
5+
* This source code is licensed under the BSD-style license found in the
6+
* LICENSE file in the root directory of this source tree.
7+
*/
8+
9+
// Implementation of MultimodalRunner for multimodal input and text output LLMs
10+
11+
#include <executorch/extension/llm/runner/constants.h>
12+
#include <executorch/extension/llm/runner/multimodal_runner.h>
13+
#include <executorch/extension/llm/runner/util.h>
14+
#include <executorch/runtime/platform/runtime.h>
15+
#include <pytorch/tokenizers/hf_tokenizer.h>
16+
#include <pytorch/tokenizers/llama2c_tokenizer.h>
17+
#include <pytorch/tokenizers/sentencepiece.h>
18+
#include <pytorch/tokenizers/tiktoken.h>
19+
20+
namespace executorch {
21+
namespace extension {
22+
namespace llm {
23+
24+
using ::executorch::extension::Module;
25+
using ::executorch::runtime::Error;
26+
using ::executorch::runtime::Result;
27+
28+
29+
MultimodalRunner::MultimodalRunner(
30+
std::unordered_map<std::string, int64_t> metadata,
31+
std::unique_ptr<::tokenizers::Tokenizer> tokenizer,
32+
std::unique_ptr<Module> module,
33+
std::unique_ptr<MultimodalDecoderRunner> text_decoder_runner,
34+
std::unique_ptr<MultimodalPrefiller> multimodal_prefiller,
35+
std::unique_ptr<IOManager> io_manager,
36+
std::unique_ptr<TextTokenGenerator> text_token_generator,
37+
std::unique_ptr<Stats> stats)
38+
: metadata_(std::move(metadata)),
39+
tokenizer_(std::move(tokenizer)),
40+
module_(std::move(module)),
41+
text_decoder_runner_(std::move(text_decoder_runner)),
42+
multimodal_prefiller_(std::move(multimodal_prefiller)),
43+
io_manager_(std::move(io_manager)),
44+
text_token_generator_(std::move(text_token_generator)),
45+
stats_(std::move(stats)),
46+
pos_(0) {}
47+
48+
bool MultimodalRunner::is_loaded() {
49+
return multimodal_prefiller_->is_method_loaded() &&
50+
text_token_generator_->is_loaded();
51+
}
52+
53+
Error MultimodalRunner::load() {
54+
if (is_loaded()) {
55+
return Error::Ok;
56+
}
57+
ET_CHECK_OK_OR_RETURN_ERROR(multimodal_prefiller_->load());
58+
ET_CHECK_OK_OR_RETURN_ERROR(text_token_generator_->load());
59+
return Error::Ok;
60+
}
61+
62+
// Don't print with the same priority during warmup
63+
#define RUNNER_ET_LOG(warmup, format, ...) \
64+
if (warmup) { \
65+
ET_LOG(Debug, format, __VA_ARGS__); \
66+
} else { \
67+
ET_LOG(Info, format, __VA_ARGS__); \
68+
}
69+
70+
Error MultimodalRunner::generate(
71+
const std::vector<MultimodalInput>& inputs,
72+
const GenerationConfig& config,
73+
std::function<void(const std::string&)>& token_callback,
74+
std::function<void(const Stats&)>& stats_callback) {
75+
if (inputs.empty()) {
76+
ET_LOG(Error, "MultimodalInput vector cannot be empty");
77+
return Error::InvalidArgument;
78+
}
79+
80+
if (!is_loaded()) {
81+
stats_->model_load_start_ms = time_in_ms();
82+
ET_CHECK_OK_OR_RETURN_ERROR(load());
83+
stats_->model_load_end_ms = time_in_ms();
84+
}
85+
86+
if (config.warming) {
87+
ET_LOG(Info, "Doing a warmup run...");
88+
}
89+
90+
RUNNER_ET_LOG(
91+
config.warming,
92+
"RSS after loading model: %f MiB (0 if unsupported)",
93+
get_rss_bytes() / 1024.0 / 1024.0);
94+
95+
// Wrap the token_callback with print function
96+
std::function<void(const std::string&)> wrapped_callback =
97+
[token_callback, config](const std::string& piece) {
98+
if (!config.warming) {
99+
safe_printf(piece.c_str());
100+
fflush(stdout);
101+
}
102+
if (token_callback) {
103+
token_callback(piece);
104+
}
105+
};
106+
107+
// Reset internal state and start inference
108+
stats_->inference_start_ms = time_in_ms();
109+
110+
uint64_t prefill_next_token = 0;
111+
// Process multimodal inputs in order
112+
for (const MultimodalInput& input : inputs) {
113+
prefill_next_token = ET_UNWRAP(multimodal_prefiller_->prefill(input, pos_));
114+
}
115+
116+
stats_->first_token_ms = time_in_ms();
117+
stats_->prompt_eval_end_ms = time_in_ms();
118+
stats_->num_prompt_tokens = pos_;
119+
120+
wrapped_callback(ET_UNWRAP_TOKENIZER(
121+
tokenizer_->decode(prefill_next_token, prefill_next_token)));
122+
123+
RUNNER_ET_LOG(
124+
config.warming,
125+
"RSS after multimodal input processing: %f MiB (0 if unsupported)",
126+
get_rss_bytes() / 1024.0 / 1024.0);
127+
128+
// Resolve max_new_tokens based on config
129+
int64_t max_context_len =
130+
metadata_.at(kMaxContextLen) - 0; // No start_pos offset
131+
int32_t max_new_tokens = config.resolve_max_new_tokens(max_context_len, pos_);
132+
133+
ET_LOG(
134+
Info,
135+
"Max new tokens resolved: %d, pos_ %" PRId64 ", max_context_len %" PRId64,
136+
max_new_tokens,
137+
pos_,
138+
max_context_len);
139+
140+
ET_CHECK_OR_RETURN_ERROR(
141+
max_new_tokens > 0,
142+
InvalidArgument,
143+
"Max new tokens %d is less than or equal to 0",
144+
max_new_tokens);
145+
146+
// Generate tokens using the text token generator
147+
std::vector<uint64_t> prompt_tokens = {prefill_next_token};
148+
int64_t num_generated_tokens = ET_UNWRAP(text_token_generator_->generate(
149+
/*tokens=*/prompt_tokens,
150+
/*start_pos=*/pos_,
151+
/*max_new_tokens=*/max_new_tokens -
152+
1, // Subtract 1 because prefill already generated 1 token
153+
/*temperature=*/config.temperature,
154+
/*token_callback=*/wrapped_callback));
155+
156+
pos_ += num_generated_tokens;
157+
// Update stats
158+
stats_->num_generated_tokens = num_generated_tokens;
159+
// Finalize stats and call callback
160+
stats_->inference_end_ms = time_in_ms();
161+
if (!config.warming) {
162+
printf("\n");
163+
}
164+
165+
if (config.warming) {
166+
ET_LOG(Info, "Warmup run finished!");
167+
} else {
168+
// Do not print report during warmup
169+
print_report(*stats_);
170+
}
171+
172+
if (stats_callback) {
173+
stats_callback(*stats_);
174+
}
175+
176+
return Error::Ok;
177+
}
178+
179+
} // namespace llm
180+
} // namespace extension
181+
} // namespace executorch

0 commit comments

Comments
 (0)