Skip to content

Commit 69e5c40

Browse files
larryliu0820facebook-github-bot
authored andcommitted
Add a generic multimodal runner (#13166)
Summary: This diff adds a generic multimodal runner for Executorch. It includes changes to the `multimodal_prefiller.h` file, which adds a `prefill` method that takes an `MultimodalInput` object and returns the next token of the LLM module after prefill. It also includes changes to the `multimodal_runner.cpp` file, which implements the `MultimodalRunner` class for multimodal input and text output LLMs. The `MultimodalRunner` class uses the `MultimodalRunner` class to prefill the KV cache of the model, then uses `TextTokenGenerator` to run the autoregressive generation loop. See diagram: ``` ┌─────────────────┐ │ IRunner │ │ <<interface>> │ │ │ │ + is_loaded() │ │ + load() │ │ + generate() │ │ + stop() │ └─────────────────┘ △ │ │ implements │ │ │ │ ┌──────┴──────────┐ ┌─────────────────┐ │ TextLLMRunner │ │MultimodalRunner │ │ │ │ │ │ - tokenizer_ │ │ - tokenizer_ │ ┌─────┼ - module_ │ │ - module_ ┼─────┐ │ ┌───┼ - stats_ │ │ - stats_ ┼───┐ │ │ │ ┌─┼ - metadata_ │ │ - metadata_ ┼─┐ │ │ │ │ │ │ - temperature_ │ │ - pos_ │ │ │ │ │ │ │ └─────────────────┘ └─────────────────┘ │ │ │ │ │ │ │ │ │ │ │ │ │ │ │ │ │ │ │ │ │ │ │ │ ┌─────────────────┐ │ │ │ │ │ │ │TextTokenGenerat-│ │ │ │ │ │ │ │or │ │ │ │ │ │ │ │ │ │ │ │ │ │ │ │ - tokenizer_* │ │ │ │ │ │ │ consists │ - text_decoder_ │ consists │ │ │ │ │ └──────────────►│ runner_ │◄───────────────┘ │ │ │ │ │ - eos_ids_ │ │ │ │ │ │ - use_kv_cache_ │ │ │ │ │ │ - stats_* │ │ │ │ │ │ │ │ │ │ │consists │ + generate() │ consists │ │ │ │ └────────┬────────┘ │ │ │ │ ┌──────────────┴───────────────┐ │ │ │ │ ▼ uses ▼ │ │ │ │ ┌─────────────────┐ ┌─────────────────┐ │ │ │ │ │TextDecoderRunner│ │MultimodalDecode-│ │ │ │ │ │ │ │rRunner │ │ │ │ │ │ - module_* │ extends │ - module_* │ │ │ │ └──►│ - should_stop_ │◄─────────┼ - should_stop_ │◄──┘ │ │ │ │ │ │ │ │ │ + step() │ │ + step() │ │ │ │ + logits_to_ │ │ + logits_to_ │ │ │ │ token() │ │ token() │ │ │ └─────────────────┘ └─────────────────┘ │ │ ▲ ▲ │ │ │ uses │ │ │consists ├─────────────────────────────┤ │ │ ┌───────┴─────────┐ │ │ │ │ TextPrefiller │ │ consists│ │ │ │ ┌────────┴────────┐ │ │ │ - text_decoder_ │ │ MultimodalPrefi-│ │ │ │ runner_ │ │ller │ │ └────►│ - use_kv_cache_ │ │ - module_* │ │ │ - enable_ │ │ │◄────┘ │ parallel_ │ │ + prefill() │ │ prefill_ │ │ + logits_to_ │ │ │ │ token() │ │ + prefill() │ └─────────────────┘ ├─────────────────┘ ``` Reviewed By: jackzhxng Differential Revision: D79231625
1 parent fe84495 commit 69e5c40

File tree

8 files changed

+889
-91
lines changed

8 files changed

+889
-91
lines changed

extension/llm/runner/README.md

Lines changed: 527 additions & 0 deletions
Large diffs are not rendered by default.

extension/llm/runner/llm_runner_helper.cpp

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,11 @@
88

99
// Implementation of helper utilities for creating and configuring LLM runners
1010

11+
#include <executorch/extension/llm/runner/image_prefiller.h>
1112
#include <executorch/extension/llm/runner/llm_runner_helper.h>
13+
#include <executorch/extension/llm/runner/multimodal_decoder_runner.h>
14+
#include <executorch/extension/llm/runner/multimodal_prefiller.h>
15+
#include <executorch/extension/llm/runner/multimodal_runner.h>
1216
#include <executorch/extension/llm/runner/stats.h>
1317
#include <executorch/extension/llm/runner/text_llm_runner.h>
1418
#include <executorch/extension/llm/runner/text_prefiller.h>
@@ -205,6 +209,65 @@ std::unique_ptr<TextLLMRunner> create_text_llm_runner(
205209
temperature);
206210
}
207211

212+
std::unique_ptr<MultimodalRunner> create_multimodal_runner(
213+
const std::string& model_path,
214+
std::unique_ptr<::tokenizers::Tokenizer> tokenizer,
215+
std::optional<const std::string> data_path,
216+
float temperature) {
217+
// Sanity check tokenizer
218+
if (!tokenizer || !tokenizer->is_loaded()) {
219+
ET_LOG(Error, "Tokenizer is null or not loaded");
220+
return nullptr;
221+
}
222+
223+
// Create the Module
224+
std::unique_ptr<Module> module;
225+
if (data_path.has_value()) {
226+
module = std::make_unique<Module>(
227+
model_path, data_path.value(), Module::LoadMode::File);
228+
} else {
229+
module = std::make_unique<Module>(model_path, Module::LoadMode::File);
230+
}
231+
232+
// Get metadata from Module
233+
ET_LOG(Info, "Reading metadata from model");
234+
auto metadata = get_llm_metadata(tokenizer.get(), module.get());
235+
236+
auto eos_ids = std::make_unique<std::unordered_set<uint64_t>>(
237+
get_eos_ids(tokenizer.get(), module.get()));
238+
239+
// Create IOManager
240+
std::unique_ptr<IOManager> io_manager = std::make_unique<IOManager>();
241+
242+
// Create text_decoder_runner
243+
auto text_decoder_runner =
244+
std::make_unique<MultimodalDecoderRunner>(module.get(), io_manager.get());
245+
246+
// Create multimodal_prefiller
247+
auto multimodal_prefiller = std::make_unique<MultimodalPrefiller>(
248+
module.get(), text_decoder_runner.get(), tokenizer.get(), io_manager.get());
249+
250+
// Create text_token_generator with stats
251+
auto stats = std::make_unique<Stats>();
252+
auto text_token_generator = std::make_unique<TextTokenGenerator>(
253+
tokenizer.get(),
254+
text_decoder_runner.get(),
255+
metadata.at(kUseKVCache),
256+
std::move(eos_ids),
257+
stats.get());
258+
259+
// Create and return the MultimodalRunner instance
260+
return std::make_unique<MultimodalRunner>(
261+
std::move(metadata),
262+
std::move(tokenizer),
263+
std::move(module),
264+
std::move(text_decoder_runner),
265+
std::move(multimodal_prefiller),
266+
std::move(io_manager),
267+
std::move(text_token_generator),
268+
std::move(stats));
269+
}
270+
208271
} // namespace llm
209272
} // namespace extension
210273
} // namespace executorch

extension/llm/runner/llm_runner_helper.h

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,23 @@ ET_EXPERIMENTAL std::unique_ptr<TextLLMRunner> create_text_llm_runner(
103103
std::optional<const std::string> data_path = std::nullopt,
104104
float temperature = -1.0f);
105105

106+
/**
107+
* @brief Creates a MultimodalRunner instance with dependency injection
108+
*
109+
* This factory function creates and initializes a MultimodalRunner with all
110+
* necessary components for multimodal text generation.
111+
*
112+
* @param model_path Path to the model file
113+
* @param tokenizer Initialized tokenizer instance
114+
* @param data_path Optional path to additional .ptd required by the model
115+
* @return std::unique_ptr<MultimodalRunner> Initialized MultimodalRunner
116+
* instance, or nullptr on failure
117+
*/
118+
ET_EXPERIMENTAL std::unique_ptr<MultimodalRunner> create_multimodal_runner(
119+
const std::string& model_path,
120+
std::unique_ptr<::tokenizers::Tokenizer> tokenizer,
121+
std::optional<const std::string> data_path = std::nullopt);
122+
106123
} // namespace llm
107124
} // namespace extension
108125
} // namespace executorch

extension/llm/runner/multimodal_decoder_runner.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
* This source code is licensed under the BSD-style license found in the
66
* LICENSE file in the root directory of this source tree.
77
*/
8+
#pragma once
89

910
#include <executorch/extension/llm/runner/constants.h>
1011
#include <executorch/extension/llm/runner/text_decoder_runner.h>
Lines changed: 187 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,187 @@
1+
/*
2+
* Copyright (c) Meta Platforms, Inc. and affiliates.
3+
* All rights reserved.
4+
*
5+
* This source code is licensed under the BSD-style license found in the
6+
* LICENSE file in the root directory of this source tree.
7+
*/
8+
9+
// Implementation of MultimodalRunner for multimodal input and text output LLMs
10+
11+
#include <executorch/extension/llm/runner/constants.h>
12+
#include <executorch/extension/llm/runner/multimodal_runner.h>
13+
#include <executorch/extension/llm/runner/util.h>
14+
#include <executorch/runtime/platform/runtime.h>
15+
#include <pytorch/tokenizers/hf_tokenizer.h>
16+
#include <pytorch/tokenizers/llama2c_tokenizer.h>
17+
#include <pytorch/tokenizers/sentencepiece.h>
18+
#include <pytorch/tokenizers/tiktoken.h>
19+
20+
namespace executorch {
21+
namespace extension {
22+
namespace llm {
23+
24+
using ::executorch::extension::Module;
25+
using ::executorch::runtime::Error;
26+
using ::executorch::runtime::Result;
27+
28+
namespace {
29+
// Default preset prompt for multimodal models
30+
const std::string kDefaultPresetPrompt =
31+
"A chat between a curious human and an artificial intelligence assistant. "
32+
"The assistant gives helpful, detailed, and polite answers to the human's questions. USER: ";
33+
} // namespace
34+
35+
MultimodalRunner::MultimodalRunner(
36+
std::unordered_map<std::string, int64_t> metadata,
37+
std::unique_ptr<::tokenizers::Tokenizer> tokenizer,
38+
std::unique_ptr<Module> module,
39+
std::unique_ptr<MultimodalDecoderRunner> text_decoder_runner,
40+
std::unique_ptr<MultimodalPrefiller> multimodal_prefiller,
41+
std::unique_ptr<IOManager> io_manager,
42+
std::unique_ptr<TextTokenGenerator> text_token_generator,
43+
std::unique_ptr<Stats> stats)
44+
: metadata_(std::move(metadata)),
45+
tokenizer_(std::move(tokenizer)),
46+
module_(std::move(module)),
47+
text_decoder_runner_(std::move(text_decoder_runner)),
48+
multimodal_prefiller_(std::move(multimodal_prefiller)),
49+
io_manager_(std::move(io_manager)),
50+
text_token_generator_(std::move(text_token_generator)),
51+
stats_(std::move(stats)),
52+
pos_(0) {}
53+
54+
bool MultimodalRunner::is_loaded() {
55+
return multimodal_prefiller_->is_method_loaded() &&
56+
text_token_generator_->is_loaded();
57+
}
58+
59+
Error MultimodalRunner::load() {
60+
if (is_loaded()) {
61+
return Error::Ok;
62+
}
63+
ET_CHECK_OK_OR_RETURN_ERROR(multimodal_prefiller_->load());
64+
ET_CHECK_OK_OR_RETURN_ERROR(text_token_generator_->load());
65+
return Error::Ok;
66+
}
67+
68+
// Don't print with the same priority during warmup
69+
#define RUNNER_ET_LOG(warmup, format, ...) \
70+
if (warmup) { \
71+
ET_LOG(Debug, format, __VA_ARGS__); \
72+
} else { \
73+
ET_LOG(Info, format, __VA_ARGS__); \
74+
}
75+
76+
Error MultimodalRunner::generate(
77+
const std::vector<MultimodalInput>& inputs,
78+
const GenerationConfig& config,
79+
std::function<void(const std::string&)>& token_callback,
80+
std::function<void(const Stats&)>& stats_callback) {
81+
if (inputs.empty()) {
82+
ET_LOG(Error, "MultimodalInput vector cannot be empty");
83+
return Error::InvalidArgument;
84+
}
85+
86+
if (!is_loaded()) {
87+
stats_->model_load_start_ms = time_in_ms();
88+
ET_CHECK_OK_OR_RETURN_ERROR(load());
89+
stats_->model_load_end_ms = time_in_ms();
90+
}
91+
92+
if (config.warming) {
93+
ET_LOG(Info, "Doing a warmup run...");
94+
}
95+
96+
RUNNER_ET_LOG(
97+
config.warming,
98+
"RSS after loading model: %f MiB (0 if unsupported)",
99+
get_rss_bytes() / 1024.0 / 1024.0);
100+
101+
// Wrap the token_callback with print function
102+
std::function<void(const std::string&)> wrapped_callback =
103+
[token_callback, config](const std::string& piece) {
104+
if (!config.warming) {
105+
safe_printf(piece.c_str());
106+
fflush(stdout);
107+
}
108+
if (token_callback) {
109+
token_callback(piece);
110+
}
111+
};
112+
113+
// Reset internal state and start inference
114+
stats_->inference_start_ms = time_in_ms();
115+
116+
uint64_t prefill_next_token = 0;
117+
// Process multimodal inputs in order
118+
for (const MultimodalInput& input : inputs) {
119+
prefill_next_token = ET_UNWRAP(multimodal_prefiller_->prefill(input, pos_));
120+
}
121+
122+
stats_->first_token_ms = time_in_ms();
123+
stats_->prompt_eval_end_ms = time_in_ms();
124+
stats_->num_prompt_tokens = pos_;
125+
126+
wrapped_callback(ET_UNWRAP_TOKENIZER(
127+
tokenizer_->decode(prefill_next_token, prefill_next_token)));
128+
129+
RUNNER_ET_LOG(
130+
config.warming,
131+
"RSS after multimodal input processing: %f MiB (0 if unsupported)",
132+
get_rss_bytes() / 1024.0 / 1024.0);
133+
134+
// Resolve max_new_tokens based on config
135+
int64_t max_context_len =
136+
metadata_.at(kMaxContextLen) - 0; // No start_pos offset
137+
int32_t max_new_tokens = config.resolve_max_new_tokens(max_context_len, pos_);
138+
139+
ET_LOG(
140+
Info,
141+
"Max new tokens resolved: %d, pos_ %" PRId64 ", max_context_len %" PRId64,
142+
max_new_tokens,
143+
pos_,
144+
max_context_len);
145+
146+
ET_CHECK_OR_RETURN_ERROR(
147+
max_new_tokens > 0,
148+
InvalidArgument,
149+
"Max new tokens %d is less than or equal to 0",
150+
max_new_tokens);
151+
152+
// Generate tokens using the text token generator
153+
std::vector<uint64_t> prompt_tokens = {prefill_next_token};
154+
int64_t num_generated_tokens = ET_UNWRAP(text_token_generator_->generate(
155+
/*tokens=*/prompt_tokens,
156+
/*start_pos=*/pos_,
157+
/*max_new_tokens=*/max_new_tokens -
158+
1, // Subtract 1 because prefill already generated 1 token
159+
/*temperature=*/config.temperature,
160+
/*token_callback=*/wrapped_callback));
161+
162+
pos_ += num_generated_tokens;
163+
// Update stats
164+
stats_->num_generated_tokens = num_generated_tokens;
165+
// Finalize stats and call callback
166+
stats_->inference_end_ms = time_in_ms();
167+
if (!config.warming) {
168+
printf("\n");
169+
}
170+
171+
if (config.warming) {
172+
ET_LOG(Info, "Warmup run finished!");
173+
} else {
174+
// Do not print report during warmup
175+
print_report(*stats_);
176+
}
177+
178+
if (stats_callback) {
179+
stats_callback(*stats_);
180+
}
181+
182+
return Error::Ok;
183+
}
184+
185+
} // namespace llm
186+
} // namespace extension
187+
} // namespace executorch

0 commit comments

Comments
 (0)