Skip to content

Commit 83749ae

Browse files
authored
Add a generic multimodal runner (#13166)
Summary: This diff adds a generic multimodal runner for Executorch. It includes changes to the `image_prefiller.h` file, which adds a `prefill` method that takes an `Image` object and returns the next token of the LLM module after prefill. It also includes changes to the `multimodal_runner.cpp` file, which implements the `MultimodalRunner` class for multimodal input and text output LLMs. The `MultimodalRunner` class uses the `ImagePrefiller`, `TextPrefiller` classes to prefill the KV cache of the model, then uses `TextTokenGenerator` to run the autoregressive generation loop. See diagram: ``` ┌─────────────────┐ │ IRunner │ │ <<interface>> │ │ │ │ + is_loaded() │ │ + load() │ │ + generate() │ │ + stop() │ └─────────────────┘ △ │ │ implements │ │ │ │ ┌──────┴──────────┐ ┌─────────────────┐ │ TextLLMRunner │ │MultimodalRunner │ │ │ │ │ │ - tokenizer_ │ │ - tokenizer_ ┼───────┐ ┌─────┼ - module_ │ │ - module_ ┼─────┐ │ │ ┌───┼ - stats_ │ │ - stats_ ┼───┐ │ │ │ │ ┌─┼ - metadata_ │ │ - metadata_ ┼─┐ │ │ │ │ │ │ │ - temperature_ │ │ - pos_ │ │ │ │ │ │ │ │ └─────────────────┘ └─────────────────┘ │ │ │ │ │ │ │ │ │ │ │ │ │ │ │ │ │ │ │ │ │ │ │ │ │ │ │ │ ┌─────────────────┐ │ │ │ │ │ │ │ │TextTokenGenerat-│ │ │ │ │ │ │ │ │or │ │ │ │ │ │ │ │ │ │ │ │ │ │ │ │ │ │ - tokenizer_* │ │ │ │ │ │ │ │ consists │ - text_decoder_ │ consists │ │ │ │ │ │ └──────────────►│ runner_ │◄───────────────┘ │ │ │ │ │ │ - eos_ids_ │ │ │ │ │ │ │ - use_kv_cache_ │ │ │ │ │ │ │ - stats_* │ │ │ │ │ │ │ │ │ │ │ │ │consists │ + generate() │ consists │ │ │ │ │ └────────┬────────┘ │ │ │ │ │ ┌──────────────┴───────────────┐ │ │ │ │ │ ▼ uses ▼ │ │ │ │ │ ┌─────────────────┐ ┌─────────────────┐ │ │ │ │ │ │TextDecoderRunner│ │MultimodalTextDe-│ │ │ │ │ │ │ │ │coderRunner │ │ │ │ │ │ │ - module_* │ extends │ - module_* │ │ │ │ │ └──►│ - should_stop_ │◄─────────┼ - should_stop_ │◄──┘ │ │ │ │ │ │ │ │ │ │ │ + step() │ │ + step() │ │ │ │ │ + logits_to_ │ │ + logits_to_ │ │ │ │ │ token() │ │ token() │ │ │ │ └─────────────────┘ └─────────────────┘ │ │ │ ▲ ▲ │ │ │ │ uses │ │ │ │ └──────────────┬──────────────┘ │ │ │ ┌───────┴─────────┐ │ │ │ │ TextPrefiller │ │ │ │ │ │ │ │ │ │ - text_decoder_ │ │ │ │ consists │ runner_ │ consists │ │ └───────────────────►│ - use_kv_cache_ │◄──────────────────┘ │ │ - enable_ │ │ │ parallel_ │ │ │ prefill_ │ │ │ │ │ │ + prefill() │ │ └─────────────────┘ consists │ │ │ ┌─────────────────┐ │ │ ImagePrefiller │ │ │ │ │ │ - module_* │ │ │ │◄──────┘ │ + prefill() │ │ + logits_to_ │ │ token() │ └─────────────────┘ ``` Differential Revision: D79231625
1 parent bf2f52b commit 83749ae

File tree

10 files changed

+886
-110
lines changed

10 files changed

+886
-110
lines changed

extension/llm/runner/README.md

Lines changed: 527 additions & 0 deletions
Large diffs are not rendered by default.

extension/llm/runner/llm_runner_helper.cpp

Lines changed: 68 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -5,10 +5,14 @@
55
* This source code is licensed under the BSD-style license found in the
66
* LICENSE file in the root directory of this source tree.
77
*/
8-
8+
// @lint-ignore-every CLANGTIDY facebook-hte-Deprecated
99
// Implementation of helper utilities for creating and configuring LLM runners
1010

11+
#include <executorch/extension/llm/runner/image_prefiller.h>
1112
#include <executorch/extension/llm/runner/llm_runner_helper.h>
13+
#include <executorch/extension/llm/runner/multimodal_decoder_runner.h>
14+
#include <executorch/extension/llm/runner/multimodal_prefiller.h>
15+
#include <executorch/extension/llm/runner/multimodal_runner.h>
1216
#include <executorch/extension/llm/runner/stats.h>
1317
#include <executorch/extension/llm/runner/text_llm_runner.h>
1418
#include <executorch/extension/llm/runner/text_prefiller.h>
@@ -19,9 +23,7 @@
1923
#include <pytorch/tokenizers/sentencepiece.h>
2024
#include <pytorch/tokenizers/tiktoken.h>
2125

22-
namespace executorch {
23-
namespace extension {
24-
namespace llm {
26+
namespace executorch::extension::llm {
2527

2628
using ::executorch::extension::Module;
2729
using ::executorch::runtime::Error;
@@ -205,6 +207,65 @@ std::unique_ptr<TextLLMRunner> create_text_llm_runner(
205207
temperature);
206208
}
207209

208-
} // namespace llm
209-
} // namespace extension
210-
} // namespace executorch
210+
std::unique_ptr<MultimodalRunner> create_multimodal_runner(
211+
const std::string& model_path,
212+
std::unique_ptr<::tokenizers::Tokenizer> tokenizer,
213+
std::optional<const std::string> data_path) {
214+
// Sanity check tokenizer
215+
if (!tokenizer || !tokenizer->is_loaded()) {
216+
ET_LOG(Error, "Tokenizer is null or not loaded");
217+
return nullptr;
218+
}
219+
220+
// Create the Module
221+
std::unique_ptr<Module> module;
222+
if (data_path.has_value()) {
223+
module = std::make_unique<Module>(
224+
model_path, data_path.value(), Module::LoadMode::File);
225+
} else {
226+
module = std::make_unique<Module>(model_path, Module::LoadMode::File);
227+
}
228+
229+
// Get metadata from Module
230+
ET_LOG(Info, "Reading metadata from model");
231+
auto metadata = get_llm_metadata(tokenizer.get(), module.get());
232+
233+
auto eos_ids = std::make_unique<std::unordered_set<uint64_t>>(
234+
get_eos_ids(tokenizer.get(), module.get()));
235+
236+
// Create IOManager
237+
std::unique_ptr<IOManager> io_manager = std::make_unique<IOManager>();
238+
239+
// Create text_decoder_runner
240+
auto text_decoder_runner =
241+
std::make_unique<MultimodalDecoderRunner>(module.get(), io_manager.get());
242+
243+
// Create multimodal_prefiller
244+
auto multimodal_prefiller = std::make_unique<MultimodalPrefiller>(
245+
module.get(),
246+
text_decoder_runner.get(),
247+
tokenizer.get(),
248+
io_manager.get());
249+
250+
// Create text_token_generator with stats
251+
auto stats = std::make_unique<Stats>();
252+
auto text_token_generator = std::make_unique<TextTokenGenerator>(
253+
tokenizer.get(),
254+
text_decoder_runner.get(),
255+
metadata.at(kUseKVCache),
256+
std::move(eos_ids),
257+
stats.get());
258+
259+
// Create and return the MultimodalRunner instance
260+
return std::make_unique<MultimodalRunner>(
261+
std::move(metadata),
262+
std::move(tokenizer),
263+
std::move(module),
264+
std::move(text_decoder_runner),
265+
std::move(multimodal_prefiller),
266+
std::move(io_manager),
267+
std::move(text_token_generator),
268+
std::move(stats));
269+
}
270+
271+
} // namespace executorch::extension::llm

extension/llm/runner/llm_runner_helper.h

Lines changed: 19 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -22,9 +22,7 @@
2222
#include <executorch/runtime/platform/compiler.h>
2323
#include <pytorch/tokenizers/tokenizer.h>
2424

25-
namespace executorch {
26-
namespace extension {
27-
namespace llm {
25+
namespace executorch::extension::llm {
2826

2927
// Forward declarations
3028
class TextLLMRunner;
@@ -103,6 +101,21 @@ ET_EXPERIMENTAL std::unique_ptr<TextLLMRunner> create_text_llm_runner(
103101
std::optional<const std::string> data_path = std::nullopt,
104102
float temperature = -1.0f);
105103

106-
} // namespace llm
107-
} // namespace extension
108-
} // namespace executorch
104+
/**
105+
* @brief Creates a MultimodalRunner instance with dependency injection
106+
*
107+
* This factory function creates and initializes a MultimodalRunner with all
108+
* necessary components for multimodal text generation.
109+
*
110+
* @param model_path Path to the model file
111+
* @param tokenizer Initialized tokenizer instance
112+
* @param data_path Optional path to additional .ptd required by the model
113+
* @return std::unique_ptr<MultimodalRunner> Initialized MultimodalRunner
114+
* instance, or nullptr on failure
115+
*/
116+
ET_EXPERIMENTAL std::unique_ptr<MultimodalRunner> create_multimodal_runner(
117+
const std::string& model_path,
118+
std::unique_ptr<::tokenizers::Tokenizer> tokenizer,
119+
std::optional<const std::string> data_path = std::nullopt);
120+
121+
} // namespace executorch::extension::llm

extension/llm/runner/multimodal_decoder_runner.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
* This source code is licensed under the BSD-style license found in the
66
* LICENSE file in the root directory of this source tree.
77
*/
8+
#pragma once
89

910
#include <executorch/extension/llm/runner/constants.h>
1011
#include <executorch/extension/llm/runner/text_decoder_runner.h>

extension/llm/runner/multimodal_input.h

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -16,9 +16,7 @@
1616
#include <string>
1717
#include <variant>
1818

19-
namespace executorch {
20-
namespace extension {
21-
namespace llm {
19+
namespace executorch::extension::llm {
2220

2321
/**
2422
* A generic class to hold either image or text data for multimodal inputs.
@@ -181,6 +179,4 @@ inline MultimodalInput make_image_input(Image&& image) noexcept {
181179
return MultimodalInput(std::move(image));
182180
}
183181

184-
} // namespace llm
185-
} // namespace extension
186-
} // namespace executorch
182+
} // namespace executorch::extension::llm
Lines changed: 174 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,174 @@
1+
/*
2+
* Copyright (c) Meta Platforms, Inc. and affiliates.
3+
* All rights reserved.
4+
*
5+
* This source code is licensed under the BSD-style license found in the
6+
* LICENSE file in the root directory of this source tree.
7+
*/
8+
9+
// Implementation of MultimodalRunner for multimodal input and text output LLMs
10+
11+
#include <executorch/extension/llm/runner/constants.h>
12+
#include <executorch/extension/llm/runner/multimodal_runner.h>
13+
#include <executorch/extension/llm/runner/util.h>
14+
#include <executorch/runtime/platform/runtime.h>
15+
#include <pytorch/tokenizers/hf_tokenizer.h>
16+
#include <pytorch/tokenizers/sentencepiece.h>
17+
18+
namespace executorch::extension::llm {
19+
20+
using ::executorch::extension::Module;
21+
using ::executorch::runtime::Error;
22+
using ::executorch::runtime::Result;
23+
24+
MultimodalRunner::MultimodalRunner(
25+
std::unordered_map<std::string, int64_t> metadata,
26+
std::unique_ptr<::tokenizers::Tokenizer> tokenizer,
27+
std::unique_ptr<Module> module,
28+
std::unique_ptr<MultimodalDecoderRunner> text_decoder_runner,
29+
std::unique_ptr<MultimodalPrefiller> multimodal_prefiller,
30+
std::unique_ptr<IOManager> io_manager,
31+
std::unique_ptr<TextTokenGenerator> text_token_generator,
32+
std::unique_ptr<Stats> stats)
33+
: metadata_(std::move(metadata)),
34+
tokenizer_(std::move(tokenizer)),
35+
module_(std::move(module)),
36+
text_decoder_runner_(std::move(text_decoder_runner)),
37+
multimodal_prefiller_(std::move(multimodal_prefiller)),
38+
io_manager_(std::move(io_manager)),
39+
text_token_generator_(std::move(text_token_generator)),
40+
stats_(std::move(stats)),
41+
pos_(0) {}
42+
43+
bool MultimodalRunner::is_loaded() {
44+
return multimodal_prefiller_->is_method_loaded() &&
45+
text_token_generator_->is_loaded();
46+
}
47+
48+
Error MultimodalRunner::load() {
49+
if (is_loaded()) {
50+
return Error::Ok;
51+
}
52+
ET_CHECK_OK_OR_RETURN_ERROR(multimodal_prefiller_->load());
53+
ET_CHECK_OK_OR_RETURN_ERROR(text_token_generator_->load());
54+
return Error::Ok;
55+
}
56+
57+
// Don't print with the same priority during warmup
58+
#define RUNNER_ET_LOG(warmup, format, ...) \
59+
if (warmup) { \
60+
ET_LOG(Debug, format, __VA_ARGS__); \
61+
} else { \
62+
ET_LOG(Info, format, __VA_ARGS__); \
63+
}
64+
65+
Error MultimodalRunner::generate(
66+
const std::vector<MultimodalInput>& inputs,
67+
const GenerationConfig& config,
68+
std::function<void(const std::string&)>& token_callback,
69+
std::function<void(const Stats&)>& stats_callback) {
70+
if (inputs.empty()) {
71+
ET_LOG(Error, "MultimodalInput vector cannot be empty");
72+
return Error::InvalidArgument;
73+
}
74+
75+
if (!is_loaded()) {
76+
stats_->model_load_start_ms = time_in_ms();
77+
ET_CHECK_OK_OR_RETURN_ERROR(load());
78+
stats_->model_load_end_ms = time_in_ms();
79+
}
80+
81+
if (config.warming) {
82+
ET_LOG(Info, "Doing a warmup run...");
83+
}
84+
85+
RUNNER_ET_LOG(
86+
config.warming,
87+
"RSS after loading model: %f MiB (0 if unsupported)",
88+
get_rss_bytes() / 1024.0 / 1024.0);
89+
90+
// Wrap the token_callback with print function
91+
std::function<void(const std::string&)> wrapped_callback =
92+
[token_callback, config](const std::string& piece) {
93+
if (!config.warming) {
94+
safe_printf(piece.c_str());
95+
fflush(stdout);
96+
}
97+
if (token_callback) {
98+
token_callback(piece);
99+
}
100+
};
101+
102+
// Reset internal state and start inference
103+
stats_->inference_start_ms = time_in_ms();
104+
105+
uint64_t prefill_next_token = 0;
106+
// Process multimodal inputs in order
107+
for (const MultimodalInput& input : inputs) {
108+
prefill_next_token = ET_UNWRAP(multimodal_prefiller_->prefill(input, pos_));
109+
}
110+
111+
stats_->first_token_ms = time_in_ms();
112+
stats_->prompt_eval_end_ms = time_in_ms();
113+
stats_->num_prompt_tokens = pos_;
114+
115+
wrapped_callback(ET_UNWRAP_TOKENIZER(
116+
tokenizer_->decode(prefill_next_token, prefill_next_token)));
117+
118+
RUNNER_ET_LOG(
119+
config.warming,
120+
"RSS after multimodal input processing: %f MiB (0 if unsupported)",
121+
get_rss_bytes() / 1024.0 / 1024.0);
122+
123+
// Resolve max_new_tokens based on config
124+
int64_t max_context_len =
125+
metadata_.at(kMaxContextLen) - 0; // No start_pos offset
126+
int32_t max_new_tokens = config.resolve_max_new_tokens(max_context_len, pos_);
127+
128+
ET_LOG(
129+
Info,
130+
"Max new tokens resolved: %d, pos_ %" PRId64 ", max_context_len %" PRId64,
131+
max_new_tokens,
132+
pos_,
133+
max_context_len);
134+
135+
ET_CHECK_OR_RETURN_ERROR(
136+
max_new_tokens > 0,
137+
InvalidArgument,
138+
"Max new tokens %d is less than or equal to 0",
139+
max_new_tokens);
140+
141+
// Generate tokens using the text token generator
142+
std::vector<uint64_t> prompt_tokens = {prefill_next_token};
143+
int64_t num_generated_tokens = ET_UNWRAP(text_token_generator_->generate(
144+
/*tokens=*/prompt_tokens,
145+
/*start_pos=*/pos_,
146+
/*max_new_tokens=*/max_new_tokens -
147+
1, // Subtract 1 because prefill already generated 1 token
148+
/*temperature=*/config.temperature,
149+
/*token_callback=*/wrapped_callback));
150+
151+
pos_ += num_generated_tokens;
152+
// Update stats
153+
stats_->num_generated_tokens = num_generated_tokens;
154+
// Finalize stats and call callback
155+
stats_->inference_end_ms = time_in_ms();
156+
if (!config.warming) {
157+
printf("\n");
158+
}
159+
160+
if (config.warming) {
161+
ET_LOG(Info, "Warmup run finished!");
162+
} else {
163+
// Do not print report during warmup
164+
print_report(*stats_);
165+
}
166+
167+
if (stats_callback) {
168+
stats_callback(*stats_);
169+
}
170+
171+
return Error::Ok;
172+
}
173+
174+
} // namespace executorch::extension::llm

0 commit comments

Comments
 (0)