Skip to content

Commit 8b11418

Browse files
authored
[multimodal] Let Audio take float data blob (#14427)
If the processed audio went through Mel transform, the spectrogram are float values. We should allow `Audio` class to be able to take this, since multimodal runner pybind API will have to be able to take processed input. Once we have the pybind API we can do something like: ```python model_id = "mistralai/Voxtral-Mini-3B-2507" processor = AutoProcessor.from_pretrained(model_id) audio_url = "https://huggingface.co/datasets/eustlb/audio-samples/resolve/main/dude_where_is_my_car.wav" conversation = [ { "role": "user", "content": [ {"type": "audio", "url": audio_url}, { "type": "text", "text": "What can you tell me about this audio?", }, ], }, ] inputs = processor.apply_chat_template(conversation, tokenize=True, return_dict=True, return_tensors="pt") inputs_combined = [ make_text_input("<s>[INST][BEGIN_AUDIO]"), make_audio_input(inputs["input_features"]), make_text_input("\nWhat can you tell me about this audio?[/INST]"), ] runner = MultimodalRunner("voxtral.pte", "tekken.json", None) config = GenerationConfig() config.max_new_tokens = 100 runner.generate(inputs_combined, config) ``` ### Summary [PLEASE REMOVE] See [CONTRIBUTING.md's Pull Requests](https://github.com/pytorch/executorch/blob/main/CONTRIBUTING.md#pull-requests) for ExecuTorch PR guidelines. [PLEASE REMOVE] If this PR closes an issue, please add a `Fixes #<issue-id>` line. [PLEASE REMOVE] If this PR introduces a fix or feature that should be the upcoming release notes, please add a "Release notes: <area>" label. For a list of available release notes labels, check out [CONTRIBUTING.md's Pull Requests](https://github.com/pytorch/executorch/blob/main/CONTRIBUTING.md#pull-requests). ### Test plan [PLEASE REMOVE] How did you test this PR? Please write down any manual commands you used and note down tests that you have written if applicable.
1 parent c780f05 commit 8b11418

File tree

3 files changed

+158
-47
lines changed

3 files changed

+158
-47
lines changed

examples/models/voxtral/multimodal.cpp

Lines changed: 17 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -103,15 +103,13 @@ MultimodalInput loadPreprocessedAudio(const std::string& audio_path) {
103103

104104
ET_LOG(Info, "audio_data len = %zu", n_floats);
105105

106-
// Create Audio multimodal input
107-
auto audio = std::make_unique<::executorch::extension::llm::Audio>();
108-
audio->batch_size = batch_size;
109-
audio->n_bins = n_bins;
110-
audio->n_frames = n_frames;
111-
audio->data.resize(n_floats * sizeof(float));
112-
f.read(reinterpret_cast<char*>(audio->data.data()), n_floats * sizeof(float));
106+
std::vector<float> audio_data(n_floats);
107+
f.read(reinterpret_cast<char*>(audio_data.data()), n_floats * sizeof(float));
113108
f.close();
114-
return ::executorch::extension::llm::make_audio_input(std::move(*audio));
109+
110+
auto audio = ::executorch::extension::llm::Audio(
111+
std::move(audio_data), batch_size, n_bins, n_frames);
112+
return ::executorch::extension::llm::make_audio_input(std::move(audio));
115113
}
116114

117115
/**
@@ -206,32 +204,21 @@ MultimodalInput processRawAudioFile(
206204
static_cast<int>(sizes[2]));
207205

208206
// Create Audio multimodal input from processed features
209-
auto processed_audio =
210-
std::make_unique<::executorch::extension::llm::Audio>();
211-
processed_audio->batch_size =
212-
static_cast<int32_t>(sizes[0]); // Note: batching for s > 30 doesn't work
213-
// yet, so this will just be = 1.
214-
processed_audio->n_bins = static_cast<int32_t>(sizes[1]);
215-
processed_audio->n_frames =
216-
static_cast<int32_t>(sizes[2]); // And this will just be = 3000.
217-
218-
size_t total_elements = processed_audio->batch_size *
219-
processed_audio->n_bins * processed_audio->n_frames;
220-
processed_audio->data.resize(total_elements * sizeof(float));
221-
std::memcpy(
222-
processed_audio->data.data(),
223-
processed_data,
224-
total_elements * sizeof(float));
225-
207+
int32_t batch_size = static_cast<int32_t>(sizes[0]);
208+
int32_t n_bins = static_cast<int32_t>(sizes[1]);
209+
int32_t n_frames = static_cast<int32_t>(sizes[2]);
210+
size_t total_elements = batch_size * n_bins * n_frames;
211+
std::vector<float> audio_vec(processed_data, processed_data + total_elements);
212+
auto processed_audio = ::executorch::extension::llm::Audio(
213+
std::move(audio_vec), batch_size, n_bins, n_frames);
226214
ET_LOG(
227215
Info,
228216
"Created processed Audio: batch_size=%d, n_bins=%d, n_frames=%d",
229-
processed_audio->batch_size,
230-
processed_audio->n_bins,
231-
processed_audio->n_frames);
232-
217+
batch_size,
218+
n_bins,
219+
n_frames);
233220
return ::executorch::extension::llm::make_audio_input(
234-
std::move(*processed_audio));
221+
std::move(processed_audio));
235222
}
236223

237224
/**

extension/llm/runner/audio.h

Lines changed: 122 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,11 @@
1111
#pragma once
1212
#include <executorch/runtime/platform/compiler.h>
1313
#include <cstdint>
14+
#include <variant>
1415
#include <vector>
1516

17+
#include <executorch/extension/tensor/tensor.h>
18+
1619
namespace executorch {
1720
namespace extension {
1821
namespace llm {
@@ -29,14 +32,126 @@ struct ET_EXPERIMENTAL RawAudio {
2932
};
3033

3134
/**
32-
* Pre-processed audio inputs, ready to feed directly into an audio
33-
* encoder.
35+
* Pre-processed audio inputs, ready to feed directly into an audio encoder.
36+
*
37+
* The data can be either uint8_t or float. If the audio has gone through a Mel
38+
* transform, we expect the data type to be float (i.e., std::vector<float>), as
39+
* Mel spectrograms are typically represented as floating point values. For raw
40+
* or quantized audio, uint8_t may be used instead.
3441
*/
35-
struct ET_EXPERIMENTAL Audio {
36-
std::vector<uint8_t> data;
37-
int32_t batch_size;
38-
int32_t n_bins;
39-
int32_t n_frames;
42+
class ET_EXPERIMENTAL Audio final {
43+
public:
44+
// Default constructor
45+
Audio() : batch_size_(0), n_bins_(0), n_frames_(0) {}
46+
47+
// Constructor for uint8_t data
48+
Audio(
49+
std::vector<uint8_t>&& data,
50+
int32_t batch_size,
51+
int32_t n_bins,
52+
int32_t n_frames)
53+
: data_(std::move(data)),
54+
batch_size_(batch_size),
55+
n_bins_(n_bins),
56+
n_frames_(n_frames) {
57+
ET_CHECK_MSG(
58+
data_.index() == 0 &&
59+
std::get<std::vector<uint8_t>>(data_).size() ==
60+
static_cast<size_t>(batch_size * n_bins * n_frames),
61+
"data.size() (%zu) does not match batch_size * n_bins * n_frames (%d)",
62+
std::get<std::vector<uint8_t>>(data_).size(),
63+
batch_size * n_bins * n_frames);
64+
}
65+
66+
// Constructor for float data
67+
Audio(
68+
std::vector<float>&& data,
69+
int32_t batch_size,
70+
int32_t n_bins,
71+
int32_t n_frames)
72+
: data_(std::move(data)),
73+
batch_size_(batch_size),
74+
n_bins_(n_bins),
75+
n_frames_(n_frames) {
76+
ET_CHECK_MSG(
77+
data_.index() == 1 &&
78+
std::get<std::vector<float>>(data_).size() ==
79+
static_cast<size_t>(batch_size * n_bins * n_frames),
80+
"data.size() (%zu) does not match batch_size * n_bins * n_frames (%d)",
81+
std::get<std::vector<float>>(data_).size(),
82+
batch_size * n_bins * n_frames);
83+
}
84+
85+
// Type checkers
86+
bool is_uint8() const {
87+
return std::holds_alternative<std::vector<uint8_t>>(data_);
88+
}
89+
90+
bool is_float() const {
91+
return std::holds_alternative<std::vector<float>>(data_);
92+
}
93+
94+
// Data access
95+
const std::vector<uint8_t>& get_uint8_data() const& {
96+
return std::get<std::vector<uint8_t>>(data_);
97+
}
98+
99+
std::vector<uint8_t>& get_uint8_data() & {
100+
return std::get<std::vector<uint8_t>>(data_);
101+
}
102+
103+
const std::vector<float>& get_float_data() const& {
104+
return std::get<std::vector<float>>(data_);
105+
}
106+
107+
std::vector<float>& get_float_data() & {
108+
return std::get<std::vector<float>>(data_);
109+
}
110+
111+
int32_t get_batch_size() const {
112+
return batch_size_;
113+
}
114+
int32_t get_n_bins() const {
115+
return n_bins_;
116+
}
117+
int32_t get_n_frames() const {
118+
return n_frames_;
119+
}
120+
/**
121+
* Convert the audio data to a TensorPtr, with optional batch dimension.
122+
* The tensor will have shape (batch_size, n_bins, n_frames) or (1,
123+
* batch_size, n_bins, n_frames) if with_batch is true.
124+
*/
125+
executorch::runtime::Result<executorch::extension::TensorPtr> toTensor(
126+
bool with_batch = false) const {
127+
std::vector<executorch::aten::SizesType> sizes = {
128+
get_batch_size(), get_n_bins(), get_n_frames()};
129+
if (with_batch) {
130+
sizes.insert(sizes.begin(), 1);
131+
}
132+
if (is_float()) {
133+
return executorch::extension::from_blob(
134+
const_cast<float*>(get_float_data().data()),
135+
sizes,
136+
::executorch::aten::ScalarType::Float);
137+
} else if (is_uint8()) {
138+
return executorch::extension::from_blob(
139+
const_cast<uint8_t*>(get_uint8_data().data()),
140+
sizes,
141+
::executorch::aten::ScalarType::Byte);
142+
}
143+
ET_LOG(
144+
Error,
145+
"Shouldn't reach here, audio data is not initialized with uint8_t or float vector.");
146+
return ::executorch::runtime::Error::NotSupported;
147+
}
148+
149+
private:
150+
// Members
151+
std::variant<std::vector<uint8_t>, std::vector<float>> data_;
152+
int32_t batch_size_;
153+
int32_t n_bins_;
154+
int32_t n_frames_;
40155
};
41156

42157
} // namespace llm

extension/llm/runner/multimodal_prefiller.cpp

Lines changed: 19 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -47,21 +47,24 @@ Result<uint64_t> MultimodalPrefiller::prefill(
4747
"Failed to get method_meta for %s",
4848
kVisionEncoderMethod);
4949

50-
ET_CHECK_MSG(
50+
ET_CHECK_OR_RETURN_ERROR(
5151
method_meta.num_inputs() > 0,
52+
InvalidArgument,
5253
"Image encoder should have at least 1 input");
5354
auto input_meta = ET_UNWRAP(
5455
method_meta.input_tensor_meta(0),
5556
"Cannot get input tensor meta at index 0");
5657
auto expected_dtype = input_meta.scalar_type();
5758

5859
if (expected_dtype == ::executorch::aten::ScalarType::Float) {
59-
ET_CHECK_MSG(
60+
ET_CHECK_OR_RETURN_ERROR(
6061
image.is_float(),
62+
InvalidArgument,
6163
"Model expects float image data, but image has uint8_t data.");
6264
} else if (expected_dtype == ::executorch::aten::ScalarType::Byte) {
63-
ET_CHECK_MSG(
65+
ET_CHECK_OR_RETURN_ERROR(
6466
image.is_uint8(),
67+
InvalidArgument,
6568
"Model expects uint8_t image data, but image has float data.");
6669
} else {
6770
ET_LOG(
@@ -77,7 +80,11 @@ Result<uint64_t> MultimodalPrefiller::prefill(
7780
auto image_tensor = ET_UNWRAP(
7881
image.toTensor(/*with_batch*/ expected_dims.size() == 4),
7982
"Failed to convert image to tensor");
80-
83+
ET_LOG(
84+
Info,
85+
"Image tensor dim: %zu, dtype: %s",
86+
image_tensor->dim(),
87+
::executorch::runtime::toString(image_tensor->scalar_type()));
8188
// Run image encoder
8289
auto image_encoder_outputs =
8390
ET_UNWRAP(module_->execute(kVisionEncoderMethod, image_tensor));
@@ -86,12 +93,14 @@ Result<uint64_t> MultimodalPrefiller::prefill(
8693
} else if (input.is_audio()) {
8794
Audio audio = input.get_audio();
8895

89-
// Use the original tensor shape as intended
90-
auto audio_tensor = executorch::extension::from_blob(
91-
audio.data.data(),
92-
{audio.batch_size, audio.n_bins, audio.n_frames},
93-
::executorch::aten::ScalarType::Float);
94-
96+
// Use Audio::toTensor() for tensor creation
97+
auto audio_tensor =
98+
ET_UNWRAP(audio.toTensor(), "Failed to convert audio to tensor");
99+
ET_LOG(
100+
Info,
101+
"Audio tensor dim: %zu, dtype: %s",
102+
audio_tensor->dim(),
103+
::executorch::runtime::toString(audio_tensor->scalar_type()));
95104
// Run audio encoder
96105
auto audio_encoder_result =
97106
module_->execute(kAudioEncoderMethod, audio_tensor);

0 commit comments

Comments
 (0)