Skip to content

Commit e104edf

Browse files
committed
helper
1 parent 2d7c116 commit e104edf

File tree

5 files changed

+71
-2
lines changed

5 files changed

+71
-2
lines changed

examples/models/voxtral/multimodal.cpp

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -104,7 +104,11 @@ MultimodalInput loadPreprocessedAudio(const std::string& audio_path) {
104104
ET_LOG(Info, "audio_data len = %zu", n_floats);
105105

106106
std::vector<float> audio_data(n_floats);
107+
ET_LOG(Info, "audio_data size = %zu", audio_data.size());
107108
f.read(reinterpret_cast<char*>(audio_data.data()), n_floats * sizeof(float));
109+
ET_LOG(Info, "First 5 floats in audio_data is %f, %f, %f, %f, %f",
110+
audio_data[10000], audio_data[10001], audio_data[10002], audio_data[10003], audio_data[10004]);
111+
108112
f.close();
109113

110114
auto audio = ::executorch::extension::llm::Audio(
@@ -331,6 +335,12 @@ int32_t main(int32_t argc, char** argv) {
331335

332336
// Generate
333337
ET_LOG(Info, "Starting generation...");
338+
for (const auto& input : inputs) {
339+
ET_LOG(
340+
Info,
341+
"Input : %s",
342+
input.to_string().c_str());
343+
}
334344
auto error = runner->generate(inputs, config);
335345
if (error != ::executorch::runtime::Error::Ok) {
336346
ET_LOG(Error, "Failed to generate with multimodal runner");

extension/android/jni/jni_layer_llama.cpp

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -222,7 +222,15 @@ class ExecuTorchLlmJni : public facebook::jni::HybridClass<ExecuTorchLlmJni> {
222222
.echo = static_cast<bool>(echo),
223223
.seq_len = seq_len,
224224
.temperature = temperature_,
225+
.max_new_tokens = 256,
225226
};
227+
for (const auto& input : inputs) {
228+
ET_LOG(
229+
Error,
230+
"Prefill input: %s",
231+
input.to_string().c_str());
232+
}
233+
226234
multi_modal_runner_->generate(
227235
std::move(inputs),
228236
config,
@@ -342,6 +350,9 @@ class ExecuTorchLlmJni : public facebook::jni::HybridClass<ExecuTorchLlmJni> {
342350
for (int i = 0; i < data_size; i++) {
343351
data_f[i] = data_jfloat[i];
344352
}
353+
ET_LOG(Error, "First 5 elements of data_f: %f, %f, %f, %f, %f",
354+
data_f[10000], data_f[10001], data_f[10002], data_f[10003], data_f[
355+
10004]);
345356
llm::Audio audio{std::move(data_f), batch_size, n_bins, n_frames};
346357
prefill_inputs_.emplace_back(llm::MultimodalInput{std::move(audio)});
347358
}
@@ -412,8 +423,7 @@ class ExecuTorchLlmJni : public facebook::jni::HybridClass<ExecuTorchLlmJni> {
412423
makeNativeMethod(
413424
"appendAudioInput", ExecuTorchLlmJni::append_audio_input),
414425
makeNativeMethod(
415-
"appendAudioInputFloat",
416-
ExecuTorchLlmJni::append_audio_input_float),
426+
"appendAudioInputFloat", ExecuTorchLlmJni::append_audio_input_float),
417427
makeNativeMethod(
418428
"appendRawAudioInput", ExecuTorchLlmJni::append_raw_audio_input),
419429
makeNativeMethod(

extension/llm/runner/audio.h

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,11 @@ struct ET_EXPERIMENTAL RawAudio {
2929
int32_t batch_size;
3030
int32_t n_channels; // For mono, use n_channels = 1.
3131
int32_t n_samples;
32+
33+
std::string to_string() const {
34+
return "RawAudio: " + std::to_string(batch_size) + "x" +
35+
std::to_string(n_channels) + "x" + std::to_string(n_samples);
36+
}
3237
};
3338

3439
/**
@@ -146,6 +151,21 @@ class ET_EXPERIMENTAL Audio final {
146151
return ::executorch::runtime::Error::NotSupported;
147152
}
148153

154+
std::string to_string() const {
155+
std::string result = "Audio: ";
156+
if (is_uint8()) {
157+
result += "uint8_t";
158+
} else if (is_float()) {
159+
result += "float";
160+
} else {
161+
result += "unknown";
162+
}
163+
result += " data, batch_size: " + std::to_string(get_batch_size()) +
164+
", n_bins: " + std::to_string(get_n_bins()) +
165+
", n_frames: " + std::to_string(get_n_frames());
166+
return result;
167+
}
168+
149169
private:
150170
// Members
151171
std::variant<std::vector<uint8_t>, std::vector<float>> data_;

extension/llm/runner/image.h

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -110,6 +110,21 @@ class ET_EXPERIMENTAL Image {
110110
return ::executorch::runtime::Error::NotSupported;
111111
}
112112

113+
std::string to_string() const {
114+
std::string result = "Image: ";
115+
if (is_uint8()) {
116+
result += "uint8_t";
117+
} else if (is_float()) {
118+
result += "float";
119+
} else {
120+
result += "unknown";
121+
}
122+
result += "width: " + std::to_string(width_) + ", ";
123+
result += "height: " + std::to_string(height_) + ", ";
124+
result += "channels: " + std::to_string(channels_);
125+
return result;
126+
}
127+
113128
private:
114129
// Assuming NCHW format
115130
std::variant<std::vector<uint8_t>, std::vector<float>> data_;

extension/llm/runner/multimodal_input.h

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,20 @@ class ET_EXPERIMENTAL MultimodalInput {
5757
// Destructor
5858
~MultimodalInput() = default;
5959

60+
std::string to_string() const noexcept {
61+
if (is_text()) {
62+
return "Text: \"" + get_text() + "\"";
63+
} else if (is_image()) {
64+
return get_image().to_string();
65+
} else if (is_audio()) {
66+
return get_audio().to_string();
67+
} else if (is_raw_audio()) {
68+
return get_raw_audio().to_string();
69+
} else {
70+
return "Unsupported input type";
71+
}
72+
}
73+
6074
/**
6175
* Check if this input contains text data.
6276
* @return true if this input contains text, false otherwise.

0 commit comments

Comments
 (0)