Skip to content

Commit e18ce85

Browse files
committed
Fix stuff and order
1 parent d35f5e9 commit e18ce85

File tree

4 files changed

+38
-40
lines changed

4 files changed

+38
-40
lines changed

extension/llm/runner/multimodal_decoder_runner.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@ class ET_EXPERIMENTAL MultimodalDecoderRunner
4848
&start_pos, {1}, executorch::aten::ScalarType::Long);
4949
// run text model
5050
auto outputs_res = ET_UNWRAP(
51-
module_->execute(kTextModelMethod, {start_pos_tensor, embeddings}));
51+
module_->execute(kTextModelMethod, {embeddings, start_pos_tensor}));
5252

5353
ET_CHECK_MSG(
5454
outputs_res.size() == 1,

extension/llm/runner/multimodal_prefiller.cpp

Lines changed: 4 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -92,39 +92,18 @@ Result<uint64_t> MultimodalPrefiller::prefill(
9292

9393
// 2. Run decoder model for prefill.
9494

95-
// Get expected shape of cache position tensor, which should be the first (0th
96-
// index) argument
97-
auto method_meta = ET_UNWRAP(module_->method_meta(kTextModelMethod));
98-
auto first_input_info = ET_UNWRAP(method_meta.input_tensor_meta(0));
99-
auto first_input_sizes = first_input_info.sizes();
100-
auto numel = first_input_sizes[0];
95+
96+
// Get expected shape of cache position tensor, which should be the second argument
10197

10298
int64_t seq_len = encoder_output.toTensor().size(1);
10399
if (seq_len == 0) {
104100
ET_LOG(Error, "The encoder returned an empty output.");
105101
return ::executorch::runtime::Error::InvalidState;
106102
}
103+
auto cache_position_tensor = ET_UNWRAP(populate_start_pos_tensor(module_, start_pos, seq_len));
107104

108-
executorch::extension::TensorPtr cache_position_tensor;
109-
if (numel > 1) {
110-
// `cache_position` goes from start_pos to start_pos +
111-
// encoder_output.size(1). e.g. if start_pos = 2 and encoder_output.size(1)
112-
// = 5, cache_position_tensor should be [2, 3, 4, 5, 6].
113-
std::vector<int64_t> cache_positions(seq_len);
114-
for (int64_t i = 0; i < seq_len; ++i) {
115-
cache_positions[i] = start_pos + i;
116-
}
117-
cache_position_tensor = ::executorch::extension::from_blob(
118-
cache_positions.data(),
119-
{static_cast<int>(seq_len)},
120-
executorch::aten::ScalarType::Long);
121-
} else {
122-
// Cache position is size 1.
123-
cache_position_tensor = ::executorch::extension::from_blob(
124-
&start_pos, {1}, executorch::aten::ScalarType::Long);
125-
}
126105
auto prefill_result = module_->execute(
127-
kTextModelMethod, {cache_position_tensor, encoder_output});
106+
kTextModelMethod, {encoder_output, cache_position_tensor});
128107
if (prefill_result.error() != ::executorch::runtime::Error::Ok) {
129108
return prefill_result.error();
130109
}

extension/llm/runner/text_decoder_runner.cpp

Lines changed: 1 addition & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -53,20 +53,7 @@ ::executorch::runtime::Result<executorch::aten::Tensor> TextDecoderRunner::step(
5353
auto numel = sizes[0];
5454
std::vector<::executorch::aten::SizesType> sizes_vec = {numel};
5555

56-
TensorPtr start_pos_tensor;
57-
if (numel > 1) {
58-
// If we are here, model is exported with cache_positions, create a tensor
59-
// with the same length as input_ids. Assuming the last dimension is the
60-
// one with the variable token length, for example [1, S] or [1, 1, S]
61-
sizes_vec[sizes_vec.size() - 1] = tokens->numel();
62-
start_pos_tensor = empty(sizes_vec, ::executorch::aten::ScalarType::Long);
63-
torch::executor::native::arange_out_impl(
64-
start_pos, start_pos + tokens->numel(), 1.0, *start_pos_tensor);
65-
} else {
66-
// Assuming model is exported with input_pos, create a tensor with size 1
67-
start_pos_tensor = from_blob(
68-
&start_pos, sizes_vec, ::executorch::aten::ScalarType::Long);
69-
}
56+
auto start_pos_tensor = ET_UNWRAP(populate_start_pos_tensor(module_, start_pos, tokens->numel()));
7057

7158
std::vector<runtime::EValue> inputs;
7259
auto inputs_res = io_manager_->prepare_decode(tokens, start_pos_tensor);

extension/llm/runner/util.h

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,9 @@
77
*/
88

99
#pragma once
10+
#include <executorch/extension/llm/runner/constants.h>
11+
#include <executorch/extension/llm/runner/multimodal_prefiller.h>
12+
#include <executorch/extension/tensor/tensor.h>
1013
#include <executorch/runtime/platform/compiler.h>
1114
#include <stdio.h>
1215
#include <time.h>
@@ -99,6 +102,35 @@ ET_EXPERIMENTAL size_t inline get_rss_bytes() {
99102
// when this changed.
100103
return 0;
101104
}
105+
106+
inline runtime::Result<TensorPtr> populate_start_pos_tensor(Module* module, int64_t& start_pos, int seq_len) {
107+
// Get expected shape of cache position tensor, which should be the second argument
108+
auto method_meta = ET_UNWRAP(module->method_meta(kTextModelMethod));
109+
auto second_input_info = ET_UNWRAP(method_meta.input_tensor_meta(1));
110+
auto second_input_sizes = second_input_info.sizes();
111+
auto numel = second_input_sizes[0];
112+
113+
TensorPtr start_pos_tensor;
114+
std::vector<::executorch::aten::SizesType> sizes_vec = {numel};
115+
if (numel > 1) {
116+
// `cache_position` goes from start_pos to start_pos +
117+
// encoder_output.size(1). e.g. if start_pos = 2 and encoder_output.size(1)
118+
// = 5, cache_position_tensor should be [2, 3, 4, 5, 6].
119+
std::vector<int64_t> cache_positions(seq_len);
120+
for (int64_t i = 0; i < seq_len; ++i) {
121+
cache_positions[i] = start_pos + i;
122+
}
123+
return ::executorch::extension::from_blob(
124+
cache_positions.data(),
125+
{static_cast<int>(seq_len)},
126+
executorch::aten::ScalarType::Long);
127+
} else {
128+
// Cache position is size 1.
129+
return ::executorch::extension::from_blob(
130+
&start_pos, {1}, executorch::aten::ScalarType::Long);
131+
}
132+
}
133+
102134
} // namespace llm
103135
} // namespace extension
104136
} // namespace executorch

0 commit comments

Comments
 (0)