diff --git a/backends/qualcomm/runtime/QnnManager.cpp b/backends/qualcomm/runtime/QnnManager.cpp index f2650301a38..83a94fdfdf5 100644 --- a/backends/qualcomm/runtime/QnnManager.cpp +++ b/backends/qualcomm/runtime/QnnManager.cpp @@ -154,8 +154,9 @@ Error QnnManager::RegisterMem( const std::shared_ptr& tensor_wrapper) { SharedBuffer& shared_buffer_manager = SharedBuffer::GetSharedBufferManager(); // Not enable shared buffer - if (!options_->shared_buffer()) + if (!options_->shared_buffer()) { return Error::Internal; + } if (backend_params_ptr_->qnn_mem_manager_ptr_ == nullptr) { QNN_EXECUTORCH_LOG_WARN( diff --git a/backends/qualcomm/runtime/QnnManager.h b/backends/qualcomm/runtime/QnnManager.h index 0157ee58378..17294afbd88 100644 --- a/backends/qualcomm/runtime/QnnManager.h +++ b/backends/qualcomm/runtime/QnnManager.h @@ -145,7 +145,7 @@ class QnnManager { {Qnn_DataType_t::QNN_DATATYPE_UFIXED_POINT_8, executorch::aten::ScalarType::Byte}, {Qnn_DataType_t::QNN_DATATYPE_UFIXED_POINT_16, - executorch::aten::ScalarType::Bits16}, + executorch::aten::ScalarType::UInt16}, }; }; } // namespace qnn diff --git a/backends/qualcomm/runtime/backends/QnnMemManager.h b/backends/qualcomm/runtime/backends/QnnMemManager.h index 664f717dc09..a0bdafab7b5 100644 --- a/backends/qualcomm/runtime/backends/QnnMemManager.h +++ b/backends/qualcomm/runtime/backends/QnnMemManager.h @@ -77,7 +77,7 @@ class QnnMemManager { Qnn_DataType_t::QNN_DATATYPE_SFIXED_POINT_16}, {executorch::aten::ScalarType::Byte, Qnn_DataType_t::QNN_DATATYPE_UFIXED_POINT_8}, - {executorch::aten::ScalarType::Bits16, + {executorch::aten::ScalarType::UInt16, Qnn_DataType_t::QNN_DATATYPE_UFIXED_POINT_16}, }; }; diff --git a/examples/qualcomm/oss_scripts/llama/CMakeLists.txt b/examples/qualcomm/oss_scripts/llama/CMakeLists.txt index c92711d9eb8..d7d355ee4dd 100644 --- a/examples/qualcomm/oss_scripts/llama/CMakeLists.txt +++ b/examples/qualcomm/oss_scripts/llama/CMakeLists.txt @@ -28,8 +28,8 @@ list( ${CMAKE_CURRENT_LIST_DIR}/qnn_llama_runner.cpp ${CMAKE_CURRENT_LIST_DIR}/runner/runner.cpp ${CMAKE_CURRENT_LIST_DIR}/runner/runner.h - ${CMAKE_CURRENT_LIST_DIR}/runner/io_memory.cpp - ${CMAKE_CURRENT_LIST_DIR}/runner/io_memory.h + ${CMAKE_CURRENT_LIST_DIR}/runner/io_manager.cpp + ${CMAKE_CURRENT_LIST_DIR}/runner/io_manager.h ) list( diff --git a/examples/qualcomm/oss_scripts/llama/llama.py b/examples/qualcomm/oss_scripts/llama/llama.py index 0af0f55b88f..e80e0c2808a 100755 --- a/examples/qualcomm/oss_scripts/llama/llama.py +++ b/examples/qualcomm/oss_scripts/llama/llama.py @@ -72,12 +72,42 @@ logging.getLogger().setLevel(logging.INFO) +def smart_mask_updator(atten_mask, pos, k_caches, v_caches, new_k_caches, new_v_caches): + for i, k_cache in enumerate(k_caches): + k_cache[:, :, pos] = new_k_caches[i][:, :, 0] + + for i, v_cache in enumerate(v_caches): + v_cache[:, pos, :] = new_v_caches[i] + + atten_mask[0][pos] = 0 + pos += 1 + return (atten_mask, pos, k_caches, v_caches) + + +def shift_pointer_updator( + atten_mask, pos, k_caches, v_caches, new_k_caches, new_v_caches +): + k_caches = [ + torch.cat([k_cache[:, :, 1:], new_k_caches[i]], dim=-1) + for i, k_cache in enumerate(k_caches) + ] + v_caches = [ + torch.cat([v_cache[:, 1:, :], new_v_caches[i]], dim=1) + for i, v_cache in enumerate(v_caches) + ] + + pos += 1 + atten_mask[0][-pos - 1] = 0 + return (atten_mask, pos, k_caches, v_caches) + + def _kv_calibrate( example_inputs, user_prompts, module: torch.fx.GraphModule, tokenizer, max_seq_len=512, + updator=smart_mask_updator, ): _, atten_mask, _, k_caches, v_caches = example_inputs @@ -105,17 +135,9 @@ def _kv_calibrate( *k_caches, *v_caches, ) - k_caches = [ - torch.cat([k_cache[:, :, 1:], new_k_caches[i]], dim=-1) - for i, k_cache in enumerate(k_caches) - ] - v_caches = [ - torch.cat([v_cache[:, 1:, :], new_v_caches[i]], dim=1) - for i, v_cache in enumerate(v_caches) - ] - - pos += 1 - atten_mask[0][-pos - 1] = 0 + atten_mask, pos, k_caches, v_caches = updator( + atten_mask, pos, k_caches, v_caches, new_k_caches, new_v_caches + ) if pos >= len(token_list): token_list.append(torch.argmax(logits[:, -1], dim=-1).item()) @@ -174,6 +196,7 @@ def calibrate( module: torch.fx.GraphModule, tokenizer, max_seq_len=512, + kv_updator=smart_mask_updator, ): if len(example_inputs) == 2: _prefill_calibrate( @@ -190,6 +213,7 @@ def calibrate( module, tokenizer, max_seq_len, + updator=kv_updator, ) else: raise RuntimeError("Get wrong inputs") @@ -319,6 +343,7 @@ def quantize(self, quant_dtype, args, tokenizer, custom_annotations=()): self.llama_model, self.inputs, strict=True ).module() fx_graph_module = prepare_pt2e(fx_graph_module, quantizer) + logging.info("Quantizing the model...") calibrate( self.get_example_inputs(self.llama_meta["get_use_kv_cache"]), @@ -326,6 +351,7 @@ def quantize(self, quant_dtype, args, tokenizer, custom_annotations=()): fx_graph_module, tokenizer=tokenizer, max_seq_len=self.llama_meta["get_max_seq_len"], + kv_updator=args.kv_updator, ) self.llama_model = convert_pt2e(fx_graph_module) @@ -337,6 +363,7 @@ def lowering_modules( use_fp16=False, soc_model=QcomChipset.SM8650, num_sharding=0, + shared_buffer=False, ): executorch_config = ExecutorchBackendConfig( # For shared buffer, user must pass the memory address @@ -357,7 +384,7 @@ def lowering_modules( compiler_specs = generate_qnn_executorch_compiler_spec( soc_model=soc_model, backend_options=backend_options, - shared_buffer=False, + shared_buffer=shared_buffer, ) skip_node_op_set = {"llama.fallback.default"} partitioner = QnnPartitioner( @@ -530,6 +557,7 @@ def compile(args, pte_filename, tokenizer): use_fp16=use_fp16, soc_model=get_soc_to_chipset_map()[args.model], num_sharding=args.num_sharding, + shared_buffer=args.shared_buffer, ) quant_attrs = llama_instance_list[0].get_quant_attrs() else: @@ -564,7 +592,7 @@ def compile(args, pte_filename, tokenizer): generate_qnn_executorch_compiler_spec( soc_model=get_soc_to_chipset_map()[args.model], backend_options=backend_options, - shared_buffer=True, + shared_buffer=args.shared_buffer, multiple_graphs=True, graph_name=graph_name, ) @@ -736,6 +764,7 @@ def inference(args, quant_attrs, pte_filename, runtime_tokenizer_path, pre_gen_p f"--system_prompt '{args.system_prompt}'", f"--logits_scale {quant_attrs['scale']}", f"--logits_offset {quant_attrs['zero_point']}", + f"--kv_updator {'SmartMask' if args.kv_updator == smart_mask_updator else 'ShiftPointer'}", ] ) runner_cmd = " ".join( @@ -907,6 +936,14 @@ def main(): type=int, ) + parser.add_argument( + "--kv_updator", + help="Choose how to update kv cache during runtime", + choices=["smart_mask", "shift_pointer"], + default="smart_mask", + type=str, + ) + args = parser.parse_args() if args.compile_only and args.pre_gen_pte: exit("Cannot set both compile_only and pre_gen_pte as true") @@ -941,6 +978,14 @@ def main(): else: raise RuntimeError(f"Unknown llama_model: {args.llama_model}.") + if args.kv_updator == "smart_mask": + args.shared_buffer = True + args.kv_updator = smart_mask_updator + elif args.kv_updator == "shift_pointer": + args.kv_updator = shift_pointer_updator + else: + exit(f"Using an unkown kv update {args.kv_updator}") + if args.pre_gen_pte: quant_attrs = json.load( open(f"{args.pre_gen_pte}/{pte_filename}_quant_attrs.txt") diff --git a/examples/qualcomm/oss_scripts/llama/qnn_llama_runner.cpp b/examples/qualcomm/oss_scripts/llama/qnn_llama_runner.cpp index 7660952ef0c..1bc90a11f9d 100644 --- a/examples/qualcomm/oss_scripts/llama/qnn_llama_runner.cpp +++ b/examples/qualcomm/oss_scripts/llama/qnn_llama_runner.cpp @@ -51,6 +51,10 @@ DEFINE_int32( "0: PromptProcessor(prefill) / 1: TokenGenerator(kv) / 2: HybridMode (prefill+kv)"); DEFINE_double(logits_scale, 0.0, "Logits scale"); DEFINE_int32(logits_offset, 0, "Logits offset"); +DEFINE_string( + kv_updator, + "How to update kv cache. Choose between SmartMask and ShiftPointer", + "SmartMask"); int main(int argc, char** argv) { gflags::ParseCommandLineFlags(&argc, &argv, true); @@ -62,7 +66,8 @@ int main(int argc, char** argv) { FLAGS_logits_scale, FLAGS_logits_offset, FLAGS_temperature, - FLAGS_eval_mode); + FLAGS_eval_mode, + FLAGS_kv_updator); std::vector buf; buf.reserve(5 * FLAGS_seq_len); // assume each token is around 5 char std::ofstream fout(FLAGS_output_path.c_str()); diff --git a/examples/qualcomm/oss_scripts/llama/runner/io_manager.cpp b/examples/qualcomm/oss_scripts/llama/runner/io_manager.cpp new file mode 100644 index 00000000000..b2fcef91491 --- /dev/null +++ b/examples/qualcomm/oss_scripts/llama/runner/io_manager.cpp @@ -0,0 +1,1080 @@ +/* + * Copyright (c) Qualcomm Innovation Center, Inc. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include +#include + +using executorch::aten::Tensor; +using executorch::aten::TensorImpl; +using executorch::extension::Module; +using executorch::runtime::Error; +using executorch::runtime::MemoryAllocator; +using executorch::runtime::MethodMeta; +using executorch::runtime::Result; +using executorch::runtime::TensorInfo; + +namespace example { + +IoMgrBase::IoMgrBase(std::vector>& modules) + : data_ptr_(nullptr, [](void*) {}), modules_(modules) {} + +IoMgrBase::~IoMgrBase() {} + +void* IoMgrBase::get_mutable_ptr() { + return data_ptr_.get(); +} + +std::vector IoMgrBase::get_input_tensors( + int shard_index, + const std::string& method_name) { + std::vector ret; + ret.reserve(input_tensors_.size()); + for (TensorImpl* impl : input_tensors_[method_name][shard_index]) { + ret.emplace_back(Tensor(impl)); + } + return ret; +} + +std::vector IoMgrBase::get_output_tensors( + int shard_index, + const std::string& method_name) { + std::vector ret; + ret.reserve(output_tensors_[method_name][shard_index].size()); + for (TensorImpl* impl : output_tensors_[method_name][shard_index]) { + ret.emplace_back(Tensor(impl)); + } + return ret; +} + +ShiftPointerIoMgr::ShiftPointerIoMgr( + std::vector>& modules, + int32_t prefill_cache_len, + int32_t kv_cache_len, + int32_t vocab_size, + int32_t num_layers, + int32_t head_dim, + int32_t num_heads, + EvalMode eval_mode, + const std::string& prefill_forward_name, + const std::string& kv_forward_name) + : IoMgrBase(modules), + shard_layers_({num_layers}), + kv_cache_len_(kv_cache_len), + prefill_cache_len_(prefill_cache_len), + vocab_size_(vocab_size), + num_layers_(num_layers), + head_dim_(head_dim), + num_heads_(num_heads), + eval_mode_(eval_mode), + prefill_forward_name_(prefill_forward_name), + kv_forward_name_(kv_forward_name) { + if (!prefill_forward_name_.empty()) { + input_tensors_[prefill_forward_name_] = + std::vector>(modules.size()); + output_tensors_[prefill_forward_name_] = + std::vector>(modules.size()); + k_cache_in_[prefill_forward_name_] = + std::vector>(); + v_cache_in_[prefill_forward_name_] = + std::vector>(); + k_cache_out_[prefill_forward_name_] = + std::vector>(); + v_cache_out_[prefill_forward_name_] = + std::vector>(); + } + if (!kv_forward_name_.empty()) { + input_tensors_[kv_forward_name_] = + std::vector>(modules.size()); + output_tensors_[kv_forward_name_] = + std::vector>(modules.size()); + k_cache_in_[kv_forward_name_] = + std::vector>(); + v_cache_in_[kv_forward_name_] = + std::vector>(); + k_cache_out_[kv_forward_name_] = + std::vector>(); + v_cache_out_[kv_forward_name_] = + std::vector>(); + } + + data_ptr_ = std::unique_ptr( + new IO, [](void* ptr) { delete static_cast(ptr); }); +} + +void ShiftPointerIoMgr::init_io() { + IO* ptr = static_cast(data_ptr_.get()); + std::memset(ptr, 0, sizeof(IO)); + + int32_t max_cache_len = std::max(kv_cache_len_, prefill_cache_len_); + int32_t k_in_size = (head_dim_ + 1) * max_cache_len; + int32_t v_cache_size = (num_heads_ + 1) * max_cache_len * head_dim_; + int32_t k_cache_out_size = num_heads_ * head_dim_; + if (eval_mode_ == EvalMode::kHybrid || eval_mode_ == EvalMode::kPrefill) { + k_cache_out_size *= prefill_cache_len_; + } + + // Init kv vector shape, general enough to be shared across all 3 modes. + ptr->k_cache_out.reserve(num_layers_); + ptr->v_cache.reserve(num_layers_); + for (int layer = 0; layer < num_layers_; layer++) { + ptr->k_cache_out.emplace_back(std::vector(k_cache_out_size)); + ptr->v_cache.emplace_back(std::vector(v_cache_size)); + } + + auto init_prefill = [&]() { + ptr->prefill_input_toks.resize(prefill_cache_len_); + ptr->prefill_atten_mask.resize(prefill_cache_len_ * prefill_cache_len_); + ptr->prefill_logits.resize(prefill_cache_len_ * vocab_size_); + }; + + auto init_kv = [&]() { + ptr->kv_logits.resize(vocab_size_); + ptr->kv_attention_mask.resize((kv_cache_len_ + 1), 0); + ptr->k_cache.reserve(num_layers_); + for (int layer = 0; layer < num_layers_; layer++) { + ptr->k_cache.emplace_back(); + ptr->k_cache[layer].reserve(num_heads_); + for (int head = 0; head < num_heads_; head++) { + ptr->k_cache[layer].emplace_back(std::vector(k_in_size)); + } + } + }; + + switch (eval_mode_) { + case EvalMode::kPrefill: + init_prefill(); + break; + case EvalMode::kKVCached: + init_kv(); + break; + case EvalMode::kHybrid: + init_prefill(); + init_kv(); + break; + default: + break; + } +} + +void ShiftPointerIoMgr::prepare_kv_io( + const std::vector>& methods_meta) { + for (int i = 0; i < modules_.size(); ++i) { + ET_CHECK_MSG( + methods_meta[i].ok(), + "Failed to get method_meta 0x%x", + static_cast(methods_meta[i].error())); + } + + ET_CHECK_MSG(!(kv_forward_name_.empty()), "kv forward name is empty"); + IO* ptr = static_cast(data_ptr_.get()); + + // [I]: input_tokens + Result input_tok = methods_meta[0]->input_tensor_meta(0); + input_tok_ = std::make_unique( + input_tok->scalar_type(), + input_tok->sizes().size(), + const_cast(input_tok->sizes().data()), + &ptr->input_tok, + const_cast(input_tok->dim_order().data())); + input_tensors_[kv_forward_name_][0].push_back(input_tok_.get()); + + // [I]: atten_mask + Result atten_mask = methods_meta[0]->input_tensor_meta(1); + attention_mask_ = std::make_unique( + atten_mask->scalar_type(), + atten_mask->sizes().size(), + const_cast(atten_mask->sizes().data()), + ptr->kv_attention_mask.data(), + const_cast(atten_mask->dim_order().data())); + input_tensors_[kv_forward_name_][0].push_back(attention_mask_.get()); + + // [I]: input_pos + Result input_pos = methods_meta[0]->input_tensor_meta(2); + input_pos_ = std::make_unique( + input_pos->scalar_type(), + input_pos->sizes().size(), + const_cast(input_pos->sizes().data()), + &ptr->input_pos, + const_cast(input_pos->dim_order().data())); + input_tensors_[kv_forward_name_][0].push_back(input_pos_.get()); + + // [I] kv_cache + int index = 3; // bypass input_tokens, input_pos, atten_mask + for (int offset = 0, shard_index = 0, v_stride = kv_cache_len_ * head_dim_; + shard_index < modules_.size(); + offset += shard_layers_[shard_index], shard_index++) { + for (int cache_group = 0; cache_group < 2; ++cache_group) { + for (int layer = 0; layer < shard_layers_[shard_index]; ++layer) { + for (int head = 0; head < num_heads_; ++head, ++index) { + Result kv_cache = + methods_meta[shard_index]->input_tensor_meta(index); + std::vector>& cache = + (cache_group == 0 ? k_cache_in_[kv_forward_name_] + : v_cache_in_[kv_forward_name_]); + void* cache_ptr = (cache_group == 0) + ? static_cast(ptr->k_cache[layer + offset][head].data()) + : static_cast( + ptr->v_cache[layer + offset].data() + head * v_stride); + + cache.emplace_back(std::make_unique( + kv_cache->scalar_type(), + kv_cache->sizes().size(), + const_cast(kv_cache->sizes().data()), + cache_ptr, + const_cast( + kv_cache->dim_order().data()))); + input_tensors_[kv_forward_name_][shard_index].push_back( + cache.back().get()); + } + } + } + } + + // [O]: logits + int logit_index = 0; + Result logits = + methods_meta[modules_.size() - 1]->output_tensor_meta(logit_index); + kv_logits_ = std::make_unique( + logits->scalar_type(), + logits->sizes().size(), + const_cast(logits->sizes().data()), + ptr->kv_logits.data(), + const_cast(logits->dim_order().data())); + output_tensors_[kv_forward_name_][modules_.size() - 1].push_back( + kv_logits_.get()); + + // [O] kv_cache + index = 1; + // Iterate through all kv cache outputs. + // For k, we store it in k_cache_out and update to k_cache later. + // For v, we append the output to the end of v_cache, + // which serves as both input and output. + for (int offset = 0, shard_index = 0, v_stride = kv_cache_len_ * head_dim_; + shard_index < modules_.size(); + offset += shard_layers_[shard_index], shard_index++) { + for (int cache_group = 0; cache_group < 2; ++cache_group) { + for (int layer = 0; layer < shard_layers_[shard_index]; ++layer) { + for (int head = 0; head < num_heads_; ++head, ++index) { + Result kv_cache = + methods_meta[shard_index]->output_tensor_meta(index); + std::vector>& cache = + (cache_group == 0 ? k_cache_out_[kv_forward_name_] + : v_cache_out_[kv_forward_name_]); + void* cache_ptr = (cache_group == 0) + ? static_cast( + ptr->k_cache_out[layer + offset].data() + + (head * head_dim_)) + : static_cast( + ptr->v_cache[layer + offset].data() + + (head + 1) * v_stride); + cache.emplace_back(std::make_unique( + kv_cache->scalar_type(), + kv_cache->sizes().size(), + const_cast(kv_cache->sizes().data()), + cache_ptr, + const_cast( + kv_cache->dim_order().data()))); + output_tensors_[kv_forward_name_][shard_index].push_back( + cache.back().get()); + } + } + } + } +} + +void ShiftPointerIoMgr::prepare_prefill_io( + const std::vector>& methods_meta) { + for (int i = 0; i < modules_.size(); ++i) { + ET_CHECK_MSG( + methods_meta[i].ok(), + "Failed to get method_meta 0x%x", + static_cast(methods_meta[i].error())); + } + + ET_CHECK_MSG( + !(prefill_forward_name_.empty()), "prefill forward name is empty"); + + IO* ptr = static_cast(data_ptr_.get()); + + // [I]: pre_input_tokens + Result prefill_input_toks = methods_meta[0]->input_tensor_meta(0); + prefill_input_toks_ = std::make_unique( + prefill_input_toks->scalar_type(), + prefill_input_toks->sizes().size(), + const_cast(prefill_input_toks->sizes().data()), + ptr->prefill_input_toks.data(), + const_cast( + prefill_input_toks->dim_order().data())); + input_tensors_[prefill_forward_name_][0].push_back(prefill_input_toks_.get()); + // [I]: prefill_attn_mask + for (int i = 0; i < prefill_cache_len_; ++i) { + for (int j = 0; j < prefill_cache_len_; ++j) { + if (i < j) { + ptr->prefill_atten_mask[i * prefill_cache_len_ + j] = 0; + } else { + ptr->prefill_atten_mask[i * prefill_cache_len_ + j] = 65535; + } + } + } + Result prefill_atten_mask = methods_meta[0]->input_tensor_meta(1); + prefill_attn_mask_ = std::make_unique( + prefill_atten_mask->scalar_type(), + prefill_atten_mask->sizes().size(), + const_cast(prefill_atten_mask->sizes().data()), + ptr->prefill_atten_mask.data(), + const_cast( + prefill_atten_mask->dim_order().data())); + input_tensors_[prefill_forward_name_][0].push_back(prefill_attn_mask_.get()); + // [O]: logits + int logit_index = 0; + Result logits = + methods_meta[modules_.size() - 1]->output_tensor_meta(logit_index); + prefill_logits_ = std::make_unique( + logits->scalar_type(), + logits->sizes().size(), + const_cast(logits->sizes().data()), + ptr->prefill_logits.data(), + const_cast(logits->dim_order().data())); + output_tensors_[prefill_forward_name_][modules_.size() - 1].push_back( + prefill_logits_.get()); + + // [O] kv_cache + int index = 1; + // prefill_k_stride should be equal to prefill_v_stride in prefill mode. + // In hybrid mode, we use kv mode cache len for v stride since we want to + // update prefill's result onto kv modes input. + int32_t prefill_k_stride = prefill_cache_len_ * head_dim_; + int32_t prefill_v_stride = + std::max(prefill_cache_len_, kv_cache_len_) * head_dim_; + + if (eval_mode_ == EvalMode::kPrefill) { + ET_CHECK_MSG( + prefill_k_stride == prefill_v_stride, + "prefill_k_stride should be equal to prefill_v_stride"); + } + for (int offset = 0, shard_index = 0; shard_index < modules_.size(); + offset += shard_layers_[shard_index], shard_index++) { + for (int cache_group = 0; cache_group < 2; ++cache_group) { + for (int layer = 0; layer < shard_layers_[shard_index]; ++layer) { + for (int head = 0; head < num_heads_; ++head, ++index) { + Result kv_cache = + methods_meta[shard_index]->output_tensor_meta(index); + std::vector>& cache = + (cache_group == 0 ? k_cache_out_[prefill_forward_name_] + : v_cache_out_[prefill_forward_name_]); + void* cache_ptr = (cache_group == 0) + ? static_cast( + ptr->k_cache_out[layer + offset].data() + + head * prefill_k_stride) + : static_cast( + ptr->v_cache[layer + offset].data() + + (head + 1) * prefill_v_stride); + cache.emplace_back(std::make_unique( + kv_cache->scalar_type(), + kv_cache->sizes().size(), + const_cast(kv_cache->sizes().data()), + cache_ptr, + const_cast( + kv_cache->dim_order().data()))); + output_tensors_[prefill_forward_name_][shard_index].push_back( + cache.back().get()); + } + } + } + } +} + +void ShiftPointerIoMgr::update_prefill_to_kv_io( + int64_t cur_token, + int64_t pos, + std::vector>& output_tensors) { + ET_CHECK_MSG(kv_cache_len_ != 0, "k_cache_len_ should not equal to 0"); + ET_CHECK_MSG( + prefill_cache_len_ != 0, "prefill_cache_len_ should not equal to 0"); + IO* ptr = static_cast(data_ptr_.get()); + + ptr->input_tok = static_cast(cur_token); + ptr->input_pos = static_cast(pos); + // If prompt len is 30, prefill will handle to pos = 30. + // At this point, pos should be 31. + for (int i = 0; i < pos + 1; i++) { + ptr->kv_attention_mask[kv_cache_len_ - i] = 65535; + } + + // update v_cache + std::vector>& v_cache_in = + v_cache_in_[kv_forward_name_]; + std::vector>& v_cache_out = + v_cache_out_[kv_forward_name_]; + for (int i = 0, v_cache_stride = head_dim_ * pos; i < v_cache_in.size(); + i++) { + v_cache_in[i]->set_data( + v_cache_in[i]->mutable_data() + v_cache_stride); + v_cache_out[i]->set_data( + v_cache_out[i]->mutable_data() + v_cache_stride); + } + for (int shard = 0; shard < output_tensors.size(); shard++) { + for (int index = 0; index < output_tensors[shard].size(); index++) { + ET_CHECK_MSG( + modules_[shard]->set_output( + kv_forward_name_, output_tensors[shard][index], index) == + Error::Ok, + "Failed to set output tensor for module %d's %d'th output " + "while updating kv_cache output tensors", + shard, + index); + } + } + + std::vector>& k_cache_in = + k_cache_in_[kv_forward_name_]; + std::vector>& k_cache_out = + k_cache_out_[prefill_forward_name_]; + for (int i = 0; i < k_cache_in.size(); ++i) { + uint8_t* ptr_in = k_cache_in[i]->mutable_data(); + const uint8_t* ptr_out = k_cache_out[i]->data(); + for (size_t j = 0, offset = kv_cache_len_; j < head_dim_; + ++j, offset += kv_cache_len_) { + for (int k = 0, k_stride = j * prefill_cache_len_; k < pos; k++) { + ptr_in[offset + k] = ptr_out[k_stride + k]; + } + } + k_cache_in[i]->set_data(ptr_in + pos); + } +} + +void ShiftPointerIoMgr::update_kv_io( + int64_t cur_token, + int64_t pos, + std::vector>& output_tensors) { + IO* ptr = static_cast(data_ptr_.get()); + // update input_tok + ptr->input_tok = static_cast(cur_token); + // update position_ids + ptr->input_pos = static_cast(pos); + // update causal mask for next token + ptr->kv_attention_mask[kv_cache_len_ - pos] = 65535; + + // update v_cache + auto& v_cache_in = v_cache_in_[kv_forward_name_]; + auto& v_cache_out = v_cache_out_[kv_forward_name_]; + for (int i = 0; i < v_cache_in.size(); i++) { + v_cache_in[i]->set_data(v_cache_in[i]->mutable_data() + head_dim_); + v_cache_out[i]->set_data( + v_cache_out[i]->mutable_data() + head_dim_); + } + + for (int shard = 0; shard < output_tensors.size(); shard++) { + for (int index = 0; index < output_tensors[shard].size(); index++) { + ET_CHECK_MSG( + modules_[shard]->set_output( + kv_forward_name_, output_tensors[shard][index], index) == + Error::Ok, + "failed to set output tensor for module %d's %d'th output " + "while updating kv_cache output tensors", + shard, + index); + } + } + + auto& k_cache_in = k_cache_in_[kv_forward_name_]; + auto& k_cache_out = k_cache_out_[kv_forward_name_]; + // update k_cache by single thread, this part is cpu cache sensitive + for (int i = 0; i < k_cache_in.size(); ++i) { + uint8_t* ptr_in = k_cache_in[i]->mutable_data(); + const uint8_t* ptr_out = k_cache_out[i]->data(); + for (size_t j = 0, offset = kv_cache_len_; j < head_dim_; + ++j, offset += kv_cache_len_) { + ptr_in[offset] = ptr_out[j]; + } + k_cache_in[i]->set_data(ptr_in + 1); + } +} + +void ShiftPointerIoMgr::update_prefill_io( + int64_t cur_token, + int64_t pos, + std::vector>& output_tensors) { + (void)output_tensors; + IO* ptr = static_cast(data_ptr_.get()); + ptr->prefill_input_toks[pos] = static_cast(cur_token); +} + +void ShiftPointerIoMgr::fill_prefill_toks( + std::vector& prompt_tokens) { + IO* ptr = static_cast(get_mutable_ptr()); + for (int i = 0; i < prompt_tokens.size(); i++) { + ptr->prefill_input_toks[i] = static_cast(prompt_tokens[i]); + } +} + +void ShiftPointerIoMgr::fill_kv_tok_mask(int64_t pos, int64_t cur_token) { + IO* ptr = static_cast(get_mutable_ptr()); + ptr->input_tok = static_cast(cur_token); + ptr->kv_attention_mask[kv_cache_len_] = 65535; +} + +SmartMaskIoMgr::SmartMaskIoMgr( + std::vector>& modules, + int32_t prefill_cache_len, + int32_t kv_cache_len, + int32_t vocab_size, + int32_t num_layers, + int32_t head_dim, + int32_t num_heads, + EvalMode eval_mode, + const std::string& prefill_forward_name, + const std::string& kv_forward_name) + : IoMgrBase(modules), + shard_layers_({num_layers}), + prefill_cache_len_(prefill_cache_len), + kv_cache_len_(kv_cache_len), + vocab_size_(vocab_size), + num_layers_(num_layers), + head_dim_(head_dim), + num_heads_(num_heads), + eval_mode_(eval_mode), + prefill_forward_name_(prefill_forward_name), + kv_forward_name_(kv_forward_name) { + if (!prefill_forward_name_.empty()) { + input_tensors_[prefill_forward_name_] = + std::vector>(modules.size()); + output_tensors_[prefill_forward_name_] = + std::vector>(modules.size()); + k_cache_out_[prefill_forward_name_] = + std::vector>(); + v_cache_out_[prefill_forward_name_] = + std::vector>(); + } + if (!kv_forward_name_.empty()) { + input_tensors_[kv_forward_name_] = + std::vector>(modules.size()); + output_tensors_[kv_forward_name_] = + std::vector>(modules.size()); + k_cache_in_[kv_forward_name_] = + std::vector>(); + v_cache_in_[kv_forward_name_] = + std::vector>(); + k_cache_out_[kv_forward_name_] = + std::vector>(); + v_cache_out_[kv_forward_name_] = + std::vector>(); + } + + data_ptr_ = std::unique_ptr( + new IO, [](void* ptr) { delete static_cast(ptr); }); +} + +std::unordered_map SmartMaskIoMgr::get_io_elements() { + size_t cache_len = std::max(kv_cache_len_, prefill_cache_len_); + size_t cache_in_ele = num_layers_ * num_heads_ * head_dim_ * cache_len; + size_t cache_out_ele = num_layers_ * num_heads_ * head_dim_; + return std::unordered_map{ + {"input_tok_ele", 1}, + {"input_pos_ele", 1}, + {"cache_in_ele", cache_in_ele}, + {"cache_out_ele", cache_out_ele}, + // 1 for the input prompt + {"atten_mask_ele", cache_len + 1}, + {"kv_logits_ele", vocab_size_}, + {"prefill_input_toks_ele", prefill_cache_len_}, + {"prefill_atten_mask_ele", prefill_cache_len_ * prefill_cache_len_}, + {"prefill_logits_ele", prefill_cache_len_ * vocab_size_}}; +} + +std::unordered_map SmartMaskIoMgr::get_io_bytes() { + std::unordered_map element_map = get_io_elements(); + auto align = [](size_t byte) { + size_t alignment = MemoryAllocator::kDefaultAlignment; + return byte % alignment == 0 ? byte + : byte + + (static_cast(alignment) - + byte % static_cast(alignment)); + }; + return std::unordered_map{ + {"input_tok_bytes", + align(element_map["input_tok_ele"] * sizeof(int32_t))}, + {"input_pos_bytes", + align(element_map["input_pos_ele"] * sizeof(int32_t))}, + {"cache_in_bytes", align(element_map["cache_in_ele"] * sizeof(uint8_t))}, + {"cache_out_bytes", + align(element_map["cache_out_ele"] * sizeof(uint8_t))}, + {"atten_mask_bytes", + align(element_map["atten_mask_ele"] * sizeof(uint16_t))}, + {"kv_logits_bytes", + align(element_map["kv_logits_ele"] * sizeof(uint16_t))}, + {"prefill_input_toks_bytes", + align(element_map["prefill_input_toks_ele"] * sizeof(int32_t))}, + {"prefill_atten_mask_bytes", + align(element_map["prefill_atten_mask_ele"] * sizeof(uint16_t))}, + {"prefill_logits_bytes", + align(element_map["prefill_logits_ele"] * sizeof(uint16_t))}}; +} + +void SmartMaskIoMgr::IO::init_io_ptrs( + void* shared_buffer_ptr, + std::unordered_map& io_bytes_map) { + shared_buffer_base = shared_buffer_ptr; + std::byte* cur_ptr = reinterpret_cast(shared_buffer_base); + std::size_t cur_pos = 0; + size_t layered_head_count = num_layers_ * num_heads_; + + // Iterate map so that we don't need to care about which mode is used. + for (const auto& iter : io_bytes_map) { + std::string key = iter.first; + size_t size = iter.second; + if (key == "input_tok_bytes") { + input_tok = reinterpret_cast(cur_ptr); + } else if (key == "input_pos_bytes") { + input_pos = reinterpret_cast(cur_ptr); + } else if (key == "cache_in_bytes" || key == "cache_out_bytes") { + auto& k_cache_ref = (key == "cache_in_bytes") ? k_cache : k_cache_out; + auto& v_cache_ref = (key == "cache_in_bytes") ? v_cache : v_cache_out; + size_t single_head_size = size / layered_head_count; + k_cache_ref.reserve(num_layers_); + v_cache_ref.reserve(num_layers_); + for (int i = 0; i < num_layers_; ++i) { + k_cache_ref[i].reserve(num_heads_); + v_cache_ref[i].reserve(num_heads_); + for (int j = 0; j < num_heads_; ++j) { + k_cache_ref[i][j] = reinterpret_cast(cur_ptr); + io_pos_map[cur_ptr] = cur_pos; + cur_ptr += single_head_size; + cur_pos += single_head_size; + v_cache_ref[i][j] = reinterpret_cast(cur_ptr); + io_pos_map[cur_ptr] = cur_pos; + cur_ptr += single_head_size; + cur_pos += single_head_size; + } + } + continue; + } else if (key == "atten_mask_bytes") { + kv_attention_mask = reinterpret_cast(cur_ptr); + } else if (key == "kv_logits_bytes") { + kv_logits = reinterpret_cast(cur_ptr); + } else if (key == "prefill_input_toks_bytes") { + prefill_input_toks = reinterpret_cast(cur_ptr); + } else if (key == "prefill_atten_mask_bytes") { + prefill_atten_mask = reinterpret_cast(cur_ptr); + } else if (key == "prefill_logits_bytes") { + prefill_logits = reinterpret_cast(cur_ptr); + } else { + ET_LOG(Error, "Unknown pointer type: %s", key.c_str()); + } + + io_pos_map[cur_ptr] = cur_pos; + cur_ptr += size; + cur_pos += size; + } +} + +void SmartMaskIoMgr::IO::add_custom_mem_info( + void* ptr, + size_t nbytes, + executorch::aten::ScalarType scalar_type, + executorch::runtime::TensorInfo& tensor_info) { + if (auto it = io_pos_map.find(static_cast(ptr)); + it == io_pos_map.end()) { + ET_LOG(Error, "Shared buffer pointer %p is not found %p", ptr); + } + size_t pos = io_pos_map[static_cast(ptr)]; + uint32_t rank = tensor_info.sizes().size(); + uint32_t shape[rank]; + CustomMemTensorInfo info = { + shared_buffer_base, ptr, pos, nbytes, shape, rank, scalar_type}; + QnnExecuTorchAddCustomMemTensorInfo(info); +} + +void SmartMaskIoMgr::init_io() { + std::unordered_map io_bytes_map = get_io_bytes(); + + switch (eval_mode_) { + case EvalMode::kPrefill: + io_bytes_map.erase("input_tok_bytes"); + io_bytes_map.erase("input_pos_bytes"); + io_bytes_map.erase("atten_mask_bytes"); + io_bytes_map.erase("kv_logits_bytes"); + break; + case EvalMode::kKVCached: + io_bytes_map.erase("prefill_input_toks_bytes"); + io_bytes_map.erase("prefill_atten_mask_bytes"); + io_bytes_map.erase("prefill_logits_bytes"); + break; + case EvalMode::kHybrid: + break; + default: + break; + } + + size_t total_bytes = 0; + for (const auto& iter : io_bytes_map) { + size_t size = iter.second; + if (iter.first == "cache_in_bytes" || iter.first == "cache_out_bytes") { + size = iter.second * 2; + } + total_bytes += size; + } + void* shared_ptr = QnnExecuTorchAllocCustomMem( + total_bytes, MemoryAllocator::kDefaultAlignment); + + ET_CHECK_MSG( + shared_ptr, + "Allocate Rpc mem falied, bytes=%zu, alignment=%zu", + total_bytes, + MemoryAllocator::kDefaultAlignment); + IO* ptr = static_cast(data_ptr_.get()); + ptr->num_heads_ = num_heads_; + ptr->num_layers_ = num_layers_; + ptr->head_dim_ = head_dim_; + ptr->init_io_ptrs(shared_ptr, io_bytes_map); +} + +void SmartMaskIoMgr::prepare_kv_io( + const std::vector>& methods_meta) { + for (int i = 0; i < modules_.size(); ++i) { + ET_CHECK_MSG( + methods_meta[i].ok(), + "Failed to get method_meta 0x%x", + static_cast(methods_meta[i].error())); + } + + ET_CHECK_MSG(!(kv_forward_name_.empty()), "kv forward name is empty"); + IO* ptr = static_cast(data_ptr_.get()); + std::unordered_map io_bytes_map = get_io_bytes(); + + // [I]: input_tokens + Result input_tok = methods_meta[0]->input_tensor_meta(0); + input_tok_ = std::make_unique( + input_tok->scalar_type(), + input_tok->sizes().size(), + const_cast(input_tok->sizes().data()), + ptr->input_tok, + const_cast(input_tok->dim_order().data())); + input_tensors_[kv_forward_name_][0].push_back(input_tok_.get()); + ptr->add_custom_mem_info( + ptr->input_tok, + io_bytes_map["input_tok_bytes"], + input_tok->scalar_type(), + input_tok.get()); + + // [I]: atten_mask + Result atten_mask = methods_meta[0]->input_tensor_meta(1); + attention_mask_ = std::make_unique( + atten_mask->scalar_type(), + atten_mask->sizes().size(), + const_cast(atten_mask->sizes().data()), + ptr->kv_attention_mask, + const_cast(atten_mask->dim_order().data())); + input_tensors_[kv_forward_name_][0].push_back(attention_mask_.get()); + ptr->add_custom_mem_info( + ptr->kv_attention_mask, + io_bytes_map["atten_mask_bytes"], + atten_mask->scalar_type(), + atten_mask.get()); + + // [I]: input_pos + Result input_pos = methods_meta[0]->input_tensor_meta(2); + input_pos_ = std::make_unique( + input_pos->scalar_type(), + input_pos->sizes().size(), + const_cast(input_pos->sizes().data()), + ptr->input_pos, + const_cast(input_pos->dim_order().data())); + input_tensors_[kv_forward_name_][0].push_back(input_pos_.get()); + ptr->add_custom_mem_info( + ptr->input_pos, + io_bytes_map["input_pos_bytes"], + input_pos->scalar_type(), + input_pos.get()); + + // [I] kv_cache + size_t layered_head_count = num_layers_ * num_heads_; + int index = 3; // bypass input_tokens, input_pos, atten_mask + for (int offset = 0, shard_index = 0; shard_index < modules_.size(); + offset += shard_layers_[shard_index], shard_index++) { + for (int cache_group = 0; cache_group < 2; ++cache_group) { + for (int layer = 0; layer < shard_layers_[shard_index]; ++layer) { + for (int head = 0; head < num_heads_; ++head, ++index) { + Result kv_cache = + methods_meta[shard_index]->input_tensor_meta(index); + std::vector>& cache = + (cache_group == 0 ? k_cache_in_[kv_forward_name_] + : v_cache_in_[kv_forward_name_]); + uint8_t* cache_ptr = (cache_group == 0) + ? ptr->k_cache[layer + offset][head] + : ptr->v_cache[layer + offset][head]; + + cache.emplace_back(std::make_unique( + kv_cache->scalar_type(), + kv_cache->sizes().size(), + const_cast(kv_cache->sizes().data()), + cache_ptr, + const_cast( + kv_cache->dim_order().data()))); + ptr->add_custom_mem_info( + cache_ptr, + io_bytes_map["cache_in_bytes"] / layered_head_count, + kv_cache->scalar_type(), + kv_cache.get()); + input_tensors_[kv_forward_name_][shard_index].push_back( + cache.back().get()); + } + } + } + } + + // [O]: logits + int logit_index = 0; + Result logits = + methods_meta[modules_.size() - 1]->output_tensor_meta(logit_index); + kv_logits_ = std::make_unique( + logits->scalar_type(), + logits->sizes().size(), + const_cast(logits->sizes().data()), + ptr->kv_logits, + const_cast(logits->dim_order().data())); + + ptr->add_custom_mem_info( + ptr->kv_logits, + io_bytes_map["kv_logits_bytes"], + logits->scalar_type(), + logits.get()); + output_tensors_[kv_forward_name_][modules_.size() - 1].push_back( + kv_logits_.get()); + + // [O] kv_cache + index = 1; + for (int offset = 0, shard_index = 0; shard_index < modules_.size(); + offset += shard_layers_[shard_index], shard_index++) { + for (int cache_group = 0; cache_group < 2; ++cache_group) { + for (int layer = 0; layer < shard_layers_[shard_index]; ++layer) { + for (int head = 0; head < num_heads_; ++head, ++index) { + Result kv_cache = + methods_meta[shard_index]->output_tensor_meta(index); + std::vector>& cache = + (cache_group == 0 ? k_cache_out_[kv_forward_name_] + : v_cache_out_[kv_forward_name_]); + uint8_t* cache_ptr = (cache_group == 0) + ? ptr->k_cache_out[layer + offset][head] + : ptr->v_cache_out[layer + offset][head]; + cache.emplace_back(std::make_unique( + kv_cache->scalar_type(), + kv_cache->sizes().size(), + const_cast(kv_cache->sizes().data()), + cache_ptr, + const_cast( + kv_cache->dim_order().data()))); + ptr->add_custom_mem_info( + cache_ptr, + io_bytes_map["cache_out_bytes"] / layered_head_count, + kv_cache->scalar_type(), + kv_cache.get()); + output_tensors_[kv_forward_name_][shard_index].push_back( + cache.back().get()); + } + } + } + } +} + +void SmartMaskIoMgr::update_kv_io( + int64_t cur_token, + int64_t pos, + std::vector>& output_tensors) { + IO* ptr = static_cast(data_ptr_.get()); + size_t cache_len = std::max(kv_cache_len_, prefill_cache_len_); + // update input_tok + *ptr->input_tok = static_cast(cur_token); + // update position_ids + *ptr->input_pos = static_cast(pos); + // update smart mask for previous cache + ptr->kv_attention_mask[pos] = 65535; + + // update v_cache + auto& v_cache_in = v_cache_in_[kv_forward_name_]; + auto& v_cache_out = v_cache_out_[kv_forward_name_]; + // update v_cache by single thread, this part is cpu cache sensitive + for (int i = 0; i < v_cache_in.size(); ++i) { + uint8_t* ptr_in = v_cache_in[i]->mutable_data() + pos * head_dim_; + const uint8_t* ptr_out = v_cache_out[i]->data(); + memcpy(ptr_in, ptr_out, head_dim_ * sizeof(uint8_t)); + } + + auto& k_cache_in = k_cache_in_[kv_forward_name_]; + auto& k_cache_out = k_cache_out_[kv_forward_name_]; + for (int i = 0; i < k_cache_in.size(); ++i) { + uint8_t* ptr_in = k_cache_in[i]->mutable_data() + pos; + const uint8_t* ptr_out = k_cache_out[i]->data(); + for (size_t j = 0, offset = 0; j < head_dim_; ++j, offset += cache_len) { + ptr_in[offset] = ptr_out[j]; + } + } +} + +void SmartMaskIoMgr::prepare_prefill_io( + const std::vector>& methods_meta) { + for (int i = 0; i < modules_.size(); ++i) { + ET_CHECK_MSG( + methods_meta[i].ok(), + "Failed to get method_meta 0x%x", + static_cast(methods_meta[i].error())); + } + + ET_CHECK_MSG( + !(prefill_forward_name_.empty()), "prefill forward name is empty"); + + IO* ptr = static_cast(data_ptr_.get()); + std::unordered_map io_bytes_map = get_io_bytes(); + + int32_t cache_len = methods_meta[0]->input_tensor_meta(0)->sizes()[1]; + // [I]: pre_input_tokens + Result prefill_input_toks = methods_meta[0]->input_tensor_meta(0); + prefill_input_toks_ = std::make_unique( + prefill_input_toks->scalar_type(), + prefill_input_toks->sizes().size(), + const_cast(prefill_input_toks->sizes().data()), + ptr->prefill_input_toks, + const_cast( + prefill_input_toks->dim_order().data())); + input_tensors_[prefill_forward_name_][0].push_back(prefill_input_toks_.get()); + ptr->add_custom_mem_info( + ptr->prefill_input_toks, + io_bytes_map["prefill_input_toks_bytes"], + executorch::aten::ScalarType::Int, + prefill_input_toks.get()); + + // [I]: prefill_attn_mask + for (int i = 0; i < cache_len; ++i) { + for (int j = 0; j < cache_len; ++j) { + if (i < j) { + ptr->prefill_atten_mask[i * cache_len + j] = 0; + } else { + ptr->prefill_atten_mask[i * cache_len + j] = 65535; + } + } + } + Result prefill_atten_mask = methods_meta[0]->input_tensor_meta(1); + prefill_attn_mask_ = std::make_unique( + prefill_atten_mask->scalar_type(), + prefill_atten_mask->sizes().size(), + const_cast(prefill_atten_mask->sizes().data()), + ptr->prefill_atten_mask, + const_cast( + prefill_atten_mask->dim_order().data())); + input_tensors_[prefill_forward_name_][0].push_back(prefill_attn_mask_.get()); + ptr->add_custom_mem_info( + ptr->prefill_atten_mask, + io_bytes_map["prefill_atten_mask_bytes"], + executorch::aten::ScalarType::Bits16, + prefill_atten_mask.get()); + + // [O]: logits + int logit_index = 0; + Result logits = methods_meta[0]->output_tensor_meta(0); + prefill_logits_ = std::make_unique( + logits->scalar_type(), + logits->sizes().size(), + const_cast(logits->sizes().data()), + ptr->prefill_logits, + const_cast(logits->dim_order().data())); + output_tensors_[prefill_forward_name_][modules_.size() - 1].push_back( + prefill_logits_.get()); + ptr->add_custom_mem_info( + ptr->prefill_logits, + io_bytes_map["prefill_logits_bytes"], + executorch::aten::ScalarType::Bits16, + logits.get()); + + // [O] kv_cache + int index = 1; + size_t layered_head_count = num_layers_ * num_heads_; + for (int offset = 0, shard_index = 0; shard_index < modules_.size(); + offset += shard_layers_[shard_index], shard_index++) { + for (int cache_group = 0; cache_group < 2; ++cache_group) { + for (int layer = 0; layer < shard_layers_[shard_index]; ++layer) { + for (int head = 0; head < num_heads_; ++head, ++index) { + Result kv_cache = + methods_meta[shard_index]->output_tensor_meta(index); + std::vector>& cache = + (cache_group == 0 ? k_cache_out_[prefill_forward_name_] + : v_cache_out_[prefill_forward_name_]); + void* cache_ptr = (cache_group == 0) + ? ptr->k_cache[layer + offset][head] + : ptr->v_cache[layer + offset][head]; + cache.emplace_back(std::make_unique( + kv_cache->scalar_type(), + kv_cache->sizes().size(), + const_cast(kv_cache->sizes().data()), + cache_ptr, + const_cast( + kv_cache->dim_order().data()))); + ptr->add_custom_mem_info( + cache_ptr, + io_bytes_map["cache_in_bytes"] / layered_head_count, + executorch::aten::ScalarType::Byte, + kv_cache.get()); + output_tensors_[prefill_forward_name_][shard_index].push_back( + cache.back().get()); + } + } + } + } +} + +void SmartMaskIoMgr::update_prefill_to_kv_io( + int64_t cur_token, + int64_t pos, + std::vector>& output_tensors) { + IO* ptr = static_cast(data_ptr_.get()); + + *ptr->input_tok = static_cast(cur_token); + *ptr->input_pos = static_cast(pos); + // pos means the cur_token pos + for (int i = 0; i < pos; i++) { + ptr->kv_attention_mask[i] = 65535; + } + + // Update K is enough, copy from last to prevent from overwriting values + size_t copied_size = prefill_cache_len_ * sizeof(uint8_t); + for (int l = 0; l < num_layers_; l++) { + for (int h = 0; h < num_heads_; h++) { + uint8_t* k_cache = ptr->k_cache[l][h]; + for (int hd = head_dim_ - 1; hd > -1; hd--) { + memcpy( + k_cache + (kv_cache_len_ * hd), + k_cache + (prefill_cache_len_ * hd), + copied_size); + } + } + } +} + +void SmartMaskIoMgr::update_prefill_io( + int64_t cur_token, + int64_t pos, + std::vector>& output_tensors) { + (void)output_tensors; + IO* ptr = static_cast(data_ptr_.get()); + ptr->prefill_input_toks[pos] = static_cast(cur_token); +} + +void SmartMaskIoMgr::fill_prefill_toks(std::vector& prompt_tokens) { + IO* ptr = static_cast(get_mutable_ptr()); + for (int i = 0; i < prompt_tokens.size(); i++) { + ptr->prefill_input_toks[i] = static_cast(prompt_tokens[i]); + } +} + +void SmartMaskIoMgr::fill_kv_tok_mask(int64_t pos, int64_t cur_token) { + IO* ptr = static_cast(get_mutable_ptr()); + *ptr->input_tok = static_cast(cur_token); + ptr->kv_attention_mask[kv_cache_len_] = 65535; +} + +} // namespace example diff --git a/examples/qualcomm/oss_scripts/llama/runner/io_memory.h b/examples/qualcomm/oss_scripts/llama/runner/io_manager.h similarity index 51% rename from examples/qualcomm/oss_scripts/llama/runner/io_memory.h rename to examples/qualcomm/oss_scripts/llama/runner/io_manager.h index bb107ffd77e..e86b2eab878 100644 --- a/examples/qualcomm/oss_scripts/llama/runner/io_memory.h +++ b/examples/qualcomm/oss_scripts/llama/runner/io_manager.h @@ -8,9 +8,15 @@ #pragma once +#include #include +#include +#include +#include +#include #include +#include #include #include @@ -22,10 +28,11 @@ enum EvalMode { kHybrid, kUnsupported, }; -class Memory { +class IoMgrBase { public: - Memory(std::vector>& modules); - virtual ~Memory(); + IoMgrBase( + std::vector>& modules); + virtual ~IoMgrBase(); virtual void init_io() = 0; virtual void prepare_prefill_io( const std::vector< @@ -35,6 +42,8 @@ class Memory { const std::vector< executorch::runtime::Result>& methods_meta) = 0; + virtual void fill_prefill_toks(std::vector& prompt_tokens) = 0; + virtual void fill_kv_tok_mask(int64_t pos, int64_t cur_token) = 0; virtual void update_prefill_to_kv_io( int64_t cur_token, int64_t pos, @@ -68,9 +77,9 @@ class Memory { std::vector> modules_; }; -class HybridMemory : public Memory { +class ShiftPointerIoMgr : public IoMgrBase { public: - HybridMemory( + ShiftPointerIoMgr( std::vector>& modules, int32_t prefill_cache_len, int32_t kv_cache_len, @@ -91,6 +100,8 @@ class HybridMemory : public Memory { const std::vector< executorch::runtime::Result>& methods_meta) override; + void fill_prefill_toks(std::vector& prompt_tokens) override; + void fill_kv_tok_mask(int64_t pos, int64_t cur_token) override; void update_prefill_to_kv_io( int64_t cur_token, int64_t pos, @@ -156,4 +167,122 @@ class HybridMemory : public Memory { std::string kv_forward_name_; }; +class SmartMaskIoMgr : public IoMgrBase { + public: + SmartMaskIoMgr( + std::vector>& modules, + int32_t prefill_cache_len, + int32_t kv_cache_len, + int32_t vocab_size, + int32_t num_layers, + int32_t head_dim, + int32_t num_heads, + EvalMode eval_mode, + const std::string& prefill_forward_name, + const std::string& kv_forward_name); + + void init_io() override; + void prepare_prefill_io( + const std::vector< + executorch::runtime::Result>& + methods_meta) override; + void prepare_kv_io( + const std::vector< + executorch::runtime::Result>& + methods_meta) override; + void fill_prefill_toks(std::vector& prompt_tokens) override; + void fill_kv_tok_mask(int64_t pos, int64_t cur_token) override; + void update_prefill_to_kv_io( + int64_t cur_token, + int64_t pos, + std::vector>& output_tensors) + override; + void update_kv_io( + int64_t cur_token, + int64_t pos, + std::vector>& output_tensors) + override; + void update_prefill_io( + int64_t cur_token, + int64_t pos, + std::vector>& output_tensors) + override; + + std::unordered_map get_io_elements(); + std::unordered_map get_io_bytes(); + + struct IO { + void* shared_buffer_base; + int32_t* input_tok; + int32_t* input_pos; + // layer -> head -> head_dim * seq_len + std::vector> k_cache; + std::vector> v_cache; + // layer -> head -> head_dim + std::vector> k_cache_out; + std::vector> v_cache_out; + // max_seq_len + uint16_t* kv_attention_mask; + // vocab_size + uint16_t* kv_logits; + int32_t* prefill_input_toks; + // prefill_cache_len_ ^ 2 + uint16_t* prefill_atten_mask; + // vocab_size * prefill_cache_len_ + uint16_t* prefill_logits; + + size_t num_layers_; + size_t num_heads_; + size_t head_dim_; + std::unordered_map io_pos_map; + ~IO() { + QnnExecuTorchFreeCustomMem(shared_buffer_base); + } + void init_io_ptrs( + void* shared_buffer_ptr, + std::unordered_map& io_bytes_map); + void add_custom_mem_info( + void* ptr, + size_t nbytes, + executorch::aten::ScalarType scalar_type, + executorch::runtime::TensorInfo& tensor_info); + }; + + private: + std::unique_ptr input_tok_; + std::unique_ptr input_pos_; + std::unique_ptr hidden_state_; + std::unique_ptr attention_mask_; + std::unique_ptr prefill_input_toks_; + std::unique_ptr prefill_attn_mask_; + std::unique_ptr prefill_logits_; + std::unordered_map< + std::string, + std::vector>> + k_cache_in_; + std::unordered_map< + std::string, + std::vector>> + v_cache_in_; + std::unordered_map< + std::string, + std::vector>> + k_cache_out_; + std::unordered_map< + std::string, + std::vector>> + v_cache_out_; + std::unique_ptr kv_logits_; + std::vector shard_layers_; + int32_t kv_cache_len_{0}; + int32_t prefill_cache_len_{0}; + int32_t vocab_size_; + int32_t num_layers_; + int32_t head_dim_; + int32_t num_heads_; + EvalMode eval_mode_; + std::string prefill_forward_name_; + std::string kv_forward_name_; +}; + } // namespace example diff --git a/examples/qualcomm/oss_scripts/llama/runner/io_memory.cpp b/examples/qualcomm/oss_scripts/llama/runner/io_memory.cpp deleted file mode 100644 index 22efd5a3344..00000000000 --- a/examples/qualcomm/oss_scripts/llama/runner/io_memory.cpp +++ /dev/null @@ -1,508 +0,0 @@ -/* - * Copyright (c) Qualcomm Innovation Center, Inc. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#include -#include -#include - -using executorch::aten::Tensor; -using executorch::aten::TensorImpl; -using executorch::extension::Module; -using executorch::runtime::Error; -using executorch::runtime::MethodMeta; -using executorch::runtime::Result; -using executorch::runtime::TensorInfo; - -namespace example { - -Memory::Memory(std::vector>& modules) - : data_ptr_(nullptr, [](void*) {}), modules_(modules) {} - -Memory::~Memory() {} - -void* Memory::get_mutable_ptr() { - return data_ptr_.get(); -} - -std::vector Memory::get_input_tensors( - int shard_index, - const std::string& method_name) { - std::vector ret; - ret.reserve(input_tensors_.size()); - for (TensorImpl* impl : input_tensors_[method_name][shard_index]) { - ret.emplace_back(Tensor(impl)); - } - return ret; -} - -std::vector Memory::get_output_tensors( - int shard_index, - const std::string& method_name) { - std::vector ret; - ret.reserve(output_tensors_[method_name][shard_index].size()); - for (TensorImpl* impl : output_tensors_[method_name][shard_index]) { - ret.emplace_back(Tensor(impl)); - } - return ret; -} - -HybridMemory::HybridMemory( - std::vector>& modules, - int32_t prefill_cache_len, - int32_t kv_cache_len, - int32_t vocab_size, - int32_t num_layers, - int32_t head_dim, - int32_t num_heads, - EvalMode eval_mode, - const std::string& prefill_forward_name, - const std::string& kv_forward_name) - : Memory(modules), - shard_layers_({num_layers}), - kv_cache_len_(kv_cache_len), - prefill_cache_len_(prefill_cache_len), - vocab_size_(vocab_size), - num_layers_(num_layers), - head_dim_(head_dim), - num_heads_(num_heads), - eval_mode_(eval_mode), - prefill_forward_name_(prefill_forward_name), - kv_forward_name_(kv_forward_name) { - if (!prefill_forward_name_.empty()) { - input_tensors_[prefill_forward_name_] = - std::vector>(modules.size()); - output_tensors_[prefill_forward_name_] = - std::vector>(modules.size()); - k_cache_in_[prefill_forward_name_] = - std::vector>(); - v_cache_in_[prefill_forward_name_] = - std::vector>(); - k_cache_out_[prefill_forward_name_] = - std::vector>(); - v_cache_out_[prefill_forward_name_] = - std::vector>(); - } - if (!kv_forward_name_.empty()) { - input_tensors_[kv_forward_name_] = - std::vector>(modules.size()); - output_tensors_[kv_forward_name_] = - std::vector>(modules.size()); - k_cache_in_[kv_forward_name_] = - std::vector>(); - v_cache_in_[kv_forward_name_] = - std::vector>(); - k_cache_out_[kv_forward_name_] = - std::vector>(); - v_cache_out_[kv_forward_name_] = - std::vector>(); - } - - data_ptr_ = std::unique_ptr( - new IO, [](void* ptr) { delete static_cast(ptr); }); -} - -void HybridMemory::init_io() { - IO* ptr = static_cast(data_ptr_.get()); - std::memset(ptr, 0, sizeof(IO)); - - int32_t max_cache_len = std::max(kv_cache_len_, prefill_cache_len_); - int32_t k_in_size = (head_dim_ + 1) * max_cache_len; - int32_t v_cache_size = (num_heads_ + 1) * max_cache_len * head_dim_; - int32_t k_cache_out_size = num_heads_ * head_dim_; - if (eval_mode_ == EvalMode::kHybrid || eval_mode_ == EvalMode::kPrefill) { - k_cache_out_size *= prefill_cache_len_; - } - - // Init kv vector shape, general enough to be shared across all 3 modes. - ptr->k_cache_out.reserve(num_layers_); - ptr->v_cache.reserve(num_layers_); - for (int layer = 0; layer < num_layers_; layer++) { - ptr->k_cache_out.emplace_back(std::vector(k_cache_out_size)); - ptr->v_cache.emplace_back(std::vector(v_cache_size)); - } - - auto init_prefill = [&]() { - ptr->prefill_input_toks.resize(prefill_cache_len_); - ptr->prefill_atten_mask.resize(prefill_cache_len_ * prefill_cache_len_); - ptr->prefill_logits.resize(prefill_cache_len_ * vocab_size_); - }; - - auto init_kv = [&]() { - ptr->kv_logits.resize(vocab_size_); - ptr->kv_attention_mask.resize((kv_cache_len_ + 1), 0); - ptr->k_cache.reserve(num_layers_); - for (int layer = 0; layer < num_layers_; layer++) { - ptr->k_cache.emplace_back(); - ptr->k_cache[layer].reserve(num_heads_); - for (int head = 0; head < num_heads_; head++) { - ptr->k_cache[layer].emplace_back(std::vector(k_in_size)); - } - } - }; - - switch (eval_mode_) { - case EvalMode::kPrefill: - init_prefill(); - break; - case EvalMode::kKVCached: - init_kv(); - break; - case EvalMode::kHybrid: - init_prefill(); - init_kv(); - break; - default: - break; - } -} - -void HybridMemory::prepare_kv_io( - const std::vector>& methods_meta) { - for (int i = 0; i < modules_.size(); ++i) { - ET_CHECK_MSG( - methods_meta[i].ok(), - "Failed to get method_meta 0x%x", - static_cast(methods_meta[i].error())); - } - - ET_CHECK_MSG(!(kv_forward_name_.empty()), "kv forward name is empty"); - IO* ptr = static_cast(data_ptr_.get()); - - // [I]: input_tokens - Result input_tok = methods_meta[0]->input_tensor_meta(0); - input_tok_ = std::make_unique( - input_tok->scalar_type(), - input_tok->sizes().size(), - const_cast(input_tok->sizes().data()), - &ptr->input_tok, - const_cast(input_tok->dim_order().data())); - input_tensors_[kv_forward_name_][0].push_back(input_tok_.get()); - - // [I]: atten_mask - Result atten_mask = methods_meta[0]->input_tensor_meta(1); - attention_mask_ = std::make_unique( - atten_mask->scalar_type(), - atten_mask->sizes().size(), - const_cast(atten_mask->sizes().data()), - ptr->kv_attention_mask.data(), - const_cast(atten_mask->dim_order().data())); - input_tensors_[kv_forward_name_][0].push_back(attention_mask_.get()); - - // [I]: input_pos - Result input_pos = methods_meta[0]->input_tensor_meta(2); - input_pos_ = std::make_unique( - input_pos->scalar_type(), - input_pos->sizes().size(), - const_cast(input_pos->sizes().data()), - &ptr->input_pos, - const_cast(input_pos->dim_order().data())); - input_tensors_[kv_forward_name_][0].push_back(input_pos_.get()); - - // [I] kv_cache - int index = 3; // bypass input_tokens, input_pos, atten_mask - for (int offset = 0, shard_index = 0, v_stride = kv_cache_len_ * head_dim_; - shard_index < modules_.size(); - offset += shard_layers_[shard_index], shard_index++) { - for (int cache_group = 0; cache_group < 2; ++cache_group) { - for (int layer = 0; layer < shard_layers_[shard_index]; ++layer) { - for (int head = 0; head < num_heads_; ++head, ++index) { - Result kv_cache = - methods_meta[shard_index]->input_tensor_meta(index); - std::vector>& cache = - (cache_group == 0 ? k_cache_in_[kv_forward_name_] - : v_cache_in_[kv_forward_name_]); - void* cache_ptr = (cache_group == 0) - ? static_cast(ptr->k_cache[layer + offset][head].data()) - : static_cast( - ptr->v_cache[layer + offset].data() + head * v_stride); - - cache.emplace_back(std::make_unique( - kv_cache->scalar_type(), - kv_cache->sizes().size(), - const_cast(kv_cache->sizes().data()), - cache_ptr, - const_cast( - kv_cache->dim_order().data()))); - input_tensors_[kv_forward_name_][shard_index].push_back( - cache.back().get()); - } - } - } - } - - // [O]: logits - int logit_index = 0; - Result logits = - methods_meta[modules_.size() - 1]->output_tensor_meta(logit_index); - kv_logits_ = std::make_unique( - logits->scalar_type(), - logits->sizes().size(), - const_cast(logits->sizes().data()), - ptr->kv_logits.data(), - const_cast(logits->dim_order().data())); - output_tensors_[kv_forward_name_][modules_.size() - 1].push_back( - kv_logits_.get()); - - // [O] kv_cache - index = 1; - // Iterate through all kv cache outputs. - // For k, we store it in k_cache_out and update to k_cache later. - // For v, we append the output to the end of v_cache, - // which serves as both input and output. - for (int offset = 0, shard_index = 0, v_stride = kv_cache_len_ * head_dim_; - shard_index < modules_.size(); - offset += shard_layers_[shard_index], shard_index++) { - for (int cache_group = 0; cache_group < 2; ++cache_group) { - for (int layer = 0; layer < shard_layers_[shard_index]; ++layer) { - for (int head = 0; head < num_heads_; ++head, ++index) { - Result kv_cache = - methods_meta[shard_index]->output_tensor_meta(index); - std::vector>& cache = - (cache_group == 0 ? k_cache_out_[kv_forward_name_] - : v_cache_out_[kv_forward_name_]); - void* cache_ptr = (cache_group == 0) - ? static_cast( - ptr->k_cache_out[layer + offset].data() + - (head * head_dim_)) - : static_cast( - ptr->v_cache[layer + offset].data() + - (head + 1) * v_stride); - cache.emplace_back(std::make_unique( - kv_cache->scalar_type(), - kv_cache->sizes().size(), - const_cast(kv_cache->sizes().data()), - cache_ptr, - const_cast( - kv_cache->dim_order().data()))); - output_tensors_[kv_forward_name_][shard_index].push_back( - cache.back().get()); - } - } - } - } -} - -void HybridMemory::prepare_prefill_io( - const std::vector>& methods_meta) { - for (int i = 0; i < modules_.size(); ++i) { - ET_CHECK_MSG( - methods_meta[i].ok(), - "Failed to get method_meta 0x%x", - static_cast(methods_meta[i].error())); - } - - ET_CHECK_MSG( - !(prefill_forward_name_.empty()), "prefill forward name is empty"); - - IO* ptr = static_cast(data_ptr_.get()); - - // [I]: pre_input_tokens - Result prefill_input_toks = methods_meta[0]->input_tensor_meta(0); - prefill_input_toks_ = std::make_unique( - prefill_input_toks->scalar_type(), - prefill_input_toks->sizes().size(), - const_cast(prefill_input_toks->sizes().data()), - ptr->prefill_input_toks.data(), - const_cast( - prefill_input_toks->dim_order().data())); - input_tensors_[prefill_forward_name_][0].push_back(prefill_input_toks_.get()); - // [I]: prefill_attn_mask - for (int i = 0; i < prefill_cache_len_; ++i) { - for (int j = 0; j < prefill_cache_len_; ++j) { - if (i < j) { - ptr->prefill_atten_mask[i * prefill_cache_len_ + j] = 0; - } else { - ptr->prefill_atten_mask[i * prefill_cache_len_ + j] = 65535; - } - } - } - Result prefill_atten_mask = methods_meta[0]->input_tensor_meta(1); - prefill_attn_mask_ = std::make_unique( - prefill_atten_mask->scalar_type(), - prefill_atten_mask->sizes().size(), - const_cast(prefill_atten_mask->sizes().data()), - ptr->prefill_atten_mask.data(), - const_cast( - prefill_atten_mask->dim_order().data())); - input_tensors_[prefill_forward_name_][0].push_back(prefill_attn_mask_.get()); - // [O]: logits - int logit_index = 0; - Result logits = - methods_meta[modules_.size() - 1]->output_tensor_meta(logit_index); - prefill_logits_ = std::make_unique( - logits->scalar_type(), - logits->sizes().size(), - const_cast(logits->sizes().data()), - ptr->prefill_logits.data(), - const_cast(logits->dim_order().data())); - output_tensors_[prefill_forward_name_][modules_.size() - 1].push_back( - prefill_logits_.get()); - - // [O] kv_cache - int index = 1; - // prefill_k_stride should be equal to prefill_v_stride in prefill mode. - // In hybrid mode, we use kv mode cache len for v stride since we want to - // update prefill's result onto kv modes input. - int32_t prefill_k_stride = prefill_cache_len_ * head_dim_; - int32_t prefill_v_stride = - std::max(prefill_cache_len_, kv_cache_len_) * head_dim_; - - if (eval_mode_ == EvalMode::kPrefill) { - ET_CHECK_MSG( - prefill_k_stride == prefill_v_stride, - "prefill_k_stride should be equal to prefill_v_stride"); - } - for (int offset = 0, shard_index = 0; shard_index < modules_.size(); - offset += shard_layers_[shard_index], shard_index++) { - for (int cache_group = 0; cache_group < 2; ++cache_group) { - for (int layer = 0; layer < shard_layers_[shard_index]; ++layer) { - for (int head = 0; head < num_heads_; ++head, ++index) { - Result kv_cache = - methods_meta[shard_index]->output_tensor_meta(index); - std::vector>& cache = - (cache_group == 0 ? k_cache_out_[prefill_forward_name_] - : v_cache_out_[prefill_forward_name_]); - void* cache_ptr = (cache_group == 0) - ? static_cast( - ptr->k_cache_out[layer + offset].data() + - head * prefill_k_stride) - : static_cast( - ptr->v_cache[layer + offset].data() + - (head + 1) * prefill_v_stride); - cache.emplace_back(std::make_unique( - kv_cache->scalar_type(), - kv_cache->sizes().size(), - const_cast(kv_cache->sizes().data()), - cache_ptr, - const_cast( - kv_cache->dim_order().data()))); - output_tensors_[prefill_forward_name_][shard_index].push_back( - cache.back().get()); - } - } - } - } -} - -void HybridMemory::update_prefill_to_kv_io( - int64_t cur_token, - int64_t pos, - std::vector>& output_tensors) { - ET_CHECK_MSG(kv_cache_len_ != 0, "k_cache_len_ should not equal to 0"); - ET_CHECK_MSG( - prefill_cache_len_ != 0, "prefill_cache_len_ should not equal to 0"); - IO* ptr = static_cast(data_ptr_.get()); - - ptr->input_tok = static_cast(cur_token); - ptr->input_pos = static_cast(pos); - // If prompt len is 30, prefill will handle to pos = 30. - // At this point, pos should be 31. - for (int i = 0; i < pos + 1; i++) { - ptr->kv_attention_mask[kv_cache_len_ - i] = 65535; - } - - // update v_cache - std::vector>& v_cache_in = - v_cache_in_[kv_forward_name_]; - std::vector>& v_cache_out = - v_cache_out_[kv_forward_name_]; - for (int i = 0, v_cache_stride = head_dim_ * pos; i < v_cache_in.size(); - i++) { - v_cache_in[i]->set_data( - v_cache_in[i]->mutable_data() + v_cache_stride); - v_cache_out[i]->set_data( - v_cache_out[i]->mutable_data() + v_cache_stride); - } - for (int shard = 0; shard < output_tensors.size(); shard++) { - for (int index = 0; index < output_tensors[shard].size(); index++) { - ET_CHECK_MSG( - modules_[shard]->set_output( - kv_forward_name_, output_tensors[shard][index], index) == - Error::Ok, - "Failed to set output tensor for module %d's %d'th output " - "while updating kv_cache output tensors", - shard, - index); - } - } - - std::vector>& k_cache_in = - k_cache_in_[kv_forward_name_]; - std::vector>& k_cache_out = - k_cache_out_[prefill_forward_name_]; - for (int i = 0; i < k_cache_in.size(); ++i) { - uint8_t* ptr_in = k_cache_in[i]->mutable_data(); - const uint8_t* ptr_out = k_cache_out[i]->data(); - for (size_t j = 0, offset = kv_cache_len_; j < head_dim_; - ++j, offset += kv_cache_len_) { - for (int k = 0, k_stride = j * prefill_cache_len_; k < pos; k++) { - ptr_in[offset + k] = ptr_out[k_stride + k]; - } - } - k_cache_in[i]->set_data(ptr_in + pos); - } -} - -void HybridMemory::update_kv_io( - int64_t cur_token, - int64_t pos, - std::vector>& output_tensors) { - IO* ptr = static_cast(data_ptr_.get()); - // update input_tok - ptr->input_tok = static_cast(cur_token); - // update position_ids - ptr->input_pos = static_cast(pos); - // update causal mask for next token - ptr->kv_attention_mask[kv_cache_len_ - pos] = 65535; - - // update v_cache - auto& v_cache_in = v_cache_in_[kv_forward_name_]; - auto& v_cache_out = v_cache_out_[kv_forward_name_]; - for (int i = 0; i < v_cache_in.size(); i++) { - v_cache_in[i]->set_data(v_cache_in[i]->mutable_data() + head_dim_); - v_cache_out[i]->set_data( - v_cache_out[i]->mutable_data() + head_dim_); - } - - for (int shard = 0; shard < output_tensors.size(); shard++) { - for (int index = 0; index < output_tensors[shard].size(); index++) { - ET_CHECK_MSG( - modules_[shard]->set_output( - kv_forward_name_, output_tensors[shard][index], index) == - Error::Ok, - "failed to set output tensor for module %d's %d'th output " - "while updating kv_cache output tensors", - shard, - index); - } - } - - auto& k_cache_in = k_cache_in_[kv_forward_name_]; - auto& k_cache_out = k_cache_out_[kv_forward_name_]; - // update k_cache by single thread, this part is cpu cache sensitive - for (int i = 0; i < k_cache_in.size(); ++i) { - uint8_t* ptr_in = k_cache_in[i]->mutable_data(); - const uint8_t* ptr_out = k_cache_out[i]->data(); - for (size_t j = 0, offset = kv_cache_len_; j < head_dim_; - ++j, offset += kv_cache_len_) { - ptr_in[offset] = ptr_out[j]; - } - k_cache_in[i]->set_data(ptr_in + 1); - } -} - -void HybridMemory::update_prefill_io( - int64_t cur_token, - int64_t pos, - std::vector>& output_tensors) { - (void)output_tensors; - IO* ptr = static_cast(data_ptr_.get()); - ptr->prefill_input_toks[pos] = static_cast(cur_token); -} - -} // namespace example diff --git a/examples/qualcomm/oss_scripts/llama/runner/runner.cpp b/examples/qualcomm/oss_scripts/llama/runner/runner.cpp index e06d52fbb37..158c6a13ca5 100644 --- a/examples/qualcomm/oss_scripts/llama/runner/runner.cpp +++ b/examples/qualcomm/oss_scripts/llama/runner/runner.cpp @@ -43,14 +43,16 @@ Runner::Runner( const float logits_scale, const int32_t logits_offset, const float temperature, - const int eval_mode) + const int eval_mode, + const std::string& kv_updator) : n_bos_(1), n_eos_(1), tokenizer_path_(tokenizer_path), logits_scale_(logits_scale), logits_offset_(logits_offset), temperature_(temperature), - eval_mode_(static_cast(eval_mode)) { + eval_mode_(static_cast(eval_mode)), + kv_updator_(kv_updator) { for (size_t i = 0; i < models_path.size(); ++i) { modules_.push_back(std::make_shared( models_path[i], Module::LoadMode::MmapUseMlockIgnoreErrors)); @@ -125,31 +127,47 @@ Error Runner::load() { vocab_size_ = method_meta.output_tensor_meta(0)->sizes()[2]; // logit_tensor ET_CHECK_MSG(num_layers != -1, "Could not retrieve num layers"); - io_mem_ = std::make_unique( - modules_, - prefill_cache_len_, - kv_cache_len_, - vocab_size_, - num_layers, - head_dim, - num_heads, - eval_mode_, - prefill_forward_name_, - kv_forward_name_); + if (kv_updator_ == "SmartMask") { + io_mgr_ = std::make_unique( + modules_, + prefill_cache_len_, + kv_cache_len_, + vocab_size_, + num_layers, + head_dim, + num_heads, + eval_mode_, + prefill_forward_name_, + kv_forward_name_); + } else if (kv_updator_ == "ShiftPointer") { + io_mgr_ = std::make_unique( + modules_, + prefill_cache_len_, + kv_cache_len_, + vocab_size_, + num_layers, + head_dim, + num_heads, + eval_mode_, + prefill_forward_name_, + kv_forward_name_); + } else { + ET_LOG(Error, "Using an unknown updator %s", kv_updator_.c_str()); + } ET_LOG(Info, "creating io_memory"); // prepare io - io_mem_->init_io(); + io_mgr_->init_io(); switch (eval_mode_) { case EvalMode::kPrefill: - io_mem_->prepare_prefill_io(get_methods_meta(prefill_forward_name_)); + io_mgr_->prepare_prefill_io(get_methods_meta(prefill_forward_name_)); break; case EvalMode::kKVCached: - io_mem_->prepare_kv_io(get_methods_meta(kv_forward_name_)); + io_mgr_->prepare_kv_io(get_methods_meta(kv_forward_name_)); break; case EvalMode::kHybrid: - io_mem_->prepare_prefill_io(get_methods_meta(prefill_forward_name_)); - io_mem_->prepare_kv_io(get_methods_meta(kv_forward_name_)); + io_mgr_->prepare_prefill_io(get_methods_meta(prefill_forward_name_)); + io_mgr_->prepare_kv_io(get_methods_meta(kv_forward_name_)); break; case EvalMode::kUnsupported: ET_CHECK_MSG(false, "unsupported mode"); @@ -255,9 +273,9 @@ Error Runner::generate( for (auto method_name : method_names_) { for (int i = 0; i < modules_.size(); ++i) { input_tensors[method_name].emplace_back( - io_mem_->get_input_tensors(i, method_name)); + io_mgr_->get_input_tensors(i, method_name)); output_tensors[method_name].emplace_back( - io_mem_->get_output_tensors(i, method_name)); + io_mgr_->get_output_tensors(i, method_name)); for (size_t j = 0; j < output_tensors[method_name][i].size(); ++j) { ET_CHECK_MSG( modules_[i]->set_output( @@ -327,16 +345,11 @@ Error Runner::generate( } int64_t pos = 0, prev_token, cur_token = prompt_tokens[0]; - HybridMemory::IO* ptr = - static_cast(io_mem_->get_mutable_ptr()); if (token_callback) { token_callback(prompt_); } - auto prefill_execute = [&](const std::string& method_name) { - for (int i = 0; i < num_prompt_tokens; i++) { - ptr->prefill_input_toks[i] = static_cast(prompt_tokens[i]); - } + io_mgr_->fill_prefill_toks(prompt_tokens); pos = num_prompt_tokens - 1; cur_token = prompt_tokens[pos]; @@ -349,7 +362,7 @@ Error Runner::generate( cur_token = logitsToToken(logits_tensor, pos); stats_.aggregate_sampling_time_ms += time_in_ms() - sample_start_time_ms; - io_mem_->update_prefill_io(cur_token, ++pos, output_tensors[method_name]); + io_mgr_->update_prefill_io(cur_token, ++pos, output_tensors[method_name]); auto piece_res = tokenizer_->decode(prev_token, cur_token); ET_CHECK(piece_res.ok()); if (token_callback) { @@ -373,8 +386,7 @@ Error Runner::generate( }; auto kv_execute = [&](const std::string& method_name) { - ptr->input_tok = static_cast(cur_token); - ptr->kv_attention_mask[kv_cache_len_] = 65535; + io_mgr_->fill_kv_tok_mask(pos, cur_token); while (pos < seq_len - 1) { // inference run_model_step(method_name, inputs[method_name]); @@ -396,7 +408,7 @@ Error Runner::generate( if (pos < num_prompt_tokens - 1) { cur_token = prompt_tokens[pos + 1]; } - io_mem_->update_kv_io(cur_token, ++pos, output_tensors[method_name]); + io_mgr_->update_kv_io(cur_token, ++pos, output_tensors[method_name]); auto piece_res = tokenizer_->decode(prev_token, cur_token); ET_CHECK(piece_res.ok()); @@ -420,7 +432,7 @@ Error Runner::generate( break; case EvalMode::kHybrid: prefill_execute(prefill_forward_name_); - io_mem_->update_prefill_to_kv_io( + io_mgr_->update_prefill_to_kv_io( cur_token, pos, output_tensors[kv_forward_name_]); kv_execute(kv_forward_name_); break; diff --git a/examples/qualcomm/oss_scripts/llama/runner/runner.h b/examples/qualcomm/oss_scripts/llama/runner/runner.h index aaf79360bdb..844ea322907 100644 --- a/examples/qualcomm/oss_scripts/llama/runner/runner.h +++ b/examples/qualcomm/oss_scripts/llama/runner/runner.h @@ -17,7 +17,7 @@ #include #include -#include +#include #include #include #include @@ -32,7 +32,8 @@ class Runner { const float logits_scale, const int32_t logits_offset, const float temperature, - const int eval_mode); + const int eval_mode, + const std::string& kv_updator); struct Stats { // Scaling factor for timestamps - in this case, we use ms. @@ -103,12 +104,13 @@ class Runner { std::unique_ptr tokenizer_; std::unique_ptr sampler_; Stats stats_; - std::unique_ptr io_mem_; + std::unique_ptr io_mgr_; EvalMode eval_mode_; std::string prefill_forward_name_; std::string kv_forward_name_; std::vector method_names_; LlamaVersion llama_version_; + std::string kv_updator_; }; } // namespace example