diff --git a/backends/qualcomm/runtime/SharedBuffer.cpp b/backends/qualcomm/runtime/SharedBuffer.cpp index 12968fa28c9..99dee7c9a7b 100644 --- a/backends/qualcomm/runtime/SharedBuffer.cpp +++ b/backends/qualcomm/runtime/SharedBuffer.cpp @@ -22,7 +22,7 @@ std::size_t std::hash::operator()( hash_val ^= std::hash()(info.pos); hash_val ^= std::hash()(info.tensor_bytes); for (int i = 0; i < info.rank; ++i) { - hash_val ^= info.shape[i]; + hash_val ^= std::hash()(info.shape[i]); } hash_val ^= std::hash()(info.rank); hash_val ^= std::hash()(info.dtype); diff --git a/backends/qualcomm/runtime/backends/QnnBackendFactory.cpp b/backends/qualcomm/runtime/backends/QnnBackendFactory.cpp index 29e6686740b..f488800441b 100644 --- a/backends/qualcomm/runtime/backends/QnnBackendFactory.cpp +++ b/backends/qualcomm/runtime/backends/QnnBackendFactory.cpp @@ -78,7 +78,9 @@ std::unique_ptr QnnBackendFactory::Create( options->soc_info(), htp_options); backend_params->qnn_mem_manager_ptr_ = std::make_unique( - implementation, backend_params->qnn_context_ptr_.get()); + implementation, + backend_params->qnn_context_ptr_.get(), + options->log_level()); backend_params->backend_init_state_ = BackendInitializeState::INITIALIZED; } break; case QnnExecuTorchBackendType::kGpuBackend: diff --git a/backends/qualcomm/runtime/backends/QnnMemManager.cpp b/backends/qualcomm/runtime/backends/QnnMemManager.cpp index 5f13e9b3ba6..e09d071075b 100644 --- a/backends/qualcomm/runtime/backends/QnnMemManager.cpp +++ b/backends/qualcomm/runtime/backends/QnnMemManager.cpp @@ -47,9 +47,12 @@ Error QnnMemManager::RegisterIonMem( } tensor_wrapper->SetMemHandle(handle); registered_map_.insert({handle, mem_ptr}); - QNN_EXECUTORCH_LOG_INFO( - "Tensor %s is successfully registered to ION shared memory.", - tensor_wrapper->GetName().c_str()); + if (log_level_ >= QnnExecuTorchLogLevel::kLogLevelInfo) { + QNN_EXECUTORCH_LOG_INFO( + "Tensor %s is successfully registered to ION shared memory.", + tensor_wrapper->GetName().c_str()); + } + return Error::Ok; } @@ -92,9 +95,11 @@ Error QnnMemManager::RegisterCustomMem( } tensor_wrapper->SetMemHandle(handle); registered_map_.insert({handle, mem_ptr}); - QNN_EXECUTORCH_LOG_INFO( - "Tensor %s is successfully registered to custom shared memory.", - tensor_wrapper->GetName().c_str()); + if (log_level_ >= QnnExecuTorchLogLevel::kLogLevelInfo) { + QNN_EXECUTORCH_LOG_INFO( + "Tensor %s is successfully registered to custom shared memory.", + tensor_wrapper->GetName().c_str()); + } return Error::Ok; } diff --git a/backends/qualcomm/runtime/backends/QnnMemManager.h b/backends/qualcomm/runtime/backends/QnnMemManager.h index a0bdafab7b5..30bb64d78ad 100644 --- a/backends/qualcomm/runtime/backends/QnnMemManager.h +++ b/backends/qualcomm/runtime/backends/QnnMemManager.h @@ -21,8 +21,11 @@ class QnnMemManager { public: explicit QnnMemManager( const QnnImplementation& implementation, - QnnContext* context) - : implementation_(implementation), context_(context) {} + QnnContext* context, + QnnExecuTorchLogLevel log_level) + : implementation_(implementation), + context_(context), + log_level_(log_level) {} ~QnnMemManager() { DeRegisterMem(); } @@ -63,6 +66,7 @@ class QnnMemManager { const QnnImplementation& implementation_; QnnContext* context_; + QnnExecuTorchLogLevel log_level_; std::unordered_map registered_map_; std::unordered_map pre_registered_handles_; std::unordered_map diff --git a/backends/qualcomm/tests/test_qnn_delegate.py b/backends/qualcomm/tests/test_qnn_delegate.py index 7d097fd45bf..20a9479897b 100644 --- a/backends/qualcomm/tests/test_qnn_delegate.py +++ b/backends/qualcomm/tests/test_qnn_delegate.py @@ -3463,7 +3463,7 @@ def test_llama3_2_1b(self): if self.pre_gen_pte: cmds.extend(["--pre_gen_pte", self.pre_gen_pte]) - golden_start_with = "<|begin_of_text|><|start_header_id|>user<|end_header_id|>" + golden_start_with = "<|start_header_id|>user<|end_header_id|>" p = subprocess.Popen(cmds, stdout=subprocess.DEVNULL) with Listener((self.ip, self.port)) as listener: conn = listener.accept() diff --git a/examples/qualcomm/oss_scripts/llama/CMakeLists.txt b/examples/qualcomm/oss_scripts/llama/CMakeLists.txt index 4d4f1c2e39d..a691cda44d3 100644 --- a/examples/qualcomm/oss_scripts/llama/CMakeLists.txt +++ b/examples/qualcomm/oss_scripts/llama/CMakeLists.txt @@ -28,8 +28,18 @@ list( ${CMAKE_CURRENT_LIST_DIR}/qnn_llama_runner.cpp ${CMAKE_CURRENT_LIST_DIR}/runner/runner.cpp ${CMAKE_CURRENT_LIST_DIR}/runner/runner.h - ${CMAKE_CURRENT_LIST_DIR}/runner/io_manager.cpp - ${CMAKE_CURRENT_LIST_DIR}/runner/io_manager.h + ${CMAKE_CURRENT_LIST_DIR}/runner/decoder_runner.cpp + ${CMAKE_CURRENT_LIST_DIR}/runner/decoder_runner.h + ${CMAKE_CURRENT_LIST_DIR}/runner/prompt_processor.cpp + ${CMAKE_CURRENT_LIST_DIR}/runner/prompt_processor.h + ${CMAKE_CURRENT_LIST_DIR}/runner/token_generator.cpp + ${CMAKE_CURRENT_LIST_DIR}/runner/token_generator.h + ${CMAKE_CURRENT_LIST_DIR}/runner/imem_alloc.h + ${CMAKE_CURRENT_LIST_DIR}/runner/client_mem.h + ${CMAKE_CURRENT_LIST_DIR}/runner/rpc_mem.cpp + ${CMAKE_CURRENT_LIST_DIR}/runner/rpc_mem.h + ${CMAKE_CURRENT_LIST_DIR}/runner/kv_manager.cpp + ${CMAKE_CURRENT_LIST_DIR}/runner/kv_manager.h ) list( @@ -42,7 +52,7 @@ list( # build qnn llama runner add_executable(qnn_llama_runner ${_llama_runner__srcs}) target_include_directories( - qnn_llama_runner PUBLIC ${_common_include_directories} + qnn_llama_runner PUBLIC ${_common_include_directories} ${CMAKE_CURRENT_SOURCE_DIR}/../../../../extension/llm/tokenizers/include ) target_link_options_shared_lib(quantized_ops_lib) diff --git a/examples/qualcomm/oss_scripts/llama/llama.py b/examples/qualcomm/oss_scripts/llama/llama.py index 375edf9fb6c..86de35a4c99 100755 --- a/examples/qualcomm/oss_scripts/llama/llama.py +++ b/examples/qualcomm/oss_scripts/llama/llama.py @@ -403,7 +403,7 @@ def quantize(self, quant_dtype, args, tokenizer, custom_annotations=()): logging.info("Quantizing the model...") calibrate( self.get_example_inputs(self.llama_meta["get_use_kv_cache"]), - args.prompt, + args.prompt[0], fx_graph_module, tokenizer=tokenizer, ar_len=self.llama_meta["get_ar_len"], @@ -828,7 +828,7 @@ def permute(w, heads): return quant_attrs -def inference(args, quant_attrs, pte_filename, runtime_tokenizer_path, pre_gen_pte=""): +def inference(args, pte_filename, runtime_tokenizer_path, pre_gen_pte=""): workspace = f"/data/local/tmp/{getpass.getuser()}/executorch/single_llama" if args.model_mode == "kv": @@ -854,14 +854,13 @@ def post_process(): outputs.append(f.read()) seq_len = args.max_seq_len + multi_prompts = " ".join([f'--prompt "{prompt}"' for prompt in args.prompt]) runner_args = " ".join( [ - f'--prompt "{args.prompt}"', + multi_prompts, f"--eval_mode {eval_mode}", f"--temperature {args.temperature}", f"--system_prompt '{args.system_prompt}'", - f"--logits_scale {quant_attrs['scale']}", - f"--logits_offset {quant_attrs['zero_point']}", ] ) @@ -1004,9 +1003,10 @@ def _build_parser(): parser.add_argument( "--prompt", - help="User prompts for llama.", + help="User prompts for Llama. When multiple prompts are entered, a multi-turn conversation will be initiated. Note that this feature is currently for testing purposes only.", required=True, type=str, + nargs="+", ) parser.add_argument( @@ -1090,7 +1090,7 @@ def _build_parser(): def export_llama(args) -> None: if args.compile_only and args.pre_gen_pte: - exit("Cannot set both compile_only and pre_gen_pte as true") + raise RuntimeError("Cannot set both compile_only and pre_gen_pte as true") if args.model_mode == "kv": pte_filename = "kv_llama_qnn" @@ -1126,29 +1126,15 @@ def export_llama(args) -> None: elif args.kv_updater == "shift_pointer": args.kv_updater = shift_pointer_updater else: - exit(f"Using an unkown kv update {args.kv_updater}") + raise RuntimeError(f"Using an unknown kv update {args.kv_updater}") if args.pre_gen_pte: - quant_attrs = json.load( - open(f"{args.pre_gen_pte}/{pte_filename}_quant_attrs.txt") - ) - inference( - args, quant_attrs, pte_filename, runtime_tokenizer_path, args.pre_gen_pte - ) - exit(f"Finish the running pre_gen_pte from {args.pre_gen_pte}") + inference(args, pte_filename, runtime_tokenizer_path, args.pre_gen_pte) + print(f"Finish the running pre_gen_pte from {args.pre_gen_pte}") + return if args.compile_only: - quant_attrs = compile(args, pte_filename, tokenizer) - if quant_attrs: - json.dump( - { - "scale": quant_attrs["scale"], - "zero_point": quant_attrs["zero_point"], - }, - open(f"{args.artifact}/{pte_filename}_quant_attrs.txt", "w"), - ) - else: - logging.warning("Quant attributes of the logit is None.") + compile(args, pte_filename, tokenizer) if args.ip and args.port != -1: pte_path = f"{args.artifact}/{pte_filename}.pte" @@ -1161,24 +1147,18 @@ def export_llama(args) -> None: } ) ) - exit(f"Finish compile_only and save to {args.artifact}") + print(f"Finish compile_only and save to {args.artifact}") + return + + compile(args, pte_filename, tokenizer) + inference(args, pte_filename, runtime_tokenizer_path) + +def main(): + parser = _build_parser() + args = parser.parse_args() try: - quant_attrs = compile(args, pte_filename, tokenizer) - if quant_attrs: - logging.info( - f"Logit scale: {quant_attrs['scale']}; Logit offset: {quant_attrs['zero_point']}" - ) - json.dump( - { - "scale": quant_attrs["scale"], - "zero_point": quant_attrs["zero_point"], - }, - open(f"{args.artifact}/{pte_filename}_quant_attrs.txt", "w"), - ) - else: - logging.warning("Quant attributes of the logit is None.") - inference(args, quant_attrs, pte_filename, runtime_tokenizer_path) + export_llama(args) except Exception as e: if args.ip and args.port != -1: with Client((args.ip, args.port)) as conn: @@ -1187,12 +1167,6 @@ def export_llama(args) -> None: raise Exception(e) -def main(): - parser = _build_parser() - args = parser.parse_args() - export_llama(args) - - # flake8: noqa: C901 if __name__ == "__main__": main() diff --git a/examples/qualcomm/oss_scripts/llama/qnn_llama_runner.cpp b/examples/qualcomm/oss_scripts/llama/qnn_llama_runner.cpp index f23cf2ec44a..938d298d077 100644 --- a/examples/qualcomm/oss_scripts/llama/qnn_llama_runner.cpp +++ b/examples/qualcomm/oss_scripts/llama/qnn_llama_runner.cpp @@ -34,7 +34,10 @@ DEFINE_string( "inference_speed.txt", "Records inference speed. For CI purpose."); DEFINE_string(tokenizer_path, "tokenizer.bin", "Tokenizer stuff."); -DEFINE_string(prompt, "The answer to the ultimate question is", "Prompt."); +DEFINE_string( + prompt, + "The answer to the ultimate question is", + "User prompts for Llama. When multiple prompts are entered, a multi-turn conversation will be initiated. Note that this feature is currently for testing purposes only."); DEFINE_string( system_prompt, "", @@ -49,10 +52,8 @@ DEFINE_int32( "Total number of tokens to generate (prompt + output)."); DEFINE_int32( eval_mode, - 1, + 0, "0: TokenGenerator(kv) / 1: HybridMode (prefill+kv)"); -DEFINE_double(logits_scale, 0.0, "Logits scale"); -DEFINE_int32(logits_offset, 0, "Logits offset"); DEFINE_string( kv_updater, "How to update kv cache. Choose between SmartMask and ShiftPointer", @@ -72,20 +73,46 @@ std::vector CollectPrompts(int argc, char** argv) { return prompts; } +std::string get_formatted_prompt( + const std::string& prompt, + const std::string& system_prompt, + example::LlamaVersion llama_version) { + std::string formatted_prompt; + switch (llama_version) { + case example::LlamaVersion::kLlama2: + formatted_prompt.append(prompt); + break; + case example::LlamaVersion::kLlama3: + if (!system_prompt.empty()) { + formatted_prompt.append( + "<|start_header_id|>system<|end_header_id|>\n\n"); + formatted_prompt.append(system_prompt); + formatted_prompt.append("<|eot_id|>"); + } + formatted_prompt.append("<|start_header_id|>user<|end_header_id|>\n\n"); + formatted_prompt.append(prompt); + formatted_prompt.append( + "<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"); + break; + default: + ET_CHECK_MSG(false, "unsupported llama version"); + break; + } + return formatted_prompt; +} + int main(int argc, char** argv) { std::vector prompts = CollectPrompts(argc, argv); gflags::ParseCommandLineFlags(&argc, &argv, true); // create llama runner example::Runner runner( - {FLAGS_model_path}, + FLAGS_model_path.c_str(), FLAGS_tokenizer_path.c_str(), FLAGS_performance_output_path.c_str(), - FLAGS_logits_scale, - FLAGS_logits_offset, FLAGS_temperature, FLAGS_eval_mode, - FLAGS_kv_updater, - FLAGS_num_iters); + FLAGS_kv_updater); + auto llama_version = runner.get_llama_version(); std::vector buf; buf.reserve(5 * FLAGS_seq_len); // assume each token is around 5 char std::ofstream fout(FLAGS_output_path.c_str()); @@ -97,8 +124,10 @@ int main(int argc, char** argv) { // generate tokens & store inference output for (int i = 0; i < FLAGS_num_iters; i++) { for (const auto& prompt : prompts) { - runner.generate( - FLAGS_seq_len, prompt.c_str(), FLAGS_system_prompt.c_str(), callback); + std::string formatted_prompt; + formatted_prompt = get_formatted_prompt( + prompt, FLAGS_system_prompt, llama_version.get()); + runner.generate(formatted_prompt.c_str(), FLAGS_seq_len, callback); } } fout.write(buf.data(), buf.size()); diff --git a/examples/qualcomm/oss_scripts/llama/runner/client_mem.h b/examples/qualcomm/oss_scripts/llama/runner/client_mem.h new file mode 100644 index 00000000000..0fd535796de --- /dev/null +++ b/examples/qualcomm/oss_scripts/llama/runner/client_mem.h @@ -0,0 +1,48 @@ +/* + * Copyright (c) Qualcomm Innovation Center, Inc. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include +#include + +namespace example { +/** + * @class ClientMem + * @brief Final class for client buffer allocation, implementing IBufferAlloc + * interface. Used for SHIFT_POINTER mode. + */ +class ClientMem final : public IMemAlloc { + public: + ClientMem(){}; + // Disable copy constructors, r-value referencing, etc + ClientMem(const ClientMem&) = delete; + ClientMem& operator=(const ClientMem&) = delete; + ClientMem(ClientMem&&) = delete; + ClientMem& operator=(ClientMem&&) = delete; + virtual ~ClientMem(){}; + /** + * @brief Allocate buffer of specified size with vector. + * @param data_size Size of the data to allocate. + * @return Pointer to the allocated buffer. + */ + std::byte* allocate(size_t data_size) override { + allocated_buffers_.push_back(std::vector(data_size)); + return allocated_buffers_.back().data(); + }; + // Only used for SMART_MASK mode + void add_memory_info( + void* data_ptr, + size_t data_size, + executorch::runtime::TensorInfo tensor_info) override {}; + + private: + std::vector> allocated_buffers_; +}; + +} // namespace example diff --git a/examples/qualcomm/oss_scripts/llama/runner/decoder_runner.cpp b/examples/qualcomm/oss_scripts/llama/runner/decoder_runner.cpp new file mode 100644 index 00000000000..ec5d9746daa --- /dev/null +++ b/examples/qualcomm/oss_scripts/llama/runner/decoder_runner.cpp @@ -0,0 +1,81 @@ +/* + * Copyright (c) Qualcomm Innovation Center, Inc. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +// Given inputs, run a text decoder and return logits. + +#include + +#include +using executorch::aten::Tensor; +using executorch::extension::Module; +using executorch::extension::llm::Sampler; +using executorch::llm::kTopp; +using executorch::runtime::Error; +using executorch::runtime::EValue; +using executorch::runtime::Result; + +namespace example { + +DecoderRunner::DecoderRunner( + Module* module, + int32_t vocab_size, + float temperature) + : module_(module), + sampler_(std::make_unique( + vocab_size, + temperature, + kTopp, + static_cast(std::time(nullptr)))) {} + +Error DecoderRunner::set_outputs( + const std::string& method_name, + std::vector output_values) { + for (size_t i = 0; i < output_values.size(); ++i) { + ET_CHECK_OK_OR_RETURN_ERROR( + module_->set_output(method_name, output_values[i], i)); + } + return Error::Ok; +} + +Error DecoderRunner::load(const std::vector& method_names) { + if (is_method_loaded(method_names)) { + return Error::Ok; + } + for (const std::string& method_name : method_names) { + ET_CHECK_OK_OR_RETURN_ERROR(module_->load_method(method_name)); + } + return Error::Ok; +} + +bool DecoderRunner::is_method_loaded( + const std::vector& method_names) { + bool method_loaded = true; + for (const std::string& method_name : method_names) { + method_loaded &= module_->is_method_loaded(method_name); + } + return method_loaded; +} + +// This function is functional, meaning it shouldn't modify any state of the +// input. It should be safe to call multiple times with the same inputs. The +// outer loop (call site) is responsible for managing state. +Result DecoderRunner::step( + const std::string& method_name, + std::vector& inputs) { + Result> outputs_res = + module_->execute(method_name, inputs); + ET_CHECK_OK_OR_RETURN_ERROR(outputs_res.error()); + ET_CHECK_MSG( + outputs_res.get()[0].isTensor(), + "Non Tensor Output returned from executing LLM"); + + // Return the logits tensor + return outputs_res.get()[0].toTensor(); +} + +} // namespace example diff --git a/examples/qualcomm/oss_scripts/llama/runner/decoder_runner.h b/examples/qualcomm/oss_scripts/llama/runner/decoder_runner.h new file mode 100644 index 00000000000..888e9acd421 --- /dev/null +++ b/examples/qualcomm/oss_scripts/llama/runner/decoder_runner.h @@ -0,0 +1,80 @@ +/* + * Copyright (c) Qualcomm Innovation Center, Inc. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include +#include +#include + +namespace example { +class DecoderRunner { + public: + DecoderRunner( + executorch::extension::Module* module, + int32_t vocab_size, + float temperature); + /** + * Run LLM text decoder with inputs to generate next token. + * @param inputs The inputs to the LLM Module. + * @return The output of the LLM Module. This will be a tensor of logits. + */ + executorch::runtime::Result step( + const std::string& method_name, + std::vector& inputs); + + /** + * Once KV Cache output data pointer change, need to set + * the output for specify method name in the module. + * @return The error code. + */ + executorch::runtime::Error set_outputs( + const std::string& method_name, + std::vector output_values); + + /** + * Load the Module for text decode purpose. + * @return The error code. + */ + executorch::runtime::Error load(const std::vector& method_names); + /** + * Check if the required methods in the Module is loaded. + * @return True if the Module is loaded, false otherwise. + */ + bool is_method_loaded(const std::vector& method_names); + + /** + * Sample the next token from the logits tensor. + * @param logits_tensor The logits tensor. + * @return The next token. + */ + inline int32_t logits_to_token( + const executorch::aten::Tensor& logits_tensor, + int64_t pos) { + auto* logits = logits_tensor.mutable_data_ptr(); + auto num_tokens = logits_tensor.size(1); + auto vocab_size = logits_tensor.size(2); + static std::vector logits_f(vocab_size); + auto* logits_last = logits; + // offset to the meaningful logit we want for prefill model. + if (num_tokens > 1) { + logits_last += pos * vocab_size; + } + // Discard dequantization (converting uint16_t to float) because the + // relative order of elements remains the same without conversion + for (int i = 0; i < vocab_size; i++) { + logits_f[i] = logits_last[i]; + } + return sampler_->sample(logits_f.data()); + } + + protected: + executorch::extension::Module* module_; + std::unique_ptr sampler_; +}; +} // namespace example diff --git a/examples/qualcomm/oss_scripts/llama/runner/imem_alloc.h b/examples/qualcomm/oss_scripts/llama/runner/imem_alloc.h new file mode 100644 index 00000000000..59680256a29 --- /dev/null +++ b/examples/qualcomm/oss_scripts/llama/runner/imem_alloc.h @@ -0,0 +1,29 @@ +/* + * Copyright (c) Qualcomm Innovation Center, Inc. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once +#include +#include + +namespace example { +/** + * @class IMemAlloc + * @brief Interface for buffer allocation. + */ +class IMemAlloc { + public: + IMemAlloc(){}; + virtual ~IMemAlloc(){}; + virtual std::byte* allocate(size_t data_size) = 0; + virtual void add_memory_info( + void* data_ptr, + size_t data_size, + executorch::runtime::TensorInfo tensor_info) = 0; +}; + +} // namespace example diff --git a/examples/qualcomm/oss_scripts/llama/runner/io_manager.cpp b/examples/qualcomm/oss_scripts/llama/runner/io_manager.cpp deleted file mode 100644 index c2bf7b04fbb..00000000000 --- a/examples/qualcomm/oss_scripts/llama/runner/io_manager.cpp +++ /dev/null @@ -1,1435 +0,0 @@ -/* - * Copyright (c) Qualcomm Innovation Center, Inc. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#include -#include -#include - -using executorch::aten::Tensor; -using executorch::aten::TensorImpl; -using executorch::extension::Module; -using executorch::runtime::Error; -using executorch::runtime::MemoryAllocator; -using executorch::runtime::MethodMeta; -using executorch::runtime::Result; -using executorch::runtime::TensorInfo; - -namespace example { - -IoMgrBase::IoMgrBase(std::vector>& modules) - : data_ptr_(nullptr, [](void*) {}), modules_(modules) {} - -IoMgrBase::~IoMgrBase() {} - -void* IoMgrBase::get_mutable_ptr() { - return data_ptr_.get(); -} - -std::vector IoMgrBase::get_input_tensors( - int shard_index, - const std::string& method_name) { - std::vector ret; - ret.reserve(input_tensors_.size()); - for (TensorImpl* impl : input_tensors_[method_name][shard_index]) { - ret.emplace_back(Tensor(impl)); - } - return ret; -} - -std::vector IoMgrBase::get_output_tensors( - int shard_index, - const std::string& method_name) { - std::vector ret; - ret.reserve(output_tensors_[method_name][shard_index].size()); - for (TensorImpl* impl : output_tensors_[method_name][shard_index]) { - ret.emplace_back(Tensor(impl)); - } - return ret; -} - -ShiftPointerIoMgr::ShiftPointerIoMgr( - std::vector>& modules, - int32_t context_len, - int32_t prefill_ar_len, - int32_t prefill_cache_len, - int32_t kv_ar_len, - int32_t kv_cache_len, - int32_t vocab_size, - int32_t num_layers, - int32_t head_dim, - int32_t num_heads, - EvalMode eval_mode, - const std::string& prefill_forward_name, - const std::string& kv_forward_name, - const bool use_int64_token) - : IoMgrBase(modules), - shard_layers_({num_layers}), - context_len_(context_len), - kv_ar_len_(kv_ar_len), - kv_cache_len_(kv_cache_len), - prefill_ar_len_(prefill_ar_len), - prefill_cache_len_(prefill_cache_len), - vocab_size_(vocab_size), - num_layers_(num_layers), - head_dim_(head_dim), - num_heads_(num_heads), - eval_mode_(eval_mode), - prefill_forward_name_(prefill_forward_name), - kv_forward_name_(kv_forward_name), - use_int64_token_(use_int64_token) { - if (!prefill_forward_name_.empty()) { - input_tensors_[prefill_forward_name_] = - std::vector>(modules.size()); - output_tensors_[prefill_forward_name_] = - std::vector>(modules.size()); - k_cache_in_[prefill_forward_name_] = - std::vector>(); - v_cache_in_[prefill_forward_name_] = - std::vector>(); - k_cache_out_[prefill_forward_name_] = - std::vector>(); - v_cache_out_[prefill_forward_name_] = - std::vector>(); - } - if (!kv_forward_name_.empty()) { - input_tensors_[kv_forward_name_] = - std::vector>(modules.size()); - output_tensors_[kv_forward_name_] = - std::vector>(modules.size()); - k_cache_in_[kv_forward_name_] = - std::vector>(); - v_cache_in_[kv_forward_name_] = - std::vector>(); - k_cache_out_[kv_forward_name_] = - std::vector>(); - v_cache_out_[kv_forward_name_] = - std::vector>(); - } - - data_ptr_ = std::unique_ptr( - new IO, [](void* ptr) { delete static_cast(ptr); }); -} - -void ShiftPointerIoMgr::init_io() { - IO* ptr = static_cast(data_ptr_.get()); - std::memset(ptr, 0, sizeof(IO)); - - int32_t max_ar_len = std::max(kv_ar_len_, prefill_ar_len_); - int32_t k_in_size = (head_dim_ + 1) * kv_cache_len_; - // Use context length to prevent exceeding the range when the AR-N model - // updates the last block in hybrid mode. - int32_t v_cache_size = (num_heads_ + 1) * context_len_ * head_dim_; - int32_t k_cache_out_size = num_heads_ * max_ar_len * head_dim_; - - // Init kv vector shape, general enough to be shared across all modes. - ptr->k_cache_out.reserve(num_layers_); - ptr->v_cache.reserve(num_layers_); - for (int layer = 0; layer < num_layers_; layer++) { - ptr->k_cache_out.emplace_back(std::vector(k_cache_out_size)); - ptr->v_cache.emplace_back(std::vector(v_cache_size)); - } - - auto init_prefill = [&]() { - ptr->prefill_input_toks.resize(prefill_ar_len_, 0); - ptr->prefill_input_pos.resize(prefill_ar_len_, 0); - ptr->prefill_attention_mask.resize((prefill_ar_len_ * context_len_), 0); - ptr->prefill_logits.resize(prefill_ar_len_ * vocab_size_); - }; - - auto init_kv = [&]() { - ptr->kv_logits.resize(kv_ar_len_ * vocab_size_); - ptr->kv_attention_mask.resize((kv_ar_len_ * context_len_), 0); - ptr->k_cache.reserve(num_layers_); - for (int layer = 0; layer < num_layers_; layer++) { - ptr->k_cache.emplace_back(); - ptr->k_cache[layer].reserve(num_heads_); - for (int head = 0; head < num_heads_; head++) { - ptr->k_cache[layer].emplace_back(std::vector(k_in_size)); - } - } - }; - - switch (eval_mode_) { - case EvalMode::kKVCached: - init_kv(); - break; - case EvalMode::kHybrid: - init_prefill(); - init_kv(); - break; - default: - break; - } -} - -void ShiftPointerIoMgr::reset_io( - const std::vector>& prefill_methods_meta, - const std::vector< - executorch::runtime::Result>& - kv_methods_meta) { - IO* ptr = static_cast(data_ptr_.get()); - std::fill(ptr->prefill_input_pos.begin(), ptr->prefill_input_pos.end(), 0); - ptr->kv_input_pos = 0; - std::fill( - ptr->prefill_attention_mask.begin(), - ptr->prefill_attention_mask.end(), - 0); - std::fill(ptr->kv_attention_mask.begin(), ptr->kv_attention_mask.end(), 0); - - input_tensors_[kv_forward_name_].clear(); - input_tensors_[kv_forward_name_].resize(modules_.size()); - output_tensors_[kv_forward_name_].clear(); - output_tensors_[kv_forward_name_].resize(modules_.size()); - - k_cache_in_[kv_forward_name_].clear(); - v_cache_in_[kv_forward_name_].clear(); - k_cache_out_[kv_forward_name_].clear(); - v_cache_out_[kv_forward_name_].clear(); - - input_tensors_[prefill_forward_name_].clear(); - input_tensors_[prefill_forward_name_].resize(modules_.size()); - output_tensors_[prefill_forward_name_].clear(); - output_tensors_[prefill_forward_name_].resize(modules_.size()); - - k_cache_in_[prefill_forward_name_].clear(); - v_cache_in_[prefill_forward_name_].clear(); - k_cache_out_[prefill_forward_name_].clear(); - v_cache_out_[prefill_forward_name_].clear(); - - switch (eval_mode_) { - case EvalMode::kKVCached: - prepare_kv_io(kv_methods_meta); - break; - case EvalMode::kHybrid: - prepare_prefill_io(prefill_methods_meta); - prepare_kv_io(kv_methods_meta); - break; - default: - ET_CHECK_MSG(false, "unsupported mode"); - break; - } -} -void ShiftPointerIoMgr::prepare_kv_io( - const std::vector>& methods_meta) { - for (int i = 0; i < modules_.size(); ++i) { - ET_CHECK_MSG( - methods_meta[i].ok(), - "Failed to get method_meta 0x%x", - static_cast(methods_meta[i].error())); - } - - ET_CHECK_MSG(!(kv_forward_name_.empty()), "kv forward name is empty"); - IO* ptr = static_cast(data_ptr_.get()); - - // [I]: input_tokens - Result kv_input_toks = methods_meta[0]->input_tensor_meta(0); - kv_input_toks_ = std::make_unique( - kv_input_toks->scalar_type(), - kv_input_toks->sizes().size(), - const_cast(kv_input_toks->sizes().data()), - &ptr->kv_input_toks, - const_cast(kv_input_toks->dim_order().data())); - input_tensors_[kv_forward_name_][0].push_back(kv_input_toks_.get()); - - // [I]: atten_mask - Result kv_attention_mask = methods_meta[0]->input_tensor_meta(1); - kv_attention_mask_ = std::make_unique( - kv_attention_mask->scalar_type(), - kv_attention_mask->sizes().size(), - const_cast(kv_attention_mask->sizes().data()), - ptr->kv_attention_mask.data(), - const_cast( - kv_attention_mask->dim_order().data())); - input_tensors_[kv_forward_name_][0].push_back(kv_attention_mask_.get()); - - // [I]: input_pos - Result kv_input_pos = methods_meta[0]->input_tensor_meta(2); - kv_input_pos_ = std::make_unique( - kv_input_pos->scalar_type(), - kv_input_pos->sizes().size(), - const_cast(kv_input_pos->sizes().data()), - &ptr->kv_input_pos, - const_cast(kv_input_pos->dim_order().data())); - input_tensors_[kv_forward_name_][0].push_back(kv_input_pos_.get()); - - // [I] kv_cache - int index = 3; // bypass input_tokens, atten_mask, input_pos - for (int offset = 0, shard_index = 0, v_stride = kv_cache_len_ * head_dim_; - shard_index < modules_.size(); - offset += shard_layers_[shard_index], shard_index++) { - for (int cache_group = 0; cache_group < 2; ++cache_group) { - for (int layer = 0; layer < shard_layers_[shard_index]; ++layer) { - for (int head = 0; head < num_heads_; ++head, ++index) { - Result kv_cache = - methods_meta[shard_index]->input_tensor_meta(index); - std::vector>& cache = - (cache_group == 0 ? k_cache_in_[kv_forward_name_] - : v_cache_in_[kv_forward_name_]); - void* cache_ptr = (cache_group == 0) - ? static_cast(ptr->k_cache[layer + offset][head].data()) - : static_cast( - ptr->v_cache[layer + offset].data() + head * v_stride); - - cache.emplace_back(std::make_unique( - kv_cache->scalar_type(), - kv_cache->sizes().size(), - const_cast(kv_cache->sizes().data()), - cache_ptr, - const_cast( - kv_cache->dim_order().data()))); - input_tensors_[kv_forward_name_][shard_index].push_back( - cache.back().get()); - } - } - } - } - - // [O]: logits - int logit_index = 0; - Result logits = - methods_meta[modules_.size() - 1]->output_tensor_meta(logit_index); - kv_logits_ = std::make_unique( - logits->scalar_type(), - logits->sizes().size(), - const_cast(logits->sizes().data()), - ptr->kv_logits.data(), - const_cast(logits->dim_order().data())); - output_tensors_[kv_forward_name_][modules_.size() - 1].push_back( - kv_logits_.get()); - - // [O] kv_cache - index = 1; - // Iterate through all kv cache outputs. - // For k, we store it in k_cache_out and update to k_cache later. - // For v, we append the output to the end of v_cache, - // which serves as both input and output. - for (int offset = 0, shard_index = 0, v_stride = kv_cache_len_ * head_dim_; - shard_index < modules_.size(); - offset += shard_layers_[shard_index], shard_index++) { - for (int cache_group = 0; cache_group < 2; ++cache_group) { - for (int layer = 0; layer < shard_layers_[shard_index]; ++layer) { - for (int head = 0; head < num_heads_; ++head, ++index) { - Result kv_cache = - methods_meta[shard_index]->output_tensor_meta(index); - std::vector>& cache = - (cache_group == 0 ? k_cache_out_[kv_forward_name_] - : v_cache_out_[kv_forward_name_]); - void* cache_ptr = (cache_group == 0) - ? static_cast( - ptr->k_cache_out[layer + offset].data() + - (head * head_dim_)) - : static_cast( - ptr->v_cache[layer + offset].data() + - (head + 1) * v_stride); - cache.emplace_back(std::make_unique( - kv_cache->scalar_type(), - kv_cache->sizes().size(), - const_cast(kv_cache->sizes().data()), - cache_ptr, - const_cast( - kv_cache->dim_order().data()))); - output_tensors_[kv_forward_name_][shard_index].push_back( - cache.back().get()); - } - } - } - } -} - -void ShiftPointerIoMgr::prepare_prefill_io( - const std::vector>& methods_meta) { - for (int i = 0; i < modules_.size(); ++i) { - ET_CHECK_MSG( - methods_meta[i].ok(), - "Failed to get method_meta 0x%x", - static_cast(methods_meta[i].error())); - } - - ET_CHECK_MSG( - !(prefill_forward_name_.empty()), "prefill forward name is empty"); - - IO* ptr = static_cast(data_ptr_.get()); - - // [I]: prefill_input_tokens - Result prefill_input_toks = methods_meta[0]->input_tensor_meta(0); - prefill_input_toks_ = std::make_unique( - prefill_input_toks->scalar_type(), - prefill_input_toks->sizes().size(), - const_cast(prefill_input_toks->sizes().data()), - ptr->prefill_input_toks.data(), - const_cast( - prefill_input_toks->dim_order().data())); - input_tensors_[prefill_forward_name_][0].push_back(prefill_input_toks_.get()); - // [I]: prefill_attention_mask - for (int i = 0; i < prefill_ar_len_; ++i) { - for (int j = 0, - offset = i * context_len_ + (context_len_ - prefill_ar_len_); - j < prefill_ar_len_; - ++j) { - if (i >= j) { - ptr->prefill_attention_mask[j + offset] = 65535; - } - } - } - Result prefill_attention_mask = - methods_meta[0]->input_tensor_meta(1); - prefill_attention_mask_ = std::make_unique( - prefill_attention_mask->scalar_type(), - prefill_attention_mask->sizes().size(), - const_cast( - prefill_attention_mask->sizes().data()), - ptr->prefill_attention_mask.data(), - const_cast( - prefill_attention_mask->dim_order().data())); - input_tensors_[prefill_forward_name_][0].push_back( - prefill_attention_mask_.get()); - - if (!is_bert()) { - // [I]: prefill_input_pos - Result prefill_input_pos = - methods_meta[0]->input_tensor_meta(2); - prefill_input_pos_ = std::make_unique( - prefill_input_pos->scalar_type(), - prefill_input_pos->sizes().size(), - const_cast(prefill_input_pos->sizes().data()), - ptr->prefill_input_pos.data(), - const_cast( - prefill_input_pos->dim_order().data())); - input_tensors_[prefill_forward_name_][0].push_back( - prefill_input_pos_.get()); - - // [I] kv_cache - int index = 3; // bypass input_tokens, atten_mask, input_pos - // Add prefill offset to align the v_out pointer with the decode model. - for (int offset = 0, - shard_index = 0, - v_stride = kv_cache_len_ * head_dim_, - prefill_offset = (kv_cache_len_ - prefill_cache_len_) * head_dim_; - shard_index < modules_.size(); - offset += shard_layers_[shard_index], shard_index++) { - for (int cache_group = 0; cache_group < 2; ++cache_group) { - for (int layer = 0; layer < shard_layers_[shard_index]; ++layer) { - for (int head = 0; head < num_heads_; ++head, ++index) { - Result kv_cache = - methods_meta[shard_index]->input_tensor_meta(index); - std::vector>& cache = - (cache_group == 0 ? k_cache_in_[prefill_forward_name_] - : v_cache_in_[prefill_forward_name_]); - void* cache_ptr = (cache_group == 0) - ? static_cast(ptr->k_cache[layer + offset][head].data()) - : static_cast( - ptr->v_cache[layer + offset].data() + head * v_stride + - prefill_offset); - - cache.emplace_back(std::make_unique( - kv_cache->scalar_type(), - kv_cache->sizes().size(), - const_cast(kv_cache->sizes().data()), - cache_ptr, - const_cast( - kv_cache->dim_order().data()))); - input_tensors_[prefill_forward_name_][shard_index].push_back( - cache.back().get()); - } - } - } - } - } - // [O]: logits - int logit_index = 0; - Result logits = - methods_meta[modules_.size() - 1]->output_tensor_meta(logit_index); - prefill_logits_ = std::make_unique( - logits->scalar_type(), - logits->sizes().size(), - const_cast(logits->sizes().data()), - ptr->prefill_logits.data(), - const_cast(logits->dim_order().data())); - output_tensors_[prefill_forward_name_][modules_.size() - 1].push_back( - prefill_logits_.get()); - - // [O] kv_cache - int index = 1; - // In hybrid mode, we use kv mode cache len for v stride since we want to - // update prefill's result onto kv modes input. - int32_t prefill_k_stride = prefill_ar_len_ * head_dim_; - int32_t prefill_v_stride = kv_cache_len_ * head_dim_; - - for (int offset = 0, shard_index = 0; shard_index < modules_.size(); - offset += shard_layers_[shard_index], shard_index++) { - for (int cache_group = 0; cache_group < 2; ++cache_group) { - for (int layer = 0; layer < shard_layers_[shard_index]; ++layer) { - for (int head = 0; head < num_heads_; ++head, ++index) { - Result kv_cache = - methods_meta[shard_index]->output_tensor_meta(index); - std::vector>& cache = - (cache_group == 0 ? k_cache_out_[prefill_forward_name_] - : v_cache_out_[prefill_forward_name_]); - void* cache_ptr = (cache_group == 0) - ? static_cast( - ptr->k_cache_out[layer + offset].data() + - head * prefill_k_stride) - : static_cast( - ptr->v_cache[layer + offset].data() + - (head + 1) * prefill_v_stride); - cache.emplace_back(std::make_unique( - kv_cache->scalar_type(), - kv_cache->sizes().size(), - const_cast(kv_cache->sizes().data()), - cache_ptr, - const_cast( - kv_cache->dim_order().data()))); - output_tensors_[prefill_forward_name_][shard_index].push_back( - cache.back().get()); - } - } - } - } -} - -void ShiftPointerIoMgr::update_prefill_to_kv_io( - int64_t cur_token, - int64_t pos, - std::vector>& output_tensors) { - ET_CHECK_MSG(kv_cache_len_ != 0, "k_cache_len_ should not equal to 0"); - IO* ptr = static_cast(data_ptr_.get()); - - ptr->kv_input_toks = - use_int64_token_ ? cur_token : static_cast(cur_token); - ptr->kv_input_pos = static_cast(pos); - // If prompt len is 30, prefill will handle to pos = 30. - // At this point, pos should be 31. - for (int i = 0; i < pos + 1; i++) { - ptr->kv_attention_mask[kv_cache_len_ - i] = 65535; - } - - // update v_cache - std::vector>& v_cache_in = - v_cache_in_[kv_forward_name_]; - std::vector>& v_cache_out = - v_cache_out_[kv_forward_name_]; - for (int i = 0, v_cache_stride = head_dim_ * pos; i < v_cache_in.size(); - i++) { - v_cache_in[i]->set_data( - v_cache_in[i]->mutable_data() + v_cache_stride); - v_cache_out[i]->set_data( - v_cache_out[i]->mutable_data() + v_cache_stride); - } - for (int shard = 0; shard < output_tensors.size(); shard++) { - for (int index = 0; index < output_tensors[shard].size(); index++) { - ET_CHECK_MSG( - modules_[shard]->set_output( - kv_forward_name_, output_tensors[shard][index], index) == - Error::Ok, - "Failed to set output tensor for module %d's %d'th output " - "while updating kv_cache output tensors", - shard, - index); - } - } - - // Update k_cache - std::vector>& k_cache_in = - k_cache_in_[kv_forward_name_]; - std::vector>& k_cache_out = - k_cache_out_[prefill_forward_name_]; - // copy from last to prevent from overwriting values - size_t copied_size = pos * sizeof(uint8_t); - for (int i = 0; i < k_cache_in.size(); ++i) { - uint8_t* ptr_in = k_cache_in[i]->mutable_data(); - if (is_bert()) { - const uint8_t* ptr_out = k_cache_out[i]->data(); - for (size_t j = 0, offset = kv_cache_len_; j < head_dim_; - ++j, offset += kv_cache_len_) { - for (int k = 0, k_stride = j * prefill_ar_len_; k < pos; k++) { - ptr_in[offset + k] = ptr_out[k_stride + k]; - } - } - } else { - for (int j = head_dim_; j > -1; --j) { - memcpy( - ptr_in + j * kv_cache_len_, - ptr_in + j * prefill_cache_len_, - copied_size); - } - } - k_cache_in[i]->set_data(ptr_in + pos); - } -} - -void ShiftPointerIoMgr::update_kv_io( - int64_t cur_token, - int64_t pos, - std::vector>& output_tensors) { - IO* ptr = static_cast(data_ptr_.get()); - // update input_tok - ptr->kv_input_toks = - use_int64_token_ ? cur_token : static_cast(cur_token); - // update position_ids - ptr->kv_input_pos = static_cast(pos); - // update causal mask for next token - ptr->kv_attention_mask[kv_cache_len_ - pos] = 65535; - - // update v_cache - auto& v_cache_in = v_cache_in_[kv_forward_name_]; - auto& v_cache_out = v_cache_out_[kv_forward_name_]; - for (int i = 0; i < v_cache_in.size(); i++) { - v_cache_in[i]->set_data(v_cache_in[i]->mutable_data() + head_dim_); - v_cache_out[i]->set_data( - v_cache_out[i]->mutable_data() + head_dim_); - } - - for (int shard = 0; shard < output_tensors.size(); shard++) { - for (int index = 0; index < output_tensors[shard].size(); index++) { - ET_CHECK_MSG( - modules_[shard]->set_output( - kv_forward_name_, output_tensors[shard][index], index) == - Error::Ok, - "failed to set output tensor for module %d's %d'th output " - "while updating kv_cache output tensors", - shard, - index); - } - } - - auto& k_cache_in = k_cache_in_[kv_forward_name_]; - auto& k_cache_out = k_cache_out_[kv_forward_name_]; - // update k_cache by single thread, this part is cpu cache sensitive - for (int i = 0; i < k_cache_in.size(); ++i) { - uint8_t* ptr_in = k_cache_in[i]->mutable_data(); - const uint8_t* ptr_out = k_cache_out[i]->data(); - for (size_t j = 0, offset = kv_cache_len_; j < head_dim_; - ++j, offset += kv_cache_len_) { - ptr_in[offset] = ptr_out[j]; - } - k_cache_in[i]->set_data(ptr_in + 1); - } -} - -void ShiftPointerIoMgr::update_prefill_io( - int64_t cur_token, - int64_t pos, - std::vector>& output_tensors) { - (void)cur_token; - (void)output_tensors; - - if (!is_bert()) { - // update v_cache - auto& v_cache_in = v_cache_in_[prefill_forward_name_]; - auto& v_cache_out = v_cache_out_[prefill_forward_name_]; - for (int i = 0; i < v_cache_in.size(); i++) { - v_cache_in[i]->set_data( - v_cache_in[i]->mutable_data() + prefill_ar_len_ * head_dim_); - v_cache_out[i]->set_data( - v_cache_out[i]->mutable_data() + - prefill_ar_len_ * head_dim_); - } - - for (int shard = 0; shard < output_tensors.size(); shard++) { - for (int index = 0; index < output_tensors[shard].size(); index++) { - ET_CHECK_MSG( - modules_[shard]->set_output( - prefill_forward_name_, output_tensors[shard][index], index) == - Error::Ok, - "failed to set output tensor for module %d's %d'th output " - "while updating kv_cache output tensors", - shard, - index); - } - } - - auto& k_cache_in = k_cache_in_[prefill_forward_name_]; - auto& k_cache_out = k_cache_out_[prefill_forward_name_]; - // update k_cache by single thread, this part is cpu cache sensitive - for (int i = 0; i < k_cache_in.size(); ++i) { - uint8_t* ptr_in = k_cache_in[i]->mutable_data(); - const uint8_t* ptr_out = k_cache_out[i]->data(); - for (size_t j = 0, offset = prefill_cache_len_; j < head_dim_; - ++j, offset += prefill_cache_len_) { - for (int k = 0, k_stride = j * prefill_ar_len_; k < prefill_ar_len_; - k++) { - ptr_in[offset + k] = ptr_out[k_stride + k]; - } - } - k_cache_in[i]->set_data(ptr_in + prefill_ar_len_); - } - } -} - -void ShiftPointerIoMgr::fill_prefill_toks( - int64_t start_pos, - std::vector& prompt_tokens) { - IO* ptr = static_cast(get_mutable_ptr()); - for (int i = 0; i < prefill_ar_len_; i++) { - if (!is_bert()) { - ptr->prefill_input_pos[i] = start_pos + i; - } - - if (start_pos + i < prompt_tokens.size()) { - // Support CPU 4-bit embedding, which requires int64 input. - // However, for QNN embedding, only int32 input is needed. - // Therefore, we need to cast to the correct type to write the data. - if (use_int64_token_) { - ptr->prefill_input_toks[i] = prompt_tokens[start_pos + i]; - } else { - int32_t* prefill_input_toks_ptr = - reinterpret_cast(ptr->prefill_input_toks.data()); - prefill_input_toks_ptr[i] = - static_cast(prompt_tokens[start_pos + i]); - } - } - if (start_pos >= prefill_ar_len_) { - for (int j = 0, - offset = i * context_len_ + - (context_len_ - prefill_ar_len_ - start_pos); - j < prefill_ar_len_; - ++j) { - ptr->prefill_attention_mask[offset + j] = 65535; - } - } - } -} - -void ShiftPointerIoMgr::fill_kv_tok_mask(int64_t pos, int64_t cur_token) { - IO* ptr = static_cast(get_mutable_ptr()); - ptr->kv_input_toks = - use_int64_token_ ? cur_token : static_cast(cur_token); - ptr->kv_input_pos = static_cast(pos); - ; - ptr->kv_attention_mask[kv_cache_len_] = 65535; -} - -SmartMaskIoMgr::SmartMaskIoMgr( - std::vector>& modules, - int32_t context_len, - int32_t prefill_ar_len, - int32_t prefill_cache_len, - int32_t kv_ar_len, - int32_t kv_cache_len, - int32_t vocab_size, - int32_t num_layers, - int32_t head_dim, - int32_t num_heads, - EvalMode eval_mode, - const std::string& prefill_forward_name, - const std::string& kv_forward_name, - const bool use_int64_token) - : IoMgrBase(modules), - shard_layers_({num_layers}), - context_len_(context_len), - kv_ar_len_(kv_ar_len), - kv_cache_len_(kv_cache_len), - prefill_ar_len_(prefill_ar_len), - prefill_cache_len_(prefill_cache_len), - vocab_size_(vocab_size), - num_layers_(num_layers), - head_dim_(head_dim), - num_heads_(num_heads), - eval_mode_(eval_mode), - prefill_forward_name_(prefill_forward_name), - kv_forward_name_(kv_forward_name), - use_int64_token_(use_int64_token) { - if (!prefill_forward_name_.empty()) { - input_tensors_[prefill_forward_name_] = - std::vector>(modules.size()); - output_tensors_[prefill_forward_name_] = - std::vector>(modules.size()); - k_cache_in_[prefill_forward_name_] = - std::vector>(); - v_cache_in_[prefill_forward_name_] = - std::vector>(); - k_cache_out_[prefill_forward_name_] = - std::vector>(); - v_cache_out_[prefill_forward_name_] = - std::vector>(); - } - if (!kv_forward_name_.empty()) { - input_tensors_[kv_forward_name_] = - std::vector>(modules.size()); - output_tensors_[kv_forward_name_] = - std::vector>(modules.size()); - k_cache_in_[kv_forward_name_] = - std::vector>(); - v_cache_in_[kv_forward_name_] = - std::vector>(); - k_cache_out_[kv_forward_name_] = - std::vector>(); - v_cache_out_[kv_forward_name_] = - std::vector>(); - } - - data_ptr_ = std::unique_ptr( - new IO, [](void* ptr) { delete static_cast(ptr); }); -} - -std::unordered_map SmartMaskIoMgr::get_io_elements() { - int32_t max_ar_len = std::max(kv_ar_len_, prefill_ar_len_); - size_t cache_in_ele = num_layers_ * num_heads_ * head_dim_ * kv_cache_len_; - size_t cache_out_ele = num_layers_ * num_heads_ * head_dim_ * max_ar_len; - return std::unordered_map{ - {"kv_input_toks_ele", kv_ar_len_}, - {"kv_input_pos_ele", kv_ar_len_}, - {"cache_in_ele", cache_in_ele}, - {"cache_out_ele", cache_out_ele}, - {"kv_attention_mask_ele", kv_ar_len_ * context_len_}, - {"kv_logits_ele", kv_ar_len_ * vocab_size_}, - {"prefill_input_toks_ele", prefill_ar_len_}, - {"prefill_input_pos_ele", prefill_ar_len_}, - {"prefill_attention_mask_ele", prefill_ar_len_ * context_len_}, - {"prefill_logits_ele", prefill_ar_len_ * vocab_size_}}; -} - -std::unordered_map SmartMaskIoMgr::get_io_bytes() { - std::unordered_map element_map = get_io_elements(); - auto align = [](size_t byte) { - size_t alignment = MemoryAllocator::kDefaultAlignment; - return byte % alignment == 0 ? byte - : byte + - (static_cast(alignment) - - byte % static_cast(alignment)); - }; - return std::unordered_map{ - {"kv_input_toks_bytes", - align(element_map["kv_input_toks_ele"] * sizeof(int32_t))}, - {"kv_input_pos_bytes", - align(element_map["kv_input_pos_ele"] * sizeof(int32_t))}, - {"cache_in_bytes", align(element_map["cache_in_ele"] * sizeof(uint8_t))}, - {"cache_out_bytes", - align(element_map["cache_out_ele"] * sizeof(uint8_t))}, - {"kv_attention_mask_bytes", - align(element_map["kv_attention_mask_ele"] * sizeof(uint16_t))}, - {"kv_logits_bytes", - align(element_map["kv_logits_ele"] * sizeof(uint16_t))}, - {"prefill_input_toks_bytes", - align(element_map["prefill_input_toks_ele"] * sizeof(int32_t))}, - {"prefill_input_pos_bytes", - align(element_map["prefill_input_pos_ele"] * sizeof(int32_t))}, - {"prefill_attention_mask_bytes", - align(element_map["prefill_attention_mask_ele"] * sizeof(uint16_t))}, - {"prefill_logits_bytes", - align(element_map["prefill_logits_ele"] * sizeof(uint16_t))}}; -} - -void SmartMaskIoMgr::IO::init_io_ptrs( - void* shared_buffer_ptr, - std::unordered_map& io_bytes_map) { - shared_buffer_base = shared_buffer_ptr; - std::byte* cur_ptr = reinterpret_cast(shared_buffer_base); - std::size_t cur_pos = 0; - size_t layered_head_count = num_layers_ * num_heads_; - - // Iterate map so that we don't need to care about which mode is used. - for (const auto& iter : io_bytes_map) { - std::string key = iter.first; - size_t size = iter.second; - if (key == "kv_input_toks_bytes") { - kv_input_toks = reinterpret_cast(cur_ptr); - } else if (key == "kv_input_pos_bytes") { - kv_input_pos = reinterpret_cast(cur_ptr); - } else if (key == "cache_in_bytes" || key == "cache_out_bytes") { - auto& k_cache_ref = (key == "cache_in_bytes") ? k_cache : k_cache_out; - auto& v_cache_ref = (key == "cache_in_bytes") ? v_cache : v_cache_out; - size_t single_head_size = size / layered_head_count; - k_cache_ref.reserve(num_layers_); - v_cache_ref.reserve(num_layers_); - for (int i = 0; i < num_layers_; ++i) { - k_cache_ref[i].reserve(num_heads_); - v_cache_ref[i].reserve(num_heads_); - for (int j = 0; j < num_heads_; ++j) { - k_cache_ref[i][j] = reinterpret_cast(cur_ptr); - io_pos_map[cur_ptr] = cur_pos; - cur_ptr += single_head_size; - cur_pos += single_head_size; - v_cache_ref[i][j] = reinterpret_cast(cur_ptr); - io_pos_map[cur_ptr] = cur_pos; - cur_ptr += single_head_size; - cur_pos += single_head_size; - } - } - continue; - } else if (key == "kv_attention_mask_bytes") { - kv_attention_mask = reinterpret_cast(cur_ptr); - } else if (key == "kv_logits_bytes") { - kv_logits = reinterpret_cast(cur_ptr); - } else if (key == "prefill_input_toks_bytes") { - prefill_input_toks = reinterpret_cast(cur_ptr); - } else if (key == "prefill_input_pos_bytes") { - prefill_input_pos = reinterpret_cast(cur_ptr); - } else if (key == "prefill_attention_mask_bytes") { - prefill_attention_mask = reinterpret_cast(cur_ptr); - } else if (key == "prefill_logits_bytes") { - prefill_logits = reinterpret_cast(cur_ptr); - } else { - ET_LOG(Error, "Unknown pointer type: %s", key.c_str()); - } - - io_pos_map[cur_ptr] = cur_pos; - cur_ptr += size; - cur_pos += size; - } -} - -void SmartMaskIoMgr::IO::add_custom_mem_info( - void* ptr, - size_t nbytes, - executorch::aten::ScalarType scalar_type, - executorch::runtime::TensorInfo& tensor_info) { - if (auto it = io_pos_map.find(static_cast(ptr)); - it == io_pos_map.end()) { - ET_LOG(Error, "Shared buffer pointer %p is not found", ptr); - } - size_t pos = io_pos_map[static_cast(ptr)]; - uint32_t rank = tensor_info.sizes().size(); - uint32_t shape[rank]; - CustomMemTensorInfo info = { - shared_buffer_base, ptr, pos, nbytes, shape, rank, scalar_type}; - QnnExecuTorchAddCustomMemTensorInfo(info); -} - -void SmartMaskIoMgr::init_io() { - std::unordered_map io_bytes_map = get_io_bytes(); - - switch (eval_mode_) { - case EvalMode::kKVCached: - io_bytes_map.erase("prefill_input_toks_bytes"); - io_bytes_map.erase("prefill_input_pos_bytes"); - io_bytes_map.erase("prefill_attention_mask_bytes"); - io_bytes_map.erase("prefill_logits_bytes"); - break; - case EvalMode::kHybrid: - break; - default: - break; - } - - size_t total_bytes = 0; - for (const auto& iter : io_bytes_map) { - size_t size = iter.second; - if (iter.first == "cache_in_bytes" || iter.first == "cache_out_bytes") { - size = iter.second * 2; - } - total_bytes += size; - } - void* shared_ptr = QnnExecuTorchAllocCustomMem( - total_bytes, MemoryAllocator::kDefaultAlignment); - - ET_CHECK_MSG( - shared_ptr, - "Allocate Rpc mem falied, bytes=%zu, alignment=%zu", - total_bytes, - MemoryAllocator::kDefaultAlignment); - IO* ptr = static_cast(data_ptr_.get()); - ptr->num_heads_ = num_heads_; - ptr->num_layers_ = num_layers_; - ptr->head_dim_ = head_dim_; - ptr->init_io_ptrs(shared_ptr, io_bytes_map); -} - -void SmartMaskIoMgr::reset_io( - const std::vector>& prefill_methods_meta, - const std::vector< - executorch::runtime::Result>& - kv_methods_meta) { - IO* ptr = static_cast(data_ptr_.get()); - int32_t prefill_attn_size = prefill_ar_len_ * context_len_; - int32_t kv_attn_size = kv_ar_len_ * context_len_; - std::fill( - ptr->prefill_attention_mask, - ptr->prefill_attention_mask + prefill_attn_size, - 0); - std::fill(ptr->kv_attention_mask, ptr->kv_attention_mask + kv_attn_size, 0); -} - -void SmartMaskIoMgr::prepare_kv_io( - const std::vector>& methods_meta) { - for (int i = 0; i < modules_.size(); ++i) { - ET_CHECK_MSG( - methods_meta[i].ok(), - "Failed to get method_meta 0x%x", - static_cast(methods_meta[i].error())); - } - - ET_CHECK_MSG(!(kv_forward_name_.empty()), "kv forward name is empty"); - IO* ptr = static_cast(data_ptr_.get()); - std::unordered_map io_bytes_map = get_io_bytes(); - - // [I]: input_tokens - Result kv_input_toks = methods_meta[0]->input_tensor_meta(0); - kv_input_toks_ = std::make_unique( - kv_input_toks->scalar_type(), - kv_input_toks->sizes().size(), - const_cast(kv_input_toks->sizes().data()), - ptr->kv_input_toks, - const_cast(kv_input_toks->dim_order().data())); - input_tensors_[kv_forward_name_][0].push_back(kv_input_toks_.get()); - ptr->add_custom_mem_info( - ptr->kv_input_toks, - io_bytes_map["kv_input_toks_bytes"], - kv_input_toks->scalar_type(), - kv_input_toks.get()); - - // [I]: atten_mask - std::fill_n(ptr->kv_attention_mask, kv_ar_len_ * context_len_, 0); - Result kv_attention_mask = methods_meta[0]->input_tensor_meta(1); - kv_attention_mask_ = std::make_unique( - kv_attention_mask->scalar_type(), - kv_attention_mask->sizes().size(), - const_cast(kv_attention_mask->sizes().data()), - ptr->kv_attention_mask, - const_cast( - kv_attention_mask->dim_order().data())); - input_tensors_[kv_forward_name_][0].push_back(kv_attention_mask_.get()); - ptr->add_custom_mem_info( - ptr->kv_attention_mask, - io_bytes_map["kv_attention_mask_bytes"], - kv_attention_mask->scalar_type(), - kv_attention_mask.get()); - - // [I]: input_pos - Result kv_input_pos = methods_meta[0]->input_tensor_meta(2); - kv_input_pos_ = std::make_unique( - kv_input_pos->scalar_type(), - kv_input_pos->sizes().size(), - const_cast(kv_input_pos->sizes().data()), - ptr->kv_input_pos, - const_cast(kv_input_pos->dim_order().data())); - input_tensors_[kv_forward_name_][0].push_back(kv_input_pos_.get()); - ptr->add_custom_mem_info( - ptr->kv_input_pos, - io_bytes_map["kv_input_pos_bytes"], - kv_input_pos->scalar_type(), - kv_input_pos.get()); - - // [I] kv_cache - size_t layered_head_count = num_layers_ * num_heads_; - int index = 3; // bypass input_tokens, atten_mask, input_pos - for (int offset = 0, shard_index = 0; shard_index < modules_.size(); - offset += shard_layers_[shard_index], shard_index++) { - for (int cache_group = 0; cache_group < 2; ++cache_group) { - for (int layer = 0; layer < shard_layers_[shard_index]; ++layer) { - for (int head = 0; head < num_heads_; ++head, ++index) { - Result kv_cache = - methods_meta[shard_index]->input_tensor_meta(index); - std::vector>& cache = - (cache_group == 0 ? k_cache_in_[kv_forward_name_] - : v_cache_in_[kv_forward_name_]); - uint8_t* cache_ptr = (cache_group == 0) - ? ptr->k_cache[layer + offset][head] - : ptr->v_cache[layer + offset][head]; - - cache.emplace_back(std::make_unique( - kv_cache->scalar_type(), - kv_cache->sizes().size(), - const_cast(kv_cache->sizes().data()), - cache_ptr, - const_cast( - kv_cache->dim_order().data()))); - ptr->add_custom_mem_info( - cache_ptr, - io_bytes_map["cache_in_bytes"] / layered_head_count, - kv_cache->scalar_type(), - kv_cache.get()); - input_tensors_[kv_forward_name_][shard_index].push_back( - cache.back().get()); - } - } - } - } - - // [O]: logits - int logit_index = 0; - Result logits = - methods_meta[modules_.size() - 1]->output_tensor_meta(logit_index); - kv_logits_ = std::make_unique( - logits->scalar_type(), - logits->sizes().size(), - const_cast(logits->sizes().data()), - ptr->kv_logits, - const_cast(logits->dim_order().data())); - - ptr->add_custom_mem_info( - ptr->kv_logits, - io_bytes_map["kv_logits_bytes"], - logits->scalar_type(), - logits.get()); - output_tensors_[kv_forward_name_][modules_.size() - 1].push_back( - kv_logits_.get()); - - // [O] kv_cache - index = 1; - for (int offset = 0, shard_index = 0; shard_index < modules_.size(); - offset += shard_layers_[shard_index], shard_index++) { - for (int cache_group = 0; cache_group < 2; ++cache_group) { - for (int layer = 0; layer < shard_layers_[shard_index]; ++layer) { - for (int head = 0; head < num_heads_; ++head, ++index) { - Result kv_cache = - methods_meta[shard_index]->output_tensor_meta(index); - std::vector>& cache = - (cache_group == 0 ? k_cache_out_[kv_forward_name_] - : v_cache_out_[kv_forward_name_]); - uint8_t* cache_ptr = (cache_group == 0) - ? ptr->k_cache_out[layer + offset][head] - : ptr->v_cache_out[layer + offset][head]; - cache.emplace_back(std::make_unique( - kv_cache->scalar_type(), - kv_cache->sizes().size(), - const_cast(kv_cache->sizes().data()), - cache_ptr, - const_cast( - kv_cache->dim_order().data()))); - ptr->add_custom_mem_info( - cache_ptr, - io_bytes_map["cache_out_bytes"] / layered_head_count, - kv_cache->scalar_type(), - kv_cache.get()); - output_tensors_[kv_forward_name_][shard_index].push_back( - cache.back().get()); - } - } - } - } -} - -void SmartMaskIoMgr::update_kv_io( - int64_t cur_token, - int64_t pos, - std::vector>& output_tensors) { - IO* ptr = static_cast(data_ptr_.get()); - // update input_tok - *ptr->kv_input_toks = - use_int64_token_ ? cur_token : static_cast(cur_token); - // update position_ids - *ptr->kv_input_pos = static_cast(pos); - // update smart mask for previous cache - ptr->kv_attention_mask[pos] = 65535; - - // update v_cache - auto& v_cache_in = v_cache_in_[kv_forward_name_]; - auto& v_cache_out = v_cache_out_[kv_forward_name_]; - // update v_cache by single thread, this part is cpu cache sensitive - for (int i = 0; i < v_cache_in.size(); ++i) { - uint8_t* ptr_in = v_cache_in[i]->mutable_data() + pos * head_dim_; - const uint8_t* ptr_out = v_cache_out[i]->data(); - memcpy(ptr_in, ptr_out, head_dim_ * sizeof(uint8_t)); - } - - auto& k_cache_in = k_cache_in_[kv_forward_name_]; - auto& k_cache_out = k_cache_out_[kv_forward_name_]; - for (int i = 0; i < k_cache_in.size(); ++i) { - uint8_t* ptr_in = k_cache_in[i]->mutable_data() + pos; - const uint8_t* ptr_out = k_cache_out[i]->data(); - for (size_t j = 0, offset = 0; j < head_dim_; - ++j, offset += kv_cache_len_) { - ptr_in[offset] = ptr_out[j]; - } - } -} - -void SmartMaskIoMgr::prepare_prefill_io( - const std::vector>& methods_meta) { - for (int i = 0; i < modules_.size(); ++i) { - ET_CHECK_MSG( - methods_meta[i].ok(), - "Failed to get method_meta 0x%x", - static_cast(methods_meta[i].error())); - } - - ET_CHECK_MSG( - !(prefill_forward_name_.empty()), "prefill forward name is empty"); - - IO* ptr = static_cast(data_ptr_.get()); - std::unordered_map io_bytes_map = get_io_bytes(); - - // [I]: pre_input_tokens - Result prefill_input_toks = methods_meta[0]->input_tensor_meta(0); - prefill_input_toks_ = std::make_unique( - prefill_input_toks->scalar_type(), - prefill_input_toks->sizes().size(), - const_cast(prefill_input_toks->sizes().data()), - ptr->prefill_input_toks, - const_cast( - prefill_input_toks->dim_order().data())); - input_tensors_[prefill_forward_name_][0].push_back(prefill_input_toks_.get()); - ptr->add_custom_mem_info( - ptr->prefill_input_toks, - io_bytes_map["prefill_input_toks_bytes"], - executorch::aten::ScalarType::Int, - prefill_input_toks.get()); - - // [I]: prefill_attention_mask - for (int i = 0; i < prefill_ar_len_; ++i) { - for (int j = 0, - offset = i * context_len_ + (context_len_ - prefill_ar_len_); - j < prefill_ar_len_; - ++j) { - if (i < j) { - ptr->prefill_attention_mask[j + offset] = 0; - } else { - ptr->prefill_attention_mask[j + offset] = 65535; - } - } - } - Result prefill_attention_mask = - methods_meta[0]->input_tensor_meta(1); - prefill_attention_mask_ = std::make_unique( - prefill_attention_mask->scalar_type(), - prefill_attention_mask->sizes().size(), - const_cast( - prefill_attention_mask->sizes().data()), - ptr->prefill_attention_mask, - const_cast( - prefill_attention_mask->dim_order().data())); - input_tensors_[prefill_forward_name_][0].push_back( - prefill_attention_mask_.get()); - ptr->add_custom_mem_info( - ptr->prefill_attention_mask, - io_bytes_map["prefill_attention_mask_bytes"], - executorch::aten::ScalarType::Bits16, - prefill_attention_mask.get()); - - if (!is_bert()) { - // [I]: prefill_input_pos - Result prefill_input_pos = - methods_meta[0]->input_tensor_meta(2); - prefill_input_pos_ = std::make_unique( - prefill_input_pos->scalar_type(), - prefill_input_pos->sizes().size(), - const_cast(prefill_input_pos->sizes().data()), - ptr->prefill_input_pos, - const_cast( - prefill_input_pos->dim_order().data())); - input_tensors_[prefill_forward_name_][0].push_back( - prefill_input_pos_.get()); - ptr->add_custom_mem_info( - ptr->prefill_input_pos, - io_bytes_map["prefill_input_pos_bytes"], - prefill_input_pos->scalar_type(), - prefill_input_pos.get()); - - // [I] kv_cache - size_t layered_head_count = num_layers_ * num_heads_; - int index = 3; // bypass input_tokens, atten_mask, input_pos - for (int offset = 0, shard_index = 0; shard_index < modules_.size(); - offset += shard_layers_[shard_index], shard_index++) { - for (int cache_group = 0; cache_group < 2; ++cache_group) { - for (int layer = 0; layer < shard_layers_[shard_index]; ++layer) { - for (int head = 0; head < num_heads_; ++head, ++index) { - Result kv_cache = - methods_meta[shard_index]->input_tensor_meta(index); - std::vector>& cache = - (cache_group == 0 ? k_cache_in_[prefill_forward_name_] - : v_cache_in_[prefill_forward_name_]); - uint8_t* cache_ptr = (cache_group == 0) - ? ptr->k_cache[layer + offset][head] - : ptr->v_cache[layer + offset][head]; - - cache.emplace_back(std::make_unique( - kv_cache->scalar_type(), - kv_cache->sizes().size(), - const_cast(kv_cache->sizes().data()), - cache_ptr, - const_cast( - kv_cache->dim_order().data()))); - ptr->add_custom_mem_info( - cache_ptr, - io_bytes_map["cache_in_bytes"] / layered_head_count, - kv_cache->scalar_type(), - kv_cache.get()); - input_tensors_[prefill_forward_name_][shard_index].push_back( - cache.back().get()); - } - } - } - } - } - - // [O]: logits - int logit_index = 0; - Result logits = methods_meta[0]->output_tensor_meta(logit_index); - prefill_logits_ = std::make_unique( - logits->scalar_type(), - logits->sizes().size(), - const_cast(logits->sizes().data()), - ptr->prefill_logits, - const_cast(logits->dim_order().data())); - output_tensors_[prefill_forward_name_][modules_.size() - 1].push_back( - prefill_logits_.get()); - ptr->add_custom_mem_info( - ptr->prefill_logits, - io_bytes_map["prefill_logits_bytes"], - executorch::aten::ScalarType::Bits16, - logits.get()); - - // [O] kv_cache - int index = 1; - size_t layered_head_count = num_layers_ * num_heads_; - for (int offset = 0, shard_index = 0; shard_index < modules_.size(); - offset += shard_layers_[shard_index], shard_index++) { - for (int cache_group = 0; cache_group < 2; ++cache_group) { - for (int layer = 0; layer < shard_layers_[shard_index]; ++layer) { - for (int head = 0; head < num_heads_; ++head, ++index) { - Result kv_cache = - methods_meta[shard_index]->output_tensor_meta(index); - std::vector>& cache = - (cache_group == 0 ? k_cache_out_[prefill_forward_name_] - : v_cache_out_[prefill_forward_name_]); - void* cache_ptr = (cache_group == 0) - ? ptr->k_cache_out[layer + offset][head] - : ptr->v_cache_out[layer + offset][head]; - cache.emplace_back(std::make_unique( - kv_cache->scalar_type(), - kv_cache->sizes().size(), - const_cast(kv_cache->sizes().data()), - cache_ptr, - const_cast( - kv_cache->dim_order().data()))); - ptr->add_custom_mem_info( - cache_ptr, - io_bytes_map["cache_out_bytes"] / layered_head_count, - executorch::aten::ScalarType::Byte, - kv_cache.get()); - output_tensors_[prefill_forward_name_][shard_index].push_back( - cache.back().get()); - } - } - } - } -} - -void SmartMaskIoMgr::update_prefill_to_kv_io( - int64_t cur_token, - int64_t pos, - std::vector>& output_tensors) { - IO* ptr = static_cast(data_ptr_.get()); - - *ptr->kv_input_toks = - use_int64_token_ ? cur_token : static_cast(cur_token); - *ptr->kv_input_pos = static_cast(pos); - // pos means the cur_token pos - for (int i = 0; i < pos; i++) { - ptr->kv_attention_mask[i] = 65535; - } - - if (is_bert()) { - // update v_cache - auto& v_cache_in = v_cache_in_[kv_forward_name_]; - auto& v_cache_out = v_cache_out_[prefill_forward_name_]; - // update v_cache by single thread, this part is cpu cache sensitive - size_t copied_size = kv_cache_len_ * head_dim_ * sizeof(uint8_t); - for (int i = 0; i < v_cache_in.size(); ++i) { - uint8_t* ptr_in = v_cache_in[i]->mutable_data(); - const uint8_t* ptr_out = v_cache_out[i]->data(); - memcpy(ptr_in, ptr_out, copied_size); - } - - auto& k_cache_in = k_cache_in_[kv_forward_name_]; - auto& k_cache_out = k_cache_out_[prefill_forward_name_]; - for (int i = 0; i < k_cache_in.size(); ++i) { - uint8_t* ptr_in = k_cache_in[i]->mutable_data(); - const uint8_t* ptr_out = k_cache_out[i]->data(); - for (size_t j = 0, offset = 0; j < head_dim_; - ++j, offset += kv_cache_len_) { - for (size_t k = 0, k_stride = j * prefill_ar_len_; k < pos; k++) { - ptr_in[offset + k] = ptr_out[k_stride + k]; - } - } - } - } else { - // Update K is enough, copy from last to prevent from overwriting values - size_t copied_size = pos * sizeof(uint8_t); - for (int l = 0; l < num_layers_; l++) { - for (int h = 0; h < num_heads_; h++) { - uint8_t* k_cache = ptr->k_cache[l][h]; - for (int hd = head_dim_ - 1; hd > -1; hd--) { - memcpy( - k_cache + (kv_cache_len_ * hd), - k_cache + (prefill_cache_len_ * hd), - copied_size); - } - } - } - } -} - -void SmartMaskIoMgr::update_prefill_io( - int64_t cur_token, - int64_t pos, - std::vector>& output_tensors) { - (void)output_tensors; - - if (!is_bert()) { - // update v_cache - auto& v_cache_in = v_cache_in_[prefill_forward_name_]; - auto& v_cache_out = v_cache_out_[prefill_forward_name_]; - // update v_cache by single thread, this part is cpu cache sensitive - size_t copied_size = prefill_ar_len_ * head_dim_ * sizeof(uint8_t); - for (int i = 0; i < v_cache_in.size(); ++i) { - uint8_t* ptr_in = - v_cache_in[i]->mutable_data() + pos * head_dim_; - const uint8_t* ptr_out = v_cache_out[i]->data(); - memcpy(ptr_in, ptr_out, copied_size); - } - - auto& k_cache_in = k_cache_in_[prefill_forward_name_]; - auto& k_cache_out = k_cache_out_[prefill_forward_name_]; - for (int i = 0; i < k_cache_in.size(); ++i) { - uint8_t* ptr_in = k_cache_in[i]->mutable_data(); - const uint8_t* ptr_out = k_cache_out[i]->data(); - for (size_t j = 0, offset = pos; j < head_dim_; - ++j, offset += prefill_cache_len_) { - for (size_t k = 0, k_stride = j * prefill_ar_len_; k < prefill_ar_len_; - k++) { - ptr_in[offset + k] = ptr_out[k_stride + k]; - } - } - } - } -} - -void SmartMaskIoMgr::fill_prefill_toks( - int64_t start_pos, - std::vector& prompt_tokens) { - IO* ptr = static_cast(get_mutable_ptr()); - for (int i = 0; i < prefill_ar_len_; i++) { - if (!is_bert()) { - ptr->prefill_input_pos[i] = start_pos + i; - } - - if (start_pos + i < prompt_tokens.size()) { - // Support CPU 4-bit embedding, which requires int64 input. - // However, for QNN embedding, only int32 input is needed. - // Therefore, we need to cast to the correct type to write the data. - if (use_int64_token_) { - ptr->prefill_input_toks[i] = prompt_tokens[start_pos + i]; - } else { - int32_t* prefill_input_toks_ptr = - reinterpret_cast(ptr->prefill_input_toks); - prefill_input_toks_ptr[i] = - static_cast(prompt_tokens[start_pos + i]); - } - } - if (start_pos >= prefill_ar_len_) { - for (int j = 0, offset = i * context_len_ + (start_pos - prefill_ar_len_); - j < prefill_ar_len_; - ++j) { - ptr->prefill_attention_mask[offset + j] = 65535; - } - } - } -} - -void SmartMaskIoMgr::fill_kv_tok_mask(int64_t pos, int64_t cur_token) { - IO* ptr = static_cast(get_mutable_ptr()); - *ptr->kv_input_toks = - use_int64_token_ ? cur_token : static_cast(cur_token); - ptr->kv_attention_mask[kv_cache_len_] = 65535; -} - -} // namespace example diff --git a/examples/qualcomm/oss_scripts/llama/runner/io_manager.h b/examples/qualcomm/oss_scripts/llama/runner/io_manager.h deleted file mode 100644 index 0f10eef8ddc..00000000000 --- a/examples/qualcomm/oss_scripts/llama/runner/io_manager.h +++ /dev/null @@ -1,340 +0,0 @@ -/* - * Copyright (c) Qualcomm Innovation Center, Inc. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#pragma once - -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include - -namespace example { - -enum EvalMode { - kKVCached = 0, - kHybrid, - kUnsupported, -}; -class IoMgrBase { - public: - IoMgrBase( - std::vector>& modules); - virtual ~IoMgrBase(); - virtual void init_io() = 0; - virtual void reset_io( - const std::vector>& prefill_methods_meta, - const std::vector< - executorch::runtime::Result>& - kv_methods_meta) = 0; - virtual void prepare_prefill_io( - const std::vector< - executorch::runtime::Result>& - methods_meta) = 0; - virtual void prepare_kv_io( - const std::vector< - executorch::runtime::Result>& - methods_meta) = 0; - virtual void fill_prefill_toks( - int64_t start_pos, - std::vector& prompt_tokens) = 0; - virtual void fill_kv_tok_mask(int64_t pos, int64_t cur_token) = 0; - virtual void update_prefill_to_kv_io( - int64_t cur_token, - int64_t pos, - std::vector>& output_tensors) = 0; - virtual void update_kv_io( - int64_t cur_token, - int64_t pos, - std::vector>& output_tensors) = 0; - virtual void update_prefill_io( - int64_t cur_token, - int64_t pos, - std::vector>& output_tensors) = 0; - void* get_mutable_ptr(); - std::vector get_input_tensors( - int shard_index, - const std::string& method_name); - std::vector get_output_tensors( - int shard_index, - const std::string& method_name); - - protected: - std::unique_ptr data_ptr_; - std::unordered_map< - std::string, - std::vector>> - input_tensors_; - std::unordered_map< - std::string, - std::vector>> - output_tensors_; - std::vector> modules_; -}; - -class ShiftPointerIoMgr : public IoMgrBase { - public: - ShiftPointerIoMgr( - std::vector>& modules, - int32_t context_len, - int32_t prefill_ar_len, - int32_t prefill_cache_len, - int32_t kv_ar_len, - int32_t kv_cache_len, - int32_t vocab_size, - int32_t num_layers, - int32_t head_dim, - int32_t num_heads, - EvalMode eval_mode, - const std::string& prefill_forward_name, - const std::string& kv_forward_name, - const bool use_int64_token); - - void init_io() override; - void reset_io( - const std::vector>& prefill_methods_meta, - const std::vector< - executorch::runtime::Result>& - kv_methods_meta) override; - void prepare_prefill_io( - const std::vector< - executorch::runtime::Result>& - methods_meta) override; - void prepare_kv_io( - const std::vector< - executorch::runtime::Result>& - methods_meta) override; - void fill_prefill_toks( - int64_t start_pos, - std::vector& prompt_tokens) override; - void fill_kv_tok_mask(int64_t pos, int64_t cur_token) override; - void update_prefill_to_kv_io( - int64_t cur_token, - int64_t pos, - std::vector>& output_tensors) - override; - void update_kv_io( - int64_t cur_token, - int64_t pos, - std::vector>& output_tensors) - override; - void update_prefill_io( - int64_t cur_token, - int64_t pos, - std::vector>& output_tensors) - override; - struct IO { - int64_t kv_input_toks; - int32_t kv_input_pos; - std::vector>> k_cache; - std::vector> v_cache; - std::vector> k_cache_out; - std::vector kv_attention_mask; - std::vector kv_logits; - std::vector prefill_input_toks; - std::vector prefill_input_pos; - std::vector prefill_attention_mask; - std::vector prefill_logits; - }; - - private: - // If the cache length is zero, it indicates a BERT model, which does not use - // position ids or KV cache inputs. - bool is_bert() const { - return prefill_cache_len_ == 0; - } - std::unique_ptr kv_input_toks_; - std::unique_ptr kv_input_pos_; - std::unique_ptr kv_attention_mask_; - std::unique_ptr prefill_input_toks_; - std::unique_ptr prefill_input_pos_; - std::unique_ptr prefill_attention_mask_; - std::unique_ptr prefill_logits_; - std::unordered_map< - std::string, - std::vector>> - k_cache_in_; - std::unordered_map< - std::string, - std::vector>> - v_cache_in_; - std::unordered_map< - std::string, - std::vector>> - k_cache_out_; - std::unordered_map< - std::string, - std::vector>> - v_cache_out_; - std::unique_ptr kv_logits_; - std::vector shard_layers_; - int32_t context_len_{0}; - int32_t kv_ar_len_{0}; - int32_t kv_cache_len_{0}; - int32_t prefill_ar_len_{0}; - int32_t prefill_cache_len_{0}; - int32_t vocab_size_; - int32_t num_layers_; - int32_t head_dim_; - int32_t num_heads_; - EvalMode eval_mode_; - std::string prefill_forward_name_; - std::string kv_forward_name_; - const bool use_int64_token_{false}; -}; - -class SmartMaskIoMgr : public IoMgrBase { - public: - SmartMaskIoMgr( - std::vector>& modules, - int32_t context_len, - int32_t prefill_ar_len, - int32_t prefill_cache_len, - int32_t kv_ar_len, - int32_t kv_cache_len, - int32_t vocab_size, - int32_t num_layers, - int32_t head_dim, - int32_t num_heads, - EvalMode eval_mode, - const std::string& prefill_forward_name, - const std::string& kv_forward_name, - const bool use_int64_token); - - void init_io() override; - void reset_io( - const std::vector>& prefill_methods_meta, - const std::vector< - executorch::runtime::Result>& - kv_methods_meta) override; - void prepare_prefill_io( - const std::vector< - executorch::runtime::Result>& - methods_meta) override; - void prepare_kv_io( - const std::vector< - executorch::runtime::Result>& - methods_meta) override; - void fill_prefill_toks( - int64_t start_pos, - std::vector& prompt_tokens) override; - void fill_kv_tok_mask(int64_t pos, int64_t cur_token) override; - void update_prefill_to_kv_io( - int64_t cur_token, - int64_t pos, - std::vector>& output_tensors) - override; - void update_kv_io( - int64_t cur_token, - int64_t pos, - std::vector>& output_tensors) - override; - void update_prefill_io( - int64_t cur_token, - int64_t pos, - std::vector>& output_tensors) - override; - - std::unordered_map get_io_elements(); - std::unordered_map get_io_bytes(); - - struct IO { - void* shared_buffer_base; - int64_t* kv_input_toks; - int32_t* kv_input_pos; - // layer -> head -> head_dim * seq_len - std::vector> k_cache; - std::vector> v_cache; - // layer -> head -> head_dim - std::vector> k_cache_out; - std::vector> v_cache_out; - // kv_ar_len_ * context_len_ - uint16_t* kv_attention_mask; - // kv_ar_len_ * vocab_size - uint16_t* kv_logits; - // prefill_ar_len_ - int64_t* prefill_input_toks; - int32_t* prefill_input_pos; - // prefill_ar_len_ * context_len_ - uint16_t* prefill_attention_mask; - // vocab_size * prefill_ar_len_ - uint16_t* prefill_logits; - - size_t num_layers_; - size_t num_heads_; - size_t head_dim_; - std::unordered_map io_pos_map; - ~IO() { - QnnExecuTorchFreeCustomMem(shared_buffer_base); - } - void init_io_ptrs( - void* shared_buffer_ptr, - std::unordered_map& io_bytes_map); - void add_custom_mem_info( - void* ptr, - size_t nbytes, - executorch::aten::ScalarType scalar_type, - executorch::runtime::TensorInfo& tensor_info); - }; - - private: - // If the cache length is zero, it indicates a BERT model, which does not use - // position ids or KV cache inputs. - bool is_bert() const { - return prefill_cache_len_ == 0; - } - std::unique_ptr kv_input_toks_; - std::unique_ptr kv_input_pos_; - std::unique_ptr kv_attention_mask_; - std::unique_ptr prefill_input_toks_; - std::unique_ptr prefill_input_pos_; - std::unique_ptr prefill_attention_mask_; - std::unique_ptr prefill_logits_; - std::unordered_map< - std::string, - std::vector>> - k_cache_in_; - std::unordered_map< - std::string, - std::vector>> - v_cache_in_; - std::unordered_map< - std::string, - std::vector>> - k_cache_out_; - std::unordered_map< - std::string, - std::vector>> - v_cache_out_; - std::unique_ptr kv_logits_; - std::vector shard_layers_; - int32_t context_len_{0}; - int32_t kv_ar_len_{0}; - int32_t kv_cache_len_{0}; - int32_t prefill_ar_len_{0}; - int32_t prefill_cache_len_{0}; - int32_t vocab_size_; - int32_t num_layers_; - int32_t head_dim_; - int32_t num_heads_; - EvalMode eval_mode_; - std::string prefill_forward_name_; - std::string kv_forward_name_; - const bool use_int64_token_{false}; -}; - -} // namespace example diff --git a/examples/qualcomm/oss_scripts/llama/runner/kv_manager.cpp b/examples/qualcomm/oss_scripts/llama/runner/kv_manager.cpp new file mode 100644 index 00000000000..ca155204dee --- /dev/null +++ b/examples/qualcomm/oss_scripts/llama/runner/kv_manager.cpp @@ -0,0 +1,370 @@ +/* + * Copyright (c) Qualcomm Innovation Center, Inc. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include +namespace example { +KVManager::KVManager(KVManagerMode kv_updater, Metadata metadata) + : kv_updater_(kv_updater), metadata_(metadata) { + k_cache_.resize( + metadata_.num_layers, std::vector(metadata_.num_heads)); + v_cache_.resize( + metadata_.num_layers, std::vector(metadata_.num_heads)); + + // Calculate cache size + switch (kv_updater_) { + case KVManagerMode::SMART_MASK: { + size_t cache_in_bytes = metadata_.num_layers * metadata_.num_heads * + metadata_.head_dim * metadata_.max_cache_len * sizeof(uint8_t); + size_t cache_out_bytes = metadata_.num_layers * metadata_.num_heads * + metadata_.head_dim * metadata_.max_ar_len * sizeof(uint8_t); + total_cache_size_ = 2 * (cache_in_bytes + cache_out_bytes); + break; + } + case KVManagerMode::SHIFT_POINTER: { + size_t k_cache_in_bytes = metadata_.num_layers * metadata_.num_heads * + (metadata_.head_dim + 1) * metadata_.max_cache_len * sizeof(uint8_t); + size_t k_cache_out_bytes = metadata_.num_layers * metadata_.num_heads * + metadata_.head_dim * metadata_.max_ar_len * sizeof(uint8_t); + // Use the same memory for input and output of value cache in shift + // pointer mode. Note that using context length to prevent exceeding the + // range when the AR-N model updates the last block in shift pointer + // mode. + size_t v_cache_bytes = metadata_.num_layers * (metadata_.num_heads + 1) * + metadata_.head_dim * metadata_.context_len * sizeof(uint8_t); + total_cache_size_ = k_cache_in_bytes + k_cache_out_bytes + v_cache_bytes; + break; + } + default: + break; + } +}; + +void KVManager::init_attention_mask( + uint16_t* attention_mask, + const std::vector& attention_map, + int32_t ar_len, + int32_t n_past) { + ET_CHECK_MSG( + attention_map.size() == ar_len, + "The size of attention_map (%zu) doesn't match with ar_len (%d)", + attention_map.size(), + ar_len); + uint16_t neg_val = 0; + uint16_t pos_val = 65535; + // Clear the attention mask + std::fill_n(attention_mask, ar_len * metadata_.context_len, neg_val); + + // SMART_MASK requires special handling of attention mask + switch (kv_updater_) { + case KVManagerMode::SMART_MASK: { + uint16_t* past_ptr = attention_mask; + uint16_t* new_ptr = attention_mask + (metadata_.context_len - ar_len); + // All inputs will necessarily attend to n_past and itself + for (int i = 0; i < ar_len; i++) { + // Iterate across ar_len + if (attention_map[i] < 0) { + // If negative, attend to only past tokens + std::fill_n(past_ptr, n_past, pos_val); + } else { + // If positive, copy attention map from (relative to 0th input) parent + // Parent token index + const int32_t pidx = attention_map[i]; + uint16_t* parent_ptr = attention_mask + pidx * metadata_.context_len; + std::memcpy( + past_ptr, parent_ptr, metadata_.context_len * sizeof(uint16_t)); + } + // Attend to itself + new_ptr[i] = pos_val; + past_ptr += metadata_.context_len; + new_ptr += metadata_.context_len; + } + break; + } + case KVManagerMode::SHIFT_POINTER: { + // Only fill in ar_len. Rest will be padding + const size_t attn_row_start = metadata_.context_len - n_past - ar_len; + for (int i = 0; i < ar_len; i++) { + uint16_t* cur_ptr = + attention_mask + i * metadata_.context_len + attn_row_start; + // Attend to itself + cur_ptr[n_past + i] = pos_val; + if (attention_map[i] < 0) { + // If negative, attend to only past tokens + std::fill_n(cur_ptr, n_past, pos_val); + } else { + // If positive, copy attention map from (relative to 0th input) parent + // Parent token index + const int32_t pidx = attention_map[i]; + uint16_t* parent_ptr = + attention_mask + pidx * metadata_.context_len + attn_row_start; + std::memcpy( + cur_ptr, parent_ptr, (n_past + pidx + 1) * sizeof(uint16_t)); + } + } + break; + } + default: + break; + } +} + +void KVManager::update_attention_mask( + uint16_t* attention_mask, + int32_t ar_len, + int32_t n_past, + int32_t n_update) { + uint16_t pos_val = 65535; + uint16_t* cur_ptr = attention_mask; + if (kv_updater_ == KVManagerMode::SMART_MASK) + cur_ptr += n_past; + if (kv_updater_ == KVManagerMode::SHIFT_POINTER) + cur_ptr += metadata_.context_len - n_past - ar_len - n_update; + + for (int i = 0; i < ar_len; i++) { + std::fill_n(cur_ptr, n_update, pos_val); + cur_ptr += metadata_.context_len; + } +} + +void KVManager::init_cache(IMemAlloc* buffer_manager, int32_t ar_len) { + cur_ar_len_ = ar_len; + const size_t max_in_cache_block_in_bytes = + metadata_.max_cache_len * sizeof(uint8_t); + const size_t max_out_cache_block_in_bytes = + metadata_.max_ar_len * sizeof(uint8_t); + + switch (kv_updater_) { + case KVManagerMode::SMART_MASK: { + const size_t cache_in_bytes = + metadata_.head_dim * max_in_cache_block_in_bytes; + const size_t cache_out_bytes = + metadata_.head_dim * max_out_cache_block_in_bytes; + for (int layer = 0; layer < metadata_.num_layers; ++layer) { + for (int head = 0; head < metadata_.num_heads; ++head) { + // Allocate buffer for key cache and value cache + uint8_t* single_layer_k_cache_in = reinterpret_cast( + buffer_manager->allocate(cache_in_bytes)); + uint8_t* single_layer_k_cache_out = reinterpret_cast( + buffer_manager->allocate(cache_out_bytes)); + uint8_t* single_layer_v_cache_in = reinterpret_cast( + buffer_manager->allocate(cache_in_bytes)); + uint8_t* single_layer_v_cache_out = reinterpret_cast( + buffer_manager->allocate(cache_out_bytes)); + + k_cache_[layer][head].buffer = single_layer_k_cache_in; + k_cache_[layer][head].output_buffer = single_layer_k_cache_out; + v_cache_[layer][head].buffer = single_layer_v_cache_in; + v_cache_[layer][head].output_buffer = single_layer_v_cache_out; + } + } + break; + } + case KVManagerMode::SHIFT_POINTER: { + const size_t k_cache_in_size_in_bytes = metadata_.num_heads * + (metadata_.head_dim + 1) * max_in_cache_block_in_bytes; + const size_t k_cache_out_size_in_bytes = metadata_.num_heads * + metadata_.head_dim * max_out_cache_block_in_bytes; + const size_t v_cache_size_in_bytes = (metadata_.num_heads + 1) * + metadata_.head_dim * metadata_.context_len * sizeof(uint8_t); + const int32_t single_head_size_in = + metadata_.head_dim * metadata_.max_cache_len; + const int32_t single_head_size_out = + metadata_.head_dim * metadata_.max_ar_len; + for (int layer = 0; layer < metadata_.num_layers; ++layer) { + // Allocate buffer for key cache and value cache + uint8_t* single_layer_k_cache_in = reinterpret_cast( + buffer_manager->allocate(k_cache_in_size_in_bytes)); + uint8_t* single_layer_k_cache_out = reinterpret_cast( + buffer_manager->allocate(k_cache_out_size_in_bytes)); + // Note that using context length to prevent exceeding the range when + // the AR-N model updates the last block in shift pointer mode. + uint8_t* single_layer_v_cache = reinterpret_cast( + buffer_manager->allocate(v_cache_size_in_bytes)); + for (int head = 0; head < metadata_.num_heads; ++head) { + k_cache_[layer][head].buffer = single_layer_k_cache_in + + head * (metadata_.head_dim + 1) * metadata_.max_cache_len; + k_cache_[layer][head].output_buffer = + single_layer_k_cache_out + head * single_head_size_out; + // v_cache: + // |cache_gap|h1_v_in_ptr|cache_len|h1_v_out_ptr|cache_gap|h2_v_in_ptr|cache_len|h2_v_out_ptr|...| + const int32_t cache_gap = (cur_ar_len_ == metadata_.context_len) + ? 0 + : metadata_.max_cache_len - (metadata_.context_len - cur_ar_len_); + v_cache_[layer][head].buffer = single_layer_v_cache + + head * single_head_size_in + cache_gap * metadata_.head_dim; + v_cache_[layer][head].output_buffer = + single_layer_v_cache + (head + 1) * single_head_size_in; + } + } + break; + } + default: + break; + } +} + +void KVManager::rearrange_cache(int32_t ar_len_dst) { + // Don't need to rearrange if cur_ar_len_ is equal to target ar_len + if (cur_ar_len_ == ar_len_dst) + return; + for (int layer = 0; layer < metadata_.num_layers; ++layer) { + for (int head = 0; head < metadata_.num_heads; ++head) { + rearrange_key(k_cache_[layer][head], ar_len_dst); + rearrange_value(v_cache_[layer][head], ar_len_dst); + } + } + // rearrange done. + cur_ar_len_ = ar_len_dst; +} + +void KVManager::rearrange_key(KVCache& k_cache, int32_t ar_len_dst) { + // The output of key cache doesn't need to rearrange for both of SMART_MASK + // and SHIFT_POINTER + const int32_t src_cache_num = (cur_ar_len_ == metadata_.context_len) + ? metadata_.context_len + : metadata_.context_len - cur_ar_len_; + const int32_t dst_cache_num = metadata_.context_len - ar_len_dst; + uint8_t* k_cache_in_read_ptr = k_cache.buffer; + uint8_t* k_cache_in_write_ptr = k_cache.buffer; + + if (src_cache_num > dst_cache_num) { + if (kv_updater_ == KVManagerMode::SHIFT_POINTER) { + // Left padded KV$ + k_cache_in_read_ptr += src_cache_num; + k_cache_in_write_ptr += dst_cache_num; + } + // copy from first dimension + for (int i = 0; i < metadata_.head_dim; i++) { + std::memmove(k_cache_in_write_ptr, k_cache_in_read_ptr, dst_cache_num); + k_cache_in_read_ptr += src_cache_num; + k_cache_in_write_ptr += dst_cache_num; + } + } else { + k_cache_in_read_ptr += (metadata_.head_dim - 1) * src_cache_num; + k_cache_in_write_ptr += (metadata_.head_dim - 1) * dst_cache_num; + if (kv_updater_ == KVManagerMode::SHIFT_POINTER) { + k_cache_in_read_ptr += src_cache_num; + k_cache_in_write_ptr += dst_cache_num; + } + // copy from last dimension + for (int i = 0; i < metadata_.head_dim; i++) { + std::memmove(k_cache_in_write_ptr, k_cache_in_read_ptr, src_cache_num); + k_cache_in_read_ptr -= src_cache_num; + k_cache_in_write_ptr -= dst_cache_num; + } + } +} + +void KVManager::rearrange_value(KVCache& v_cache, int32_t ar_len_dst) { + // The input and output of the value cache don't need to rearrange for both + // SMART_MASK and SHIFT_POINTER. However, the input pointer of the value cache + // needs to be reset by ar_len_dst in SHIFT_POINTER mode. The output pointer + // of the value cache remains unchanged regardless of ar_len. + const int32_t ar_gap = (cur_ar_len_ == metadata_.context_len) + ? ar_len_dst + : ar_len_dst - cur_ar_len_; + if (kv_updater_ == KVManagerMode::SHIFT_POINTER) { + v_cache.buffer = v_cache.buffer + ar_gap * metadata_.head_dim; + } +} + +bool KVManager::update_cache_tensor( + std::vector>>& + k_cache_in, + std::vector>>& + k_cache_out, + std::vector>>& + v_cache_in, + std::vector>>& + v_cache_out, + int32_t ar_len, + int32_t n_past) { + ET_CHECK_MSG( + cur_ar_len_ == ar_len, + "Current AR length (%d) is not matched with target AR length (%d). Please rearrange cache first.", + cur_ar_len_, + ar_len); + bool updated = false; + // Data pointer in the tensors need to update only for SHIFT_POINTER mode + // The BERT model does not update the cache tensor because it does not use KV + // cache inputs. + if (kv_updater_ == KVManagerMode::SHIFT_POINTER && + metadata_.context_len != cur_ar_len_) { + for (int layer = 0; layer < metadata_.num_layers; ++layer) { + for (int head = 0; head < metadata_.num_heads; ++head) { + k_cache_in[layer][head]->set_data( + k_cache_[layer][head].buffer + n_past); + v_cache_in[layer][head]->set_data( + v_cache_[layer][head].buffer + n_past * metadata_.head_dim); + v_cache_out[layer][head]->set_data( + v_cache_[layer][head].output_buffer + n_past * metadata_.head_dim); + } + } + updated = true; + } + return updated; +} + +void KVManager::update_cache(int32_t ar_len, int32_t n_past, int32_t n_update) { + ET_CHECK_MSG( + cur_ar_len_ == ar_len, + "Current AR length (%d) is not matched with target AR length (%d). Please rearrange cache first.", + cur_ar_len_, + ar_len); + for (int layer = 0; layer < metadata_.num_layers; ++layer) { + for (int head = 0; head < metadata_.num_heads; ++head) { + update_key(k_cache_[layer][head], n_past, n_update); + update_value(v_cache_[layer][head], n_past, n_update); + } + } +} + +void KVManager::update_key(KVCache& k_cache, int32_t n_past, int32_t n_update) { + uint8_t* write_ptr = k_cache.buffer; + uint8_t* read_ptr = k_cache.output_buffer; + const int32_t copy_size = n_update * sizeof(uint8_t); + const int32_t iter_size = (cur_ar_len_ == metadata_.context_len) + ? metadata_.context_len + : metadata_.context_len - cur_ar_len_; + const int32_t out_size = cur_ar_len_; + const int32_t past_size = n_past; + const int32_t n_iter = metadata_.head_dim; + + if (kv_updater_ == KVManagerMode::SHIFT_POINTER) + write_ptr += iter_size + past_size; + if (kv_updater_ == KVManagerMode::SMART_MASK) + write_ptr += past_size; + + for (int i = 0; i < n_iter; ++i) { + std::memcpy(write_ptr, read_ptr, copy_size); + write_ptr += iter_size; + read_ptr += out_size; + } +} + +void KVManager::update_value( + KVCache& v_cache, + int32_t n_past, + int32_t n_update) { + // Value cache doesn't need to copy for SHIFT_POINTER mode + if (kv_updater_ == KVManagerMode::SHIFT_POINTER) + return; + + uint8_t* write_ptr = v_cache.buffer; + uint8_t* read_ptr = v_cache.output_buffer; + const int32_t copy_size = n_update * metadata_.head_dim * sizeof(uint8_t); + const int32_t past_size = n_past * metadata_.head_dim; + + if (kv_updater_ == KVManagerMode::SMART_MASK) + write_ptr += past_size; + + std::memcpy(write_ptr, read_ptr, copy_size); +} + +} // namespace example diff --git a/examples/qualcomm/oss_scripts/llama/runner/kv_manager.h b/examples/qualcomm/oss_scripts/llama/runner/kv_manager.h new file mode 100644 index 00000000000..1a3beb35f97 --- /dev/null +++ b/examples/qualcomm/oss_scripts/llama/runner/kv_manager.h @@ -0,0 +1,155 @@ +/* + * Copyright (c) Qualcomm Innovation Center, Inc. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once +#include +#include +#include +#include + +namespace example { + +// Structure to hold key-value cache buffers +struct KVCache { + uint8_t* buffer; + uint8_t* output_buffer; +}; + +// Enumeration for key-value manager modes +enum KVManagerMode { SMART_MASK = 0x0, SHIFT_POINTER = 0x1 }; +/** + * @class KVManager + * @brief Class for kv cache update, rearrangement, and buffer allocatation. + */ +class KVManager { + public: + struct Metadata { + int32_t context_len; + int64_t head_dim; + int32_t max_ar_len; + int32_t max_cache_len; + int64_t num_heads; + int64_t num_layers; + }; + KVManager(KVManagerMode kv_updater, Metadata metadata); + + /** + * @brief Allocate buffer for KV cache and set the cur_ar_len_. + * @param buffer_manager Pointer to IMemAlloc instance which depends on + * kv_updater. + * @param ar_len Length of input tokens. + */ + void init_cache(IMemAlloc* buffer_manager, int32_t ar_len); + + /** + * @brief Switch key and value cache from AR-cur to AR-dst. + * @param ar_len_dst Target length of input tokens. + */ + void rearrange_cache(int32_t ar_len_dst); + + /** + * @brief Initialize attention mask based on kv manager mode, and attention + * map. + * For example, + * ar_len = 4, CL = 6, n_past = 0, + * attention map: {-1, 0, 1, 2} and SMART_MASK. + * Attention_mask will be: + * [ 0 0 65535 0 0 0 ] + * [ 0 0 65535 65535 0 0 ] + * [ 0 0 65535 65535 65535 0 ] + * [ 0 0 65535 65535 65535 65535 ] + * @param attention_mask Pointer to the attention mask array to be + * initialized. + * @param attention_map Vector containing the attention map values. The shape + * of attention map should be [ar_len]. + * @param ar_len Length of input tokens. + * @param n_past Number of past elements in the cache. + */ + void init_attention_mask( + uint16_t* attention_mask, + const std::vector& attention_map, + int32_t ar_len, + int32_t n_past); + + /** + * @brief Update attention mask based on kv manager mode, and n_update. + * @param attention_mask Pointer to the attention mask array to be + * initialized. + * @param ar_len Length of input tokens. + * @param n_past Number of past elements in the cache. + * @param n_update Number of elements to be updated. + */ + void update_attention_mask( + uint16_t* attention_mask, + int32_t ar_len, + int32_t n_past, + int32_t n_update); + + /** + * @brief Reset the data pointer of the I/O cache tensor based on number of + * past cache, kv manager mode, current ar length and KV cache data pointer + * for SHIFT_POINTER mode. + * @param k_cache_in Reference to the input key cache TensorImpl vector. + * @param k_cache_out Reference to the output key cache TensorImpl vector. + * @param v_cache_in Reference to the input value cache TensorImpl vector. + * @param v_cache_out Reference to the output value cache TensorImpl vector. + * @param ar_len Length of input tokens. + * @param n_past Number of past elements in the cache. + * @return Returns true if the data pointer is updated; otherwise, returns + * false. + */ + bool update_cache_tensor( + std::vector>>& + k_cache_in, + std::vector>>& + k_cache_out, + std::vector>>& + v_cache_in, + std::vector>>& + v_cache_out, + int32_t ar_len, + int32_t n_past); + + /** + * @brief Based on cur_ar_len_ to update cache + * @param ar_len Length of input tokens. + * @param n_past Number of past elements in the cache. + * @param n_update Number of elements to be updated. + */ + void update_cache(int32_t ar_len, int32_t n_past, int32_t n_update); + + const std::vector>& get_k_cache_() const { + return k_cache_; + } + const std::vector>& get_v_cache_() const { + return v_cache_; + } + + inline const size_t total_cache_size_in_bytes() const { + return total_cache_size_; + } + + private: + // Helper functions to rearrange and update key and value caches + void rearrange_key(KVCache& k_cache, int32_t ar_len_dst); + void rearrange_value(KVCache& v_cache, int32_t ar_len_dst); + void update_key(KVCache& k_cache, int32_t n_past, int32_t n_update); + void update_value(KVCache& v_cache, int32_t n_past, int32_t n_update); + KVManagerMode kv_updater_; + + // metadata + Metadata metadata_; + size_t total_cache_size_; + int32_t cur_ar_len_; + // Store start pointer of k and v cache for input and output + // input: layer -> head -> head_dim * max_cache_len + // output: layer -> head -> head_dim * max_ar_len + std::vector> k_cache_; + std::vector> v_cache_; +}; +} // namespace example diff --git a/examples/qualcomm/oss_scripts/llama/runner/prompt_processor.cpp b/examples/qualcomm/oss_scripts/llama/runner/prompt_processor.cpp new file mode 100644 index 00000000000..37dce8f06c4 --- /dev/null +++ b/examples/qualcomm/oss_scripts/llama/runner/prompt_processor.cpp @@ -0,0 +1,273 @@ +/* + * Copyright (c) Qualcomm Innovation Center, Inc. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include +using executorch::aten::TensorImpl; +using executorch::runtime::MethodMeta; +using executorch::runtime::Result; +using executorch::runtime::TensorInfo; + +namespace example { +PromptProcessor::PromptProcessor( + DecoderRunner* decoder_runner, + KVManager* kv_manager, + const std::string& method_name, + Metadata metadata) + : decoder_runner_(decoder_runner), + kv_manager_(kv_manager), + method_name_(method_name), + metadata_(metadata) { + k_cache_in_.resize(metadata_.num_layers); + v_cache_in_.resize(metadata_.num_layers); + k_cache_out_.resize(metadata_.num_layers); + v_cache_out_.resize(metadata_.num_layers); + // Calculate I/O size + input_toks_.size = metadata_.ar_len * sizeof(int64_t); + if (is_bert()) + input_pos_.size = 0; + else + input_pos_.size = metadata_.ar_len * sizeof(int32_t); + attention_mask_.size = + metadata_.ar_len * metadata_.context_len * sizeof(uint16_t); + logits_.size = metadata_.ar_len * metadata_.vocab_size * sizeof(uint16_t); +}; +void PromptProcessor::init_io( + IMemAlloc* buffer_manager, + Result method_meta) { + input_tensors_.reserve(method_meta->num_inputs()); + output_tensors_.reserve(method_meta->num_outputs()); + // [I]: input_tokens + Result input_toks = method_meta->input_tensor_meta(0); + input_toks_.data = + reinterpret_cast(buffer_manager->allocate(input_toks_.size)); + input_toks_.tensor = std::make_unique( + input_toks->scalar_type(), + input_toks->sizes().size(), + const_cast(input_toks->sizes().data()), + input_toks_.data, + const_cast(input_toks->dim_order().data())); + input_tensors_.emplace_back(input_toks_.tensor.get()); + buffer_manager->add_memory_info( + input_toks_.data, input_toks_.size, input_toks.get()); + + // [I]: attention_mask + Result attention_mask = method_meta->input_tensor_meta(1); + attention_mask_.data = reinterpret_cast( + buffer_manager->allocate(attention_mask_.size)); + attention_mask_.tensor = std::make_unique( + attention_mask->scalar_type(), + attention_mask->sizes().size(), + const_cast(attention_mask->sizes().data()), + attention_mask_.data, + const_cast( + attention_mask->dim_order().data())); + input_tensors_.emplace_back(attention_mask_.tensor.get()); + buffer_manager->add_memory_info( + attention_mask_.data, attention_mask_.size, attention_mask.get()); + + if (!is_bert()) { + // [I]: input_pos + Result input_pos = method_meta->input_tensor_meta(2); + input_pos_.data = + reinterpret_cast(buffer_manager->allocate(input_pos_.size)); + input_pos_.tensor = std::make_unique( + input_pos->scalar_type(), + input_pos->sizes().size(), + const_cast(input_pos->sizes().data()), + input_pos_.data, + const_cast(input_pos->dim_order().data())); + input_tensors_.emplace_back(input_pos_.tensor.get()); + buffer_manager->add_memory_info( + input_pos_.data, input_pos_.size, input_pos.get()); + + // [I] kv_cache + int index = 3; // bypass input_tokens, atten_mask, input_pos + for (int cache_group = 0; cache_group < 2; ++cache_group) { + std::vector>>& cache = + (cache_group == 0 ? k_cache_in_ : v_cache_in_); + std::vector> cache_ptrs = (cache_group == 0) + ? kv_manager_->get_k_cache_() + : kv_manager_->get_v_cache_(); + for (int layer = 0; layer < metadata_.num_layers; ++layer) { + for (int head = 0; head < metadata_.num_heads; ++head, ++index) { + Result kv_cache = method_meta->input_tensor_meta(index); + + uint8_t* cache_ptr = cache_ptrs[layer][head].buffer; + + cache[layer].emplace_back(std::make_unique( + kv_cache->scalar_type(), + kv_cache->sizes().size(), + const_cast(kv_cache->sizes().data()), + cache_ptr, + const_cast( + kv_cache->dim_order().data()))); + input_tensors_.emplace_back(cache[layer][head].get()); + buffer_manager->add_memory_info( + cache_ptr, cache[layer][head]->nbytes(), kv_cache.get()); + } + } + } + } + + // [O]: logits + Result logits = method_meta->output_tensor_meta(0); + logits_.data = + reinterpret_cast(buffer_manager->allocate(logits_.size)); + logits_.tensor = std::make_unique( + logits->scalar_type(), + logits->sizes().size(), + const_cast(logits->sizes().data()), + logits_.data, + const_cast(logits->dim_order().data())); + output_tensors_.emplace_back(logits_.tensor.get()); + buffer_manager->add_memory_info(logits_.data, logits_.size, logits.get()); + + // [O] kv_cache + int index = 1; + for (int cache_group = 0; cache_group < 2; ++cache_group) { + std::vector>>& cache = + (cache_group == 0 ? k_cache_out_ : v_cache_out_); + std::vector> cache_ptrs = (cache_group == 0) + ? kv_manager_->get_k_cache_() + : kv_manager_->get_v_cache_(); + for (int layer = 0; layer < metadata_.num_layers; ++layer) { + for (int head = 0; head < metadata_.num_heads; ++head, ++index) { + Result kv_cache = method_meta->output_tensor_meta(index); + uint8_t* cache_ptr = cache_ptrs[layer][head].output_buffer; + cache[layer].emplace_back(std::make_unique( + kv_cache->scalar_type(), + kv_cache->sizes().size(), + const_cast(kv_cache->sizes().data()), + cache_ptr, + const_cast( + kv_cache->dim_order().data()))); + output_tensors_.emplace_back(cache[layer][head].get()); + buffer_manager->add_memory_info( + cache_ptr, cache[layer][head]->nbytes(), kv_cache.get()); + } + } + } + // Prepare the vector of EValue to run inference + inputs_.reserve(input_tensors_.size()); + for (auto& input_tensor : input_tensors_) { + inputs_.emplace_back(std::move(input_tensor)); + } +} + +void PromptProcessor::prepare_io( + const std::vector& prompt_tokens, + int64_t prompt_pos, + int64_t start_pos) { + for (int i = 0; i < metadata_.ar_len; i++) { + if (!is_bert()) { + // Prepare pos data + input_pos_.data[i] = start_pos + i; + } + + // Prepare input token data + if (prompt_pos + i < prompt_tokens.size()) { + // Support CPU 4-bit embedding, which requires int64 input. + // However, for QNN embedding, only int32 input is needed. + // Therefore, we need to cast to the correct type to write the data. + if (metadata_.use_int64_token) { + input_toks_.data[i] = prompt_tokens[prompt_pos + i]; + } else { + int32_t* input_toks_ptr = reinterpret_cast(input_toks_.data); + input_toks_ptr[i] = static_cast(prompt_tokens[prompt_pos + i]); + } + } + } +} + +Result PromptProcessor::prefill( + std::vector prompt_tokens, + int64_t start_pos) { + ET_CHECK_MSG(!prompt_tokens.empty(), "Prompt cannot be null"); + + // Calculate number of blocks + int32_t num_prompt_tokens = prompt_tokens.size(); + if (!is_bert()) { + ET_CHECK_MSG( + (start_pos + num_prompt_tokens) <= + (metadata_.context_len - metadata_.ar_len), + "The sequence length exceeds the maximum limit that the prompt processor can handle."); + } else { + ET_CHECK_MSG( + start_pos == 0, "Bert model doesn't support multi-turn conversation."); + } + + // store the token + int64_t cur_token; + int64_t prompt_pos = 0; + int64_t pos = start_pos; + int32_t n_update = metadata_.ar_len; + int num_iters = 1 + ((num_prompt_tokens - 1) / metadata_.ar_len); + ET_LOG( + Info, + "Prompt Processor: total %d prompt tokens (AR-%d * %d iters)", + num_prompt_tokens, + metadata_.ar_len, + num_iters); + + // Rearrange KV cache first + kv_manager_->rearrange_cache(metadata_.ar_len); + std::vector attention_map(metadata_.ar_len); + std::iota(attention_map.begin(), attention_map.end(), -1); + // Initialize attention mask with current position + kv_manager_->init_attention_mask( + attention_mask_.data, attention_map, metadata_.ar_len, pos); + // Initialize the output of the module + ET_CHECK_MSG( + decoder_runner_->set_outputs(method_name_, output_tensors_) == + executorch::runtime::Error::Ok, + "Failed to set output tensor for module %s", + method_name_.c_str()); + for (int i = 0; i < num_iters; ++i) { + // Fill in the token and position data + prepare_io(prompt_tokens, prompt_pos, pos); + // Only update data pointer of the cache to the tensor for SHIFT_POINTER + // mode + bool updated = kv_manager_->update_cache_tensor( + k_cache_in_, + k_cache_out_, + v_cache_in_, + v_cache_out_, + metadata_.ar_len, + pos); + // Only update the output of module for SHIFT_POINTER mode + if (updated) { + // Update the output of the module + ET_CHECK_MSG( + decoder_runner_->set_outputs(method_name_, output_tensors_) == + executorch::runtime::Error::Ok, + "Failed to set output tensor for module %s", + method_name_.c_str()); + } + // Run inference + decoder_runner_->step(method_name_, inputs_); + // In the last run, offset to the meaningful logits. + if (i == num_iters - 1) { + n_update = 1 + ((num_prompt_tokens - 1) % metadata_.ar_len); + } + // Update KV Cache with the output results + kv_manager_->update_cache(metadata_.ar_len, pos, n_update); + // Update attention mask with current position + kv_manager_->update_attention_mask( + attention_mask_.data, metadata_.ar_len, pos, n_update); + prompt_pos += metadata_.ar_len; + pos += metadata_.ar_len; + } + + cur_token = decoder_runner_->logits_to_token( + output_tensors_[0], + (num_prompt_tokens + metadata_.ar_len - 1) % metadata_.ar_len); + return cur_token; +} + +} // namespace example diff --git a/examples/qualcomm/oss_scripts/llama/runner/prompt_processor.h b/examples/qualcomm/oss_scripts/llama/runner/prompt_processor.h new file mode 100644 index 00000000000..a9991a6c79a --- /dev/null +++ b/examples/qualcomm/oss_scripts/llama/runner/prompt_processor.h @@ -0,0 +1,111 @@ +/* + * Copyright (c) Qualcomm Innovation Center, Inc. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once +#include +#include +#include +#include +#include +#include + +namespace example { +/** + * @class PromptProcessor + * @brief Class for processing prompts using decoder and key-value manager. + */ +class PromptProcessor { + public: + struct Metadata { + int32_t context_len; + int64_t num_heads; + int64_t num_layers; + int32_t ar_len; + int32_t vocab_size; + bool use_int64_token; + }; + PromptProcessor( + DecoderRunner* decoder_runner, + KVManager* kv_manager, + const std::string& method_name, + Metadata metadata); + + /** + * @brief Initialize I/O tensor and allocate I/O data buffer. + * @param buffer_manager Pointer to IMemAlloc instance which depends on + * kv_updater. + * @param method_meta Method metadata. + */ + void init_io( + IMemAlloc* buffer_manager, + executorch::runtime::Result method_meta); + + /** + * Prefill an LLM Module with the given text input. + * @param prompt_tokens The text prompt tokens to the LLM Module. Encoded by + * tokenizer. + * @param start_pos The starting position in KV cache of the input in the LLM + * Module. + * @return The next token of the LLM Module after prefill. + */ + executorch::runtime::Result prefill( + std::vector prompt_tokens, + int64_t start_pos); + /** + * @brief Get total I/O size in bytes (excluding the KV cache size) + * @return Total I/O size in bytes. + */ + inline const size_t total_prompt_processor_io_size_in_bytes() const { + return input_toks_.size + input_pos_.size + attention_mask_.size + + logits_.size; + } + + private: + // If the cache length is zero, it indicates a BERT model, which does not use + // position ids or KV cache inputs. + bool is_bert() const { + return metadata_.context_len == metadata_.ar_len; + } + /** + * @brief Fill in I/O buffers with prompt token and position. + * @param prompt_tokens Vector of prompt tokens. + * @param prompt_pos Position of the prompt. + * @param start_pos Starting position. + */ + void prepare_io( + const std::vector& prompt_tokens, + int64_t prompt_pos, + int64_t start_pos); + DecoderRunner* decoder_runner_; + KVManager* kv_manager_; + std::string method_name_; + + // metadata + Metadata metadata_; + + // inputs and outputs + TensorStruct input_toks_; + TensorStruct input_pos_; + TensorStruct attention_mask_; + TensorStruct logits_; + + // layer -> head -> TensorImpl + std::vector>> + k_cache_in_; + std::vector>> + v_cache_in_; + std::vector>> + k_cache_out_; + std::vector>> + v_cache_out_; + + std::vector inputs_; + std::vector input_tensors_; + std::vector output_tensors_; +}; +} // namespace example diff --git a/examples/qualcomm/oss_scripts/llama/runner/rpc_mem.cpp b/examples/qualcomm/oss_scripts/llama/runner/rpc_mem.cpp new file mode 100644 index 00000000000..f0cc6d9a7a2 --- /dev/null +++ b/examples/qualcomm/oss_scripts/llama/runner/rpc_mem.cpp @@ -0,0 +1,63 @@ +/* + * Copyright (c) Qualcomm Innovation Center, Inc. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include +#include +using executorch::runtime::MemoryAllocator; +using executorch::runtime::TensorInfo; + +namespace example { +RpcMem::RpcMem( + const size_t total_cache_size, + const size_t total_prompt_processor_io_size, + const size_t total_token_generator_io_size) + : calculated_offsets_(0) { + size_t total_bytes = total_cache_size + total_prompt_processor_io_size + + total_token_generator_io_size; + shared_buffer_base_ptr_ = QnnExecuTorchAllocCustomMem( + total_bytes, MemoryAllocator::kDefaultAlignment); +} +RpcMem::~RpcMem() { + QnnExecuTorchFreeCustomMem(shared_buffer_base_ptr_); +} + +std::byte* RpcMem::allocate(size_t data_size) { + std::byte* data_ptr = static_cast(shared_buffer_base_ptr_); + data_ptr += calculated_offsets_; + // Record the position of the data pointer + io_pos_map_[data_ptr] = calculated_offsets_; + calculated_offsets_ += data_size; + return data_ptr; +} + +void RpcMem::add_memory_info( + void* data_ptr, + size_t data_size, + TensorInfo tensor_info) { + if (auto it = io_pos_map_.find(static_cast(data_ptr)); + it == io_pos_map_.end()) { + ET_LOG(Error, "Shared buffer pointer %p is not found", data_ptr); + } + size_t pos = io_pos_map_[static_cast(data_ptr)]; + uint32_t* shape = const_cast( + reinterpret_cast(tensor_info.sizes().data())); + uint32_t rank = static_cast(tensor_info.sizes().size()); + executorch::aten::ScalarType scalar_type = tensor_info.scalar_type(); + CustomMemTensorInfo info = { + shared_buffer_base_ptr_, + data_ptr, + pos, + data_size, + shape, + rank, + scalar_type}; + QnnExecuTorchAddCustomMemTensorInfo(info); +}; + +} // namespace example diff --git a/examples/qualcomm/oss_scripts/llama/runner/rpc_mem.h b/examples/qualcomm/oss_scripts/llama/runner/rpc_mem.h new file mode 100644 index 00000000000..d8da945cb96 --- /dev/null +++ b/examples/qualcomm/oss_scripts/llama/runner/rpc_mem.h @@ -0,0 +1,62 @@ +/* + * Copyright (c) Qualcomm Innovation Center, Inc. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once +#include + +namespace example { +/** + * @class RpcMem + * @brief Final class for rpc memory allocation, implementing IMemAlloc + * interface. Used for SMART_MASK mode. + */ +class RpcMem final : public IMemAlloc { + public: + /** +   * @brief Constructor to allocate RpcMem with total sizes. +   * @param total_cache_size Total size of the cache. +   * @param total_prompt_processor_io_size Total size for prompt processor I/O. +   * @param total_token_generator_io_size Total size for token generator I/O. +   */ + RpcMem( + const size_t total_cache_size, + const size_t total_prompt_processor_io_size, + const size_t total_token_generator_io_size); + // Disable copy constructors, r-value referencing, etc + RpcMem(const RpcMem&) = delete; + RpcMem& operator=(const RpcMem&) = delete; + RpcMem(RpcMem&&) = delete; + RpcMem& operator=(RpcMem&&) = delete; + virtual ~RpcMem(); + /** + * @brief Allocate buffer of specified size with shared_buffer_base_ptr_. + * @param data_size Size of the data to allocate. + * @return Pointer to the allocated buffer. + */ + std::byte* allocate(size_t size) override; + + /** +   * @brief Add memory information into QNN Backend to register RpcMem to the +tensor. + * @param data_ptr Pointer to the data. + * @param data_size Size of the data. + * @param tensor_info Information about the tensor. + */ + void add_memory_info( + void* data_ptr, + size_t data_size, + executorch::runtime::TensorInfo tensor_info) override; + + private: + // shared buffer + void* shared_buffer_base_ptr_; + size_t calculated_offsets_; + std::unordered_map io_pos_map_; +}; + +} // namespace example diff --git a/examples/qualcomm/oss_scripts/llama/runner/runner.cpp b/examples/qualcomm/oss_scripts/llama/runner/runner.cpp index dafc911a172..d348878294a 100644 --- a/examples/qualcomm/oss_scripts/llama/runner/runner.cpp +++ b/examples/qualcomm/oss_scripts/llama/runner/runner.cpp @@ -10,72 +10,76 @@ // logic. The module takes in a string as input and emits a string as output. #include +#include +#include #include -#include #include #include #include #include #include -#include +#include #include -#include -using executorch::aten::Tensor; using executorch::extension::Module; -using executorch::extension::llm::Sampler; +using executorch::extension::llm::get_rss_bytes; +using executorch::extension::llm::print_report; +using executorch::extension::llm::Stats; using executorch::extension::llm::time_in_ms; using executorch::runtime::Error; -using executorch::runtime::EValue; using executorch::runtime::MethodMeta; using executorch::runtime::Result; namespace example { - namespace { -static constexpr auto kTopp = 0.9f; -void printReport( - const Runner::Stats& stats, - const std::string& performance_output_path); -std::string statsToJsonString(const Runner::Stats& stats); +void print_performance_report( + const Stats& stats, + const std::string& performance_output_path) { + // For now, we just print the total inference time for CI, can save more info + // in future if needed. + std::ofstream outfile(performance_output_path.c_str()); + if (outfile.is_open()) { + double num_tok = (stats.num_generated_tokens) / + (double)(stats.inference_end_ms - stats.inference_start_ms) * + stats.SCALING_FACTOR_UNITS_PER_SECOND; + outfile << num_tok; + outfile.close(); + } else { + ET_CHECK_MSG(false, "Error saving the inference speed file"); + } +} } // namespace Runner::Runner( - const std::vector& models_path, + const std::string& model_path, const std::string& tokenizer_path, const std::string& performance_output_path, - const float logits_scale, - const int32_t logits_offset, const float temperature, const int eval_mode, - const std::string& kv_updater, - const int num_iters) - : n_bos_(1), - n_eos_(1), - tokenizer_path_(tokenizer_path), + const std::string& kv_updater) + : tokenizer_path_(tokenizer_path), performance_output_path_(performance_output_path), - logits_scale_(logits_scale), - logits_offset_(logits_offset), temperature_(temperature), - eval_mode_(static_cast(eval_mode)), - kv_updater_(kv_updater), - num_iters_(num_iters) { - for (size_t i = 0; i < models_path.size(); ++i) { - modules_.push_back(std::make_shared( - models_path[i], Module::LoadMode::MmapUseMlockIgnoreErrors)); - ET_LOG(Info, "creating module: model_path=%s", models_path[i].c_str()); + eval_mode_(static_cast(eval_mode)) { + module_ = std::make_unique( + model_path, Module::LoadMode::MmapUseMlockIgnoreErrors); + if (kv_updater == "SmartMask") { + kv_updater_ = KVManagerMode::SMART_MASK; + } else if (kv_updater == "ShiftPointer") { + kv_updater_ = KVManagerMode::SHIFT_POINTER; + } else { + ET_CHECK_MSG(false, "kv updater (%s) not found", kv_updater.c_str()); } + ET_LOG(Info, "creating module: model_path=%s", model_path.c_str()); ET_LOG(Info, "creating runner: tokenizer_path=%s", tokenizer_path_.c_str()); ET_LOG(Info, "eval mode=%d", eval_mode_); + ET_LOG(Info, "kv updater=%s", kv_updater.c_str()); } bool Runner::is_loaded() const { - bool loaded = true; - for (const std::shared_ptr& module : modules_) { - loaded &= module->is_loaded(); - } - return loaded && tokenizer_ && sampler_; + return module_->is_loaded() && tokenizer_ && decoder_runner_ && + prompt_processor_ && token_generator_ && kv_manager_ && buffer_manager_; } Error Runner::load() { @@ -83,123 +87,37 @@ Error Runner::load() { return Error::Ok; } + std::string token_generator_method_name, prompt_processor_method_name; + std::vector method_names; switch (eval_mode_) { case EvalMode::kKVCached: - kv_forward_name_ = "forward"; - method_names_.emplace_back(kv_forward_name_); + prompt_processor_method_name = "forward"; + token_generator_method_name = "forward"; + method_names.emplace_back(token_generator_method_name); break; case EvalMode::kHybrid: - prefill_forward_name_ = "prefill_forward"; - kv_forward_name_ = "kv_forward"; - method_names_.emplace_back(prefill_forward_name_); - method_names_.emplace_back(kv_forward_name_); + prompt_processor_method_name = "prefill_forward"; + token_generator_method_name = "kv_forward"; + method_names.emplace_back(prompt_processor_method_name); + method_names.emplace_back(token_generator_method_name); break; case EvalMode::kUnsupported: - ET_CHECK_MSG(false, "Unsupported llama version"); + ET_CHECK_MSG(false, "Unsupported llama evaluation mode"); break; } - for (std::shared_ptr& module : modules_) { - if (!prefill_forward_name_.empty()) { - ET_CHECK_OK_OR_RETURN_ERROR(module->load_method(prefill_forward_name_)); - } - if (!kv_forward_name_.empty()) { - ET_CHECK_OK_OR_RETURN_ERROR(module->load_method(kv_forward_name_)); - } - } - - if (!prefill_forward_name_.empty()) { - // Use attention mask length to retrieve prefill_ar_len and context length - // Prefill cache length equals to context_len - prefill_ar_len - auto atten_mask_meta = - get_methods_meta(prefill_forward_name_)[0]->input_tensor_meta(1); - prefill_ar_len_ = atten_mask_meta->sizes()[1]; - context_len_ = atten_mask_meta->sizes()[2]; - prefill_cache_len_ = context_len_ - prefill_ar_len_; - } - if (!kv_forward_name_.empty()) { - // Use attention mask length to retrieve kv ar len and context length - // Cache len equals to kv model context_len - kv_ar_len - auto atten_mask_meta = - get_methods_meta(kv_forward_name_)[0]->input_tensor_meta(1); - kv_ar_len_ = atten_mask_meta->sizes()[1]; - context_len_ = atten_mask_meta->sizes()[2]; - kv_cache_len_ = context_len_ - kv_ar_len_; - } - - // retrieve any method meta, can be either prefill or kv - // Try avoid getMetadataHelper as it is time consuming. - auto method_meta = get_methods_meta(method_names_[0])[0].get(); - int64_t num_layers = getMetadataHelper("get_n_layers", -1); - int64_t head_dim = method_meta.output_tensor_meta(1)->sizes()[1]; // k_cache - int64_t num_heads = (method_meta.num_outputs() - 1) / (num_layers * 2); - vocab_size_ = method_meta.output_tensor_meta(0)->sizes()[2]; // logit_tensor - use_int64_token_ = method_meta.input_tensor_meta(0)->scalar_type() == - executorch::aten::ScalarType::Long; - ET_CHECK_MSG(num_layers != -1, "Could not retrieve num layers"); - - if (kv_updater_ == "SmartMask") { - io_mgr_ = std::make_unique( - modules_, - context_len_, - prefill_ar_len_, - prefill_cache_len_, - kv_ar_len_, - kv_cache_len_, - vocab_size_, - num_layers, - head_dim, - num_heads, - eval_mode_, - prefill_forward_name_, - kv_forward_name_, - use_int64_token_); - } else if (kv_updater_ == "ShiftPointer") { - io_mgr_ = std::make_unique( - modules_, - context_len_, - prefill_ar_len_, - prefill_cache_len_, - kv_ar_len_, - kv_cache_len_, - vocab_size_, - num_layers, - head_dim, - num_heads, - eval_mode_, - prefill_forward_name_, - kv_forward_name_, - use_int64_token_); - } else { - ET_LOG(Error, "Using an unknown updater %s", kv_updater_.c_str()); - } - ET_LOG(Info, "creating io_memory"); - - // prepare io - io_mgr_->init_io(); - switch (eval_mode_) { - case EvalMode::kKVCached: - io_mgr_->prepare_kv_io(get_methods_meta(kv_forward_name_)); - break; - case EvalMode::kHybrid: - io_mgr_->prepare_prefill_io(get_methods_meta(prefill_forward_name_)); - io_mgr_->prepare_kv_io(get_methods_meta(kv_forward_name_)); - break; - case EvalMode::kUnsupported: - ET_CHECK_MSG(false, "unsupported mode"); - break; - } - - // llama3 tokenizer - tokenizer_ = example::get_tiktoken_for_llama(); + // load tokenizer. Assuming tiktoken is the default tokenizer + tokenizer_ = get_tiktoken_for_llama(); auto err = tokenizer_->load(tokenizer_path_); + auto eos_ids = std::make_unique>(); + // Rely on tiktoken to throw error if the artifact is incompatible. Then we + // fallback to BPE tokenizer. if (err != tokenizers::Error::Ok) { ET_LOG( Info, "Failed to load %s as a Tiktoken artifact, trying BPE tokenizer", tokenizer_path_.c_str()); tokenizer_.reset(); - // llama2 tokenizer tokenizer_ = std::make_unique(); err = tokenizer_->load(tokenizer_path_); llama_version_ = LlamaVersion::kLlama2; @@ -208,354 +126,193 @@ Error Runner::load() { "failed to load tokenizer %s", tokenizer_path_.c_str()); } else { - eos_id_.insert(tokenizer_->encode("<|eot_id|>", 0, 0).get()[0]); + eos_ids->insert(tokenizer_->encode("<|eot_id|>", 0, 0).get()[0]); llama_version_ = LlamaVersion::kLlama3; } - bos_id_ = tokenizer_->bos_tok(); - eos_id_.insert(tokenizer_->eos_tok()); + eos_ids->insert(tokenizer_->eos_tok()); + int32_t vocab_size = tokenizer_->vocab_size(); + decoder_runner_ = + std::make_unique(module_.get(), vocab_size, temperature_); - // create sampler - sampler_ = std::make_unique( - vocab_size_, - temperature_, - kTopp, - static_cast(std::time(nullptr))); + ET_CHECK_OK_OR_RETURN_ERROR(decoder_runner_->load(method_names)); - return Error::Ok; -} + ET_LOG(Info, "Reading metadata from model"); + // Try avoid getMetadataHelper as it is time consuming. + Result method_meta = + module_->method_meta(token_generator_method_name); + // retrieve any method meta, can be either prefill or kv + int64_t num_layers = + ET_UNWRAP(module_->get("get_n_layers")).toScalar().to(); + ET_CHECK_MSG(num_layers != -1, "Could not retrieve num layers"); + // k_cache: [1, head_dim, seq_len] + int64_t head_dim = method_meta->output_tensor_meta(1)->sizes()[1]; + int64_t num_heads = (method_meta->num_outputs() - 1) / (num_layers * 2); + bool use_int64_token = method_meta->input_tensor_meta(0)->scalar_type() == + executorch::aten::ScalarType::Long; -template -T Runner::getMetadataHelper(std::string method_name, T default_val) { - T res = default_val; - if (modules_[0]->method_names()->count(method_name)) { - Result> outputs = modules_[0]->execute(method_name); - if (outputs.ok()) { - std::vector outs = outputs.get(); - if (outs.size() > 0) { - res = outs[0].to(); - } - } - } else { - ET_LOG( - Info, - "The model does not contain %s method, using default value %lld", - method_name.c_str(), - (long long)default_val); + // Use attention mask length to retrieve AR length and context length + // Cache len equals to context_len - ar_len + int32_t prompt_processor_ar_len, token_generator_ar_len, max_cache_len, + max_ar_len; + // atten mask: [1, AR-N, CL] + auto atten_mask_meta_token = method_meta->input_tensor_meta(1); + token_generator_ar_len = atten_mask_meta_token->sizes()[1]; + context_len_ = atten_mask_meta_token->sizes()[2]; + if (eval_mode_ == EvalMode::kKVCached) { + prompt_processor_ar_len = token_generator_ar_len; + } else if (eval_mode_ == EvalMode::kHybrid) { + auto atten_mask_meta_prompt = + module_->method_meta(prompt_processor_method_name) + ->input_tensor_meta(1); + prompt_processor_ar_len = atten_mask_meta_prompt->sizes()[1]; } - return res; -} - -int32_t Runner::logitsToToken(const Tensor& logits_tensor, int64_t pos) { - static std::vector logits_f(vocab_size_); - const uint16_t* logits = logits_tensor.data_ptr(); - // Since the logits are for all tokens, get the last token probabilities - auto* logits_last = logits; - - // offset to the meaningful logit we want. - if (logits_tensor.sizes().data()[1] > 1) { - logits_last += pos * vocab_size_; + if (prompt_processor_ar_len == context_len_) + max_cache_len = context_len_; + else + max_cache_len = context_len_ - + std::min(token_generator_ar_len, prompt_processor_ar_len); + max_ar_len = std::max(token_generator_ar_len, prompt_processor_ar_len); + + kv_manager_ = std::make_unique( + kv_updater_, + KVManager::Metadata{ + context_len_, + head_dim, + max_ar_len, + max_cache_len, + num_heads, + num_layers}); + + prompt_processor_ = std::make_unique( + decoder_runner_.get(), + kv_manager_.get(), + prompt_processor_method_name, + PromptProcessor::Metadata{ + context_len_, + num_heads, + num_layers, + prompt_processor_ar_len, + vocab_size, + use_int64_token}); + token_generator_ = std::make_unique( + tokenizer_.get(), + decoder_runner_.get(), + kv_manager_.get(), + token_generator_method_name, + std::move(eos_ids), + TokenGenerator::Metadata{ + context_len_, + num_heads, + num_layers, + token_generator_ar_len, + vocab_size, + use_int64_token, + }, + &stats_); + + buffer_manager_ = std::make_unique(); + if (kv_updater_ == KVManagerMode::SMART_MASK) { + buffer_manager_ = std::make_unique( + kv_manager_->total_cache_size_in_bytes(), + prompt_processor_->total_prompt_processor_io_size_in_bytes(), + token_generator_->total_token_generator_io_size_in_bytes()); } - // dequantize - for (int i = 0; i < vocab_size_; i++) { - logits_f[i] = (logits_last[i] - logits_offset_) * logits_scale_; - } - return sampler_->sample(logits_f.data()); -} + ET_LOG(Info, "creating io_memory"); + // prepare io + kv_manager_->init_cache(buffer_manager_.get(), prompt_processor_ar_len); + prompt_processor_->init_io( + buffer_manager_.get(), + module_->method_meta(prompt_processor_method_name)); + token_generator_->init_io( + buffer_manager_.get(), module_->method_meta(token_generator_method_name)); -void Runner::run_model_step( - const std::string& method_name, - std::vector>& inputs) { - for (size_t i = 0, num_modules = modules_.size(); i < num_modules; ++i) { - Result> outputs_res = - modules_[i]->execute(method_name, inputs[i]); - ET_CHECK_MSG( - outputs_res.error() == Error::Ok, "shard %zu inference failed", i); - } + return Error::Ok; } Error Runner::generate( - int32_t seq_len, const std::string& prompt, - const std::string& system_prompt, + int32_t seq_len, std::function token_callback, - std::function stats_callback) { - std::unordered_map>> - input_tensors, output_tensors; - std::unordered_map>> inputs; - if (!is_loaded() || (num_iters_ > 1)) { + std::function stats_callback, + bool echo, + bool warming) { + ET_CHECK_MSG(!prompt.empty(), "prompt cannot be null"); + if (!is_loaded()) { stats_.model_load_start_ms = time_in_ms(); ET_CHECK_OK_OR_RETURN_ERROR(load()); - for (auto method_name : method_names_) { - for (int i = 0; i < modules_.size(); ++i) { - input_tensors[method_name].emplace_back( - io_mgr_->get_input_tensors(i, method_name)); - output_tensors[method_name].emplace_back( - io_mgr_->get_output_tensors(i, method_name)); - for (size_t j = 0; j < output_tensors[method_name][i].size(); ++j) { - ET_CHECK_MSG( - modules_[i]->set_output( - method_name, output_tensors[method_name][i][j], j) == - Error::Ok, - "failed to set output tensor for module %d's %zu'th output", - i, - j); - } - inputs[method_name].emplace_back(std::vector( - begin(input_tensors[method_name][i]), - end(input_tensors[method_name][i]))); - } - } + stats_.model_load_end_ms = time_in_ms(); } - stats_.model_load_end_ms = time_in_ms(); stats_.inference_start_ms = time_in_ms(); - ET_CHECK_MSG(!prompt.empty(), "prompt cannot be null"); - - switch (llama_version_) { - case LlamaVersion::kLlama2: - prompt_.append(prompt); - break; - case LlamaVersion::kLlama3: - if (!system_prompt.empty()) { - prompt_.append("<|start_header_id|>system<|end_header_id|>\n\n"); - prompt_.append(system_prompt); - prompt_.append("<|eot_id|>"); - } - prompt_.append("<|start_header_id|>user<|end_header_id|>\n\n"); - prompt_.append(prompt); - prompt_.append( - "<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"); - if (token_callback) { - token_callback("<|begin_of_text|>"); - } - break; - default: - ET_CHECK_MSG(false, "unsupported llama version"); - break; - } - seq_len = (seq_len > 0 && seq_len <= context_len_) ? seq_len : context_len_; + int32_t n_bos = (cur_pos_ == 0) ? 1 : 0; tokenizers::Result> encode_res = - tokenizer_->encode(prompt_, n_bos_, 0); + tokenizer_->encode(prompt, n_bos, 0); ET_CHECK_TK_OK_OR_RETURN_ERROR( - encode_res.error(), "failed to encode prompt %s", prompt_.c_str()); + encode_res.error(), "failed to encode prompt %s", prompt.c_str()); + // encode the (string) prompt into tokens sequence std::vector prompt_tokens = encode_res.get(); int num_prompt_tokens = prompt_tokens.size(); + ET_CHECK_MSG(num_prompt_tokens >= 1, "Expected at least 1 prompt token"); ET_CHECK_MSG( - num_prompt_tokens < seq_len, + cur_pos_ + num_prompt_tokens < seq_len, "sequence length exceeded - please increase the seq_len value"); - int64_t pos = 0, prev_token, cur_token = prompt_tokens[0]; + // Prompt Processor first if (token_callback) { - token_callback(prompt_); + token_callback(prompt); } - auto prefill_execute = [&](const std::string& method_name) { - int num_iters = 1 + ((num_prompt_tokens - 1) / prefill_ar_len_); - ET_LOG( - Info, - "Prompt Processor: total %d tokens (AR-%d * %d iters)", - num_prompt_tokens, - prefill_ar_len_, - num_iters); - - for (int i = 0; i < num_iters; i++) { - io_mgr_->fill_prefill_toks(pos, prompt_tokens); - run_model_step(method_name, inputs[method_name]); - io_mgr_->update_prefill_io(cur_token, pos, output_tensors[method_name]); - pos += prefill_ar_len_; - } - Tensor& logits_tensor = output_tensors[method_name].back()[0]; - prev_token = prompt_tokens[num_prompt_tokens - 1]; - long sample_start_time_ms = time_in_ms(); - cur_token = logitsToToken( - logits_tensor, - (num_prompt_tokens + prefill_ar_len_ - 1) % prefill_ar_len_); - stats_.aggregate_sampling_time_ms += time_in_ms() - sample_start_time_ms; - - auto piece_res = tokenizer_->decode(prev_token, cur_token); - ET_CHECK(piece_res.ok()); - if (token_callback) { - token_callback(piece_res.get().c_str()); - } - - pos = num_prompt_tokens; - stats_.first_token_ms = time_in_ms(); - stats_.prompt_eval_end_ms = time_in_ms(); - }; - - auto kv_execute = [&](const std::string& method_name) { - io_mgr_->fill_kv_tok_mask(pos, cur_token); - while (pos < seq_len - 1) { - // inference - run_model_step(method_name, inputs[method_name]); - Tensor& logits_tensor = output_tensors[method_name].back()[0]; - // hybrid mode will check these stats_ at prefill(prefill) - if (eval_mode_ == EvalMode::kKVCached) { - if (pos == num_prompt_tokens) { - stats_.first_token_ms = time_in_ms(); - } else if (pos == num_prompt_tokens - 1) { - stats_.prompt_eval_end_ms = time_in_ms(); - } - } - prev_token = cur_token; - long sample_start_time_ms = time_in_ms(); - cur_token = logitsToToken(logits_tensor, pos); - stats_.aggregate_sampling_time_ms += time_in_ms() - sample_start_time_ms; + auto prefill_res = prompt_processor_->prefill(prompt_tokens, cur_pos_); + ET_CHECK_OK_OR_RETURN_ERROR(prefill_res.error()); + uint64_t cur_token = prefill_res.get(); + cur_pos_ += num_prompt_tokens; + stats_.first_token_ms = time_in_ms(); + stats_.prompt_eval_end_ms = time_in_ms(); - if (pos < num_prompt_tokens - 1) { - cur_token = prompt_tokens[pos + 1]; - } - io_mgr_->update_kv_io(cur_token, ++pos, output_tensors[method_name]); - auto piece_res = tokenizer_->decode(prev_token, cur_token); - ET_CHECK(piece_res.ok()); - - if (token_callback && pos >= num_prompt_tokens) { - token_callback(piece_res.get().c_str()); - } - - if (pos >= num_prompt_tokens && eos_id_.count(cur_token) > 0) { - ET_LOG(Info, "\nReached to the end of generation"); - break; - } - } - }; - - switch (eval_mode_) { - case EvalMode::kKVCached: - kv_execute(kv_forward_name_); - break; - case EvalMode::kHybrid: - prefill_execute(prefill_forward_name_); - io_mgr_->update_prefill_to_kv_io( - cur_token, pos, output_tensors[kv_forward_name_]); - kv_execute(kv_forward_name_); - break; - default: - ET_CHECK_MSG(false, "Unsupported eval mode"); - break; + // print the first token from prefill. No prev_token so use cur_token for it. + if (token_callback) { + token_callback( + ET_UNWRAP_TOKENIZER(tokenizer_->decode(cur_token, cur_token))); } + ET_LOG( + Info, + "RSS after prompt prefill: %f MiB (0 if unsupported)", + get_rss_bytes() / 1024.0 / 1024.0); + + // start the main loop + prompt_tokens.push_back(cur_token); + int64_t num_generated_tokens = ET_UNWRAP(token_generator_->generate( + prompt_tokens, cur_pos_, seq_len, token_callback)); stats_.inference_end_ms = time_in_ms(); - if (pos == seq_len) { - ET_LOG(Info, "\nSequence length (%i tokens) reached!", seq_len); + ET_LOG( + Info, + "RSS after finishing text generation: %f MiB (0 if unsupported)", + get_rss_bytes() / 1024.0 / 1024.0); + cur_pos_ += num_generated_tokens; + if (cur_pos_ == seq_len) { + ET_LOG(Info, "Sequence length (%i tokens) reached!", seq_len); } stats_.num_prompt_tokens = num_prompt_tokens; - stats_.num_generated_tokens = pos - num_prompt_tokens; - printReport(stats_, performance_output_path_); + stats_.num_generated_tokens = num_generated_tokens; + print_report(stats_); + print_performance_report(stats_, performance_output_path_); if (stats_callback) { stats_callback(stats_); } - io_mgr_->reset_io( - get_methods_meta(prefill_forward_name_), - get_methods_meta(kv_forward_name_)); - prompt_.clear(); return Error::Ok; } -namespace { -void printReport( - const Runner::Stats& stats, - const std::string& performance_output_path) { - printf("PyTorchObserver %s\n", statsToJsonString(stats).c_str()); - - ET_LOG( - Info, - "\tPrompt Tokens: %" PRIu64 " Generated Tokens: %" PRIu64, - stats.num_prompt_tokens, - stats.num_generated_tokens); - - ET_LOG( - Info, - "\tModel Load Time:\t\t%f (seconds)", - ((double)(stats.model_load_end_ms - stats.model_load_start_ms) / - stats.SCALING_FACTOR_UNITS_PER_SECOND)); - double inference_time_ms = - (double)(stats.inference_end_ms - stats.inference_start_ms); - ET_LOG( - Info, - "\tTotal inference time:\t\t%f (seconds)\t\t Rate: \t%f (tokens/second)", - inference_time_ms / stats.SCALING_FACTOR_UNITS_PER_SECOND, - - (stats.num_generated_tokens) / - (double)(stats.inference_end_ms - stats.inference_start_ms) * - stats.SCALING_FACTOR_UNITS_PER_SECOND); - double prompt_eval_time = - (double)(stats.prompt_eval_end_ms - stats.inference_start_ms); - ET_LOG( - Info, - "\t\tPrompt evaluation:\t%f (seconds)\t\t Rate: \t%f (tokens/second)", - prompt_eval_time / stats.SCALING_FACTOR_UNITS_PER_SECOND, - (stats.num_prompt_tokens) / prompt_eval_time * - stats.SCALING_FACTOR_UNITS_PER_SECOND); - - double eval_time = - (double)(stats.inference_end_ms - stats.prompt_eval_end_ms); - ET_LOG( - Info, - "\t\tGenerated %" PRIu64 - " tokens:\t%f (seconds)\t\t Rate: \t%f (tokens/second)", - stats.num_generated_tokens, - eval_time / stats.SCALING_FACTOR_UNITS_PER_SECOND, - stats.num_generated_tokens / eval_time * - stats.SCALING_FACTOR_UNITS_PER_SECOND); - - // Time to first token is measured from the start of inference, excluding - // model load time. - ET_LOG( - Info, - "\tTime to first generated token:\t%f (seconds)", - ((double)(stats.first_token_ms - stats.inference_start_ms) / - stats.SCALING_FACTOR_UNITS_PER_SECOND)); - - ET_LOG( - Info, - "\tSampling time over %" PRIu64 " tokens:\t%f (seconds)", - stats.num_generated_tokens, - (double)stats.aggregate_sampling_time_ms / - stats.SCALING_FACTOR_UNITS_PER_SECOND); - - // For now, we just print the total inference time for CI, can save more info - // in future if needed. - - std::ofstream outfile(performance_output_path.c_str()); - if (outfile.is_open()) { - double num_tok = (stats.num_generated_tokens) / - (double)(stats.inference_end_ms - stats.inference_start_ms) * - stats.SCALING_FACTOR_UNITS_PER_SECOND; - outfile << num_tok; - outfile.close(); - } else { - ET_CHECK_MSG(false, "Error saving the inference speed file"); +Result Runner::get_llama_version() { + if (!is_loaded()) { + stats_.model_load_start_ms = time_in_ms(); + ET_CHECK_OK_OR_RETURN_ERROR(load()); + stats_.model_load_end_ms = time_in_ms(); } + return llama_version_; } -std::string statsToJsonString(const Runner::Stats& stats) { - std::stringstream ss; - ss << "{\"prompt_tokens\":" << stats.num_prompt_tokens << "," - << "\"generated_tokens\":" << stats.num_generated_tokens << "," - << "\"model_load_start_ms\":" << stats.model_load_start_ms << "," - << "\"model_load_end_ms\":" << stats.model_load_end_ms << "," - << "\"inference_start_ms\":" << stats.inference_start_ms << "," - << "\"inference_end_ms\":" << stats.inference_end_ms << "," - << "\"prompt_eval_end_ms\":" << stats.prompt_eval_end_ms << "," - << "\"first_token_ms\":" << stats.first_token_ms << "," - << "\"aggregate_sampling_time_ms\":" << stats.aggregate_sampling_time_ms - << "," << "\"SCALING_FACTOR_UNITS_PER_SECOND\":" - << stats.SCALING_FACTOR_UNITS_PER_SECOND << "}"; - return ss.str(); -} -} // namespace - -std::vector> Runner::get_methods_meta( - std::string& method_name) { - std::vector> methods_meta; - methods_meta.reserve(modules_.size()); - for (std::shared_ptr& module : modules_) { - methods_meta.emplace_back(module->method_meta(method_name)); - } - return methods_meta; -} } // namespace example diff --git a/examples/qualcomm/oss_scripts/llama/runner/runner.h b/examples/qualcomm/oss_scripts/llama/runner/runner.h index e693bcd7077..708f91157a3 100644 --- a/examples/qualcomm/oss_scripts/llama/runner/runner.h +++ b/examples/qualcomm/oss_scripts/llama/runner/runner.h @@ -15,110 +15,70 @@ #include #include #include -#include -#include -#include +#include +#include +#include +#include +#include +#include #include #include - namespace example { +enum LlamaVersion { + kLlama2 = 0, + kLlama3, +}; class Runner { public: explicit Runner( - const std::vector& models_path, + const std::string& model_path, const std::string& tokenizer_path, - const std::string& performance_output_path_, - const float logits_scale, - const int32_t logits_offset, - const float temperature, - const int eval_mode, - const std::string& kv_updater, - const int num_iters); - - struct Stats { - // Scaling factor for timestamps - in this case, we use ms. - const long SCALING_FACTOR_UNITS_PER_SECOND = 1000; - // Time stamps for the different stages of the execution - // model_load_start_ms: Start of model loading. - long model_load_start_ms; - // model_load_end_ms: End of model loading. - long model_load_end_ms; - // inference_start_ms: Immediately after the model is loaded (or we check - // for model load), measure the inference time. - long inference_start_ms; - // prompt_eval_end_ms: Prompt array allocation and tokenization. Ends right - // before the inference loop starts - long prompt_eval_end_ms; - // first_token: Timestamp when the first generated token is emitted - long first_token_ms; - // inference_end_ms: End of inference/generation. - long inference_end_ms; - // Keep a running total of the time spent in sampling. - long aggregate_sampling_time_ms; - // Token count from prompt - int64_t num_prompt_tokens; - // Token count from generated (total - prompt) - int64_t num_generated_tokens; - }; + const std::string& performance_output_path, + const float temperature = 0.8f, + const int eval_mode = EvalMode::kKVCached, + const std::string& kv_updater = "SmartMask"); bool is_loaded() const; executorch::runtime::Error load(); + // TODO: Support echo and warming executorch::runtime::Error generate( - int32_t seq_len, const std::string& prompt, - const std::string& system_prompt, + int32_t seq_len, std::function token_callback = {}, - std::function stats_callback = {}); - void stop(); - std::vector> - get_methods_meta(std::string& method_name); + std::function stats_callback = {}, + bool echo = true, + bool warming = false); + void stop() {}; + executorch::runtime::Result get_llama_version(); private: - enum LlamaVersion { - kLlama2 = 0, - kLlama3, + enum EvalMode { + kKVCached = 0, + kHybrid, + kUnsupported, }; - template - T getMetadataHelper(std::string method_name, T default_val); - int32_t logitsToToken( - const executorch::aten::Tensor& logits_tensor, - int64_t pos); - void run_model_step( - const std::string& method_name, - std::vector>& inputs); - std::string prompt_; - // metadata + std::unique_ptr module_; int32_t context_len_{0}; - int32_t prefill_ar_len_{0}; - int32_t prefill_cache_len_{0}; - int32_t kv_ar_len_{0}; - int32_t kv_cache_len_{0}; - int32_t vocab_size_; - int32_t bos_id_; - std::unordered_set eos_id_; - const int32_t n_bos_; - const int32_t n_eos_; - std::vector> modules_; + + int64_t cur_pos_{0}; + std::string tokenizer_path_; std::string performance_output_path_; - float logits_scale_; - int32_t logits_offset_; float temperature_; - std::unique_ptr tokenizer_; - std::unique_ptr sampler_; - Stats stats_; - std::unique_ptr io_mgr_; EvalMode eval_mode_; - bool use_int64_token_{false}; - std::string prefill_forward_name_; - std::string kv_forward_name_; - std::vector method_names_; LlamaVersion llama_version_; - std::string kv_updater_; - int num_iters_; -}; + KVManagerMode kv_updater_; + std::unique_ptr buffer_manager_; + std::unique_ptr kv_manager_; + std::unique_ptr tokenizer_; + std::unique_ptr decoder_runner_; + std::unique_ptr prompt_processor_; + std::unique_ptr token_generator_; + // stats + executorch::llm::Stats stats_; +}; } // namespace example diff --git a/examples/qualcomm/oss_scripts/llama/runner/token_generator.cpp b/examples/qualcomm/oss_scripts/llama/runner/token_generator.cpp new file mode 100644 index 00000000000..8d890637b13 --- /dev/null +++ b/examples/qualcomm/oss_scripts/llama/runner/token_generator.cpp @@ -0,0 +1,253 @@ +/* + * Copyright (c) Qualcomm Innovation Center, Inc. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include +using executorch::aten::TensorImpl; +using executorch::runtime::MethodMeta; +using executorch::runtime::Result; +using executorch::runtime::TensorInfo; + +namespace example { +TokenGenerator::TokenGenerator( + tokenizers::Tokenizer* tokenizer, + DecoderRunner* decoder_runner, + KVManager* kv_manager, + const std::string& method_name, + std::unique_ptr>&& eos_ids, + Metadata metadata, + executorch::llm::Stats* stats) + : tokenizer_(tokenizer), + decoder_runner_(decoder_runner), + kv_manager_(kv_manager), + method_name_(method_name), + eos_ids_(std::move(eos_ids)), + metadata_(metadata), + stats_(stats) { + k_cache_in_.resize(metadata_.num_layers); + v_cache_in_.resize(metadata_.num_layers); + k_cache_out_.resize(metadata_.num_layers); + v_cache_out_.resize(metadata_.num_layers); + + // Calculate I/O size + input_toks_.size = metadata_.ar_len * sizeof(int64_t); + input_pos_.size = metadata_.ar_len * sizeof(int32_t); + attention_mask_.size = + metadata_.ar_len * metadata_.context_len * sizeof(uint16_t); + logits_.size = metadata_.ar_len * metadata_.vocab_size * sizeof(uint16_t); +} +void TokenGenerator::init_io( + IMemAlloc* buffer_manager, + Result method_meta) { + input_tensors_.reserve(method_meta->num_inputs()); + output_tensors_.reserve(method_meta->num_outputs()); + // [I]: input_tokens + Result input_toks = method_meta->input_tensor_meta(0); + input_toks_.data = + reinterpret_cast(buffer_manager->allocate(input_toks_.size)); + input_toks_.tensor = std::make_unique( + input_toks->scalar_type(), + input_toks->sizes().size(), + const_cast(input_toks->sizes().data()), + input_toks_.data, + const_cast(input_toks->dim_order().data())); + input_tensors_.emplace_back(input_toks_.tensor.get()); + buffer_manager->add_memory_info( + input_toks_.data, input_toks_.size, input_toks.get()); + + // [I]: attention_mask + Result attention_mask = method_meta->input_tensor_meta(1); + attention_mask_.data = reinterpret_cast( + buffer_manager->allocate(attention_mask_.size)); + attention_mask_.tensor = std::make_unique( + attention_mask->scalar_type(), + attention_mask->sizes().size(), + const_cast(attention_mask->sizes().data()), + attention_mask_.data, + const_cast( + attention_mask->dim_order().data())); + input_tensors_.emplace_back(attention_mask_.tensor.get()); + buffer_manager->add_memory_info( + attention_mask_.data, attention_mask_.size, attention_mask.get()); + + // [I]: input_pos + Result input_pos = method_meta->input_tensor_meta(2); + input_pos_.data = + reinterpret_cast(buffer_manager->allocate(input_pos_.size)); + input_pos_.tensor = std::make_unique( + input_pos->scalar_type(), + input_pos->sizes().size(), + const_cast(input_pos->sizes().data()), + input_pos_.data, + const_cast(input_pos->dim_order().data())); + input_tensors_.emplace_back(input_pos_.tensor.get()); + buffer_manager->add_memory_info( + input_pos_.data, input_pos_.size, input_pos.get()); + + // [I] kv_cache + int index = 3; // bypass input_tokens, atten_mask, input_pos + for (int cache_group = 0; cache_group < 2; ++cache_group) { + std::vector>>& cache = + (cache_group == 0 ? k_cache_in_ : v_cache_in_); + std::vector> cache_ptrs = (cache_group == 0) + ? kv_manager_->get_k_cache_() + : kv_manager_->get_v_cache_(); + for (int layer = 0; layer < metadata_.num_layers; ++layer) { + for (int head = 0; head < metadata_.num_heads; ++head, ++index) { + Result kv_cache = method_meta->input_tensor_meta(index); + + uint8_t* cache_ptr = cache_ptrs[layer][head].buffer; + + cache[layer].emplace_back(std::make_unique( + kv_cache->scalar_type(), + kv_cache->sizes().size(), + const_cast(kv_cache->sizes().data()), + cache_ptr, + const_cast( + kv_cache->dim_order().data()))); + input_tensors_.emplace_back(cache[layer][head].get()); + buffer_manager->add_memory_info( + cache_ptr, cache[layer][head]->nbytes(), kv_cache.get()); + } + } + } + + // [O]: logits + Result logits = method_meta->output_tensor_meta(0); + logits_.data = + reinterpret_cast(buffer_manager->allocate(logits_.size)); + logits_.tensor = std::make_unique( + logits->scalar_type(), + logits->sizes().size(), + const_cast(logits->sizes().data()), + logits_.data, + const_cast(logits->dim_order().data())); + output_tensors_.emplace_back(logits_.tensor.get()); + buffer_manager->add_memory_info(logits_.data, logits_.size, logits.get()); + + // [O] kv_cache + index = 1; + for (int cache_group = 0; cache_group < 2; ++cache_group) { + std::vector>>& cache = + (cache_group == 0 ? k_cache_out_ : v_cache_out_); + std::vector> cache_ptrs = (cache_group == 0) + ? kv_manager_->get_k_cache_() + : kv_manager_->get_v_cache_(); + for (int layer = 0; layer < metadata_.num_layers; ++layer) { + for (int head = 0; head < metadata_.num_heads; ++head, ++index) { + Result kv_cache = method_meta->output_tensor_meta(index); + uint8_t* cache_ptr = cache_ptrs[layer][head].output_buffer; + cache[layer].emplace_back(std::make_unique( + kv_cache->scalar_type(), + kv_cache->sizes().size(), + const_cast(kv_cache->sizes().data()), + cache_ptr, + const_cast( + kv_cache->dim_order().data()))); + output_tensors_.emplace_back(cache[layer][head].get()); + buffer_manager->add_memory_info( + cache_ptr, cache[layer][head]->nbytes(), kv_cache.get()); + } + } + } + // Prepare the vector of EValue to run inference + inputs_.reserve(input_tensors_.size()); + for (auto& input_tensor : input_tensors_) { + inputs_.emplace_back(std::move(input_tensor)); + } +} + +// This function only considers the case where token_generator_ar_len equals 1. +void TokenGenerator::prepare_io(uint64_t cur_token, int64_t start_pos) { + // update input_tok + *input_toks_.data = + metadata_.use_int64_token ? cur_token : static_cast(cur_token); + // update position_ids + *input_pos_.data = static_cast(start_pos); +} + +Result TokenGenerator::generate( + std::vector tokens, + int64_t start_pos, + int32_t seq_len, + std::function token_callback) { + ET_CHECK_MSG( + !tokens.empty(), "Token generation loop shouldn't take empty tokens"); + int64_t pos = start_pos; // position in the sequence + + // Token after prefill + uint64_t cur_token = tokens.back(); + uint64_t prev_token; + // Rearrange KV cache first + kv_manager_->rearrange_cache(metadata_.ar_len); + std::vector attention_map(metadata_.ar_len); + std::iota(attention_map.begin(), attention_map.end(), -1); + // Initialize attention mask with current position + kv_manager_->init_attention_mask( + attention_mask_.data, attention_map, metadata_.ar_len, pos); + // Initialize the output of the module + ET_CHECK_MSG( + decoder_runner_->set_outputs(method_name_, output_tensors_) == + executorch::runtime::Error::Ok, + "Failed to set output tensor for module %s", + method_name_.c_str()); + // Generate our tokens + while (pos < seq_len - 1) { + // Fill in the token and position data + prepare_io(cur_token, pos); + // Only update data pointer of the cache to the tensor for SHIFT_POINTER + // mode + bool updated = kv_manager_->update_cache_tensor( + k_cache_in_, + k_cache_out_, + v_cache_in_, + v_cache_out_, + metadata_.ar_len, + pos); + // Only update the output of module for SHIFT_POINTER mode + if (updated) { + // Update the output of the module + ET_CHECK_MSG( + decoder_runner_->set_outputs(method_name_, output_tensors_) == + executorch::runtime::Error::Ok, + "Failed to set output tensor for module %s", + method_name_.c_str()); + } + // Run inference + auto logits_res = decoder_runner_->step(method_name_, inputs_); + ET_CHECK_OK_OR_RETURN_ERROR(logits_res.error()); + executorch::aten::Tensor& logits_tensor = logits_res.get(); + + prev_token = cur_token; + + stats_->on_sampling_begin(); + cur_token = + decoder_runner_->logits_to_token(logits_tensor, metadata_.ar_len); + stats_->on_sampling_end(); + + // Update KV Cache with the output results + kv_manager_->update_cache(metadata_.ar_len, pos, metadata_.ar_len); + // Update attention mask with current position + kv_manager_->update_attention_mask( + attention_mask_.data, metadata_.ar_len, pos, metadata_.ar_len); + pos++; + + // print the token as string, decode it with the Tokenizer object + token_callback( + ET_UNWRAP_TOKENIZER(tokenizer_->decode(prev_token, cur_token))); + + // data-dependent terminating condition: we have n_eos_ number of EOS + if (eos_ids_->count(cur_token) > 0) { + printf("\n"); + ET_LOG(Info, "\nReached to the end of generation"); + break; + } + } + return pos - start_pos; +} +} // namespace example diff --git a/examples/qualcomm/oss_scripts/llama/runner/token_generator.h b/examples/qualcomm/oss_scripts/llama/runner/token_generator.h new file mode 100644 index 00000000000..a5d69657955 --- /dev/null +++ b/examples/qualcomm/oss_scripts/llama/runner/token_generator.h @@ -0,0 +1,109 @@ +/* + * Copyright (c) Qualcomm Innovation Center, Inc. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once +#include +#include +#include +#include +#include +#include +#include + +namespace example { +/** + * @class TokenGenerator + * @brief Class for generating the token using decoder and key-value manager. + */ +class TokenGenerator { + public: + struct Metadata { + int32_t context_len; + int64_t num_heads; + int64_t num_layers; + int32_t ar_len; + int32_t vocab_size; + bool use_int64_token; + }; + TokenGenerator( + tokenizers::Tokenizer* tokenizer, + DecoderRunner* decoder_runner, + KVManager* kv_manager, + const std::string& method_name, + std::unique_ptr>&& eos_ids, + Metadata metadata, + executorch::llm::Stats* stats); + /** + * @brief Initialize I/O tensor and allocate I/O data buffer. + * @param buffer_manager Pointer to IMemAlloc instance which depends on + * kv_updater. + * @param method_meta Method metadata. + */ + void init_io( + IMemAlloc* buffer_manager, + executorch::runtime::Result method_meta); + + /** +    * @brief Generate tokens. +    * @param tokens Vector of input tokens. +    * @param start_pos Starting position for generation. +    * @param seq_len Length of the sequence to generate. +    * @param token_callback Callback function for generated tokens. +    * @return The number of tokens generated. +    */ + executorch::runtime::Result generate( + std::vector tokens, + int64_t start_pos, + int32_t seq_len, + std::function token_callback); + inline const size_t total_token_generator_io_size_in_bytes() const { + return input_toks_.size + input_pos_.size + attention_mask_.size + + logits_.size; + } + + private: + /** + * @brief Fill in I/O buffers with prompt token and position. + * @param cur_token Current token. + * @param start_pos Starting position. + */ + void prepare_io(uint64_t cur_token, int64_t start_pos); + + tokenizers::Tokenizer* tokenizer_; + DecoderRunner* decoder_runner_; + KVManager* kv_manager_; + std::string method_name_; + std::unique_ptr> eos_ids_; + + // metadata + Metadata metadata_; + + // inputs and outputs + TensorStruct input_toks_; + TensorStruct input_pos_; + TensorStruct attention_mask_; + TensorStruct logits_; + + // layer -> head -> TensorImpl + std::vector>> + k_cache_in_; + std::vector>> + v_cache_in_; + std::vector>> + k_cache_out_; + std::vector>> + v_cache_out_; + + std::vector inputs_; + std::vector input_tensors_; + std::vector output_tensors_; + + // stats + executorch::llm::Stats* stats_; +}; +} // namespace example diff --git a/examples/qualcomm/oss_scripts/llama/runner/utils.h b/examples/qualcomm/oss_scripts/llama/runner/utils.h new file mode 100644 index 00000000000..5b20ba5d3d1 --- /dev/null +++ b/examples/qualcomm/oss_scripts/llama/runner/utils.h @@ -0,0 +1,21 @@ +/* + * Copyright (c) Qualcomm Innovation Center, Inc. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once +#include +#include +#include + +// Template struct to hold tensor data and tensor +template +struct TensorStruct { + std::unique_ptr tensor; + T* data; + // data size in bytes + size_t size; +};