diff --git a/benchmark/profile_pipeline_api.py b/benchmark/profile_pipeline_api.py index 3c72722610..f1e72b265b 100644 --- a/benchmark/profile_pipeline_api.py +++ b/benchmark/profile_pipeline_api.py @@ -275,6 +275,7 @@ def parse_args(): ArgumentHelper.num_tokens_per_iter(tb_group) ArgumentHelper.max_prefill_iters(tb_group) ArgumentHelper.communicator(tb_group) + ArgumentHelper.async_(tb_group) args = parser.parse_args() return args @@ -285,19 +286,19 @@ def main(): random.seed(args.seed) os.environ['TM_LOG_LEVEL'] = args.log_level if args.backend == 'turbomind': - engine_config = TurbomindEngineConfig( - max_batch_size=args.concurrency, - tp=args.tp, - cache_max_entry_count=args.cache_max_entry_count, - session_len=args.session_len, - cache_block_seq_len=args.cache_block_seq_len, - model_format=args.model_format, - quant_policy=args.quant_policy, - num_tokens_per_iter=args.num_tokens_per_iter, - max_prefill_iters=args.max_prefill_iters, - enable_prefix_caching=args.enable_prefix_caching, - communicator=args.communicator, - ) + engine_config = TurbomindEngineConfig(max_batch_size=args.concurrency, + tp=args.tp, + cache_max_entry_count=args.cache_max_entry_count, + session_len=args.session_len, + cache_block_seq_len=args.cache_block_seq_len, + model_format=args.model_format, + quant_policy=args.quant_policy, + num_tokens_per_iter=args.num_tokens_per_iter, + max_prefill_iters=args.max_prefill_iters, + enable_prefix_caching=args.enable_prefix_caching, + communicator=args.communicator, + enable_metrics=False, + async_=args.async_) elif args.backend == 'pytorch': engine_config = PytorchEngineConfig( cache_max_entry_count=args.cache_max_entry_count, diff --git a/src/turbomind/engine/engine.cc b/src/turbomind/engine/engine.cc index 867717926c..db9a3ff3a3 100644 --- a/src/turbomind/engine/engine.cc +++ b/src/turbomind/engine/engine.cc @@ -374,9 +374,6 @@ void Engine::Impl::Accept(const Requests& rs, vector& signals) { auto& s = states_.at(0); - const int offset = s.rc.size(); - int index = offset; - vector> incoming; incoming.reserve(rs.size()); @@ -522,7 +519,7 @@ void Engine::Impl::Schedule() // dbg("Schedule"); - auto outcome = seq_mgr_->Materialize( + seq_mgr_->Materialize( sequences, context_length, alpha, priorities, param_.max_forward_token_num, param_.max_context_token_num); vector idxs(sequences.size()); @@ -703,12 +700,12 @@ void Engine::Impl::Update(BatchData& b, std::vector& signals) s.tokens.insert(s.tokens.end(), c.token_ids + c.seq_len - new_tokens, c.token_ids + c.seq_len); } if (TM_UNLIKELY(finished[i])) { - signals.push_back([this, r = c.req, l = c.seq_len] { // + signals.push_back([r = c.req, l = c.seq_len] { // UpdateState(*r, Request::kFinish, l); }); } else if (c.req->stream_output) { - signals.push_back([this, r = c.req, l = c.seq_len] { // + signals.push_back([r = c.req, l = c.seq_len] { // UpdateState(*r, Request::kOk, l); }); } diff --git a/src/turbomind/models/llama/BlockManager.cc b/src/turbomind/models/llama/BlockManager.cc index d04634a287..7be87d73c7 100644 --- a/src/turbomind/models/llama/BlockManager.cc +++ b/src/turbomind/models/llama/BlockManager.cc @@ -1,32 +1,15 @@ // Copyright (c) OpenMMLab. All rights reserved. +#include + #include "src/turbomind/models/llama/BlockManager.h" #include "src/turbomind/utils/cuda_utils.h" #include "src/turbomind/utils/debug_utils.h" #include "src/turbomind/utils/logger.h" #include "src/turbomind/utils/string_utils.h" -#include -#include -#include namespace turbomind { -size_t GetSyncFreeMemSize(Barrier& barrier, std::atomic& value) -{ - size_t free{}; - size_t total{}; - check_cuda_error(cudaMemGetInfo(&free, &total)); - - // atomicMin - auto old = value.load(); - while (old > free && !value.compare_exchange_weak(old, free)) {} - - // wait for all ranks - barrier.wait(); - - return value.load(); -} - BlockManager::BlockManager( size_t block_size, double block_count, int chunk_size, core::Allocator allocator, GetFreeMemSize get_free_size): block_size_(block_size), allocator_(allocator) @@ -106,18 +89,18 @@ size_t BlockManager::GetBlockCount(size_t block_size, double ratio, GetFreeMemSi void BlockManager::Move(std::vector& src, const std::vector& delta, std::vector& dst) { - FT_CHECK(src.size() >= delta.size()); + TM_CHECK_GE(src.size(), delta.size()); std::vector src1(src.size() - delta.size()); { auto end = std::set_difference(src.begin(), src.end(), delta.begin(), delta.end(), src1.begin()); - FT_CHECK(end == src1.end()); + TM_CHECK(end == src1.end()); } src.swap(src1); std::vector dst1(dst.size() + delta.size()); { auto end = std::set_union(dst.begin(), dst.end(), delta.begin(), delta.end(), dst1.begin()); - FT_CHECK(end == dst1.end()); + TM_CHECK(end == dst1.end()); } dst.swap(dst1); } @@ -136,10 +119,11 @@ auto BlockManager::Allocate(int count) -> std::pair for (int i = 0; i < count; ++i) { int idx = free_ids_[i]; auto& b = blocks_[idx]; - FT_CHECK(is_free(b)); // pre-condition: uc == 0 && ts == 0 + TM_CHECK(is_free(b)); // pre-condition: uc == 0 && ts == 0 b.use_count = 1; b.unique_id = unique_id_++; - FT_CHECK(is_active(b)); // post-condition + b.timestamp = timestamp_++; + TM_CHECK(is_active(b)); // post-condition block_ids[i] = idx; unique_ids[i] = b.unique_id; } @@ -153,7 +137,7 @@ auto BlockManager::Allocate(int count) -> std::pair void BlockManager::Evict(int count) { - FT_CHECK(count <= cached_ids_.size()); + TM_CHECK_LE(count, cached_ids_.size()); std::vector idxs(cached_ids_); // get first `count` cached ids according to timestamp std::nth_element(idxs.begin(), idxs.begin() + count, idxs.end(), [&](int i, int j) { @@ -167,10 +151,10 @@ void BlockManager::Evict(int count) // set as free for (const auto& idx : idxs) { auto& b = blocks_[idx]; - FT_CHECK(is_cached(b)); + TM_CHECK(is_cached(b)); // pre-condition b.unique_id = 0; b.timestamp = 0; - FT_CHECK(is_free(b)); + TM_CHECK(is_free(b)); // post-condition } Move(cached_ids_, idxs, free_ids_); @@ -184,10 +168,10 @@ void BlockManager::Free(BlockIds ids) for (const auto& i : ids) { auto& b = blocks_[i]; - FT_CHECK(is_cached(b)); // uc == 0 && ts != 0 + TM_CHECK(is_cached(b)); // pre-condition b.unique_id = 0; b.timestamp = 0; - FT_CHECK(is_free(b)); + TM_CHECK(is_free(b)); // post-condition } Move(cached_ids_, ids, free_ids_); @@ -200,10 +184,10 @@ int BlockManager::Unlock(const BlockIds& ids) for (const auto& i : ids) { auto& b = blocks_[i]; - FT_CHECK(is_active(b)); // pre-condition: uc > 0 + TM_CHECK(is_active(b)); // pre-condition if (--b.use_count == 0) { unlock.push_back(b.id); - FT_CHECK(is_cached(b)); // post-condition + TM_CHECK(is_cached(b)); // post-condition } } @@ -224,7 +208,7 @@ int BlockManager::Lock(const BlockIds& ids) auto& b = blocks_[i]; if (++b.use_count == 1) { lock.push_back(i); - FT_CHECK(is_active(b)); + TM_CHECK(is_active(b)); // post-condition } } @@ -240,14 +224,14 @@ int BlockManager::Lock(const BlockIds& ids) void BlockManager::Touch(const BlockIds& ids) { std::for_each(ids.crbegin(), ids.crend(), [this](int i) { - FT_CHECK(is_active(blocks_[i])); + TM_CHECK(is_active(blocks_[i])); blocks_[i].timestamp = timestamp_++; }); } int BlockManager::Verify(const std::vector& block_ids, const std::vector& unique_ids) { - FT_CHECK(block_ids.size() == unique_ids.size()); + TM_CHECK_EQ(block_ids.size(), unique_ids.size()); int valid = block_ids.size(); for (int i = 0; i < block_ids.size(); ++i) { if (unique_id(block_ids[i]) != unique_ids[i]) { @@ -260,8 +244,8 @@ int BlockManager::Verify(const std::vector& block_ids, const std::vector; -size_t GetSyncFreeMemSize(Barrier& barrier, std::atomic& value); - class BlockManager { public: explicit BlockManager( diff --git a/src/turbomind/models/llama/BlockTrie.cc b/src/turbomind/models/llama/BlockTrie.cc index 391a923143..d63bbcf1fd 100644 --- a/src/turbomind/models/llama/BlockTrie.cc +++ b/src/turbomind/models/llama/BlockTrie.cc @@ -22,37 +22,35 @@ BlockTrie::BlockTrie(size_t block_len, std::shared_ptr block_manag std::tuple BlockTrie::Match(const Sequence& seq) { - BlockIds matched_blocks; - UniqueIds matched_unique_ids; + BlockIds block_ids; + UniqueIds unique_ids; - std::shared_ptr curr_node = root_; - int num_matched = 0; + auto node = root_; + auto first = seq.prompt.begin(); // Warning: Do not use "<=" operator even when seq.prompt length is evenly - // divisible by block_seq_len_. This may produce an input_length of zero for - // the sequence, violating the precondition checked in LlamaBatch::Forward. - while (num_matched + block_seq_len_ < seq.prompt.size()) { - std::vector curr_tokens(seq.prompt.begin() + num_matched, - seq.prompt.begin() + num_matched + block_seq_len_); - size_t hash_key = hash(curr_tokens); - - auto it = curr_node->children.find(hash_key); - - if (it == curr_node->children.end()) { - break; + // divisible by block_seq_len_. The model needs at least one input token to generate output. + while (first + block_seq_len_ < seq.prompt.end()) { + const std::vector segment{first, first + block_seq_len_}; + const size_t hash_key = hash(segment); + if (const auto it = node->children.find(hash_key); it != node->children.end()) { + if (segment == it->second->tokens) { + block_ids.push_back(it->second->block_id); + unique_ids.push_back(it->second->block_unique_id); + node = it->second; + first += block_seq_len_; + } + else { + TM_LOG_WARNING("hash collision detected"); + break; + } } - - if (curr_tokens != it->second->tokens) { - TM_LOG_WARNING("hash key cache hit, but tokens are not the same"); + else { break; } - - matched_blocks.emplace_back(it->second->block_id); - matched_unique_ids.emplace_back(it->second->block_unique_id); - curr_node = it->second; - num_matched += block_seq_len_; } - return std::make_tuple(matched_blocks, matched_unique_ids); + + return std::make_tuple(block_ids, unique_ids); } std::tuple BlockTrie::Cache(const Sequence& seq, const std::vector& tokens) @@ -62,7 +60,6 @@ std::tuple BlockTrie::Cache(const Sequence& seq, const std: TM_CHECK_LE(seq.cache_len, seq.blocks.size() * block_seq_len_); auto node = root_; - int idx = 0; BlockIds cache_block_ids; UniqueIds cache_block_unique_ids; @@ -75,15 +72,14 @@ std::tuple BlockTrie::Cache(const Sequence& seq, const std: auto start = tokens.begin() + idx * block_seq_len_; auto end = start + block_seq_len_; - std::vector curr_tokens(start, end); - // TODO(lvhan): add salt to ensure the hash security - size_t hash_key = hash(curr_tokens); + const std::vector segment(start, end); + const size_t hash_key = hash(segment); // TODO(lvhan): add salt to ensure the hash security int block_id = seq.blocks[idx]; uint64_t block_unique_id = seq.block_unique_ids[idx]; if (auto it = node->children.find(hash_key); it != node->children.end()) { - if (curr_tokens == it->second->tokens) { // fast-forward + if (segment == it->second->tokens) { // fast-forward node = it->second; node->block_id = block_id; node->block_unique_id = block_unique_id; @@ -97,7 +93,7 @@ std::tuple BlockTrie::Cache(const Sequence& seq, const std: // insert new node node = node->children.emplace_hint(it, hash_key, std::make_shared())->second; node->hash_key = hash_key; - node->tokens = curr_tokens; + node->tokens = segment; node->block_id = block_id; node->block_unique_id = block_unique_id; new_cached += block_seq_len_; diff --git a/src/turbomind/models/llama/SequenceManager.cc b/src/turbomind/models/llama/SequenceManager.cc index 3c85bd573e..50e669ae49 100644 --- a/src/turbomind/models/llama/SequenceManager.cc +++ b/src/turbomind/models/llama/SequenceManager.cc @@ -1,14 +1,17 @@ // Copyright (c) OpenMMLab. All rights reserved. -#include "src/turbomind/models/llama/SequenceManager.h" -#include "src/turbomind/kernels/attention/block.h" -#include "src/turbomind/models/llama/BlockManager.h" -#include "src/turbomind/utils/debug_utils.h" -#include "src/turbomind/utils/logger.h" #include #include #include #include + +#include "src/turbomind/kernels/attention/block.h" +#include "src/turbomind/models/llama/BlockManager.h" +#include "src/turbomind/models/llama/SequenceManager.h" +#include "src/turbomind/utils/logger.h" + +// #include "dbg.h" + namespace turbomind { template @@ -114,19 +117,19 @@ void SequenceManager::CachePrompt(const Sequences& sequences, int active_size) } for (int i = 0; i < active_size; ++i) { - if (auto& seq = *sequences[i]; !seq.prompt.empty() && seq.cache_len >= seq.prompt.size()) { - BlockIds block_ids; - UniqueIds block_unique_ids; - std::tie(block_ids, block_unique_ids) = block_trie_->Cache(seq, seq.prompt); + if (auto& seq = *sequences[i]; !seq.prompt.empty()) { + const auto& [block_ids, unique_ids] = block_trie_->Cache(seq, seq.prompt); if (rank_ == 0) { // clang-format off TM_LOG_INFO("[SeqMgr][CachePrompt] ID %llu, cached blocks %d, tokens %d", seq.id, (int)block_ids.size(), (int)seq.prompt.size()); TM_LOG_DEBUG("[SeqMgr][CachePrompt] ID %llu, cached block_ids %s, unique_ids %s", seq.id, - vector2string(block_ids).c_str(), vector2string(block_unique_ids).c_str()); + vector2string(block_ids).c_str(), vector2string(unique_ids).c_str()); // clang-format on } - seq.prompt.clear(); + if (seq.cache_len >= seq.prompt.size()) { + seq.prompt.clear(); + } } } } @@ -137,19 +140,15 @@ void SequenceManager::CacheGeneration(const Sequence& seq) return; } - BlockIds block_ids; - UniqueIds block_unique_ids; + const auto& [block_ids, unique_ids] = block_trie_->Cache(seq, seq.tokens); - std::tie(block_ids, block_unique_ids) = block_trie_->Cache(seq, seq.tokens); if (rank_ == 0) { + // clang-format off TM_LOG_INFO("[SeqMgr][CacheGeneration] ID %llu, cached blocks %d, tokens %d", - seq.id, - block_ids.size(), - seq.tokens.size()); - TM_LOG_DEBUG("[SeqMgr][CacheGeneration] ID %llu, cached block_ids %s, unique_ids %s", - seq.id, - vector2string(block_ids).c_str(), - vector2string(block_unique_ids).c_str()); + seq.id, (int)block_ids.size(), (int)seq.tokens.size()); + TM_LOG_DEBUG("[SeqMgr][CacheGeneration] ID %llu, cached block_ids %s, unique_ids %s", seq.id, + vector2string(block_ids).c_str(), vector2string(unique_ids).c_str()); + // clang-format on } } @@ -161,7 +160,7 @@ void SequenceManager::VerifyAndLockCached(const Sequences& sequences) if (seq.status != Sequence::kCached) { continue; } - FT_CHECK(seq.blocks.size() == seq.block_unique_ids.size()); + TM_CHECK_EQ(seq.blocks.size(), seq.block_unique_ids.size()); // Verify cache blocks that may be invalidated const int count = block_manager_->Verify(seq.blocks, seq.block_unique_ids); seq.blocks.resize(count); @@ -189,7 +188,7 @@ void SequenceManager::CommitUnlockAndFree() void SequenceManager::UpdateAndSetUnlock(const Sequence& sequence) { - FT_CHECK(sequence.status != Sequence::kCached); + TM_CHECK_NE(sequence.status, Sequence::kCached); auto& seq = const_cast(sequence); block_manager_->Touch(seq.blocks); unlocked_.insert(unlocked_.end(), seq.blocks.begin(), seq.blocks.end()); @@ -217,14 +216,14 @@ struct Schedule { Sequences victims; Schedule(Snapshot snapshot, int size, int max_fwd_tokens, int max_tmp_tokens): - free(snapshot.free), - cached(snapshot.cached), - last(size), - use_count_(std::move(snapshot.use_count)), - unlocked_(size), - it_(size), + free{snapshot.free}, + cached{snapshot.cached}, + last{size}, max_fwd_tokens{max_fwd_tokens}, - max_tmp_tokens{max_tmp_tokens} + max_tmp_tokens{max_tmp_tokens}, + use_count_{std::move(snapshot.use_count)}, + unlocked_{size}, + it_{size} { } @@ -281,16 +280,14 @@ struct Transaction { const Sequences& sequences_; Schedule& schedule_; - std::shared_ptr block_trie_; - explicit Transaction( const Sequences& sequences, int index, int block_count, int input_len, int temp_len, Schedule& sched): - sequences_(sequences), - schedule_(sched), - index_(index), - block_count_(block_count), - input_len_(input_len), - temp_len_{temp_len} + index_{index}, + block_count_{block_count}, + input_len_{input_len}, + temp_len_{temp_len}, + sequences_{sequences}, + schedule_{sched} { } @@ -334,10 +331,10 @@ struct Transaction { { // update available resources schedule_.free -= allocate_; - FT_CHECK(schedule_.free >= 0); + TM_CHECK_GE(schedule_.free, 0); schedule_.cached += preempt_; schedule_.cached -= evict_; - FT_CHECK(schedule_.cached >= 0); + TM_CHECK_GE(schedule_.cached, 0); // update scheduled operations schedule_.allocate += allocate_; @@ -399,13 +396,13 @@ void SequenceManager::AssignAndActivate(const Sequences& sequences, // const BlockIds& blocks, const UniqueIds& unique_ids) { - FT_CHECK(sequences.size() == counts.size()); + TM_CHECK_EQ(sequences.size(), counts.size()); int first = 0; for (int i = 0; i < sequences.size(); ++i) { auto& s = const_cast(*sequences[i]); auto count = counts[i]; int last = first + count; - FT_CHECK(last <= blocks.size()); + TM_CHECK_LE(last, blocks.size()); s.blocks.insert(s.blocks.end(), blocks.begin() + first, blocks.begin() + last); s.block_unique_ids.insert(s.block_unique_ids.end(), unique_ids.begin() + first, unique_ids.begin() + last); s.status = Sequence::kActive; @@ -420,40 +417,44 @@ void SequenceManager::PrefixMatch(Sequences& sequences, const std::vector& } for (int i = 0; i < sequences.size(); i++) { - BlockIds block_ids; - UniqueIds unique_ids; - auto& seq = const_cast(*sequences[i]); - if (seq.cache_len != 0 || alpha[i] != 0) { - // We only apply prefix-cache matching when seq.cache_len is 0, - // which means this seq is a brand-new sequence. - // seq.cache_len is updated after every forward iter. Refer to `LlamaBatch::Forward` + + auto& seq = const_cast(*sequences[i]); + + /// TODO: Is there a way to exploit the alpha[i] != 0 case? + if (alpha[i] != 0 || seq.cache_len >= seq.prompt.size()) { continue; } - std::tie(block_ids, unique_ids) = block_trie_->Match(seq); - block_manager_->Lock(block_ids); - int valid = block_ids.size(); + const auto& [block_ids, unique_ids] = block_trie_->Match(seq); + if (rank_ == 0) { - TM_LOG_INFO("[SeqMgr][match] ID %llu, hit blocks %d, cache_len %d", seq.id, valid, seq.cache_len); - TM_LOG_DEBUG("[SeqMgr][match] ID %llu, hit block_ids %s, unique_ids %s", - seq.id, - vector2string(block_ids).c_str(), - vector2string(unique_ids).c_str()); + // clang-format off + TM_LOG_INFO("[SeqMgr][match] ID %llu, hit blocks %d, cache_len %d", seq.id, (int)block_ids.size(), seq.cache_len); + TM_LOG_DEBUG("[SeqMgr][match] ID %llu, hit block_ids %s, unique_ids %s", seq.id, + vector2string(block_ids).c_str(), vector2string(unique_ids).c_str()); + // clang-format on + } + + /// TODO: `Unlock` and `Lock` can't be batched because there may be repeated blocks between sequences + if (const int offset = seq.cache_len / block_seq_len_; offset < block_ids.size()) { + if (BlockIds tail{seq.blocks.begin() + offset, seq.blocks.end()}; !tail.empty()) { + block_manager_->Unlock(tail); + seq.blocks.resize(offset); + seq.block_unique_ids.resize(offset); + } + seq.blocks.insert(seq.blocks.end(), block_ids.begin() + offset, block_ids.end()); + seq.block_unique_ids.insert(seq.block_unique_ids.end(), unique_ids.begin() + offset, unique_ids.end()); + seq.cache_len = seq.blocks.size() * block_seq_len_; + block_manager_->Lock({block_ids.begin() + offset, block_ids.end()}); } - FT_CHECK(seq.blocks.empty()); - seq.cache_len = valid * block_seq_len_; - seq.blocks.insert(seq.blocks.end(), block_ids.begin(), block_ids.end()); - seq.block_unique_ids.insert(seq.block_unique_ids.end(), unique_ids.begin(), unique_ids.end()); if (rank_ == 0) { + // clang-format off TM_LOG_INFO("[SeqMgr][match] ID %llu, after matching, blocks %d, cache_len %d", - seq.id, - seq.blocks.size(), - seq.cache_len); - TM_LOG_DEBUG("[SeqMgr][match] ID %llu, after matching, block_ids %s, unique_ids %s", - seq.id, - vector2string(seq.blocks).c_str(), - vector2string(seq.block_unique_ids).c_str()); + seq.id, seq.blocks.size(), seq.cache_len); + TM_LOG_DEBUG("[SeqMgr][match] ID %llu, after matching, block_ids %s, unique_ids %s", seq.id, + vector2string(seq.blocks).c_str(), vector2string(seq.block_unique_ids).c_str()); + // clang-format on } } } @@ -506,23 +507,23 @@ auto SequenceManager::Materialize(Sequences sequences, // combine allocate and evict since evicted blocks are reused by allocation schedule.allocate += schedule.evict; - if (schedule.allocate) { - dbg(*block_manager_); - } + // if (schedule.allocate) { + // dbg(*block_manager_); + // } Outcome outcome{}; outcome.allocation = schedule.allocate; outcome.swap_in = std::count_if(schedule.active.begin(), schedule.active.end(), [](auto p) { - if (p->status != Sequence::kActive) { - dbg(*p); - } - return p->status != Sequence::kActive; // + // if (p->status != Sequence::kActive) { + // dbg(*p); + // } + return p->status != Sequence::kActive; }); - outcome.swap_out = std::count_if(schedule.inactive.begin(), schedule.inactive.end(), [](auto p) { - if (p->status == Sequence::kActive) { - dbg(*p); - } - return p->status == Sequence::kActive; // + outcome.swap_out = std::count_if(schedule.inactive.begin(), schedule.inactive.end(), [](auto p) { + // if (p->status == Sequence::kActive) { + // dbg(*p); + // } + return p->status == Sequence::kActive; }); // release preempted blocks -> cached diff --git a/src/turbomind/models/llama/SequenceManager.h b/src/turbomind/models/llama/SequenceManager.h index fc19eef038..f926a3ebd5 100644 --- a/src/turbomind/models/llama/SequenceManager.h +++ b/src/turbomind/models/llama/SequenceManager.h @@ -115,12 +115,12 @@ class SequenceManager { // seq_len += output // cache += input + output - 1 or cache = seq_len - 1 - [[nodiscard]] Outcome Materialize(Sequences sequences, - std::vector context_length, - std::vector alpha, - std::vector priorities, - int max_fwd_tokens, - int max_tmp_tokens); + [[maybe_unused]] Outcome Materialize(Sequences sequences, + std::vector context_length, + std::vector alpha, + std::vector priorities, + int max_fwd_tokens, + int max_tmp_tokens); /** @brief cache the input prompt tokens of each seq in sequences[0:active_size-1] *