diff --git a/benchmark/profile_pipeline_api.py b/benchmark/profile_pipeline_api.py
index 3c72722610..f1e72b265b 100644
--- a/benchmark/profile_pipeline_api.py
+++ b/benchmark/profile_pipeline_api.py
@@ -275,6 +275,7 @@ def parse_args():
     ArgumentHelper.num_tokens_per_iter(tb_group)
     ArgumentHelper.max_prefill_iters(tb_group)
     ArgumentHelper.communicator(tb_group)
+    ArgumentHelper.async_(tb_group)
 
     args = parser.parse_args()
     return args
@@ -285,19 +286,19 @@ def main():
     random.seed(args.seed)
     os.environ['TM_LOG_LEVEL'] = args.log_level
     if args.backend == 'turbomind':
-        engine_config = TurbomindEngineConfig(
-            max_batch_size=args.concurrency,
-            tp=args.tp,
-            cache_max_entry_count=args.cache_max_entry_count,
-            session_len=args.session_len,
-            cache_block_seq_len=args.cache_block_seq_len,
-            model_format=args.model_format,
-            quant_policy=args.quant_policy,
-            num_tokens_per_iter=args.num_tokens_per_iter,
-            max_prefill_iters=args.max_prefill_iters,
-            enable_prefix_caching=args.enable_prefix_caching,
-            communicator=args.communicator,
-        )
+        engine_config = TurbomindEngineConfig(max_batch_size=args.concurrency,
+                                              tp=args.tp,
+                                              cache_max_entry_count=args.cache_max_entry_count,
+                                              session_len=args.session_len,
+                                              cache_block_seq_len=args.cache_block_seq_len,
+                                              model_format=args.model_format,
+                                              quant_policy=args.quant_policy,
+                                              num_tokens_per_iter=args.num_tokens_per_iter,
+                                              max_prefill_iters=args.max_prefill_iters,
+                                              enable_prefix_caching=args.enable_prefix_caching,
+                                              communicator=args.communicator,
+                                              enable_metrics=False,
+                                              async_=args.async_)
     elif args.backend == 'pytorch':
         engine_config = PytorchEngineConfig(
             cache_max_entry_count=args.cache_max_entry_count,
diff --git a/src/turbomind/engine/engine.cc b/src/turbomind/engine/engine.cc
index 867717926c..db9a3ff3a3 100644
--- a/src/turbomind/engine/engine.cc
+++ b/src/turbomind/engine/engine.cc
@@ -374,9 +374,6 @@ void Engine::Impl::Accept(const Requests& rs, vector<Signal>& signals)
 {
     auto& s = states_.at(0);
 
-    const int offset = s.rc.size();
-    int       index  = offset;
-
     vector<unique_ptr<RequestCache>> incoming;
     incoming.reserve(rs.size());
 
@@ -522,7 +519,7 @@ void Engine::Impl::Schedule()
 
     // dbg("Schedule");
 
-    auto outcome = seq_mgr_->Materialize(
+    seq_mgr_->Materialize(
         sequences, context_length, alpha, priorities, param_.max_forward_token_num, param_.max_context_token_num);
 
     vector<int> idxs(sequences.size());
@@ -703,12 +700,12 @@ void Engine::Impl::Update(BatchData& b, std::vector<Signal>& signals)
                     s.tokens.insert(s.tokens.end(), c.token_ids + c.seq_len - new_tokens, c.token_ids + c.seq_len);
                 }
                 if (TM_UNLIKELY(finished[i])) {
-                    signals.push_back([this, r = c.req, l = c.seq_len] {  //
+                    signals.push_back([r = c.req, l = c.seq_len] {  //
                         UpdateState(*r, Request::kFinish, l);
                     });
                 }
                 else if (c.req->stream_output) {
-                    signals.push_back([this, r = c.req, l = c.seq_len] {  //
+                    signals.push_back([r = c.req, l = c.seq_len] {  //
                         UpdateState(*r, Request::kOk, l);
                     });
                 }
diff --git a/src/turbomind/models/llama/BlockManager.cc b/src/turbomind/models/llama/BlockManager.cc
index d04634a287..7be87d73c7 100644
--- a/src/turbomind/models/llama/BlockManager.cc
+++ b/src/turbomind/models/llama/BlockManager.cc
@@ -1,32 +1,15 @@
 // Copyright (c) OpenMMLab. All rights reserved.
 
+#include <algorithm>
+
 #include "src/turbomind/models/llama/BlockManager.h"
 #include "src/turbomind/utils/cuda_utils.h"
 #include "src/turbomind/utils/debug_utils.h"
 #include "src/turbomind/utils/logger.h"
 #include "src/turbomind/utils/string_utils.h"
-#include <algorithm>
-#include <iterator>
-#include <stdexcept>
 
 namespace turbomind {
 
-size_t GetSyncFreeMemSize(Barrier& barrier, std::atomic<size_t>& value)
-{
-    size_t free{};
-    size_t total{};
-    check_cuda_error(cudaMemGetInfo(&free, &total));
-
-    // atomicMin
-    auto old = value.load();
-    while (old > free && !value.compare_exchange_weak(old, free)) {}
-
-    // wait for all ranks
-    barrier.wait();
-
-    return value.load();
-}
-
 BlockManager::BlockManager(
     size_t block_size, double block_count, int chunk_size, core::Allocator allocator, GetFreeMemSize get_free_size):
     block_size_(block_size), allocator_(allocator)
@@ -106,18 +89,18 @@ size_t BlockManager::GetBlockCount(size_t block_size, double ratio, GetFreeMemSi
 
 void BlockManager::Move(std::vector<int>& src, const std::vector<int>& delta, std::vector<int>& dst)
 {
-    FT_CHECK(src.size() >= delta.size());
+    TM_CHECK_GE(src.size(), delta.size());
     std::vector<int> src1(src.size() - delta.size());
     {
         auto end = std::set_difference(src.begin(), src.end(), delta.begin(), delta.end(), src1.begin());
-        FT_CHECK(end == src1.end());
+        TM_CHECK(end == src1.end());
     }
     src.swap(src1);
 
     std::vector<int> dst1(dst.size() + delta.size());
     {
         auto end = std::set_union(dst.begin(), dst.end(), delta.begin(), delta.end(), dst1.begin());
-        FT_CHECK(end == dst1.end());
+        TM_CHECK(end == dst1.end());
     }
     dst.swap(dst1);
 }
@@ -136,10 +119,11 @@ auto BlockManager::Allocate(int count) -> std::pair<BlockIds, UniqueIds>
     for (int i = 0; i < count; ++i) {
         int   idx = free_ids_[i];
         auto& b   = blocks_[idx];
-        FT_CHECK(is_free(b));  // pre-condition: uc == 0 && ts == 0
+        TM_CHECK(is_free(b));  // pre-condition: uc == 0 && ts == 0
         b.use_count = 1;
         b.unique_id = unique_id_++;
-        FT_CHECK(is_active(b));  // post-condition
+        b.timestamp = timestamp_++;
+        TM_CHECK(is_active(b));  // post-condition
         block_ids[i]  = idx;
         unique_ids[i] = b.unique_id;
     }
@@ -153,7 +137,7 @@ auto BlockManager::Allocate(int count) -> std::pair<BlockIds, UniqueIds>
 
 void BlockManager::Evict(int count)
 {
-    FT_CHECK(count <= cached_ids_.size());
+    TM_CHECK_LE(count, cached_ids_.size());
     std::vector<int> idxs(cached_ids_);
     // get first `count` cached ids according to timestamp
     std::nth_element(idxs.begin(), idxs.begin() + count, idxs.end(), [&](int i, int j) {
@@ -167,10 +151,10 @@ void BlockManager::Evict(int count)
     // set as free
     for (const auto& idx : idxs) {
         auto& b = blocks_[idx];
-        FT_CHECK(is_cached(b));
+        TM_CHECK(is_cached(b));  // pre-condition
         b.unique_id = 0;
         b.timestamp = 0;
-        FT_CHECK(is_free(b));
+        TM_CHECK(is_free(b));  // post-condition
     }
 
     Move(cached_ids_, idxs, free_ids_);
@@ -184,10 +168,10 @@ void BlockManager::Free(BlockIds ids)
 
     for (const auto& i : ids) {
         auto& b = blocks_[i];
-        FT_CHECK(is_cached(b));  // uc == 0 && ts != 0
+        TM_CHECK(is_cached(b));  // pre-condition
         b.unique_id = 0;
         b.timestamp = 0;
-        FT_CHECK(is_free(b));
+        TM_CHECK(is_free(b));  // post-condition
     }
 
     Move(cached_ids_, ids, free_ids_);
@@ -200,10 +184,10 @@ int BlockManager::Unlock(const BlockIds& ids)
 
     for (const auto& i : ids) {
         auto& b = blocks_[i];
-        FT_CHECK(is_active(b));  // pre-condition: uc > 0
+        TM_CHECK(is_active(b));  // pre-condition
         if (--b.use_count == 0) {
             unlock.push_back(b.id);
-            FT_CHECK(is_cached(b));  // post-condition
+            TM_CHECK(is_cached(b));  // post-condition
         }
     }
 
@@ -224,7 +208,7 @@ int BlockManager::Lock(const BlockIds& ids)
         auto& b = blocks_[i];
         if (++b.use_count == 1) {
             lock.push_back(i);
-            FT_CHECK(is_active(b));
+            TM_CHECK(is_active(b));  // post-condition
         }
     }
 
@@ -240,14 +224,14 @@ int BlockManager::Lock(const BlockIds& ids)
 void BlockManager::Touch(const BlockIds& ids)
 {
     std::for_each(ids.crbegin(), ids.crend(), [this](int i) {
-        FT_CHECK(is_active(blocks_[i]));
+        TM_CHECK(is_active(blocks_[i]));
         blocks_[i].timestamp = timestamp_++;
     });
 }
 
 int BlockManager::Verify(const std::vector<int>& block_ids, const std::vector<uint64_t>& unique_ids)
 {
-    FT_CHECK(block_ids.size() == unique_ids.size());
+    TM_CHECK_EQ(block_ids.size(), unique_ids.size());
     int valid = block_ids.size();
     for (int i = 0; i < block_ids.size(); ++i) {
         if (unique_id(block_ids[i]) != unique_ids[i]) {
@@ -260,8 +244,8 @@ int BlockManager::Verify(const std::vector<int>& block_ids, const std::vector<ui
         miss += (unique_id(block_ids[i]) != unique_ids[i]);
     }
     // All later blocks should have been invalidated
-    FT_CHECK_WITH_INFO(miss == (int)block_ids.size() - valid,
-                       fmtstr("count = %d, valid = %d, miss = %d", (int)block_ids.size(), valid, miss));
+    TM_CHECK_EQ(miss, (int)block_ids.size() - valid)
+        << fmtstr("count = %d, valid = %d, miss = %d", (int)block_ids.size(), valid, miss);
     return valid;
 }
 
diff --git a/src/turbomind/models/llama/BlockManager.h b/src/turbomind/models/llama/BlockManager.h
index d8f48e2633..8b50353a25 100644
--- a/src/turbomind/models/llama/BlockManager.h
+++ b/src/turbomind/models/llama/BlockManager.h
@@ -68,8 +68,6 @@ struct Snapshot {
 
 using GetFreeMemSize = std::function<size_t()>;
 
-size_t GetSyncFreeMemSize(Barrier& barrier, std::atomic<size_t>& value);
-
 class BlockManager {
 public:
     explicit BlockManager(
diff --git a/src/turbomind/models/llama/BlockTrie.cc b/src/turbomind/models/llama/BlockTrie.cc
index 391a923143..d63bbcf1fd 100644
--- a/src/turbomind/models/llama/BlockTrie.cc
+++ b/src/turbomind/models/llama/BlockTrie.cc
@@ -22,37 +22,35 @@ BlockTrie::BlockTrie(size_t block_len, std::shared_ptr<BlockManager> block_manag
 
 std::tuple<BlockIds, UniqueIds> BlockTrie::Match(const Sequence& seq)
 {
-    BlockIds  matched_blocks;
-    UniqueIds matched_unique_ids;
+    BlockIds  block_ids;
+    UniqueIds unique_ids;
 
-    std::shared_ptr<TrieNode> curr_node   = root_;
-    int                       num_matched = 0;
+    auto node  = root_;
+    auto first = seq.prompt.begin();
 
     // Warning: Do not use "<=" operator even when seq.prompt length is evenly
-    // divisible by block_seq_len_. This may produce an input_length of zero for
-    // the sequence, violating the precondition checked in LlamaBatch::Forward.
-    while (num_matched + block_seq_len_ < seq.prompt.size()) {
-        std::vector<int> curr_tokens(seq.prompt.begin() + num_matched,
-                                     seq.prompt.begin() + num_matched + block_seq_len_);
-        size_t           hash_key = hash(curr_tokens);
-
-        auto it = curr_node->children.find(hash_key);
-
-        if (it == curr_node->children.end()) {
-            break;
+    // divisible by block_seq_len_. The model needs at least one input token to generate output.
+    while (first + block_seq_len_ < seq.prompt.end()) {
+        const std::vector<int> segment{first, first + block_seq_len_};
+        const size_t           hash_key = hash(segment);
+        if (const auto it = node->children.find(hash_key); it != node->children.end()) {
+            if (segment == it->second->tokens) {
+                block_ids.push_back(it->second->block_id);
+                unique_ids.push_back(it->second->block_unique_id);
+                node = it->second;
+                first += block_seq_len_;
+            }
+            else {
+                TM_LOG_WARNING("hash collision detected");
+                break;
+            }
         }
-
-        if (curr_tokens != it->second->tokens) {
-            TM_LOG_WARNING("hash key cache hit, but tokens are not the same");
+        else {
             break;
         }
-
-        matched_blocks.emplace_back(it->second->block_id);
-        matched_unique_ids.emplace_back(it->second->block_unique_id);
-        curr_node = it->second;
-        num_matched += block_seq_len_;
     }
-    return std::make_tuple(matched_blocks, matched_unique_ids);
+
+    return std::make_tuple(block_ids, unique_ids);
 }
 
 std::tuple<BlockIds, UniqueIds> BlockTrie::Cache(const Sequence& seq, const std::vector<int>& tokens)
@@ -62,7 +60,6 @@ std::tuple<BlockIds, UniqueIds> BlockTrie::Cache(const Sequence& seq, const std:
     TM_CHECK_LE(seq.cache_len, seq.blocks.size() * block_seq_len_);
 
     auto node = root_;
-    int  idx  = 0;
 
     BlockIds  cache_block_ids;
     UniqueIds cache_block_unique_ids;
@@ -75,15 +72,14 @@ std::tuple<BlockIds, UniqueIds> BlockTrie::Cache(const Sequence& seq, const std:
         auto start = tokens.begin() + idx * block_seq_len_;
         auto end   = start + block_seq_len_;
 
-        std::vector<int> curr_tokens(start, end);
-        // TODO(lvhan): add salt to ensure the hash security
-        size_t hash_key = hash(curr_tokens);
+        const std::vector<int> segment(start, end);
+        const size_t           hash_key = hash(segment);  // TODO(lvhan): add salt to ensure the hash security
 
         int      block_id        = seq.blocks[idx];
         uint64_t block_unique_id = seq.block_unique_ids[idx];
 
         if (auto it = node->children.find(hash_key); it != node->children.end()) {
-            if (curr_tokens == it->second->tokens) {  // fast-forward
+            if (segment == it->second->tokens) {  // fast-forward
                 node                  = it->second;
                 node->block_id        = block_id;
                 node->block_unique_id = block_unique_id;
@@ -97,7 +93,7 @@ std::tuple<BlockIds, UniqueIds> BlockTrie::Cache(const Sequence& seq, const std:
             // insert new node
             node                  = node->children.emplace_hint(it, hash_key, std::make_shared<TrieNode>())->second;
             node->hash_key        = hash_key;
-            node->tokens          = curr_tokens;
+            node->tokens          = segment;
             node->block_id        = block_id;
             node->block_unique_id = block_unique_id;
             new_cached += block_seq_len_;
diff --git a/src/turbomind/models/llama/SequenceManager.cc b/src/turbomind/models/llama/SequenceManager.cc
index 3c85bd573e..50e669ae49 100644
--- a/src/turbomind/models/llama/SequenceManager.cc
+++ b/src/turbomind/models/llama/SequenceManager.cc
@@ -1,14 +1,17 @@
 // Copyright (c) OpenMMLab. All rights reserved.
 
-#include "src/turbomind/models/llama/SequenceManager.h"
-#include "src/turbomind/kernels/attention/block.h"
-#include "src/turbomind/models/llama/BlockManager.h"
-#include "src/turbomind/utils/debug_utils.h"
-#include "src/turbomind/utils/logger.h"
 #include <cstddef>
 #include <cstdlib>
 #include <ctime>
 #include <numeric>
+
+#include "src/turbomind/kernels/attention/block.h"
+#include "src/turbomind/models/llama/BlockManager.h"
+#include "src/turbomind/models/llama/SequenceManager.h"
+#include "src/turbomind/utils/logger.h"
+
+// #include "dbg.h"
+
 namespace turbomind {
 
 template<typename T>
@@ -114,19 +117,19 @@ void SequenceManager::CachePrompt(const Sequences& sequences, int active_size)
     }
 
     for (int i = 0; i < active_size; ++i) {
-        if (auto& seq = *sequences[i]; !seq.prompt.empty() && seq.cache_len >= seq.prompt.size()) {
-            BlockIds  block_ids;
-            UniqueIds block_unique_ids;
-            std::tie(block_ids, block_unique_ids) = block_trie_->Cache(seq, seq.prompt);
+        if (auto& seq = *sequences[i]; !seq.prompt.empty()) {
+            const auto& [block_ids, unique_ids] = block_trie_->Cache(seq, seq.prompt);
             if (rank_ == 0) {
                 // clang-format off
                 TM_LOG_INFO("[SeqMgr][CachePrompt] ID %llu, cached blocks %d, tokens %d", seq.id,
                             (int)block_ids.size(), (int)seq.prompt.size());
                 TM_LOG_DEBUG("[SeqMgr][CachePrompt] ID %llu, cached block_ids %s, unique_ids %s", seq.id,
-                             vector2string(block_ids).c_str(), vector2string(block_unique_ids).c_str());
+                             vector2string(block_ids).c_str(), vector2string(unique_ids).c_str());
                 // clang-format on
             }
-            seq.prompt.clear();
+            if (seq.cache_len >= seq.prompt.size()) {
+                seq.prompt.clear();
+            }
         }
     }
 }
@@ -137,19 +140,15 @@ void SequenceManager::CacheGeneration(const Sequence& seq)
         return;
     }
 
-    BlockIds  block_ids;
-    UniqueIds block_unique_ids;
+    const auto& [block_ids, unique_ids] = block_trie_->Cache(seq, seq.tokens);
 
-    std::tie(block_ids, block_unique_ids) = block_trie_->Cache(seq, seq.tokens);
     if (rank_ == 0) {
+        // clang-format off
         TM_LOG_INFO("[SeqMgr][CacheGeneration] ID %llu, cached blocks %d, tokens %d",
-                    seq.id,
-                    block_ids.size(),
-                    seq.tokens.size());
-        TM_LOG_DEBUG("[SeqMgr][CacheGeneration] ID %llu, cached block_ids %s, unique_ids %s",
-                     seq.id,
-                     vector2string(block_ids).c_str(),
-                     vector2string(block_unique_ids).c_str());
+                    seq.id, (int)block_ids.size(), (int)seq.tokens.size());
+        TM_LOG_DEBUG("[SeqMgr][CacheGeneration] ID %llu, cached block_ids %s, unique_ids %s", seq.id,
+                     vector2string(block_ids).c_str(), vector2string(unique_ids).c_str());
+        // clang-format on
     }
 }
 
@@ -161,7 +160,7 @@ void SequenceManager::VerifyAndLockCached(const Sequences& sequences)
         if (seq.status != Sequence::kCached) {
             continue;
         }
-        FT_CHECK(seq.blocks.size() == seq.block_unique_ids.size());
+        TM_CHECK_EQ(seq.blocks.size(), seq.block_unique_ids.size());
         // Verify cache blocks that may be invalidated
         const int count = block_manager_->Verify(seq.blocks, seq.block_unique_ids);
         seq.blocks.resize(count);
@@ -189,7 +188,7 @@ void SequenceManager::CommitUnlockAndFree()
 
 void SequenceManager::UpdateAndSetUnlock(const Sequence& sequence)
 {
-    FT_CHECK(sequence.status != Sequence::kCached);
+    TM_CHECK_NE(sequence.status, Sequence::kCached);
     auto& seq = const_cast<Sequence&>(sequence);
     block_manager_->Touch(seq.blocks);
     unlocked_.insert(unlocked_.end(), seq.blocks.begin(), seq.blocks.end());
@@ -217,14 +216,14 @@ struct Schedule {
     Sequences        victims;
 
     Schedule(Snapshot snapshot, int size, int max_fwd_tokens, int max_tmp_tokens):
-        free(snapshot.free),
-        cached(snapshot.cached),
-        last(size),
-        use_count_(std::move(snapshot.use_count)),
-        unlocked_(size),
-        it_(size),
+        free{snapshot.free},
+        cached{snapshot.cached},
+        last{size},
         max_fwd_tokens{max_fwd_tokens},
-        max_tmp_tokens{max_tmp_tokens}
+        max_tmp_tokens{max_tmp_tokens},
+        use_count_{std::move(snapshot.use_count)},
+        unlocked_{size},
+        it_{size}
     {
     }
 
@@ -281,16 +280,14 @@ struct Transaction {
     const Sequences& sequences_;
     Schedule&        schedule_;
 
-    std::shared_ptr<BlockTrie> block_trie_;
-
     explicit Transaction(
         const Sequences& sequences, int index, int block_count, int input_len, int temp_len, Schedule& sched):
-        sequences_(sequences),
-        schedule_(sched),
-        index_(index),
-        block_count_(block_count),
-        input_len_(input_len),
-        temp_len_{temp_len}
+        index_{index},
+        block_count_{block_count},
+        input_len_{input_len},
+        temp_len_{temp_len},
+        sequences_{sequences},
+        schedule_{sched}
     {
     }
 
@@ -334,10 +331,10 @@ struct Transaction {
     {
         // update available resources
         schedule_.free -= allocate_;
-        FT_CHECK(schedule_.free >= 0);
+        TM_CHECK_GE(schedule_.free, 0);
         schedule_.cached += preempt_;
         schedule_.cached -= evict_;
-        FT_CHECK(schedule_.cached >= 0);
+        TM_CHECK_GE(schedule_.cached, 0);
 
         // update scheduled operations
         schedule_.allocate += allocate_;
@@ -399,13 +396,13 @@ void SequenceManager::AssignAndActivate(const Sequences&        sequences,  //
                                         const BlockIds&         blocks,
                                         const UniqueIds&        unique_ids)
 {
-    FT_CHECK(sequences.size() == counts.size());
+    TM_CHECK_EQ(sequences.size(), counts.size());
     int first = 0;
     for (int i = 0; i < sequences.size(); ++i) {
         auto& s     = const_cast<Sequence&>(*sequences[i]);
         auto  count = counts[i];
         int   last  = first + count;
-        FT_CHECK(last <= blocks.size());
+        TM_CHECK_LE(last, blocks.size());
         s.blocks.insert(s.blocks.end(), blocks.begin() + first, blocks.begin() + last);
         s.block_unique_ids.insert(s.block_unique_ids.end(), unique_ids.begin() + first, unique_ids.begin() + last);
         s.status = Sequence::kActive;
@@ -420,40 +417,44 @@ void SequenceManager::PrefixMatch(Sequences& sequences, const std::vector<int>&
     }
 
     for (int i = 0; i < sequences.size(); i++) {
-        BlockIds  block_ids;
-        UniqueIds unique_ids;
-        auto&     seq = const_cast<Sequence&>(*sequences[i]);
-        if (seq.cache_len != 0 || alpha[i] != 0) {
-            // We only apply prefix-cache matching when seq.cache_len is 0,
-            // which means this seq is a brand-new sequence.
-            // seq.cache_len is updated after every forward iter. Refer to `LlamaBatch::Forward`
+
+        auto& seq = const_cast<Sequence&>(*sequences[i]);
+
+        /// TODO: Is there a way to exploit the alpha[i] != 0 case?
+        if (alpha[i] != 0 || seq.cache_len >= seq.prompt.size()) {
             continue;
         }
-        std::tie(block_ids, unique_ids) = block_trie_->Match(seq);
 
-        block_manager_->Lock(block_ids);
-        int valid = block_ids.size();
+        const auto& [block_ids, unique_ids] = block_trie_->Match(seq);
+
         if (rank_ == 0) {
-            TM_LOG_INFO("[SeqMgr][match] ID %llu, hit blocks %d, cache_len %d", seq.id, valid, seq.cache_len);
-            TM_LOG_DEBUG("[SeqMgr][match] ID %llu, hit block_ids %s, unique_ids %s",
-                         seq.id,
-                         vector2string(block_ids).c_str(),
-                         vector2string(unique_ids).c_str());
+            // clang-format off
+            TM_LOG_INFO("[SeqMgr][match] ID %llu, hit blocks %d, cache_len %d", seq.id, (int)block_ids.size(), seq.cache_len);
+            TM_LOG_DEBUG("[SeqMgr][match] ID %llu, hit block_ids %s, unique_ids %s", seq.id,
+                         vector2string(block_ids).c_str(), vector2string(unique_ids).c_str());
+            // clang-format on
+        }
+
+        /// TODO: `Unlock` and `Lock` can't be batched because there may be repeated blocks between sequences
+        if (const int offset = seq.cache_len / block_seq_len_; offset < block_ids.size()) {
+            if (BlockIds tail{seq.blocks.begin() + offset, seq.blocks.end()}; !tail.empty()) {
+                block_manager_->Unlock(tail);
+                seq.blocks.resize(offset);
+                seq.block_unique_ids.resize(offset);
+            }
+            seq.blocks.insert(seq.blocks.end(), block_ids.begin() + offset, block_ids.end());
+            seq.block_unique_ids.insert(seq.block_unique_ids.end(), unique_ids.begin() + offset, unique_ids.end());
+            seq.cache_len = seq.blocks.size() * block_seq_len_;
+            block_manager_->Lock({block_ids.begin() + offset, block_ids.end()});
         }
 
-        FT_CHECK(seq.blocks.empty());
-        seq.cache_len = valid * block_seq_len_;
-        seq.blocks.insert(seq.blocks.end(), block_ids.begin(), block_ids.end());
-        seq.block_unique_ids.insert(seq.block_unique_ids.end(), unique_ids.begin(), unique_ids.end());
         if (rank_ == 0) {
+            // clang-format off
             TM_LOG_INFO("[SeqMgr][match] ID %llu, after matching, blocks %d, cache_len %d",
-                        seq.id,
-                        seq.blocks.size(),
-                        seq.cache_len);
-            TM_LOG_DEBUG("[SeqMgr][match] ID %llu, after matching, block_ids %s, unique_ids %s",
-                         seq.id,
-                         vector2string(seq.blocks).c_str(),
-                         vector2string(seq.block_unique_ids).c_str());
+                        seq.id, seq.blocks.size(), seq.cache_len);
+            TM_LOG_DEBUG("[SeqMgr][match] ID %llu, after matching, block_ids %s, unique_ids %s", seq.id,
+                         vector2string(seq.blocks).c_str(), vector2string(seq.block_unique_ids).c_str());
+            // clang-format on
         }
     }
 }
@@ -506,23 +507,23 @@ auto SequenceManager::Materialize(Sequences             sequences,
     // combine allocate and evict since evicted blocks are reused by allocation
     schedule.allocate += schedule.evict;
 
-    if (schedule.allocate) {
-        dbg(*block_manager_);
-    }
+    // if (schedule.allocate) {
+    //     dbg(*block_manager_);
+    // }
 
     Outcome outcome{};
     outcome.allocation = schedule.allocate;
     outcome.swap_in    = std::count_if(schedule.active.begin(), schedule.active.end(), [](auto p) {
-        if (p->status != Sequence::kActive) {
-            dbg(*p);
-        }
-        return p->status != Sequence::kActive;  //
+        // if (p->status != Sequence::kActive) {
+        //     dbg(*p);
+        // }
+        return p->status != Sequence::kActive;
     });
-    outcome.swap_out   = std::count_if(schedule.inactive.begin(), schedule.inactive.end(), [](auto p) {
-        if (p->status == Sequence::kActive) {
-            dbg(*p);
-        }
-        return p->status == Sequence::kActive;  //
+    outcome.swap_out = std::count_if(schedule.inactive.begin(), schedule.inactive.end(), [](auto p) {
+        // if (p->status == Sequence::kActive) {
+        //     dbg(*p);
+        // }
+        return p->status == Sequence::kActive;
     });
 
     // release preempted blocks -> cached
diff --git a/src/turbomind/models/llama/SequenceManager.h b/src/turbomind/models/llama/SequenceManager.h
index fc19eef038..f926a3ebd5 100644
--- a/src/turbomind/models/llama/SequenceManager.h
+++ b/src/turbomind/models/llama/SequenceManager.h
@@ -115,12 +115,12 @@ class SequenceManager {
     //   seq_len += output
     //     cache += input + output - 1  or  cache = seq_len - 1
 
-    [[nodiscard]] Outcome Materialize(Sequences             sequences,
-                                      std::vector<int>      context_length,
-                                      std::vector<int>      alpha,
-                                      std::vector<uint64_t> priorities,
-                                      int                   max_fwd_tokens,
-                                      int                   max_tmp_tokens);
+    [[maybe_unused]] Outcome Materialize(Sequences             sequences,
+                                         std::vector<int>      context_length,
+                                         std::vector<int>      alpha,
+                                         std::vector<uint64_t> priorities,
+                                         int                   max_fwd_tokens,
+                                         int                   max_tmp_tokens);
 
     /** @brief cache the input prompt tokens of each seq in sequences[0:active_size-1]
      *