@@ -1068,6 +1068,12 @@ class GPUKVCacheMangerImpl
10681068 host_kv_mgr->append_kvdata_v2 (uid, offload_startpos[seq_idx], offload_lengths[seq_idx], input_ptr, gather_layer_stride);
10691069 this ->_uid_to_offloaded_length [uid] = offload_startpos[seq_idx] + offload_lengths[seq_idx];
10701070 page_offset += offload_lengths[seq_idx] / this ->num_tokens_per_page ;
1071+ {
1072+ std::unique_lock<std::mutex> lock (queued_offload_lastpos_mutex_);
1073+ if (offload_startpos[seq_idx] + offload_lengths[seq_idx] == queued_offload_lastpos[uid]) {
1074+ queued_offload_lastpos.erase (uid);
1075+ }
1076+ }
10711077 }
10721078 }
10731079
@@ -1124,6 +1130,8 @@ class GPUKVCacheMangerImpl
11241130 std::queue<std::tuple<std::vector<int >, at::Tensor, at::Tensor, cudaEvent_t>> offload_task_queue;
11251131 std::mutex offload_task_mutex_;
11261132 std::condition_variable offload_task_cv_;
1133+ std::unordered_map<int64_t , int > queued_offload_lastpos;
1134+ std::mutex queued_offload_lastpos_mutex_;
11271135
11281136 int num_offload_memcpy_worker;
11291137 std::vector<std::thread> offload_memcpy_worker;
@@ -1224,10 +1232,20 @@ void prepare_kvcache(
12241232 new_history_offsets[seq_idx + 1 ] = new_history_offsets[seq_idx] + total_history_length - old_history_lengths[seq_idx];
12251233
12261234 auto offloaded_length = 0 ;
1235+ auto chunked_length = total_history_length - total_history_length % gpu_mgr.num_tokens_per_chunk ;
12271236 if (gpu_mgr._uid_to_offloaded_length .find (uid) != gpu_mgr._uid_to_offloaded_length .end ())
12281237 offloaded_length = gpu_mgr._uid_to_offloaded_length [uid];
1238+ {
1239+ std::unique_lock<std::mutex> lock (gpu_mgr.queued_offload_lastpos_mutex_ );
1240+ if (gpu_mgr.queued_offload_lastpos .find (uid) != gpu_mgr.queued_offload_lastpos .end ()) {
1241+ offloaded_length = gpu_mgr.queued_offload_lastpos [uid];
1242+ }
1243+ if (total_history_length - offloaded_length >= gpu_mgr.num_tokens_per_chunk ) {
1244+ gpu_mgr.queued_offload_lastpos [uid] = chunked_length;
1245+ }
1246+ }
12291247 if (total_history_length - offloaded_length >= gpu_mgr.num_tokens_per_chunk ) {
1230- auto chunked_length = total_history_length - total_history_length % gpu_mgr.num_tokens_per_chunk ;
1248+ // auto chunked_length = total_history_length - total_history_length % gpu_mgr.num_tokens_per_chunk;
12311249 auto new_offload_page_start = (offloaded_length - gpu_cache_startpos) / gpu_mgr.num_tokens_per_page ;
12321250
12331251 offload_user_ids[num_offload_uids] = uid;
0 commit comments