Skip to content

Commit e6750a7

Browse files
committed
Fix duplicated offloading for oom
1 parent b1ca740 commit e6750a7

File tree

1 file changed

+19
-1
lines changed

1 file changed

+19
-1
lines changed

examples/hstu/ops/cuda_ops/csrc/paged_kvcache_ops_cuda.cpp

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1068,6 +1068,12 @@ class GPUKVCacheMangerImpl
10681068
host_kv_mgr->append_kvdata_v2(uid, offload_startpos[seq_idx], offload_lengths[seq_idx], input_ptr, gather_layer_stride);
10691069
this->_uid_to_offloaded_length[uid] = offload_startpos[seq_idx] + offload_lengths[seq_idx];
10701070
page_offset += offload_lengths[seq_idx] / this->num_tokens_per_page;
1071+
{
1072+
std::unique_lock<std::mutex> lock(queued_offload_lastpos_mutex_);
1073+
if (offload_startpos[seq_idx] + offload_lengths[seq_idx] == queued_offload_lastpos[uid]) {
1074+
queued_offload_lastpos.erase(uid);
1075+
}
1076+
}
10711077
}
10721078
}
10731079

@@ -1124,6 +1130,8 @@ class GPUKVCacheMangerImpl
11241130
std::queue<std::tuple<std::vector<int>, at::Tensor, at::Tensor, cudaEvent_t>> offload_task_queue;
11251131
std::mutex offload_task_mutex_;
11261132
std::condition_variable offload_task_cv_;
1133+
std::unordered_map<int64_t, int> queued_offload_lastpos;
1134+
std::mutex queued_offload_lastpos_mutex_;
11271135

11281136
int num_offload_memcpy_worker;
11291137
std::vector<std::thread> offload_memcpy_worker;
@@ -1224,10 +1232,20 @@ void prepare_kvcache(
12241232
new_history_offsets[seq_idx + 1] = new_history_offsets[seq_idx] + total_history_length - old_history_lengths[seq_idx];
12251233

12261234
auto offloaded_length = 0;
1235+
auto chunked_length = total_history_length - total_history_length % gpu_mgr.num_tokens_per_chunk;
12271236
if (gpu_mgr._uid_to_offloaded_length.find(uid) != gpu_mgr._uid_to_offloaded_length.end())
12281237
offloaded_length = gpu_mgr._uid_to_offloaded_length[uid];
1238+
{
1239+
std::unique_lock<std::mutex> lock(gpu_mgr.queued_offload_lastpos_mutex_);
1240+
if (gpu_mgr.queued_offload_lastpos.find(uid) != gpu_mgr.queued_offload_lastpos.end()) {
1241+
offloaded_length = gpu_mgr.queued_offload_lastpos[uid];
1242+
}
1243+
if (total_history_length - offloaded_length >= gpu_mgr.num_tokens_per_chunk) {
1244+
gpu_mgr.queued_offload_lastpos[uid] = chunked_length;
1245+
}
1246+
}
12291247
if (total_history_length - offloaded_length >= gpu_mgr.num_tokens_per_chunk) {
1230-
auto chunked_length = total_history_length - total_history_length % gpu_mgr.num_tokens_per_chunk;
1248+
// auto chunked_length = total_history_length - total_history_length % gpu_mgr.num_tokens_per_chunk;
12311249
auto new_offload_page_start = (offloaded_length - gpu_cache_startpos) / gpu_mgr.num_tokens_per_page;
12321250

12331251
offload_user_ids[num_offload_uids] = uid;

0 commit comments

Comments
 (0)