Skip to content

Commit d23d6f5

Browse files
authored
[C++] Invoke storage allocation for CUDA Graph explicitly (#3042)
This PR adds a function that invokes the storage allocation function generated by CUDA Graph rewrite. With this function, we now manually trigger the storage allocation at initialization time. The reason we need this is because that the storage allocation may contain CUDA IPC memory alloc that has to run through a Disco session. So when a function that needs CUDA graph storage allocation runs first outside a Disco session, there might be error caused if we did not initialize the allocation in advance.
1 parent e349684 commit d23d6f5

File tree

4 files changed

+41
-1
lines changed

4 files changed

+41
-1
lines changed

cpp/serve/function_table.cc

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -152,6 +152,10 @@ void FunctionTable::Init(String reload_lib_path, Device device, picojson::object
152152
}
153153
ICHECK_EQ(this->model_metadata_.tensor_parallel_shards, num_shards);
154154
ICHECK_EQ(this->model_metadata_.pipeline_parallel_stages, num_stages);
155+
// Invoke the CUDA graph allocation init function if it is defined.
156+
if (cuda_graph_alloc_init_func_.defined()) {
157+
this->cuda_graph_alloc_init_func_();
158+
}
155159
}
156160

157161
ObjectRef FunctionTable::LoadParams(const std::string& model_path, Device device) {
@@ -231,6 +235,7 @@ void FunctionTable::_InitFunctions() {
231235
this->apply_penalty_func_ = mod->GetFunction("apply_penalty_inplace", true);
232236
this->apply_bitmask_func_ = mod->GetFunction("apply_bitmask_inplace", true);
233237
this->alloc_embedding_tensor_func_ = mod_get_func("alloc_embedding_tensor");
238+
this->cuda_graph_alloc_init_func_ = mod_get_func("cuda_graph_alloc_init");
234239
this->create_kv_cache_func_ = mod_get_func("create_flashinfer_paged_kv_cache");
235240
if (this->model_metadata_.sliding_window_size != -1 || !this->create_kv_cache_func_.defined()) {
236241
PackedFunc f_create_rnn_state = mod_get_func("create_rnn_state");

cpp/serve/function_table.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -99,6 +99,7 @@ struct FunctionTable {
9999
PackedFunc apply_penalty_func_;
100100
PackedFunc apply_bitmask_func_;
101101
PackedFunc alloc_embedding_tensor_func_;
102+
PackedFunc cuda_graph_alloc_init_func_;
102103
PackedFunc create_kv_cache_func_;
103104
PackedFunc reset_kv_cache_func_;
104105
bool support_backtracking_kv_;
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
"""The pass that attaches an empty function for initialization."""
2+
3+
import tvm
4+
from tvm import IRModule, relax
5+
6+
7+
@tvm.transform.module_pass(opt_level=0, name="AttachCUDAGraphAllocInitFunc")
8+
class AttachCUDAGraphAllocInitFunc: # pylint: disable=too-few-public-methods
9+
"""Attach an empty function for initialization."""
10+
11+
def __init__(self):
12+
pass
13+
14+
def transform_module(self, mod: IRModule, _ctx: tvm.transform.PassContext) -> IRModule:
15+
"""Entrypoint"""
16+
bb = relax.BlockBuilder(mod)
17+
alloc_func_gv = None
18+
for gv, _ in mod.functions_items():
19+
if gv.name_hint.startswith("cuda_graph_alloc"):
20+
assert alloc_func_gv is None
21+
alloc_func_gv = gv
22+
if alloc_func_gv is None:
23+
return mod
24+
25+
with bb.function("cuda_graph_alloc_init", []):
26+
bb.emit_func_output(
27+
relax.op.call_builtin_with_ctx(
28+
"vm.builtin.cuda_graph.get_cached_alloc",
29+
args=[alloc_func_gv, relax.PrimValue(0)],
30+
sinfo_args=relax.ObjectStructInfo(),
31+
)
32+
)
33+
return bb.finalize()

python/mlc_llm/compiler_pass/pipeline.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
from mlc_llm.interface.compiler_flags import IPCAllReduceStrategyType
1313
from mlc_llm.support import logging
1414

15+
from .attach_cuda_graph_alloc_init_func import AttachCUDAGraphAllocInitFunc
1516
from .attach_embedding_allocator import AttachAllocEmbeddingTensorFunc
1617
from .attach_logit_processor import AttachLogitProcessFunc
1718
from .attach_sampler import AttachGPUSamplingFunc
@@ -159,7 +160,6 @@ def _pipeline(mod: tvm.ir.IRModule, _ctx: tvm.transform.PassContext) -> tvm.ir.I
159160
),
160161
ScatterTupleGetItem(),
161162
PipelineParallelRewrite(),
162-
_DebugDump("after-pipeline-rewrite.py", debug_dump, show_meta=False),
163163
tvm.relax.transform.RewriteDataflowReshape(),
164164
tvm.relax.transform.ToNonDataflow(),
165165
tvm.relax.transform.RemovePurityChecking(),
@@ -172,6 +172,7 @@ def _pipeline(mod: tvm.ir.IRModule, _ctx: tvm.transform.PassContext) -> tvm.ir.I
172172
tvm.relax.transform.StaticPlanBlockMemory(),
173173
AttachMetadataWithMemoryUsage(metadata),
174174
tvm.relax.transform.RewriteCUDAGraph(),
175+
AttachCUDAGraphAllocInitFunc(),
175176
tvm.relax.transform.LowerGPUIPCAllocStorage(),
176177
tvm.relax.transform.LowerAllocTensor(),
177178
tvm.relax.transform.KillAfterLastUse(),

0 commit comments

Comments
 (0)