DeepCompile: Fuse allgather and downcast (deepspeedai#7588)

eternalNight · amaurya · commit b9e25638c765 · 2025-10-04T01:07:37.000Z
With autocast enabled, a majority of weights are downcasted before being used in calculations. Today zero3_compile gathers the FP32 weights before they are downcasted. That is sub-optimal because FP32 weights consumes more bandwidth to allgather and takes more time to downcast. To reduce communication and downcast time, fuse allgather and downcast in the dc ops. The target type is now passed to allgather_param() and prefetch_params_fused() which will downcast the (partial) weights before launching allgathers. This corresponds to issue 1 of deepspeedai#7577. Tested with https://gist.github.com/eternalNight/3c2cf8c703f1e9e7742d3b7f9e1edae3 (run with `deepspeed --num_gpus=N this_file.py -c -p -m 23` to collect torch and memory profiles, and with DINOV2_DEPTH = SIGLIP_DEPTH = 3, LLAMA2_DEPTH = 4 for faster compileation) on 5090 (which has limited inter-GPU bandwidth), time per step decreases from 438ms to 337ms and peak GPU memory usage from 9.5GB to 8.5GB. Profiles of a single step before this PR: <img width="1235" height="1029" alt="image" src="https://github.com/user-attachments/assets/d9fe5296-7731-4542-924b-421ff7415054" /> <img width="1466" height="616" alt="image" src="https://github.com/user-attachments/assets/aa192802-8633-4e36-b2c4-f28b1b432663" /> After this PR: <img width="1218" height="1006" alt="image" src="https://github.com/user-attachments/assets/18a0e09c-155b-4783-adb5-b4d36c5c3691" /> <img width="1537" height="559" alt="image" src="https://github.com/user-attachments/assets/16a2ca74-8a89-4db9-9b68-81844295c61b" /> This PR also reduces peak memory usage because the `fast_free_schedule()` today always arranges param allgathers and downcasts at the beginning of the graph. While the original FP32 params can be freed early, all FP16/BF16-casted params are kept in GPU memory at the beginning of the backward graph, leading to a higher peak in memory usage. P.S. Probably due to organization branch rule settings, I don't find anywhere to allow reviewers to modify the branch. So I'll update the branch per reviewers' comments and rebase if needed. Signed-off-by: Junjie Mao <junjie.mao@linux.alibaba.com>
diff --git a/csrc/compile/init.cpp b/csrc/compile/init.cpp
@@ -10,8 +10,10 @@
 
 TORCH_LIBRARY(dc, m)
 {
-    m.def("allgather_param(Tensor a, int graph_id, int id) -> Tensor");
-    m.def("prefetch_params_fused(int graph_id, Tensor[] params, int[] ids) -> ()");
+    m.def("allgather_param(Tensor a, int graph_id, int id, ScalarType? dtype = None) -> Tensor");
+    m.def(
+        "prefetch_params_fused(int graph_id, Tensor[] params, int[] ids,"
+        "                      ScalarType[]? dtypes = None) -> ()");
     m.def("wait_allgather(Tensor(a) a, int graph_id, int id) -> Tensor(a)");
     m.def("release_param(Tensor(a) a, int graph_id, int id, int n_users) -> Tensor(a)");
     m.def("reduce_grad(Tensor a, int graph_id, int id) -> Tensor");
diff --git a/csrc/compile/z3.cpp b/csrc/compile/z3.cpp
@@ -68,7 +68,12 @@ class Z3CustomOpExecutor : public CustomOpExecutor {
                          c10::intrusive_ptr<c10d::symmetric_memory::SymmetricMemory> symm_mem)
     {
         const DSParam& param = param_registry_->getParam(ds_id);
-        const at::Tensor& ds_tensor = param.getDSTensor();
+        at::Tensor ds_tensor = param.getDSTensor();
+
+        if (ds_tensor.scalar_type() != output_buf.scalar_type()) {
+            at::cuda::CUDAStreamGuard guard(ag_stream_);
+            ds_tensor = ds_tensor.to(output_buf.scalar_type(), true, true);
+        }
 
         if (symm_mem == nullptr) {
             // Fast path: assume uniform shard sizes (ZeRO-3 partitions are padded to uniform size)
@@ -110,6 +115,7 @@ class Z3CustomOpExecutor : public CustomOpExecutor {
     }
 
     at::Tensor allgatherParam(long ds_id,
+                              std::optional<at::ScalarType> dtype,
                               c10::intrusive_ptr<c10d::symmetric_memory::SymmetricMemory> symm_mem)
     {
         const DSParam& param = param_registry_->getParam(ds_id);
@@ -118,11 +124,16 @@ class Z3CustomOpExecutor : public CustomOpExecutor {
         const int64_t true_numel = static_cast<int64_t>(productDim(param.getShape()));
         const int64_t padded_per_rank = (true_numel + world_size - 1) / world_size;
         const int64_t padded_numel = static_cast<int64_t>(world_size) * padded_per_rank;
+        at::ScalarType target_dtype = dtype ? dtype.value() : ds_tensor.scalar_type();
 
         if (param_registry_->isValid(ds_id)) {
             // Return a view sliced to the true size with the original shape
+            //
+            // Persistent params are gathered in their original dtype which may
+            // be different from the requested.
             auto base = param_registry_->getGatheredParam(ds_id);
             return base.flatten()
+                .to(target_dtype)
                 .index({torch::indexing::Slice(0, true_numel)})
                 .view(param.getShape());
         }
@@ -134,7 +145,7 @@ class Z3CustomOpExecutor : public CustomOpExecutor {
         }
         if (!output_buf.defined()) {
             at::cuda::CUDAStreamGuard guard(ag_stream_);
-            output_buf = torch::empty({padded_numel}, ds_tensor.options());
+            output_buf = torch::empty({padded_numel}, ds_tensor.options().dtype(target_dtype));
         }
 
         assert(hasKey(ag_comp_done_events_, ds_id));
@@ -150,16 +161,20 @@ class Z3CustomOpExecutor : public CustomOpExecutor {
             .view(param.getShape());
     }
 
-    void prefetchParamsFused(std::vector<int64_t> ds_ids,
+    void prefetchParamsFused(const std::vector<long>& ds_ids,
+                             const std::optional<std::vector<at::ScalarType>> dtypes,
                              c10::intrusive_ptr<c10d::symmetric_memory::SymmetricMemory> symm_mem)
     {
-        std::vector<int64_t> invalid_ds_ids;
-        for (const auto& ds_id : ds_ids) {
-            if (!param_registry_->isValid(ds_id)) { invalid_ds_ids.push_back(ds_id); }
+        std::vector<std::tuple<long, std::optional<at::ScalarType>>> invalid_params;
+        for (int i = 0; i < ds_ids.size(); i++) {
+            if (!param_registry_->isValid(ds_ids[i])) {
+                auto dtype = dtypes ? dtypes.value()[i] : std::optional<at::ScalarType>();
+                invalid_params.push_back(std::make_tuple(ds_ids[i], dtype));
+            }
         }
 
         std::unordered_map<long, at::Tensor> output_bufs;
-        for (long ds_id : invalid_ds_ids) {
+        for (const auto& [ds_id, dtype] : invalid_params) {
             const DSParam& param = param_registry_->getParam(ds_id);
             const at::Tensor& ds_tensor = param.getDSTensor();
             const int world_size = process_group_->getSize();
@@ -173,22 +188,26 @@ class Z3CustomOpExecutor : public CustomOpExecutor {
                     continue;
                 }
             }
-            output_bufs[ds_id] = torch::empty({padded_numel}, ds_tensor.options());
+            auto target_dtype = dtype ? dtype.value() : ds_tensor.scalar_type();
+            output_bufs[ds_id] =
+                torch::empty({padded_numel}, ds_tensor.options().dtype(target_dtype));
         }
 
-        for (long ds_id : invalid_ds_ids) {
+        for (const auto& [ds_id, _] : invalid_params) {
             ag_comp_done_events_[ds_id]->record();
             ag_comp_done_events_[ds_id]->block(ag_stream_);
         }
 
         ncclGroupStart();
-        for (long ds_id : invalid_ds_ids) {
+        for (const auto& [ds_id, _] : invalid_params) {
             assert(hasKey(output_bufs, ds_id));
             launchAllGather(output_bufs.at(ds_id), ds_id, symm_mem);
         }
         ncclGroupEnd();
 
-        for (long ds_id : invalid_ds_ids) { ag_comm_done_events_[ds_id]->record(ag_stream_); }
+        for (const auto& [ds_id, _] : invalid_params) {
+            ag_comm_done_events_[ds_id]->record(ag_stream_);
+        }
     }
 
     void releaseParam(long ds_id, long n_users)
@@ -458,12 +477,15 @@ void register_z3_param(long ds_id,
     }
 }
 
-at::Tensor allgather_param(at::Tensor param_tensor, long graph_id, long ds_id)
+at::Tensor allgather_param(at::Tensor param_tensor,
+                           long graph_id,
+                           long ds_id,
+                           std::optional<at::ScalarType> dtype)
 {
     auto executor = getExecutor<Z3CustomOpExecutor>(graph_id, executors);
 
     if (sync_before_allgather) { c10::cuda::device_synchronize(); }
-    auto ret = executor->allgatherParam(ds_id, symm_mem);
+    auto ret = executor->allgatherParam(ds_id, dtype, symm_mem);
     if (sync_after_allgather) { c10::cuda::device_synchronize(); }
     return ret;
 }
@@ -477,22 +499,25 @@ void set_persistent(long ds_id)
     for (auto& it : executors) {
         if (it.second->hasParam(ds_id)) {
             auto executor = getExecutor<Z3CustomOpExecutor>(it.first, executors);
-            executor->allgatherParam(ds_id, symm_mem);
+            auto dtype = param_registry->getParam(ds_id).getDtype();
+            executor->allgatherParam(ds_id, dtype, symm_mem);
         }
     }
 }
 
 void prefetch_params_fused(long graph_id,
-                           const std::vector<at::Tensor> params,
-                           const std::vector<long>& ds_ids)
+                           const std::vector<at::Tensor>& params,
+                           const std::vector<long>& ds_ids,
+                           const std::optional<std::vector<at::ScalarType>>& dtypes)
 {
     auto executor = getExecutor<Z3CustomOpExecutor>(graph_id, executors);
-    executor->prefetchParamsFused(ds_ids, symm_mem);
+    executor->prefetchParamsFused(ds_ids, dtypes, symm_mem);
 }
 
 void prefetch_params_fused_meta(long graph_id,
-                                const std::vector<at::Tensor> params,
-                                const std::vector<long>& ds_ids)
+                                const std::vector<at::Tensor>& params,
+                                const std::vector<long>& ds_ids,
+                                const std::optional<std::vector<at::ScalarType>>& dtypes)
 {
 }
 
@@ -518,11 +543,14 @@ void clear_all_gathered_params()
     }
 }
 
-at::Tensor allgather_param_meta(at::Tensor param_tensor, long graph_id, long ds_id)
+at::Tensor allgather_param_meta(at::Tensor param_tensor,
+                                long graph_id,
+                                long ds_id,
+                                std::optional<at::ScalarType> dtype)
 {
     const DSParam& param = param_registry->getParam(ds_id);
     auto options = param.getDSTensor().options().device(c10::kMeta);
-    at::Tensor output_buf = torch::empty(param.getShape(), options);
+    at::Tensor output_buf = torch::empty(param.getShape(), options.dtype(dtype));
     return output_buf;
 }
 
diff --git a/csrc/compile/z3.h b/csrc/compile/z3.h
@@ -21,18 +21,26 @@ void register_z3_param(long ds_id,
                        at::Tensor ds_tensor,
                        at::Tensor grad_buffer,
                        bool persistent);
-at::Tensor allgather_param(at::Tensor param_tensor, long graph_id, long ds_id);
+at::Tensor allgather_param(at::Tensor param_tensor,
+                           long graph_id,
+                           long ds_id,
+                           std::optional<at::ScalarType> dtype);
 void set_persistent(long ds_id);
 void prefetch_params_fused(long graph_id,
-                           const std::vector<at::Tensor> params,
-                           const std::vector<long>& ds_ids);
+                           const std::vector<at::Tensor>& params,
+                           const std::vector<long>& ds_ids,
+                           const std::optional<std::vector<at::ScalarType>>& dtypes);
 void prefetch_params_fused_meta(long graph_id,
-                                const std::vector<at::Tensor> params,
-                                const std::vector<long>& ds_ids);
+                                const std::vector<at::Tensor>& params,
+                                const std::vector<long>& ds_ids,
+                                const std::optional<std::vector<at::ScalarType>>& dtypes);
 // for profiling
 void invalidate_gathered_param(long ds_id);
 void clear_all_gathered_params();
-at::Tensor allgather_param_meta(at::Tensor param_tensor, long graph_id, long ds_id);
+at::Tensor allgather_param_meta(at::Tensor param_tensor,
+                                long graph_id,
+                                long ds_id,
+                                std::optional<at::ScalarType> dtype);
 at::Tensor release_param(at::Tensor dummy, long graph_id, long ds_id, long n_users);
 at::Tensor release_param_meta(at::Tensor dummy, long graph_id, long ds_id, long n_users);
 at::Tensor wait_allgather(at::Tensor v, long graph_id, const long ds_id);
diff --git a/csrc/includes/deepcompile.h b/csrc/includes/deepcompile.h
@@ -18,6 +18,7 @@
 #include <c10/cuda/CUDAStream.h>
 #include <torch/csrc/cuda/nccl.h>
 #include <torch/csrc/distributed/c10d/NCCLUtils.hpp>
+#include <torch/csrc/distributed/c10d/ParamCommsUtils.hpp>
 #include <torch/csrc/distributed/c10d/ProcessGroup.hpp>
 
 #if __has_include(<torch/csrc/distributed/c10d/symm_mem/SymmetricMemory.hpp>)
@@ -261,6 +262,7 @@ class DSParam {
         : id_(id),
           shape_(std::move(ds_shape)),
           ds_tensor_(ds_tensor),
+          ds_dtype_(ds_tensor.scalar_type()),
           grad_buffer_(grad_buffer),
           partitioned_(partitioned),
           offset_(offset),
@@ -272,6 +274,7 @@ class DSParam {
 
     long getId() const { return id_; }
     std::vector<int64_t> getShape() const { return shape_; }
+    at::ScalarType getDtype() const { return ds_dtype_; }
     at::Tensor getDSTensor() const
     {
         // If the reload event exists and is complete, return the reloaded tensor (if defined)
@@ -343,6 +346,7 @@ class DSParam {
 private:
     long id_;
     std::vector<int64_t> shape_;
+    at::ScalarType ds_dtype_;
     at::Tensor ds_tensor_;
     at::Tensor ds_reload_tensor_;
     at::Tensor grad_buffer_;
diff --git a/deepspeed/compile/fx.py b/deepspeed/compile/fx.py
@@ -3,7 +3,7 @@
 
 # DeepSpeed Team
 
-from typing import Callable, Any, List
+from typing import Callable, Any, List, Dict
 from collections import defaultdict
 
 import torch
@@ -60,7 +60,8 @@ def add_args_process(graph: Graph,
 def add_postprocess(graph: Graph,
                     node: Node,
                     fn: Callable[..., Any],
-                    extra_args: List[int] = [],
+                    extra_args: List[Any] = [],
+                    extra_kwargs: Dict[str, Any] = {},
                     name=None,
                     meta={}) -> Node:
     # https://github.com/pytorch/examples/blob/main/fx/wrap_output_dynamically.py
@@ -70,7 +71,7 @@ def add_postprocess(graph: Graph,
             args += (a, )
 
         node_users = node.users.keys()
-        new_node = graph.create_node('call_function', fn, args, {}, name=name)
+        new_node = graph.create_node('call_function', fn, args, extra_kwargs, name=name)
         users = {}
         for u in node_users:
             if u != new_node:
diff --git a/deepspeed/compile/passes/zero3_compile.py b/deepspeed/compile/passes/zero3_compile.py
@@ -10,7 +10,7 @@
 import torch
 from torch.fx import Graph, Node, GraphModule
 
-from ..util import get_input_nodes, get_param_nodes, get_index_by_graph_id, get_deepcompile_handle, get_real_uses
+from ..util import get_input_nodes, get_param_nodes, get_index_by_graph_id, get_deepcompile_handle, get_real_uses, is_cast_op
 from ..fx import add_postprocess, _make_node_meta, get_output_node, move_primals_to_head
 from ..profilers.graph_profile import ProfilingInterpreter
 from ..list_schedule import fast_free_schedule
@@ -21,14 +21,15 @@
 NAME = "zero3_compile"
 
 
-def add_allgather(graph_id: int, graph: Graph, node: Node, ds_id: int):
+def add_allgather(graph_id: int, graph: Graph, node: Node, ds_id: int, dtype: torch.dtype):
     new_ag_node = add_postprocess(graph,
                                   node,
                                   torch.ops.dc.allgather_param.default,
                                   extra_args=[graph_id, ds_id],
+                                  extra_kwargs={"dtype": dtype},
                                   name=f"allgather_ds_param_{node.target}_{ds_id}",
                                   meta=_make_node_meta(node, ds_id, True))
-    new_ag_node.meta["val"] = node.meta["val"]
+    new_ag_node.meta["val"] = node.meta["val"].to(dtype)
 
     # Set the previous node back to output
     # We don't want to change the output node to allgather
@@ -42,7 +43,7 @@ def add_allgather(graph_id: int, graph: Graph, node: Node, ds_id: int):
                                     extra_args=[graph_id, ds_id],
                                     name=f"wait_allgather_ds_param__{node.target}_{ds_id}",
                                     meta=_make_node_meta(node, ds_id, False))
-    new_wait_node.meta["val"] = node.meta["val"]
+    new_wait_node.meta["val"] = new_ag_node.meta["val"]
 
     return new_ag_node
 
@@ -74,9 +75,30 @@ def add_gather_and_release(graph_id: int, graph: Graph, param_manager, param_nod
         if len(pn.users) == 0:
             continue
 
-        add_allgather(graph_id, graph, pn, param_manager.ds_ids[pn.name])
+        # If the only use of the parameter is a type-cast to a smaller type, fuse it with all-gather.
+        fuse_typecast = False
+        target_dtype = param_manager.params[pn.name].dtype
+        if len([user for user in pn.users if user.op != "output"]) == 1:
+            typecast_node = next(iter(pn.users))
+
+            is_cast, casted_dtype = is_cast_op(typecast_node)
+            if is_cast and casted_dtype.itemsize < target_dtype.itemsize:
+                fuse_typecast = True
+                target_dtype = casted_dtype
+
+        add_allgather(graph_id, graph, pn, param_manager.ds_ids[pn.name], target_dtype)
+        if fuse_typecast:
+            users = node_to_uses[typecast_node]
+            wait_node = typecast_node.args[0]
+            for user in list(typecast_node.users.keys()):
+                if user.op == "output":
+                    wait_node.meta["original_output_name"] = typecast_node.name
+                user.replace_input_with(typecast_node, wait_node)
+            graph.erase_node(typecast_node)
+        else:
+            users = node_to_uses[pn]
+
         ds_id = param_manager.ds_ids[pn.name]
-        users = node_to_uses[pn]
         for user in users:
             # release_param() only accepts tensors as its first argument. If
             # `user` is a tuple, we should release the param after any of
diff --git a/deepspeed/compile/profilers/graph_profile.py b/deepspeed/compile/profilers/graph_profile.py
@@ -130,9 +130,15 @@ def run_node(self, n: torch.fx.Node) -> Any:
         assert isinstance(args, tuple)
         assert isinstance(kwargs, dict)
 
+        partitioned_params = {}
+
         def rebuild_param_if_necessary(v):
             if hasattr(v, "ds_id"):
                 v.all_gather(param_list=[v])
+                if hasattr(v, "ds_target_dtype"):
+                    casted = v.to(v.ds_target_dtype)
+                    partitioned_params[id(casted)] = v
+                    return casted
             return v
 
         args = map_aggregate(args, lambda x: rebuild_param_if_necessary(x))
@@ -191,6 +197,8 @@ def rebuild_param_if_necessary(v):
         tensor_size = _node_size(out)
 
         def partition_param_if_necessary(v):
+            if id(v) in partitioned_params:
+                v = partitioned_params[id(v)]
             if hasattr(v, "ds_id") and not v.ds_persist:
                 v.partition(param_list=[v], has_been_updated=False)
             return v
@@ -227,6 +235,8 @@ def partition_param_if_necessary(v):
             assert hasattr(out, "ds_id")
             if not out.ds_persist:
                 self.nz3.invalidate_gathered_param(args[2])
+            if "dtype" in n.kwargs:
+                setattr(out, "ds_target_dtype", n.kwargs["dtype"])
             self.allgather_mem[out.ds_id] = n.meta["alloc_mem"]
 
         return out
diff --git a/deepspeed/compile/util.py b/deepspeed/compile/util.py
diff --git a/tests/unit/v1/compile/test_compile_zero.py b/tests/unit/v1/compile/test_compile_zero.py

Original file line number	Diff line number	Diff line change
`@@ -68,7 +68,12 @@ class Z3CustomOpExecutor : public CustomOpExecutor {`
`68`	`68`	`c10::intrusive_ptr<c10d::symmetric_memory::SymmetricMemory> symm_mem)`
`69`	`69`	`{`
`70`	`70`	`const DSParam& param = param_registry_->getParam(ds_id);`
`71`		`- const at::Tensor& ds_tensor = param.getDSTensor();`
	`71`	`+ at::Tensor ds_tensor = param.getDSTensor();`
	`72`	`+`
	`73`	`+ if (ds_tensor.scalar_type() != output_buf.scalar_type()) {`
	`74`	`+ at::cuda::CUDAStreamGuard guard(ag_stream_);`
	`75`	`+ ds_tensor = ds_tensor.to(output_buf.scalar_type(), true, true);`
	`76`	`+ }`
`72`	`77`
`73`	`78`	`if (symm_mem == nullptr) {`
`74`	`79`	`// Fast path: assume uniform shard sizes (ZeRO-3 partitions are padded to uniform size)`
`@@ -110,6 +115,7 @@ class Z3CustomOpExecutor : public CustomOpExecutor {`
`110`	`115`	`}`
`111`	`116`
`112`	`117`	`at::Tensor allgatherParam(long ds_id,`
	`118`	`+ std::optional<at::ScalarType> dtype,`
`113`	`119`	`c10::intrusive_ptr<c10d::symmetric_memory::SymmetricMemory> symm_mem)`
`114`	`120`	`{`
`115`	`121`	`const DSParam& param = param_registry_->getParam(ds_id);`
`@@ -118,11 +124,16 @@ class Z3CustomOpExecutor : public CustomOpExecutor {`
`118`	`124`	`const int64_t true_numel = static_cast<int64_t>(productDim(param.getShape()));`
`119`	`125`	`const int64_t padded_per_rank = (true_numel + world_size - 1) / world_size;`
`120`	`126`	`const int64_t padded_numel = static_cast<int64_t>(world_size) * padded_per_rank;`
	`127`	`+ at::ScalarType target_dtype = dtype ? dtype.value() : ds_tensor.scalar_type();`
`121`	`128`
`122`	`129`	`if (param_registry_->isValid(ds_id)) {`
`123`	`130`	`// Return a view sliced to the true size with the original shape`
	`131`	`+ //`
	`132`	`+ // Persistent params are gathered in their original dtype which may`
	`133`	`+ // be different from the requested.`
`124`	`134`	`auto base = param_registry_->getGatheredParam(ds_id);`
`125`	`135`	`return base.flatten()`
	`136`	`+ .to(target_dtype)`
`126`	`137`	`.index({torch::indexing::Slice(0, true_numel)})`
`127`	`138`	`.view(param.getShape());`
`128`	`139`	`}`
`@@ -134,7 +145,7 @@ class Z3CustomOpExecutor : public CustomOpExecutor {`
`134`	`145`	`}`
`135`	`146`	`if (!output_buf.defined()) {`
`136`	`147`	`at::cuda::CUDAStreamGuard guard(ag_stream_);`
`137`		`- output_buf = torch::empty({padded_numel}, ds_tensor.options());`
	`148`	`+ output_buf = torch::empty({padded_numel}, ds_tensor.options().dtype(target_dtype));`
`138`	`149`	`}`
`139`	`150`
`140`	`151`	`assert(hasKey(ag_comp_done_events_, ds_id));`
`@@ -150,16 +161,20 @@ class Z3CustomOpExecutor : public CustomOpExecutor {`
`150`	`161`	`.view(param.getShape());`
`151`	`162`	`}`
`152`	`163`
`153`		`- void prefetchParamsFused(std::vector<int64_t> ds_ids,`
	`164`	`+ void prefetchParamsFused(const std::vector<long>& ds_ids,`
	`165`	`+ const std::optional<std::vector<at::ScalarType>> dtypes,`
`154`	`166`	`c10::intrusive_ptr<c10d::symmetric_memory::SymmetricMemory> symm_mem)`
`155`	`167`	`{`
`156`		`- std::vector<int64_t> invalid_ds_ids;`
`157`		`- for (const auto& ds_id : ds_ids) {`
`158`		`- if (!param_registry_->isValid(ds_id)) { invalid_ds_ids.push_back(ds_id); }`
	`168`	`+ std::vector<std::tuple<long, std::optional<at::ScalarType>>> invalid_params;`
	`169`	`+ for (int i = 0; i < ds_ids.size(); i++) {`
	`170`	`+ if (!param_registry_->isValid(ds_ids[i])) {`
	`171`	`+ auto dtype = dtypes ? dtypes.value()[i] : std::optional<at::ScalarType>();`
	`172`	`+ invalid_params.push_back(std::make_tuple(ds_ids[i], dtype));`
	`173`	`+ }`
`159`	`174`	`}`
`160`	`175`
`161`	`176`	`std::unordered_map<long, at::Tensor> output_bufs;`
`162`		`- for (long ds_id : invalid_ds_ids) {`
	`177`	`+ for (const auto& [ds_id, dtype] : invalid_params) {`
`163`	`178`	`const DSParam& param = param_registry_->getParam(ds_id);`
`164`	`179`	`const at::Tensor& ds_tensor = param.getDSTensor();`
`165`	`180`	`const int world_size = process_group_->getSize();`
`@@ -173,22 +188,26 @@ class Z3CustomOpExecutor : public CustomOpExecutor {`
`173`	`188`	`continue;`
`174`	`189`	`}`
`175`	`190`	`}`
`176`		`- output_bufs[ds_id] = torch::empty({padded_numel}, ds_tensor.options());`
	`191`	`+ auto target_dtype = dtype ? dtype.value() : ds_tensor.scalar_type();`
	`192`	`+ output_bufs[ds_id] =`
	`193`	`+ torch::empty({padded_numel}, ds_tensor.options().dtype(target_dtype));`
`177`	`194`	`}`
`178`	`195`
`179`		`- for (long ds_id : invalid_ds_ids) {`
	`196`	`+ for (const auto& [ds_id, _] : invalid_params) {`
`180`	`197`	`ag_comp_done_events_[ds_id]->record();`
`181`	`198`	`ag_comp_done_events_[ds_id]->block(ag_stream_);`
`182`	`199`	`}`
`183`	`200`
`184`	`201`	`ncclGroupStart();`
`185`		`- for (long ds_id : invalid_ds_ids) {`
	`202`	`+ for (const auto& [ds_id, _] : invalid_params) {`
`186`	`203`	`assert(hasKey(output_bufs, ds_id));`
`187`	`204`	`launchAllGather(output_bufs.at(ds_id), ds_id, symm_mem);`
`188`	`205`	`}`
`189`	`206`	`ncclGroupEnd();`
`190`	`207`
`191`		`- for (long ds_id : invalid_ds_ids) { ag_comm_done_events_[ds_id]->record(ag_stream_); }`
	`208`	`+ for (const auto& [ds_id, _] : invalid_params) {`
	`209`	`+ ag_comm_done_events_[ds_id]->record(ag_stream_);`
	`210`	`+ }`
`192`	`211`	`}`
`193`	`212`
`194`	`213`	`void releaseParam(long ds_id, long n_users)`
`@@ -458,12 +477,15 @@ void register_z3_param(long ds_id,`
`458`	`477`	`}`
`459`	`478`	`}`
`460`	`479`
`461`		`-at::Tensor allgather_param(at::Tensor param_tensor, long graph_id, long ds_id)`
	`480`	`+at::Tensor allgather_param(at::Tensor param_tensor,`
	`481`	`+ long graph_id,`
	`482`	`+ long ds_id,`
	`483`	`+ std::optional<at::ScalarType> dtype)`
`462`	`484`	`{`
`463`	`485`	`auto executor = getExecutor<Z3CustomOpExecutor>(graph_id, executors);`
`464`	`486`
`465`	`487`	`if (sync_before_allgather) { c10::cuda::device_synchronize(); }`
`466`		`- auto ret = executor->allgatherParam(ds_id, symm_mem);`
	`488`	`+ auto ret = executor->allgatherParam(ds_id, dtype, symm_mem);`
`467`	`489`	`if (sync_after_allgather) { c10::cuda::device_synchronize(); }`
`468`	`490`	`return ret;`
`469`	`491`	`}`
`@@ -477,22 +499,25 @@ void set_persistent(long ds_id)`
`477`	`499`	`for (auto& it : executors) {`
`478`	`500`	`if (it.second->hasParam(ds_id)) {`
`479`	`501`	`auto executor = getExecutor<Z3CustomOpExecutor>(it.first, executors);`
`480`		`- executor->allgatherParam(ds_id, symm_mem);`
	`502`	`+ auto dtype = param_registry->getParam(ds_id).getDtype();`
	`503`	`+ executor->allgatherParam(ds_id, dtype, symm_mem);`
`481`	`504`	`}`
`482`	`505`	`}`
`483`	`506`	`}`
`484`	`507`
`485`	`508`	`void prefetch_params_fused(long graph_id,`
`486`		`- const std::vector<at::Tensor> params,`
`487`		`- const std::vector<long>& ds_ids)`
	`509`	`+ const std::vector<at::Tensor>& params,`
	`510`	`+ const std::vector<long>& ds_ids,`
	`511`	`+ const std::optional<std::vector<at::ScalarType>>& dtypes)`
`488`	`512`	`{`
`489`	`513`	`auto executor = getExecutor<Z3CustomOpExecutor>(graph_id, executors);`
`490`		`- executor->prefetchParamsFused(ds_ids, symm_mem);`
	`514`	`+ executor->prefetchParamsFused(ds_ids, dtypes, symm_mem);`
`491`	`515`	`}`
`492`	`516`
`493`	`517`	`void prefetch_params_fused_meta(long graph_id,`
`494`		`- const std::vector<at::Tensor> params,`
`495`		`- const std::vector<long>& ds_ids)`
	`518`	`+ const std::vector<at::Tensor>& params,`
	`519`	`+ const std::vector<long>& ds_ids,`
	`520`	`+ const std::optional<std::vector<at::ScalarType>>& dtypes)`
`496`	`521`	`{`
`497`	`522`	`}`
`498`	`523`
`@@ -518,11 +543,14 @@ void clear_all_gathered_params()`
`518`	`543`	`}`
`519`	`544`	`}`
`520`	`545`
`521`		`-at::Tensor allgather_param_meta(at::Tensor param_tensor, long graph_id, long ds_id)`
	`546`	`+at::Tensor allgather_param_meta(at::Tensor param_tensor,`
	`547`	`+ long graph_id,`
	`548`	`+ long ds_id,`
	`549`	`+ std::optional<at::ScalarType> dtype)`
`522`	`550`	`{`
`523`	`551`	`const DSParam& param = param_registry->getParam(ds_id);`
`524`	`552`	`auto options = param.getDSTensor().options().device(c10::kMeta);`
`525`		`- at::Tensor output_buf = torch::empty(param.getShape(), options);`
	`553`	`+ at::Tensor output_buf = torch::empty(param.getShape(), options.dtype(dtype));`
`526`	`554`	`return output_buf;`
`527`	`555`	`}`
`528`	`556`