@@ -192,68 +192,74 @@ def use_deepep_ll_kernels(self):
192
192
def make (tp_size_ : int , dp_size_ : int ,
193
193
vllm_parallel_config : ParallelConfig ) -> "FusedMoEParallelConfig" :
194
194
"""
195
- Determine MoE parallel configuration. Based on the input tp_size_,
196
- dp_size_, ep_size_ and vllm's parallel config, determine what
195
+ Determine MoE parallel configuration. Based on the input ` tp_size_` ,
196
+ ` dp_size_` and vllm's parallel config, determine what
197
197
level's of parallelism to use in the fused moe layer.
198
198
199
199
Args:
200
- tp_size_ (int): tp_size passed into the FusedMoE constructor.
201
- dp_size_ (int): dp_size passed into the FusedMoE constructor.
202
- ep_size_ (int): ep_size passed into the FusedMoE constructor.
203
- vllm_parallel_config (ParallelConfig): vllm's parallel config
204
- object.
200
+ tp_size_ (int): `tp_size` passed into the FusedMoE constructor.
201
+ dp_size_ (int): `dp_size` passed into the FusedMoE constructor.
202
+ vllm_parallel_config (ParallelConfig): vLLM's parallel config
203
+ object which contains the `enable_expert_parallel` flag.
205
204
206
205
Examples:
207
- When there is no parallelism requested, i.e. tp_size_ = dp_size_ = 1,
208
- we simply return the sizes unaltered and the ranks set to 0.
206
+ When there is no parallelism requested,
207
+ i.e. `tp_size_` = `dp_size_` = 1, we simply return the sizes
208
+ unaltered and the ranks set to 0.
209
209
210
- Expert Parallelism is considered only when either dp_size_ or tp_size_
211
- is non trivial.
210
+ Expert Parallelism is considered only when either `dp_size_` or
211
+ `tp_size_` is non trivial.
212
+
213
+ When TP = 2, DP = 1 and EP = False, the configuration on different
214
+ devices:
212
215
213
- When TP = 2, DP = 1 and EP = False, the configuration on different
214
- devices,
215
216
- device 0 : TP = {2, 0} DP = {1, 0} EP = {1, 0} //
216
- legend : {size, rank}
217
+ legend : {size, rank}
217
218
- device 1 : TP = {2, 1} DP = {1, 0} EP = {1, 0}
218
219
- Comment : Tensors are sharded across 2 devices.
219
220
220
- When TP = 1, DP = 2 and EP = False, the configuration on different
221
- devices,
221
+ When TP = 1, DP = 2 and EP = False, the configuration on different
222
+ devices:
223
+
222
224
- device 0 : TP = {2, 0} DP = {2, 0} EP = {1, 0}
223
225
- device 1 : TP = {2, 1} DP = {2, 1} EP = {1, 0}
224
226
- Comment: There are 2 engine instances and the tensors are sharded
225
- across 2 decvices.
227
+ across 2 decvices.
228
+
229
+ When TP = 2, DP = 2 and EP = False, the configuration on different
230
+ devices:
226
231
227
- When TP = 2, DP = 2 and EP = False, the configuration on different
228
- devices,
229
232
- device 0: TP = {4, 0} DP = {2, 0} EP = {1, 0}
230
233
- device 1: TP = {4, 1} DP = {2, 0} EP = {1, 0}
231
234
- device 2: TP = {4, 2} DP = {2, 1} EP = {1, 0}
232
235
- device 3: TP = {4, 3} DP = {2, 1} EP = {1, 0}
233
236
- Comment: There are 2 engine instances and the tensors are sharded
234
- across 4 devices.
237
+ across 4 devices.
238
+
239
+ When, TP = 2, DP = 1 and EP = True, the configuration on different
240
+ devices:
235
241
236
- When, TP = 2, DP = 1 and EP = True, the configuration on different
237
- devices,
238
242
- device 0: TP = {1, 0} DP = {1, 0} EP = {2, 0}
239
243
- device 1: TP = {1, 0} DP = {1, 0} EP = {2, 1}
240
244
- Comment: The experts are split between the 2 devices.
241
245
242
- When, TP = 1, DP = 2 and EP = True, the configuration on different
243
- devices,
246
+ When, TP = 1, DP = 2 and EP = True, the configuration on different
247
+ devices:
248
+
244
249
- device 0: TP = {1, 0} DP = {2, 0} EP = {2, 0}
245
250
- device 1: TP = {1, 0} DP = {2, 1} EP = {2, 1}
246
251
- Comment: There are 2 engine instances and the experts are split
247
- between the 2 devices.
252
+ between the 2 devices.
253
+
254
+ When TP = 2, DP = 2 and EP = True, the configuration on different
255
+ devices:
248
256
249
- When TP = 2, DP = 2 and EP = True, the configuration on different
250
- devices,
251
257
- device 0: TP = {1, 0} DP = {2, 0} EP = {4, 0}
252
258
- device 1: TP = {1, 0} DP = {2, 0} EP = {4, 1}
253
259
- device 2: TP = {1, 0} DP = {2, 1} EP = {4, 2}
254
260
- device 3: TP = {1, 0} DP = {2, 1} EP = {4, 3}
255
261
- Comment: There are 2 engine instances and the experts are split
256
- between the 4 devices.
262
+ between the 4 devices.
257
263
"""
258
264
259
265
def flatten_tp_across_dp (dp_rank : int ):
0 commit comments