@@ -67,7 +67,7 @@ To use Sharded Training, you need to first install FairScale using the command b
6767 .. code-block :: python
6868
6969 # train using Sharded DDP
70- trainer = Trainer(plugins = ' ddp_sharded' )
70+ trainer = Trainer(plugins = " ddp_sharded" )
7171
7272 Sharded Training can work across all DDP variants by adding the additional ``--plugins ddp_sharded `` flag.
7373
@@ -123,46 +123,32 @@ Below is an example of using both ``wrap`` and ``auto_wrap`` to create your mode
123123 from pytorch_lightning import Trainer
124124 from fairscale.nn import checkpoint_wrapper, auto_wrap, wrap
125125
126+
126127 class MyModel (pl .LightningModule ):
127128 ...
129+
128130 def configure_sharded_model (self ):
129131 # Created within sharded model context, modules are instantly sharded across processes
130132 # as soon as they are wrapped with ``wrap`` or ``auto_wrap``
131133
132- # Wraps the layer in a Fully Sharded Wrapper automatically
134+ # Wraps the layer in a Fully Sharded Wrapper automatically
133135 linear_layer = wrap(nn.Linear(32 , 32 ))
134136
135137 # Wraps the module recursively
136138 # based on a minimum number of parameters (default 100M parameters)
137- block = auto_wrap(
138- nn.Sequential(
139- nn.Linear(32 , 32 ),
140- nn.ReLU()
141- )
142- )
139+ block = auto_wrap(nn.Sequential(nn.Linear(32 , 32 ), nn.ReLU()))
143140
144141 # For best memory efficiency,
145142 # add fairscale activation checkpointing
146- final_block = auto_wrap(
147- checkpoint_wrapper(
148- nn.Sequential(
149- nn.Linear(32 , 32 ),
150- nn.ReLU()
151- )
152- )
153- )
154- self .model = nn.Sequential(
155- linear_layer,
156- nn.ReLU(),
157- block,
158- final_block
159- )
143+ final_block = auto_wrap(checkpoint_wrapper(nn.Sequential(nn.Linear(32 , 32 ), nn.ReLU())))
144+ self .model = nn.Sequential(linear_layer, nn.ReLU(), block, final_block)
160145
161146 def configure_optimizers (self ):
162147 return torch.optim.AdamW(self .model.parameters())
163148
149+
164150 model = MyModel()
165- trainer = Trainer(gpus = 4 , plugins = ' fsdp' , precision = 16 )
151+ trainer = Trainer(gpus = 4 , plugins = " fsdp" , precision = 16 )
166152 trainer.fit(model)
167153
168154 trainer.test()
@@ -246,7 +232,7 @@ As a result, benefits can also be seen on a single GPU. Do note that the default
246232 from pytorch_lightning import Trainer
247233
248234 model = MyModel()
249- trainer = Trainer(gpus = 4 , plugins = ' deepspeed_stage_2' , precision = 16 )
235+ trainer = Trainer(gpus = 4 , plugins = " deepspeed_stage_2" , precision = 16 )
250236 trainer.fit(model)
251237
252238 .. code-block :: bash
@@ -270,7 +256,7 @@ Below we show an example of running `ZeRO-Offload <https://www.deepspeed.ai/tuto
270256 from pytorch_lightning.plugins import DeepSpeedPlugin
271257
272258 model = MyModel()
273- trainer = Trainer(gpus = 4 , plugins = ' deepspeed_stage_2_offload' , precision = 16 )
259+ trainer = Trainer(gpus = 4 , plugins = " deepspeed_stage_2_offload" , precision = 16 )
274260 trainer.fit(model)
275261
276262
@@ -289,7 +275,9 @@ You can also modify the ZeRO-Offload parameters via the plugin as below.
289275 from pytorch_lightning.plugins import DeepSpeedPlugin
290276
291277 model = MyModel()
292- trainer = Trainer(gpus = 4 , plugins = DeepSpeedPlugin(cpu_offload = True , allgather_bucket_size = 5e8 , reduce_bucket_size = 5e8 ), precision = 16 )
278+ trainer = Trainer(
279+ gpus = 4 , plugins = DeepSpeedPlugin(cpu_offload = True , allgather_bucket_size = 5e8 , reduce_bucket_size = 5e8 ), precision = 16
280+ )
293281 trainer.fit(model)
294282
295283
@@ -310,14 +298,17 @@ For even more speed benefit, DeepSpeed offers an optimized CPU version of ADAM c
310298 from pytorch_lightning.plugins import DeepSpeedPlugin
311299 from deepspeed.ops.adam import DeepSpeedCPUAdam
312300
301+
313302 class MyModel (pl .LightningModule ):
314303 ...
304+
315305 def configure_optimizers (self ):
316306 # DeepSpeedCPUAdam provides 5x to 7x speedup over torch.optim.adam(w)
317307 return DeepSpeedCPUAdam(self .parameters())
318308
309+
319310 model = MyModel()
320- trainer = Trainer(gpus = 4 , plugins = ' deepspeed_stage_2_offload' precision = 16 )
311+ trainer = Trainer(gpus = 4 , plugins = " deepspeed_stage_2_offload" , precision = 16 )
321312 trainer.fit(model)
322313
323314
@@ -347,13 +338,16 @@ Also please have a look at our :ref:`deepspeed-zero-stage-3-tips` which contains
347338 from pytorch_lightning.plugins import DeepSpeedPlugin
348339 from deepspeed.ops.adam import FusedAdam
349340
341+
350342 class MyModel (pl .LightningModule ):
351343 ...
344+
352345 def configure_optimizers (self ):
353346 return FusedAdam(self .parameters())
354347
348+
355349 model = MyModel()
356- trainer = Trainer(gpus = 4 , plugins = ' deepspeed_stage_3' , precision = 16 )
350+ trainer = Trainer(gpus = 4 , plugins = " deepspeed_stage_3" , precision = 16 )
357351 trainer.fit(model)
358352
359353 trainer.test()
@@ -377,8 +371,10 @@ This reduces the time taken to initialize very large models, as well as ensure w
377371 from pytorch_lightning.plugins import DeepSpeedPlugin
378372 from deepspeed.ops.adam import FusedAdam
379373
374+
380375 class MyModel (pl .LightningModule ):
381376 ...
377+
382378 def configure_sharded_model (self ):
383379 # Created within sharded model context, modules are instantly sharded across processes
384380 # as soon as they are made.
@@ -387,8 +383,9 @@ This reduces the time taken to initialize very large models, as well as ensure w
387383 def configure_optimizers (self ):
388384 return FusedAdam(self .parameters())
389385
386+
390387 model = MyModel()
391- trainer = Trainer(gpus = 4 , plugins = ' deepspeed_stage_3' , precision = 16 )
388+ trainer = Trainer(gpus = 4 , plugins = " deepspeed_stage_3" , precision = 16 )
392389 trainer.fit(model)
393390
394391 trainer.test()
@@ -409,7 +406,7 @@ DeepSpeed ZeRO Stage 3 Offloads optimizer state, gradients to the host CPU to re
409406
410407 # Enable CPU Offloading
411408 model = MyModel()
412- trainer = Trainer(gpus = 4 , plugins = ' deepspeed_stage_3_offload' , precision = 16 )
409+ trainer = Trainer(gpus = 4 , plugins = " deepspeed_stage_3_offload" , precision = 16 )
413410 trainer.fit(model)
414411
415412 # Enable CPU Offloading, and offload parameters to CPU
@@ -421,7 +418,7 @@ DeepSpeed ZeRO Stage 3 Offloads optimizer state, gradients to the host CPU to re
421418 offload_optimizer = True ,
422419 offload_parameters = True ,
423420 ),
424- precision = 16
421+ precision = 16 ,
425422 )
426423 trainer.fit(model)
427424
@@ -438,7 +435,7 @@ Additionally, DeepSpeed supports offloading to NVMe drives for even larger model
438435
439436 # Enable CPU Offloading
440437 model = MyModel()
441- trainer = Trainer(gpus = 4 , plugins = ' deepspeed_stage_3_offload' , precision = 16 )
438+ trainer = Trainer(gpus = 4 , plugins = " deepspeed_stage_3_offload" , precision = 16 )
442439 trainer.fit(model)
443440
444441 # Enable CPU Offloading, and offload parameters to CPU
@@ -449,12 +446,12 @@ Additionally, DeepSpeed supports offloading to NVMe drives for even larger model
449446 stage = 3 ,
450447 offload_optimizer = True ,
451448 offload_parameters = True ,
452- remote_device = ' nvme' ,
453- offload_params_device = ' nvme' ,
454- offload_optimizer_device = ' nvme' ,
455- nvme_path = ' /local_nvme'
449+ remote_device = " nvme" ,
450+ offload_params_device = " nvme" ,
451+ offload_optimizer_device = " nvme" ,
452+ nvme_path = " /local_nvme" ,
456453 ),
457- precision = 16
454+ precision = 16 ,
458455 )
459456 trainer.fit(model)
460457
@@ -492,21 +489,17 @@ This saves memory when training larger models however requires using a checkpoin
492489 model = MyModel()
493490
494491
495- trainer = Trainer(
496- gpus = 4 ,
497- plugins = ' deepspeed_stage_3_offload' ,
498- precision = 16
499- )
492+ trainer = Trainer(gpus = 4 , plugins = " deepspeed_stage_3_offload" , precision = 16 )
500493
501494 # Enable CPU Activation Checkpointing
502495 trainer = Trainer(
503496 gpus = 4 ,
504497 plugins = DeepSpeedPlugin(
505498 stage = 3 ,
506499 cpu_offload = True , # Enable CPU Offloading
507- cpu_checkpointing = True # (Optional) offload activations to CPU
500+ cpu_checkpointing = True , # (Optional) offload activations to CPU
508501 ),
509- precision = 16
502+ precision = 16 ,
510503 )
511504 trainer.fit(model)
512505
@@ -549,23 +542,23 @@ In some cases you may want to define your own DeepSpeed Config, to access all pa
549542 " cuda_aware" : True ,
550543 },
551544 },
552- ' scheduler' : {
545+ " scheduler" : {
553546 " type" : " WarmupLR" ,
554547 " params" : {
555548 " last_batch_iteration" : - 1 ,
556549 " warmup_min_lr" : 0 ,
557550 " warmup_max_lr" : 3e-5 ,
558551 " warmup_num_steps" : 100 ,
559- }
552+ },
560553 },
561554 " zero_optimization" : {
562- " stage" : 2 , # Enable Stage 2 ZeRO (Optimizer/Gradient state partitioning)
563- " cpu_offload" : True , # Enable Offloading optimizer state/calculation to the host CPU
564- " contiguous_gradients" : True , # Reduce gradient fragmentation.
565- " overlap_comm" : True , # Overlap reduce/backward operation of gradients for speed.
566- " allgather_bucket_size" : 2e8 , # Number of elements to all gather at once.
567- " reduce_bucket_size" : 2e8 , # Number of elements we reduce/allreduce at once.
568- }
555+ " stage" : 2 , # Enable Stage 2 ZeRO (Optimizer/Gradient state partitioning)
556+ " cpu_offload" : True , # Enable Offloading optimizer state/calculation to the host CPU
557+ " contiguous_gradients" : True , # Reduce gradient fragmentation.
558+ " overlap_comm" : True , # Overlap reduce/backward operation of gradients for speed.
559+ " allgather_bucket_size" : 2e8 , # Number of elements to all gather at once.
560+ " reduce_bucket_size" : 2e8 , # Number of elements we reduce/allreduce at once.
561+ },
569562 }
570563
571564 model = MyModel()
@@ -634,8 +627,8 @@ Enable `FP16 Compress Hook for multi-node throughput improvement <https://pytorc
634627 from pytorch_lightning import Trainer
635628 from pytorch_lightning.plugins import DDPPlugin
636629 from torch.distributed.algorithms.ddp_comm_hooks import (
637- default_hooks as default,
638- powerSGD_hook as powerSGD,
630+ default_hooks as default,
631+ powerSGD_hook as powerSGD,
639632 )
640633
641634 model = MyModel()
@@ -664,7 +657,7 @@ Enable `PowerSGD for multi-node throughput improvement <https://pytorch.org/docs
664657 start_powerSGD_iter = 5000 ,
665658 ),
666659 ddp_comm_hook = powerSGD.powerSGD_hook,
667- )
660+ ),
668661 )
669662 trainer.fit(model)
670663
@@ -679,8 +672,8 @@ Combine hooks for accumulated benefit:
679672 from pytorch_lightning import Trainer
680673 from pytorch_lightning.plugins import DDPPlugin
681674 from torch.distributed.algorithms.ddp_comm_hooks import (
682- default_hooks as default,
683- powerSGD_hook as powerSGD,
675+ default_hooks as default,
676+ powerSGD_hook as powerSGD,
684677 )
685678
686679 model = MyModel()
@@ -694,6 +687,6 @@ Combine hooks for accumulated benefit:
694687 ),
695688 ddp_comm_hook = powerSGD.powerSGD_hook,
696689 ddp_comm_wrapper = default.fp16_compress_wrapper,
697- )
690+ ),
698691 )
699692 trainer.fit(model)
0 commit comments