Skip to content

Commit db3b467

Browse files
carmoccalexierule
authored andcommitted
Fix pre-commit blacken-docs failures (#8624)
(cherry picked from commit 93784da)
1 parent 41ac9e2 commit db3b467

File tree

18 files changed

+551
-539
lines changed

18 files changed

+551
-539
lines changed

docs/source/advanced/advanced_gpu.rst

Lines changed: 52 additions & 59 deletions
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,7 @@ To use Sharded Training, you need to first install FairScale using the command b
6767
.. code-block:: python
6868
6969
# train using Sharded DDP
70-
trainer = Trainer(plugins='ddp_sharded')
70+
trainer = Trainer(plugins="ddp_sharded")
7171
7272
Sharded Training can work across all DDP variants by adding the additional ``--plugins ddp_sharded`` flag.
7373

@@ -123,46 +123,32 @@ Below is an example of using both ``wrap`` and ``auto_wrap`` to create your mode
123123
from pytorch_lightning import Trainer
124124
from fairscale.nn import checkpoint_wrapper, auto_wrap, wrap
125125
126+
126127
class MyModel(pl.LightningModule):
127128
...
129+
128130
def configure_sharded_model(self):
129131
# Created within sharded model context, modules are instantly sharded across processes
130132
# as soon as they are wrapped with ``wrap`` or ``auto_wrap``
131133
132-
# Wraps the layer in a Fully Sharded Wrapper automatically
134+
# Wraps the layer in a Fully Sharded Wrapper automatically
133135
linear_layer = wrap(nn.Linear(32, 32))
134136
135137
# Wraps the module recursively
136138
# based on a minimum number of parameters (default 100M parameters)
137-
block = auto_wrap(
138-
nn.Sequential(
139-
nn.Linear(32, 32),
140-
nn.ReLU()
141-
)
142-
)
139+
block = auto_wrap(nn.Sequential(nn.Linear(32, 32), nn.ReLU()))
143140
144141
# For best memory efficiency,
145142
# add fairscale activation checkpointing
146-
final_block = auto_wrap(
147-
checkpoint_wrapper(
148-
nn.Sequential(
149-
nn.Linear(32, 32),
150-
nn.ReLU()
151-
)
152-
)
153-
)
154-
self.model = nn.Sequential(
155-
linear_layer,
156-
nn.ReLU(),
157-
block,
158-
final_block
159-
)
143+
final_block = auto_wrap(checkpoint_wrapper(nn.Sequential(nn.Linear(32, 32), nn.ReLU())))
144+
self.model = nn.Sequential(linear_layer, nn.ReLU(), block, final_block)
160145
161146
def configure_optimizers(self):
162147
return torch.optim.AdamW(self.model.parameters())
163148
149+
164150
model = MyModel()
165-
trainer = Trainer(gpus=4, plugins='fsdp', precision=16)
151+
trainer = Trainer(gpus=4, plugins="fsdp", precision=16)
166152
trainer.fit(model)
167153
168154
trainer.test()
@@ -246,7 +232,7 @@ As a result, benefits can also be seen on a single GPU. Do note that the default
246232
from pytorch_lightning import Trainer
247233
248234
model = MyModel()
249-
trainer = Trainer(gpus=4, plugins='deepspeed_stage_2', precision=16)
235+
trainer = Trainer(gpus=4, plugins="deepspeed_stage_2", precision=16)
250236
trainer.fit(model)
251237
252238
.. code-block:: bash
@@ -270,7 +256,7 @@ Below we show an example of running `ZeRO-Offload <https://www.deepspeed.ai/tuto
270256
from pytorch_lightning.plugins import DeepSpeedPlugin
271257
272258
model = MyModel()
273-
trainer = Trainer(gpus=4, plugins='deepspeed_stage_2_offload', precision=16)
259+
trainer = Trainer(gpus=4, plugins="deepspeed_stage_2_offload", precision=16)
274260
trainer.fit(model)
275261
276262
@@ -289,7 +275,9 @@ You can also modify the ZeRO-Offload parameters via the plugin as below.
289275
from pytorch_lightning.plugins import DeepSpeedPlugin
290276
291277
model = MyModel()
292-
trainer = Trainer(gpus=4, plugins=DeepSpeedPlugin(cpu_offload=True, allgather_bucket_size=5e8, reduce_bucket_size=5e8), precision=16)
278+
trainer = Trainer(
279+
gpus=4, plugins=DeepSpeedPlugin(cpu_offload=True, allgather_bucket_size=5e8, reduce_bucket_size=5e8), precision=16
280+
)
293281
trainer.fit(model)
294282
295283
@@ -310,14 +298,17 @@ For even more speed benefit, DeepSpeed offers an optimized CPU version of ADAM c
310298
from pytorch_lightning.plugins import DeepSpeedPlugin
311299
from deepspeed.ops.adam import DeepSpeedCPUAdam
312300
301+
313302
class MyModel(pl.LightningModule):
314303
...
304+
315305
def configure_optimizers(self):
316306
# DeepSpeedCPUAdam provides 5x to 7x speedup over torch.optim.adam(w)
317307
return DeepSpeedCPUAdam(self.parameters())
318308
309+
319310
model = MyModel()
320-
trainer = Trainer(gpus=4, plugins='deepspeed_stage_2_offload' precision=16)
311+
trainer = Trainer(gpus=4, plugins="deepspeed_stage_2_offload", precision=16)
321312
trainer.fit(model)
322313
323314
@@ -347,13 +338,16 @@ Also please have a look at our :ref:`deepspeed-zero-stage-3-tips` which contains
347338
from pytorch_lightning.plugins import DeepSpeedPlugin
348339
from deepspeed.ops.adam import FusedAdam
349340
341+
350342
class MyModel(pl.LightningModule):
351343
...
344+
352345
def configure_optimizers(self):
353346
return FusedAdam(self.parameters())
354347
348+
355349
model = MyModel()
356-
trainer = Trainer(gpus=4, plugins='deepspeed_stage_3', precision=16)
350+
trainer = Trainer(gpus=4, plugins="deepspeed_stage_3", precision=16)
357351
trainer.fit(model)
358352
359353
trainer.test()
@@ -377,8 +371,10 @@ This reduces the time taken to initialize very large models, as well as ensure w
377371
from pytorch_lightning.plugins import DeepSpeedPlugin
378372
from deepspeed.ops.adam import FusedAdam
379373
374+
380375
class MyModel(pl.LightningModule):
381376
...
377+
382378
def configure_sharded_model(self):
383379
# Created within sharded model context, modules are instantly sharded across processes
384380
# as soon as they are made.
@@ -387,8 +383,9 @@ This reduces the time taken to initialize very large models, as well as ensure w
387383
def configure_optimizers(self):
388384
return FusedAdam(self.parameters())
389385
386+
390387
model = MyModel()
391-
trainer = Trainer(gpus=4, plugins='deepspeed_stage_3', precision=16)
388+
trainer = Trainer(gpus=4, plugins="deepspeed_stage_3", precision=16)
392389
trainer.fit(model)
393390
394391
trainer.test()
@@ -409,7 +406,7 @@ DeepSpeed ZeRO Stage 3 Offloads optimizer state, gradients to the host CPU to re
409406
410407
# Enable CPU Offloading
411408
model = MyModel()
412-
trainer = Trainer(gpus=4, plugins='deepspeed_stage_3_offload', precision=16)
409+
trainer = Trainer(gpus=4, plugins="deepspeed_stage_3_offload", precision=16)
413410
trainer.fit(model)
414411
415412
# Enable CPU Offloading, and offload parameters to CPU
@@ -421,7 +418,7 @@ DeepSpeed ZeRO Stage 3 Offloads optimizer state, gradients to the host CPU to re
421418
offload_optimizer=True,
422419
offload_parameters=True,
423420
),
424-
precision=16
421+
precision=16,
425422
)
426423
trainer.fit(model)
427424
@@ -438,7 +435,7 @@ Additionally, DeepSpeed supports offloading to NVMe drives for even larger model
438435
439436
# Enable CPU Offloading
440437
model = MyModel()
441-
trainer = Trainer(gpus=4, plugins='deepspeed_stage_3_offload', precision=16)
438+
trainer = Trainer(gpus=4, plugins="deepspeed_stage_3_offload", precision=16)
442439
trainer.fit(model)
443440
444441
# Enable CPU Offloading, and offload parameters to CPU
@@ -449,12 +446,12 @@ Additionally, DeepSpeed supports offloading to NVMe drives for even larger model
449446
stage=3,
450447
offload_optimizer=True,
451448
offload_parameters=True,
452-
remote_device='nvme',
453-
offload_params_device='nvme',
454-
offload_optimizer_device='nvme',
455-
nvme_path = '/local_nvme'
449+
remote_device="nvme",
450+
offload_params_device="nvme",
451+
offload_optimizer_device="nvme",
452+
nvme_path="/local_nvme",
456453
),
457-
precision=16
454+
precision=16,
458455
)
459456
trainer.fit(model)
460457
@@ -492,21 +489,17 @@ This saves memory when training larger models however requires using a checkpoin
492489
model = MyModel()
493490
494491
495-
trainer = Trainer(
496-
gpus=4,
497-
plugins='deepspeed_stage_3_offload',
498-
precision=16
499-
)
492+
trainer = Trainer(gpus=4, plugins="deepspeed_stage_3_offload", precision=16)
500493
501494
# Enable CPU Activation Checkpointing
502495
trainer = Trainer(
503496
gpus=4,
504497
plugins=DeepSpeedPlugin(
505498
stage=3,
506499
cpu_offload=True, # Enable CPU Offloading
507-
cpu_checkpointing=True # (Optional) offload activations to CPU
500+
cpu_checkpointing=True, # (Optional) offload activations to CPU
508501
),
509-
precision=16
502+
precision=16,
510503
)
511504
trainer.fit(model)
512505
@@ -549,23 +542,23 @@ In some cases you may want to define your own DeepSpeed Config, to access all pa
549542
"cuda_aware": True,
550543
},
551544
},
552-
'scheduler': {
545+
"scheduler": {
553546
"type": "WarmupLR",
554547
"params": {
555548
"last_batch_iteration": -1,
556549
"warmup_min_lr": 0,
557550
"warmup_max_lr": 3e-5,
558551
"warmup_num_steps": 100,
559-
}
552+
},
560553
},
561554
"zero_optimization": {
562-
"stage": 2, # Enable Stage 2 ZeRO (Optimizer/Gradient state partitioning)
563-
"cpu_offload": True, # Enable Offloading optimizer state/calculation to the host CPU
564-
"contiguous_gradients": True, # Reduce gradient fragmentation.
565-
"overlap_comm": True, # Overlap reduce/backward operation of gradients for speed.
566-
"allgather_bucket_size": 2e8, # Number of elements to all gather at once.
567-
"reduce_bucket_size": 2e8, # Number of elements we reduce/allreduce at once.
568-
}
555+
"stage": 2, # Enable Stage 2 ZeRO (Optimizer/Gradient state partitioning)
556+
"cpu_offload": True, # Enable Offloading optimizer state/calculation to the host CPU
557+
"contiguous_gradients": True, # Reduce gradient fragmentation.
558+
"overlap_comm": True, # Overlap reduce/backward operation of gradients for speed.
559+
"allgather_bucket_size": 2e8, # Number of elements to all gather at once.
560+
"reduce_bucket_size": 2e8, # Number of elements we reduce/allreduce at once.
561+
},
569562
}
570563
571564
model = MyModel()
@@ -634,8 +627,8 @@ Enable `FP16 Compress Hook for multi-node throughput improvement <https://pytorc
634627
from pytorch_lightning import Trainer
635628
from pytorch_lightning.plugins import DDPPlugin
636629
from torch.distributed.algorithms.ddp_comm_hooks import (
637-
default_hooks as default,
638-
powerSGD_hook as powerSGD,
630+
default_hooks as default,
631+
powerSGD_hook as powerSGD,
639632
)
640633
641634
model = MyModel()
@@ -664,7 +657,7 @@ Enable `PowerSGD for multi-node throughput improvement <https://pytorch.org/docs
664657
start_powerSGD_iter=5000,
665658
),
666659
ddp_comm_hook=powerSGD.powerSGD_hook,
667-
)
660+
),
668661
)
669662
trainer.fit(model)
670663
@@ -679,8 +672,8 @@ Combine hooks for accumulated benefit:
679672
from pytorch_lightning import Trainer
680673
from pytorch_lightning.plugins import DDPPlugin
681674
from torch.distributed.algorithms.ddp_comm_hooks import (
682-
default_hooks as default,
683-
powerSGD_hook as powerSGD,
675+
default_hooks as default,
676+
powerSGD_hook as powerSGD,
684677
)
685678
686679
model = MyModel()
@@ -694,6 +687,6 @@ Combine hooks for accumulated benefit:
694687
),
695688
ddp_comm_hook=powerSGD.powerSGD_hook,
696689
ddp_comm_wrapper=default.fp16_compress_wrapper,
697-
)
690+
),
698691
)
699692
trainer.fit(model)

0 commit comments

Comments
 (0)