Skip to content

Commit 8c09284

Browse files
authored
Merge branch 'main' into update_seq2seq_tutorial
2 parents c1a1cca + 001e1a5 commit 8c09284

15 files changed

+940
-366
lines changed

.jenkins/validate_tutorials_built.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@
3131
"prototype_source/vmap_recipe",
3232
"prototype_source/torchscript_freezing",
3333
"prototype_source/nestedtensor",
34+
"prototype_source/gpu_direct_storage", # requires specific filesystem + GPUDirect Storage to be set up
3435
"recipes_source/recipes/saving_and_loading_models_for_inference",
3536
"recipes_source/recipes/saving_multiple_models_in_one_file",
3637
"recipes_source/recipes/tensorboard_with_pytorch",
@@ -52,7 +53,6 @@
5253
"intermediate_source/tensorboard_profiler_tutorial", # reenable after 2.0 release.
5354
"advanced_source/semi_structured_sparse", # reenable after 3303 is fixed.
5455
"intermediate_source/torchrec_intro_tutorial", # reenable after 3302 is fixe
55-
"intermediate_source/memory_format_tutorial", # causes other tutorials like torch_logs fail. "state" issue, reseting dynamo didn't help
5656
]
5757

5858
def tutorial_source_dirs() -> List[Path]:
40.8 KB
Loading

beginner_source/basics/README.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ Learn the Basics
1313
Tensors
1414
https://pytorch.org/tutorials/beginner/basics/tensor_tutorial.html
1515

16-
4. dataquickstart_tutorial.py
16+
4. data_tutorial.py
1717
Datasets & DataLoaders
1818
https://pytorch.org/tutorials/beginner/basics/data_tutorial.html
1919

beginner_source/examples_autograd/polynomial_autograd.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,3 @@
1-
# -*- coding: utf-8 -*-
21
"""
32
PyTorch: Tensors and autograd
43
-------------------------------

beginner_source/examples_autograd/polynomial_custom_function.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,3 @@
1-
# -*- coding: utf-8 -*-
21
"""
32
PyTorch: Defining New autograd Functions
43
----------------------------------------

conf.py

Lines changed: 40 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -33,8 +33,6 @@
3333
sys.path.insert(0, os.path.abspath('./.jenkins'))
3434
import pytorch_sphinx_theme
3535
import torch
36-
import numpy
37-
import gc
3836
import glob
3937
import random
4038
import shutil
@@ -49,6 +47,46 @@
4947
pio.renderers.default = 'sphinx_gallery'
5048

5149

50+
import sphinx_gallery.gen_rst
51+
import multiprocessing
52+
53+
# Monkey patch sphinx gallery to run each example in an isolated process so that
54+
# we don't need to worry about examples changing global state.
55+
#
56+
# Alt option 1: Parallelism was added to sphinx gallery (a later version that we
57+
# are not using yet) using joblib, but it seems to result in errors for us, and
58+
# it has no effect if you set parallel = 1 (it will not put each file run into
59+
# its own process and run singly) so you need parallel >= 2, and there may be
60+
# tutorials that cannot be run in parallel.
61+
#
62+
# Alt option 2: Run sphinx gallery once per file (similar to how we shard in CI
63+
# but with shard sizes of 1), but running sphinx gallery for each file has a
64+
# ~5min overhead, resulting in the entire suite taking ~2x time
65+
def call_fn(func, args, kwargs, result_queue):
66+
try:
67+
result = func(*args, **kwargs)
68+
result_queue.put((True, result))
69+
except Exception as e:
70+
result_queue.put((False, str(e)))
71+
72+
def call_in_subprocess(func):
73+
def wrapper(*args, **kwargs):
74+
result_queue = multiprocessing.Queue()
75+
p = multiprocessing.Process(
76+
target=call_fn,
77+
args=(func, args, kwargs, result_queue)
78+
)
79+
p.start()
80+
p.join()
81+
success, result = result_queue.get()
82+
if success:
83+
return result
84+
else:
85+
raise RuntimeError(f"Error in subprocess: {result}")
86+
return wrapper
87+
88+
sphinx_gallery.gen_rst.generate_file_rst = call_in_subprocess(sphinx_gallery.gen_rst.generate_file_rst)
89+
5290
try:
5391
import torchvision
5492
except ImportError:
@@ -97,20 +135,6 @@
97135

98136
# -- Sphinx-gallery configuration --------------------------------------------
99137

100-
def reset_seeds(gallery_conf, fname):
101-
torch.cuda.empty_cache()
102-
torch.backends.cudnn.deterministic = True
103-
torch.backends.cudnn.benchmark = False
104-
torch._dynamo.reset()
105-
torch._inductor.config.force_disable_caches = True
106-
torch.manual_seed(42)
107-
torch.set_default_device(None)
108-
random.seed(10)
109-
numpy.random.seed(10)
110-
torch.set_grad_enabled(True)
111-
112-
gc.collect()
113-
114138
sphinx_gallery_conf = {
115139
'examples_dirs': ['beginner_source', 'intermediate_source',
116140
'advanced_source', 'recipes_source', 'prototype_source'],
@@ -121,7 +145,6 @@ def reset_seeds(gallery_conf, fname):
121145
'first_notebook_cell': ("# For tips on running notebooks in Google Colab, see\n"
122146
"# https://pytorch.org/tutorials/beginner/colab\n"
123147
"%matplotlib inline"),
124-
'reset_modules': (reset_seeds),
125148
'ignore_pattern': r'_torch_export_nightly_tutorial.py',
126149
'pypandoc': {'extra_args': ['--mathjax', '--toc'],
127150
'filters': ['.jenkins/custom_pandoc_filter.py'],

index.rst

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -766,14 +766,14 @@ Welcome to PyTorch Tutorials
766766
:tags: Parallel-and-Distributed-Training
767767

768768
.. customcarditem::
769-
:header: Getting Started with Fully Sharded Data Parallel(FSDP)
770-
:card_description: Learn how to train models with Fully Sharded Data Parallel package.
769+
:header: Getting Started with Fully Sharded Data Parallel (FSDP2)
770+
:card_description: Learn how to train models with Fully Sharded Data Parallel (fully_shard) package.
771771
:image: _static/img/thumbnails/cropped/Getting-Started-with-FSDP.png
772772
:link: intermediate/FSDP_tutorial.html
773773
:tags: Parallel-and-Distributed-Training
774774

775775
.. customcarditem::
776-
:header: Advanced Model Training with Fully Sharded Data Parallel (FSDP)
776+
:header: Advanced Model Training with Fully Sharded Data Parallel (FSDP1)
777777
:card_description: Explore advanced model training with Fully Sharded Data Parallel package.
778778
:image: _static/img/thumbnails/cropped/Getting-Started-with-FSDP.png
779779
:link: intermediate/FSDP_advanced_tutorial.html

0 commit comments

Comments
 (0)