Merge branch 'main' into svekars-patch-28

svekars · web-flow · commit 698ca139f3d1 · 2024-10-16T14:03:56.000-07:00
diff --git a/beginner_source/basics/saveloadrun_tutorial.py b/beginner_source/basics/saveloadrun_tutorial.py
@@ -57,8 +57,8 @@
 ########################
 # We can then load the model as demonstrated below.
 #
-# As described in `Saving and loading torch.nn.Modules <pytorch.org/docs/main/notes/serialization.html#saving-and-loading-torch-nn-modules>`__,
-# saving ``state_dict``s is considered the best practice. However,
+# As described in `Saving and loading torch.nn.Modules <https://pytorch.org/docs/main/notes/serialization.html#saving-and-loading-torch-nn-modules>`_,
+# saving ``state_dict`` is considered the best practice. However,
 # below we use ``weights_only=False`` because this involves loading the
 # model, which is a legacy use case for ``torch.save``.
 
diff --git a/beginner_source/introyt/modelsyt_tutorial.py b/beginner_source/introyt/modelsyt_tutorial.py
@@ -311,9 +311,7 @@ def forward(self, sentence):
 # ``TransformerDecoder``) and subcomponents (``TransformerEncoderLayer``,
 # ``TransformerDecoderLayer``). For details, check out the
 # `documentation <https://pytorch.org/docs/stable/nn.html#transformer-layers>`__
-# on transformer classes, and the relevant
-# `tutorial <https://pytorch.org/tutorials/beginner/transformer_tutorial.html>`__
-# on pytorch.org.
+# on transformer classes.
 # 
 # Other Layers and Functions
 # --------------------------
diff --git a/prototype_source/vulkan_workflow.rst b/prototype_source/vulkan_workflow.rst
@@ -1,3 +1,6 @@
+.. note::
+    PyTorch Vulkan Backend is no longer maintained. Please review the `ExecuTorch Vulkan Delegate <https://pytorch.org/executorch/stable/native-delegates-executorch-vulkan-delegate.html>`_ implementation instead.
+
 PyTorch Vulkan Backend User Workflow
 ====================================
 
diff --git a/recipes_source/distributed_checkpoint_recipe.rst b/recipes_source/distributed_checkpoint_recipe.rst
@@ -82,7 +82,7 @@ Now, let's create a toy module, wrap it with FSDP, feed it with some dummy input
 
         def state_dict(self):
             # this line automatically manages FSDP FQN's, as well as sets the default state dict type to FSDP.SHARDED_STATE_DICT
-            model_state_dict, optimizer_state_dict = get_state_dict(model, optimizer)
+            model_state_dict, optimizer_state_dict = get_state_dict(self.model, self.optimizer)
             return {
                 "model": model_state_dict,
                 "optim": optimizer_state_dict
@@ -178,6 +178,7 @@ The reason that we need the ``state_dict`` prior to loading is:
     import torch
     import torch.distributed as dist
     import torch.distributed.checkpoint as dcp
+    from torch.distributed.checkpoint.stateful import Stateful
     from torch.distributed.checkpoint.state_dict import get_state_dict, set_state_dict
     import torch.multiprocessing as mp
     import torch.nn as nn
@@ -202,7 +203,7 @@ The reason that we need the ``state_dict`` prior to loading is:
 
         def state_dict(self):
             # this line automatically manages FSDP FQN's, as well as sets the default state dict type to FSDP.SHARDED_STATE_DICT
-            model_state_dict, optimizer_state_dict = get_state_dict(model, optimizer)
+            model_state_dict, optimizer_state_dict = get_state_dict(self.model, self.optimizer)
             return {
                 "model": model_state_dict,
                 "optim": optimizer_state_dict
@@ -252,13 +253,6 @@ The reason that we need the ``state_dict`` prior to loading is:
         optimizer = torch.optim.Adam(model.parameters(), lr=0.1)
 
         state_dict = { "app": AppState(model, optimizer)}
-        optimizer = torch.optim.Adam(model.parameters(), lr=0.1)
-        # generates the state dict we will load into
-        model_state_dict, optimizer_state_dict = get_state_dict(model, optimizer)
-        state_dict = {
-            "model": model_state_dict,
-            "optimizer": optimizer_state_dict
-        }
         dcp.load(
             state_dict=state_dict,
             checkpoint_id=CHECKPOINT_DIR,
diff --git a/recipes_source/torch_compile_user_defined_triton_kernel_tutorial.py b/recipes_source/torch_compile_user_defined_triton_kernel_tutorial.py
@@ -152,7 +152,7 @@ def add_fn(x, y):
 # * **Tensor Subclasses:** Currently, there is no support for
 #   tensor subclasses and other advanced features.
 # * **Triton Features:** While ``triton.heuristics`` can be used either standalone or
-#   before ``triton.autotune``, it cannot be used after ```triton.autotune``. This
+#   before ``triton.autotune``, it cannot be used after ``triton.autotune``. This
 #   implies that if ``triton.heuristics`` and ``triton.autotune`` are to be used
 #   together, ``triton.heuristics`` must be used first.
 #