pytorch
diff --git a/‎_downloads/562d6bd0e2a429f010fcf8007f6a7cac/pinmem_nonblock.py
Lines changed: 48 additions & 7 deletions b/‎_downloads/562d6bd0e2a429f010fcf8007f6a7cac/pinmem_nonblock.py
Lines changed: 48 additions & 7 deletions
diff --git a/‎_downloads/6a760a243fcbf87fb3368be3d4d860ee/pinmem_nonblock.ipynb
Lines changed: 86 additions & 13 deletions b/‎_downloads/6a760a243fcbf87fb3368be3d4d860ee/pinmem_nonblock.ipynb
Lines changed: 86 additions & 13 deletions
diff --git a/‎_images/sphx_glr_char_rnn_classification_tutorial_001.png
-245 Bytes b/‎_images/sphx_glr_char_rnn_classification_tutorial_001.png
-245 Bytes
diff --git a/‎_images/sphx_glr_char_rnn_classification_tutorial_002.png
-178 Bytes b/‎_images/sphx_glr_char_rnn_classification_tutorial_002.png
-178 Bytes
diff --git a/‎_images/sphx_glr_coding_ddpg_001.png
5.72 KB b/‎_images/sphx_glr_coding_ddpg_001.png
5.72 KB
diff --git a/‎_images/sphx_glr_dqn_with_rnn_tutorial_001.png
-2.41 KB b/‎_images/sphx_glr_dqn_with_rnn_tutorial_001.png
-2.41 KB
diff --git a/‎_images/sphx_glr_neural_style_tutorial_004.png
-110 Bytes b/‎_images/sphx_glr_neural_style_tutorial_004.png
-110 Bytes
diff --git a/‎_images/sphx_glr_pinmem_nonblock_001.png
-567 Bytes b/‎_images/sphx_glr_pinmem_nonblock_001.png
-567 Bytes
diff --git a/‎_images/sphx_glr_pinmem_nonblock_002.png
5 Bytes b/‎_images/sphx_glr_pinmem_nonblock_002.png
5 Bytes
diff --git a/‎_images/sphx_glr_pinmem_nonblock_003.png
527 Bytes b/‎_images/sphx_glr_pinmem_nonblock_003.png
527 Bytes
@@ -108,7 +108,7 @@
 #
 #   .. _pinned_memory_async_sync:
 #
-# When executing a copy from a host (e.g., CPU) to a device (e.g., GPU), the CUDA toolkit offers modalities to do these
+# When executing a copy from a host (such as, CPU) to a device (such as, GPU), the CUDA toolkit offers modalities to do these
 # operations synchronously or asynchronously with respect to the host.
 #
 # In practice, when calling :meth:`~torch.Tensor.to`, PyTorch always makes a call to
@@ -512,12 +512,54 @@ def pin_copy_to_device_nonblocking(*tensors):
 #
 # Until now, we have operated under the assumption that asynchronous copies from the CPU to the GPU are safe.
 # This is generally true because CUDA automatically handles synchronization to ensure that the data being accessed is
-# valid at read time.
-# However, this guarantee does not extend to transfers in the opposite direction, from GPU to CPU.
-# Without explicit synchronization, these transfers offer no assurance that the copy will be complete at the time of
-# data access. Consequently, the data on the host might be incomplete or incorrect, effectively rendering it garbage:
+# valid at read time __whenever the tensor is in pageable memory__.
 #
+# However, in other cases we cannot make the same assumption: when a tensor is placed in pinned memory, mutating the
+# original copy after calling the host-to-device transfer may corrupt the data received on GPU.
+# Similarly, when a transfer is achieved in the opposite direction, from GPU to CPU, or from any device that is not CPU
+# or GPU to any device that is not a CUDA-handled GPU (such as, MPS), there is no guarantee that the data read on GPU is
+# valid without explicit synchronization.
+#
+# In these scenarios, these transfers offer no assurance that the copy will be complete at the time of
+# data access. Consequently, the data on the host might be incomplete or incorrect, effectively rendering it garbage.
+#
+# Let's first demonstrate this with a pinned-memory tensor:
+DELAY = 100000000
+try:
+    i = -1
+    for i in range(100):
+        # Create a tensor in pin-memory
+        cpu_tensor = torch.ones(1024, 1024, pin_memory=True)
+        torch.cuda.synchronize()
+        # Send the tensor to CUDA
+        cuda_tensor = cpu_tensor.to("cuda", non_blocking=True)
+        torch.cuda._sleep(DELAY)
+        # Corrupt the original tensor
+        cpu_tensor.zero_()
+        assert (cuda_tensor == 1).all()
+    print("No test failed with non_blocking and pinned tensor")
+except AssertionError:
+    print(f"{i}th test failed with non_blocking and pinned tensor. Skipping remaining tests")
 
+######################################################################
+# Using a pageable tensor always works:
+#
+
+i = -1
+for i in range(100):
+    # Create a tensor in pin-memory
+    cpu_tensor = torch.ones(1024, 1024)
+    torch.cuda.synchronize()
+    # Send the tensor to CUDA
+    cuda_tensor = cpu_tensor.to("cuda", non_blocking=True)
+    torch.cuda._sleep(DELAY)
+    # Corrupt the original tensor
+    cpu_tensor.zero_()
+    assert (cuda_tensor == 1).all()
+print("No test failed with non_blocking and pageable tensor")
+
+######################################################################
+# Now let's demonstrate that CUDA to CPU also fails to produce reliable outputs without synchronization:
 
 tensor = (
     torch.arange(1, 1_000_000, dtype=torch.double, device="cuda")
@@ -551,9 +593,8 @@ def pin_copy_to_device_nonblocking(*tensors):
 
 
 ######################################################################
-# The same considerations apply to copies from the CPU to non-CUDA devices, such as MPS.
 # Generally, asynchronous copies to a device are safe without explicit synchronization only when the target is a
-# CUDA-enabled device.
+# CUDA-enabled device and the original tensor is in pageable memory.
 #
 # In summary, copying data from CPU to GPU is safe when using ``non_blocking=True``, but for any other direction,
 # ``non_blocking=True`` can still be used but the user must make sure that a device synchronization is executed before
 
@@ -147,9 +147,9 @@
     "Asynchronous vs. Synchronous Operations with `non_blocking=True` (CUDA `cudaMemcpyAsync`)\n",
     "-----------------------------------------------------------------------------------------\n",
     "\n",
-    "When executing a copy from a host (e.g., CPU) to a device (e.g., GPU),\n",
-    "the CUDA toolkit offers modalities to do these operations synchronously\n",
-    "or asynchronously with respect to the host.\n",
+    "When executing a copy from a host (such as, CPU) to a device (such as,\n",
+    "GPU), the CUDA toolkit offers modalities to do these operations\n",
+    "synchronously or asynchronously with respect to the host.\n",
     "\n",
     "In practice, when calling `~torch.Tensor.to`{.interpreted-text\n",
     "role=\"meth\"}, PyTorch always makes a call to\n",
@@ -696,12 +696,86 @@
     "Until now, we have operated under the assumption that asynchronous\n",
     "copies from the CPU to the GPU are safe. This is generally true because\n",
     "CUDA automatically handles synchronization to ensure that the data being\n",
-    "accessed is valid at read time. However, this guarantee does not extend\n",
-    "to transfers in the opposite direction, from GPU to CPU. Without\n",
-    "explicit synchronization, these transfers offer no assurance that the\n",
-    "copy will be complete at the time of data access. Consequently, the data\n",
-    "on the host might be incomplete or incorrect, effectively rendering it\n",
-    "garbage:\n"
+    "accessed is valid at read time \\_\\_whenever the tensor is in pageable\n",
+    "memory\\_\\_.\n",
+    "\n",
+    "However, in other cases we cannot make the same assumption: when a\n",
+    "tensor is placed in pinned memory, mutating the original copy after\n",
+    "calling the host-to-device transfer may corrupt the data received on\n",
+    "GPU. Similarly, when a transfer is achieved in the opposite direction,\n",
+    "from GPU to CPU, or from any device that is not CPU or GPU to any device\n",
+    "that is not a CUDA-handled GPU (such as, MPS), there is no guarantee\n",
+    "that the data read on GPU is valid without explicit synchronization.\n",
+    "\n",
+    "In these scenarios, these transfers offer no assurance that the copy\n",
+    "will be complete at the time of data access. Consequently, the data on\n",
+    "the host might be incomplete or incorrect, effectively rendering it\n",
+    "garbage.\n",
+    "\n",
+    "Let\\'s first demonstrate this with a pinned-memory tensor:\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "DELAY = 100000000\n",
+    "try:\n",
+    "    i = -1\n",
+    "    for i in range(100):\n",
+    "        # Create a tensor in pin-memory\n",
+    "        cpu_tensor = torch.ones(1024, 1024, pin_memory=True)\n",
+    "        torch.cuda.synchronize()\n",
+    "        # Send the tensor to CUDA\n",
+    "        cuda_tensor = cpu_tensor.to(\"cuda\", non_blocking=True)\n",
+    "        torch.cuda._sleep(DELAY)\n",
+    "        # Corrupt the original tensor\n",
+    "        cpu_tensor.zero_()\n",
+    "        assert (cuda_tensor == 1).all()\n",
+    "    print(\"No test failed with non_blocking and pinned tensor\")\n",
+    "except AssertionError:\n",
+    "    print(f\"{i}th test failed with non_blocking and pinned tensor. Skipping remaining tests\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Using a pageable tensor always works:\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "i = -1\n",
+    "for i in range(100):\n",
+    "    # Create a tensor in pin-memory\n",
+    "    cpu_tensor = torch.ones(1024, 1024)\n",
+    "    torch.cuda.synchronize()\n",
+    "    # Send the tensor to CUDA\n",
+    "    cuda_tensor = cpu_tensor.to(\"cuda\", non_blocking=True)\n",
+    "    torch.cuda._sleep(DELAY)\n",
+    "    # Corrupt the original tensor\n",
+    "    cpu_tensor.zero_()\n",
+    "    assert (cuda_tensor == 1).all()\n",
+    "print(\"No test failed with non_blocking and pageable tensor\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Now let\\'s demonstrate that CUDA to CPU also fails to produce reliable\n",
+    "outputs without synchronization:\n"
    ]
   },
   {
@@ -747,10 +821,9 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "The same considerations apply to copies from the CPU to non-CUDA\n",
-    "devices, such as MPS. Generally, asynchronous copies to a device are\n",
-    "safe without explicit synchronization only when the target is a\n",
-    "CUDA-enabled device.\n",
+    "Generally, asynchronous copies to a device are safe without explicit\n",
+    "synchronization only when the target is a CUDA-enabled device and the\n",
+    "original tensor is in pageable memory.\n",
     "\n",
     "In summary, copying data from CPU to GPU is safe when using\n",
     "`non_blocking=True`, but for any other direction, `non_blocking=True`\n",