amend

Vincent Moens · Vincent Moens · commit 8a6f90de83f1 · 2024-07-26T13:56:35.000+01:00
diff --git a/intermediate_source/pinmem_nonblock.py b/intermediate_source/pinmem_nonblock.py
@@ -487,7 +487,7 @@ def pin_copy_to_device_nonblocking(*tensors):
 #
 # Additionally, ``TensorDict.to()`` includes a ``non_blocking_pin`` option  which initiates multiple threads to execute
 # ``pin_memory()`` before proceeding with to ``to(device)``.
-# This approach can further accelerate data transfers, as demonstrated in the following example:
+# This approach can further accelerate data transfers, as demonstrated in the following example.
 #
 # .. code-block:: bash
 #
@@ -536,6 +536,11 @@ def pin_copy_to_device_nonblocking(*tensors):
 plt.show()
 
 ######################################################################
+# In this example, we are transferring many large tensors from the CPU to the GPU.
+# This scenario is ideal for utilizing multithreaded ``pin_memory()``, which can significantly enhance performance.
+# However, if the tensors are small, the overhead associated with multithreading may outweigh the benefits.
+# Similarly, if there are only a few tensors, the advantages of pinning tensors on separate threads become limited.
+#
 # As an additional note, while it might seem advantageous to create permanent buffers in pinned memory to shuttle
 # tensors from pageable memory before transferring them to the GPU, this strategy does not necessarily expedite
 # computation. The inherent bottleneck caused by copying data into pinned memory remains a limiting factor.

Original file line number	Diff line number	Diff line change
`@@ -487,7 +487,7 @@ def pin_copy_to_device_nonblocking(*tensors):`
`487`	`487`	`#`
`488`	`488`	# Additionally, ``TensorDict.to()`` includes a ``non_blocking_pin`` option which initiates multiple threads to execute
`489`	`489`	# ``pin_memory()`` before proceeding with to ``to(device)``.
`490`		`-# This approach can further accelerate data transfers, as demonstrated in the following example:`
	`490`	`+# This approach can further accelerate data transfers, as demonstrated in the following example.`
`491`	`491`	`#`
`492`	`492`	`# .. code-block:: bash`
`493`	`493`	`#`
`@@ -536,6 +536,11 @@ def pin_copy_to_device_nonblocking(*tensors):`
`536`	`536`	`plt.show()`
`537`	`537`
`538`	`538`	`######################################################################`
	`539`	`+# In this example, we are transferring many large tensors from the CPU to the GPU.`
	`540`	+# This scenario is ideal for utilizing multithreaded ``pin_memory()``, which can significantly enhance performance.
	`541`	`+# However, if the tensors are small, the overhead associated with multithreading may outweigh the benefits.`
	`542`	`+# Similarly, if there are only a few tensors, the advantages of pinning tensors on separate threads become limited.`
	`543`	`+#`
`539`	`544`	`# As an additional note, while it might seem advantageous to create permanent buffers in pinned memory to shuttle`
`540`	`545`	`# tensors from pageable memory before transferring them to the GPU, this strategy does not necessarily expedite`
`541`	`546`	`# computation. The inherent bottleneck caused by copying data into pinned memory remains a limiting factor.`