@@ -512,12 +512,48 @@ def pin_copy_to_device_nonblocking(*tensors):
512
512
#
513
513
# Until now, we have operated under the assumption that asynchronous copies from the CPU to the GPU are safe.
514
514
# This is generally true because CUDA automatically handles synchronization to ensure that the data being accessed is
515
- # valid at read time.
516
- # However, this guarantee does not extend to transfers in the opposite direction, from GPU to CPU.
517
- # Without explicit synchronization, these transfers offer no assurance that the copy will be complete at the time of
518
- # data access. Consequently, the data on the host might be incomplete or incorrect, effectively rendering it garbage:
515
+ # valid at read time __whenever the tensor is in pageable memory__.
519
516
#
517
+ # However, in other cases we cannot make the same asusmption: when a tensor is placed in pinned memory, mutating the
518
+ # original copy after calling the host-to-device transfer may corrupt the data received on GPU.
519
+ # Similarly, when a transfer is achieved in the opposite direction, from GPU to CPU, or from any device that is not CPU
520
+ # or GPU to any device that is not a CUDA-handled GPU (e.g., MPS), there is no guarantee that the data read on GPU is
521
+ # valid without explicit synchronization.
522
+ #
523
+ # In these scenarios, these transfers offer no assurance that the copy will be complete at the time of
524
+ # data access. Consequently, the data on the host might be incomplete or incorrect, effectively rendering it garbage.
525
+ #
526
+ # Let's first demonstrate this with a pinned-memory tensor:
527
+ try :
528
+ i = - 1
529
+ for i in range (100 ):
530
+ # Create a tensor in pin-memory
531
+ cpu_tensor = torch .ones (1024 , 1024 , pin_memory = True )
532
+ # Send the tensor to CUDA
533
+ cuda_tensor = cpu_tensor .to ("cuda" , non_blocking = True )
534
+ # Corrupt the original tensor
535
+ cpu_tensor .zero_ ()
536
+ assert (cuda_tensor == 1 ).all ()
537
+ print ("No test failed with non_blocking" )
538
+ except AssertionError :
539
+ print (f"{ i } th test failed with non_blocking. Skipping remaining tests" )
540
+
541
+ ######################################################################
542
+ # Using a pageable tensor always works:
543
+ #
544
+
545
+ i = - 1
546
+ for i in range (100 ):
547
+ # Create a tensor in pin-memory
548
+ cpu_tensor = torch .ones (1024 , 1024 )
549
+ # Send the tensor to CUDA
550
+ cuda_tensor = cpu_tensor .to ("cuda" , non_blocking = True )
551
+ # Corrupt the original tensor
552
+ cpu_tensor .zero_ ()
553
+ assert (cuda_tensor == 1 ).all ()
520
554
555
+ ######################################################################
556
+ # Now let's demonstrate that CUDA to CPU also fails to produce reliable outputs without synchronization:
521
557
522
558
tensor = (
523
559
torch .arange (1 , 1_000_000 , dtype = torch .double , device = "cuda" )
@@ -551,9 +587,8 @@ def pin_copy_to_device_nonblocking(*tensors):
551
587
552
588
553
589
######################################################################
554
- # The same considerations apply to copies from the CPU to non-CUDA devices, such as MPS.
555
590
# Generally, asynchronous copies to a device are safe without explicit synchronization only when the target is a
556
- # CUDA-enabled device.
591
+ # CUDA-enabled device and the original tensor is in pageable memory .
557
592
#
558
593
# In summary, copying data from CPU to GPU is safe when using ``non_blocking=True``, but for any other direction,
559
594
# ``non_blocking=True`` can still be used but the user must make sure that a device synchronization is executed before
0 commit comments