Skip to content

Commit 5ae66ec

Browse files
author
Vincent Moens
committed
amend
1 parent d706405 commit 5ae66ec

File tree

1 file changed

+93
-10
lines changed

1 file changed

+93
-10
lines changed

intermediate_source/pinmem_nonblock.py

Lines changed: 93 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -163,6 +163,60 @@
163163
del tensor_pageable, tensor_pinned
164164
gc.collect()
165165

166+
######################################################################
167+
# Another size (TODO: Remove the one less concinving)
168+
tensor_pageable = torch.randn(1_000_000)
169+
170+
tensor_pinned = torch.randn(1_000_000, pin_memory=True)
171+
172+
print(
173+
"Regular to(device)",
174+
Timer("tensor_pageable.to('cuda:0')", globals=globals()).adaptive_autorange(),
175+
)
176+
print(
177+
"Pinned to(device)",
178+
Timer("tensor_pinned.to('cuda:0')", globals=globals()).adaptive_autorange(),
179+
)
180+
print(
181+
"pin_memory() along",
182+
Timer("tensor_pageable.pin_memory()", globals=globals()).adaptive_autorange(),
183+
)
184+
print(
185+
"pin_memory() + to(device)",
186+
Timer(
187+
"tensor_pageable.pin_memory().to('cuda:0')", globals=globals()
188+
).adaptive_autorange(),
189+
)
190+
del tensor_pageable, tensor_pinned
191+
gc.collect()
192+
193+
194+
######################################################################
195+
# Another size (TODO: Remove the one less concinving)
196+
tensor_pageable = torch.randn(10_000)
197+
198+
tensor_pinned = torch.randn(10_000, pin_memory=True)
199+
200+
print(
201+
"Regular to(device)",
202+
Timer("tensor_pageable.to('cuda:0')", globals=globals()).adaptive_autorange(),
203+
)
204+
print(
205+
"Pinned to(device)",
206+
Timer("tensor_pinned.to('cuda:0')", globals=globals()).adaptive_autorange(),
207+
)
208+
print(
209+
"pin_memory() along",
210+
Timer("tensor_pageable.pin_memory()", globals=globals()).adaptive_autorange(),
211+
)
212+
print(
213+
"pin_memory() + to(device)",
214+
Timer(
215+
"tensor_pageable.pin_memory().to('cuda:0')", globals=globals()
216+
).adaptive_autorange(),
217+
)
218+
del tensor_pageable, tensor_pinned
219+
gc.collect()
166220

167221
######################################################################
168222
# We can observe that casting a pinned-memory tensor to GPU is indeed much faster than a pageable tensor, because under
@@ -393,17 +447,46 @@ def pin_copy_to_device_nonblocking(*tensors):
393447
import torch
394448
from torch.utils.benchmark import Timer
395449

396-
td = TensorDict({str(i): torch.randn(1_000_000) for i in range(100)})
450+
for s0 in (100, 1000, 10_000, 1_000_000):
451+
for s1 in (10, 100, 1000):
452+
print("\n\n\n\n", s0, s1)
453+
td = TensorDict({str(i): torch.randn(s0) for i in range(s1)})
397454

398-
print(
399-
Timer("td.to('cuda:0', non_blocking=False)", globals=globals()).adaptive_autorange()
400-
)
401-
print(Timer("td.to('cuda:0')", globals=globals()).adaptive_autorange())
402-
print(
403-
Timer(
404-
"td.to('cuda:0', non_blocking=True, non_blocking_pin=True)", globals=globals()
405-
).adaptive_autorange()
406-
)
455+
print(
456+
Timer("td.to('cuda:0', non_blocking=False)", globals=globals()).adaptive_autorange()
457+
)
458+
print(Timer("td.to('cuda:0')", globals=globals()).adaptive_autorange())
459+
print(torch.get_num_threads())
460+
print(
461+
Timer(
462+
"td.to('cuda:0', non_blocking_pin=True, num_threads=2)", globals=globals()
463+
).adaptive_autorange()
464+
)
465+
print(
466+
Timer(
467+
"td.to('cuda:0', non_blocking_pin=True, num_threads=4)", globals=globals()
468+
).adaptive_autorange()
469+
)
470+
print(
471+
Timer(
472+
"td.to('cuda:0', non_blocking_pin=True, num_threads=8)", globals=globals()
473+
).adaptive_autorange()
474+
)
475+
print(
476+
Timer(
477+
"td.to('cuda:0', non_blocking_pin=True, num_threads=16)", globals=globals()
478+
).adaptive_autorange()
479+
)
480+
print(
481+
Timer(
482+
"td.to('cuda:0', non_blocking_pin=True, num_threads=32)", globals=globals()
483+
).adaptive_autorange()
484+
)
485+
print(
486+
Timer(
487+
"td.to('cuda:0', non_blocking_pin=True, num_threads=64)", globals=globals()
488+
).adaptive_autorange()
489+
)
407490

408491

409492
######################################################################

0 commit comments

Comments
 (0)