|
163 | 163 | del tensor_pageable, tensor_pinned
|
164 | 164 | gc.collect()
|
165 | 165 |
|
| 166 | +###################################################################### |
| 167 | +# Another size (TODO: Remove the one less concinving) |
| 168 | +tensor_pageable = torch.randn(1_000_000) |
| 169 | + |
| 170 | +tensor_pinned = torch.randn(1_000_000, pin_memory=True) |
| 171 | + |
| 172 | +print( |
| 173 | + "Regular to(device)", |
| 174 | + Timer("tensor_pageable.to('cuda:0')", globals=globals()).adaptive_autorange(), |
| 175 | +) |
| 176 | +print( |
| 177 | + "Pinned to(device)", |
| 178 | + Timer("tensor_pinned.to('cuda:0')", globals=globals()).adaptive_autorange(), |
| 179 | +) |
| 180 | +print( |
| 181 | + "pin_memory() along", |
| 182 | + Timer("tensor_pageable.pin_memory()", globals=globals()).adaptive_autorange(), |
| 183 | +) |
| 184 | +print( |
| 185 | + "pin_memory() + to(device)", |
| 186 | + Timer( |
| 187 | + "tensor_pageable.pin_memory().to('cuda:0')", globals=globals() |
| 188 | + ).adaptive_autorange(), |
| 189 | +) |
| 190 | +del tensor_pageable, tensor_pinned |
| 191 | +gc.collect() |
| 192 | + |
| 193 | + |
| 194 | +###################################################################### |
| 195 | +# Another size (TODO: Remove the one less concinving) |
| 196 | +tensor_pageable = torch.randn(10_000) |
| 197 | + |
| 198 | +tensor_pinned = torch.randn(10_000, pin_memory=True) |
| 199 | + |
| 200 | +print( |
| 201 | + "Regular to(device)", |
| 202 | + Timer("tensor_pageable.to('cuda:0')", globals=globals()).adaptive_autorange(), |
| 203 | +) |
| 204 | +print( |
| 205 | + "Pinned to(device)", |
| 206 | + Timer("tensor_pinned.to('cuda:0')", globals=globals()).adaptive_autorange(), |
| 207 | +) |
| 208 | +print( |
| 209 | + "pin_memory() along", |
| 210 | + Timer("tensor_pageable.pin_memory()", globals=globals()).adaptive_autorange(), |
| 211 | +) |
| 212 | +print( |
| 213 | + "pin_memory() + to(device)", |
| 214 | + Timer( |
| 215 | + "tensor_pageable.pin_memory().to('cuda:0')", globals=globals() |
| 216 | + ).adaptive_autorange(), |
| 217 | +) |
| 218 | +del tensor_pageable, tensor_pinned |
| 219 | +gc.collect() |
166 | 220 |
|
167 | 221 | ######################################################################
|
168 | 222 | # We can observe that casting a pinned-memory tensor to GPU is indeed much faster than a pageable tensor, because under
|
@@ -393,17 +447,46 @@ def pin_copy_to_device_nonblocking(*tensors):
|
393 | 447 | import torch
|
394 | 448 | from torch.utils.benchmark import Timer
|
395 | 449 |
|
396 |
| -td = TensorDict({str(i): torch.randn(1_000_000) for i in range(100)}) |
| 450 | +for s0 in (100, 1000, 10_000, 1_000_000): |
| 451 | + for s1 in (10, 100, 1000): |
| 452 | + print("\n\n\n\n", s0, s1) |
| 453 | + td = TensorDict({str(i): torch.randn(s0) for i in range(s1)}) |
397 | 454 |
|
398 |
| -print( |
399 |
| - Timer("td.to('cuda:0', non_blocking=False)", globals=globals()).adaptive_autorange() |
400 |
| -) |
401 |
| -print(Timer("td.to('cuda:0')", globals=globals()).adaptive_autorange()) |
402 |
| -print( |
403 |
| - Timer( |
404 |
| - "td.to('cuda:0', non_blocking=True, non_blocking_pin=True)", globals=globals() |
405 |
| - ).adaptive_autorange() |
406 |
| -) |
| 455 | + print( |
| 456 | + Timer("td.to('cuda:0', non_blocking=False)", globals=globals()).adaptive_autorange() |
| 457 | + ) |
| 458 | + print(Timer("td.to('cuda:0')", globals=globals()).adaptive_autorange()) |
| 459 | + print(torch.get_num_threads()) |
| 460 | + print( |
| 461 | + Timer( |
| 462 | + "td.to('cuda:0', non_blocking_pin=True, num_threads=2)", globals=globals() |
| 463 | + ).adaptive_autorange() |
| 464 | + ) |
| 465 | + print( |
| 466 | + Timer( |
| 467 | + "td.to('cuda:0', non_blocking_pin=True, num_threads=4)", globals=globals() |
| 468 | + ).adaptive_autorange() |
| 469 | + ) |
| 470 | + print( |
| 471 | + Timer( |
| 472 | + "td.to('cuda:0', non_blocking_pin=True, num_threads=8)", globals=globals() |
| 473 | + ).adaptive_autorange() |
| 474 | + ) |
| 475 | + print( |
| 476 | + Timer( |
| 477 | + "td.to('cuda:0', non_blocking_pin=True, num_threads=16)", globals=globals() |
| 478 | + ).adaptive_autorange() |
| 479 | + ) |
| 480 | + print( |
| 481 | + Timer( |
| 482 | + "td.to('cuda:0', non_blocking_pin=True, num_threads=32)", globals=globals() |
| 483 | + ).adaptive_autorange() |
| 484 | + ) |
| 485 | + print( |
| 486 | + Timer( |
| 487 | + "td.to('cuda:0', non_blocking_pin=True, num_threads=64)", globals=globals() |
| 488 | + ).adaptive_autorange() |
| 489 | + ) |
407 | 490 |
|
408 | 491 |
|
409 | 492 | ######################################################################
|
|
0 commit comments