|
137 | 137 | import torch
|
138 | 138 | import gc
|
139 | 139 | from torch.utils.benchmark import Timer
|
| 140 | +import matplotlib.pyplot as plt |
140 | 141 |
|
141 |
| -tensor_pageable = torch.randn(100_000) |
142 | 142 |
|
143 |
| -tensor_pinned = torch.randn(100_000, pin_memory=True) |
| 143 | +def timer(cmd): |
| 144 | + return Timer(cmd, globals=globals()).adaptive_autorange().median * 1000 |
144 | 145 |
|
145 |
| -print( |
146 |
| - "Regular to(device)", |
147 |
| - Timer("tensor_pageable.to('cuda:0')", globals=globals()).adaptive_autorange(), |
148 |
| -) |
149 |
| -print( |
150 |
| - "Pinned to(device)", |
151 |
| - Timer("tensor_pinned.to('cuda:0')", globals=globals()).adaptive_autorange(), |
152 |
| -) |
153 |
| -print( |
154 |
| - "pin_memory() along", |
155 |
| - Timer("tensor_pageable.pin_memory()", globals=globals()).adaptive_autorange(), |
156 |
| -) |
157 |
| -print( |
158 |
| - "pin_memory() + to(device)", |
159 |
| - Timer( |
160 |
| - "tensor_pageable.pin_memory().to('cuda:0')", globals=globals() |
161 |
| - ).adaptive_autorange(), |
162 |
| -) |
163 |
| -del tensor_pageable, tensor_pinned |
164 |
| -gc.collect() |
165 | 146 |
|
166 |
| -###################################################################### |
167 |
| -# Another size (TODO: Remove the one less concinving) |
168 |
| -tensor_pageable = torch.randn(1_000_000) |
| 147 | +pageable_tensor = torch.randn(1_000_000) |
169 | 148 |
|
170 |
| -tensor_pinned = torch.randn(1_000_000, pin_memory=True) |
| 149 | +pinned_tensor = torch.randn(1_000_000, pin_memory=True) |
171 | 150 |
|
172 |
| -print( |
173 |
| - "Regular to(device)", |
174 |
| - Timer("tensor_pageable.to('cuda:0')", globals=globals()).adaptive_autorange(), |
175 |
| -) |
176 |
| -print( |
177 |
| - "Pinned to(device)", |
178 |
| - Timer("tensor_pinned.to('cuda:0')", globals=globals()).adaptive_autorange(), |
179 |
| -) |
180 |
| -print( |
181 |
| - "pin_memory() along", |
182 |
| - Timer("tensor_pageable.pin_memory()", globals=globals()).adaptive_autorange(), |
183 |
| -) |
184 |
| -print( |
185 |
| - "pin_memory() + to(device)", |
186 |
| - Timer( |
187 |
| - "tensor_pageable.pin_memory().to('cuda:0')", globals=globals() |
188 |
| - ).adaptive_autorange(), |
189 |
| -) |
190 |
| -del tensor_pageable, tensor_pinned |
191 |
| -gc.collect() |
| 151 | +pageable_to_device = timer("pageable_tensor.to('cuda:0')") |
| 152 | +pinned_to_device = timer("pinned_tensor.to('cuda:0')") |
| 153 | +pin_mem = timer("pageable_tensor.pin_memory()") |
| 154 | +pin_mem_to_device = timer("pageable_tensor.pin_memory().to('cuda:0')") |
| 155 | +r1 = pinned_to_device / pageable_to_device |
| 156 | +r2 = pin_mem_to_device / pageable_to_device |
192 | 157 |
|
| 158 | +fig, ax = plt.subplots() |
193 | 159 |
|
194 |
| -###################################################################### |
195 |
| -# Another size (TODO: Remove the one less concinving) |
196 |
| -tensor_pageable = torch.randn(10_000) |
| 160 | +xlabels = ["Pageable Tensor", "Pinned tensor", "Pageable Tensor with pin"] |
| 161 | +bar_labels = [ |
| 162 | + "pageable_tensor.to(device) (1x)", |
| 163 | + f"pinned_tensor.to(device) ({r1:4.4f}x)", |
| 164 | + f"pageable_tensor.pin_memory().to(device) ({r2:4.4f}x)", |
| 165 | +] |
| 166 | +values = [pageable_to_device, pinned_to_device, pin_mem_to_device] |
197 | 167 |
|
198 |
| -tensor_pinned = torch.randn(10_000, pin_memory=True) |
| 168 | +ax.bar(xlabels, values, label=bar_labels) |
199 | 169 |
|
200 |
| -print( |
201 |
| - "Regular to(device)", |
202 |
| - Timer("tensor_pageable.to('cuda:0')", globals=globals()).adaptive_autorange(), |
203 |
| -) |
204 |
| -print( |
205 |
| - "Pinned to(device)", |
206 |
| - Timer("tensor_pinned.to('cuda:0')", globals=globals()).adaptive_autorange(), |
207 |
| -) |
208 |
| -print( |
209 |
| - "pin_memory() along", |
210 |
| - Timer("tensor_pageable.pin_memory()", globals=globals()).adaptive_autorange(), |
211 |
| -) |
212 |
| -print( |
213 |
| - "pin_memory() + to(device)", |
214 |
| - Timer( |
215 |
| - "tensor_pageable.pin_memory().to('cuda:0')", globals=globals() |
216 |
| - ).adaptive_autorange(), |
217 |
| -) |
218 |
| -del tensor_pageable, tensor_pinned |
| 170 | +ax.set_ylabel("Runtime (ms)") |
| 171 | +ax.set_title("Device casting runtime (pin-memory)") |
| 172 | +ax.legend() |
| 173 | + |
| 174 | +plt.show() |
| 175 | + |
| 176 | +del pageable_tensor, pinned_tensor |
219 | 177 | gc.collect()
|
220 | 178 |
|
221 | 179 | ######################################################################
|
| 180 | +# |
222 | 181 | # We can observe that casting a pinned-memory tensor to GPU is indeed much faster than a pageable tensor, because under
|
223 | 182 | # the hood, a pageable tensor must be copied to pinned memory before being sent to GPU.
|
224 | 183 | #
|
@@ -253,16 +212,22 @@ def copy_to_device_nonblocking(*tensors, display_peak_mem=False):
|
253 | 212 |
|
254 | 213 |
|
255 | 214 | tensors = [torch.randn(1000) for _ in range(1000)]
|
256 |
| -print( |
257 |
| - "Call to `to(device)`", |
258 |
| - Timer("copy_to_device(*tensors)", globals=globals()).adaptive_autorange(), |
259 |
| -) |
260 |
| -print( |
261 |
| - "Call to `to(device, non_blocking=True)`", |
262 |
| - Timer( |
263 |
| - "copy_to_device_nonblocking(*tensors)", globals=globals() |
264 |
| - ).adaptive_autorange(), |
265 |
| -) |
| 215 | +to_device = timer("copy_to_device(*tensors)") |
| 216 | +to_device_nonblocking = timer("copy_to_device_nonblocking(*tensors)") |
| 217 | + |
| 218 | +fig, ax = plt.subplots() |
| 219 | + |
| 220 | +xlabels = ["to(device)", "to(device, non_blocking=True)"] |
| 221 | +bar_labels = xlabels |
| 222 | +values = [to_device, to_device_nonblocking] |
| 223 | + |
| 224 | +ax.bar(xlabels, values, label=bar_labels) |
| 225 | + |
| 226 | +ax.set_ylabel("Runtime (ms)") |
| 227 | +ax.set_title("Device casting runtime (non-blocking)") |
| 228 | +ax.legend() |
| 229 | + |
| 230 | +plt.show() |
266 | 231 |
|
267 | 232 |
|
268 | 233 | ######################################################################
|
@@ -318,42 +283,44 @@ def pin_copy_to_device_nonblocking(*tensors):
|
318 | 283 | return result
|
319 | 284 |
|
320 | 285 |
|
321 |
| -print("\nCall to `pin_memory()` + `to(device)`") |
322 |
| -print( |
323 |
| - "pin_memory().to(device)", |
324 |
| - Timer("pin_copy_to_device(*tensors)", globals=globals()).adaptive_autorange(), |
325 |
| -) |
326 |
| -print( |
327 |
| - "pin_memory().to(device, non_blocking=True)", |
328 |
| - Timer( |
329 |
| - "pin_copy_to_device_nonblocking(*tensors)", globals=globals() |
330 |
| - ).adaptive_autorange(), |
331 |
| -) |
| 286 | +pin_and_copy = timer("pin_copy_to_device(*tensors)") |
| 287 | +pin_and_copy_nb = timer("pin_copy_to_device_nonblocking(*tensors)") |
332 | 288 |
|
333 |
| -print("\nCall to `to(device)`") |
334 |
| -print( |
335 |
| - "to(device)", |
336 |
| - Timer("copy_to_device(*tensors)", globals=globals()).adaptive_autorange(), |
337 |
| -) |
338 |
| -print( |
339 |
| - "to(device, non_blocking=True)", |
340 |
| - Timer( |
341 |
| - "copy_to_device_nonblocking(*tensors)", globals=globals() |
342 |
| - ).adaptive_autorange(), |
343 |
| -) |
| 289 | +page_copy = timer("copy_to_device(*tensors") |
| 290 | +page_copy_nb = timer("copy_to_device_nonblocking(*tensors_pinned))") |
344 | 291 |
|
345 |
| -print("\nCall to `to(device)` from pinned tensors") |
346 |
| -tensors_pinned = [torch.zeros(1000, pin_memory=True) for _ in range(1000)] |
347 |
| -print( |
348 |
| - "tensor_pinned.to(device)", |
349 |
| - Timer("copy_to_device(*tensors_pinned)", globals=globals()).adaptive_autorange(), |
350 |
| -) |
351 |
| -print( |
352 |
| - "tensor_pinned.to(device, non_blocking=True)", |
353 |
| - Timer( |
354 |
| - "copy_to_device_nonblocking(*tensors_pinned)", globals=globals() |
355 |
| - ).adaptive_autorange(), |
356 |
| -) |
| 292 | +tensors_pinned = [torch.randn(1000, pin_memory=True) for _ in range(1000)] |
| 293 | + |
| 294 | +pinned_copy = timer("copy_to_device(*tensors") |
| 295 | +pinned_copy_nb = timer("copy_to_device_nonblocking(*tensors_pinned))") |
| 296 | + |
| 297 | +strategies = ("pageable copy", "pinned copy", "pin and copy") |
| 298 | +blocking = { |
| 299 | + "blocking": [page_copy, pinned_copy, pin_and_copy], |
| 300 | + "non-blocking": [page_copy_nb, pinned_copy_nb, pin_and_copy_nb], |
| 301 | +} |
| 302 | + |
| 303 | +x = [0, 1, 2] |
| 304 | +width = 0.25 |
| 305 | +multiplier = 0 |
| 306 | + |
| 307 | + |
| 308 | +fig, ax = plt.subplots(layout="constrained") |
| 309 | + |
| 310 | +for attribute, runtimes in blocking.items(): |
| 311 | + offset = width * multiplier |
| 312 | + rects = ax.bar(x + offset, runtimes, width, label=attribute) |
| 313 | + ax.bar_label(rects, padding=3) |
| 314 | + multiplier += 1 |
| 315 | + |
| 316 | +# Add some text for labels, title and custom x-axis tick labels, etc. |
| 317 | +ax.set_ylabel("Runtime (ms)") |
| 318 | +ax.set_title("Runtime (pin-mem and non-blocking)") |
| 319 | +ax.set_xticks(x + width, strategies) |
| 320 | +ax.legend(loc="upper left", ncols=3) |
| 321 | +ax.set_ylim(0, 250) |
| 322 | + |
| 323 | +plt.show() |
357 | 324 |
|
358 | 325 | del tensors, tensors_pinned
|
359 | 326 | gc.collect()
|
@@ -447,47 +414,36 @@ def pin_copy_to_device_nonblocking(*tensors):
|
447 | 414 | import torch
|
448 | 415 | from torch.utils.benchmark import Timer
|
449 | 416 |
|
450 |
| -for s0 in (100, 1000, 10_000, 1_000_000): |
451 |
| - for s1 in (10, 100, 1000): |
452 |
| - print("\n\n\n\n", s0, s1) |
453 |
| - td = TensorDict({str(i): torch.randn(s0) for i in range(s1)}) |
| 417 | +td = TensorDict({str(i): torch.randn(1_000_000) for i in range(100)}) |
454 | 418 |
|
455 |
| - print( |
456 |
| - Timer("td.to('cuda:0', non_blocking=False)", globals=globals()).adaptive_autorange() |
457 |
| - ) |
458 |
| - print(Timer("td.to('cuda:0')", globals=globals()).adaptive_autorange()) |
459 |
| - print(torch.get_num_threads()) |
460 |
| - print( |
461 |
| - Timer( |
462 |
| - "td.to('cuda:0', non_blocking_pin=True, num_threads=2)", globals=globals() |
463 |
| - ).adaptive_autorange() |
464 |
| - ) |
465 |
| - print( |
466 |
| - Timer( |
467 |
| - "td.to('cuda:0', non_blocking_pin=True, num_threads=4)", globals=globals() |
468 |
| - ).adaptive_autorange() |
469 |
| - ) |
470 |
| - print( |
471 |
| - Timer( |
472 |
| - "td.to('cuda:0', non_blocking_pin=True, num_threads=8)", globals=globals() |
473 |
| - ).adaptive_autorange() |
474 |
| - ) |
475 |
| - print( |
476 |
| - Timer( |
477 |
| - "td.to('cuda:0', non_blocking_pin=True, num_threads=16)", globals=globals() |
478 |
| - ).adaptive_autorange() |
479 |
| - ) |
480 |
| - print( |
481 |
| - Timer( |
482 |
| - "td.to('cuda:0', non_blocking_pin=True, num_threads=32)", globals=globals() |
483 |
| - ).adaptive_autorange() |
484 |
| - ) |
485 |
| - print( |
486 |
| - Timer( |
487 |
| - "td.to('cuda:0', non_blocking_pin=True, num_threads=64)", globals=globals() |
488 |
| - ).adaptive_autorange() |
489 |
| - ) |
| 419 | +copy_blocking = timer("td.to('cuda:0', non_blocking=False)") |
| 420 | +copy_non_blocking = timer("td.to('cuda:0')") |
| 421 | +copy_pin_nb = timer("td.to('cuda:0', non_blocking_pin=True, num_threads=0)") |
| 422 | +copy_pin_multithread_nb = timer("td.to('cuda:0', non_blocking_pin=True, num_threads=4)") |
| 423 | + |
| 424 | + |
| 425 | +r1 = copy_non_blocking / copy_blocking |
| 426 | +r2 = copy_pin_nb / copy_blocking |
| 427 | +r3 = copy_pin_multithread_nb / copy_blocking |
| 428 | + |
| 429 | +fig, ax = plt.subplots() |
| 430 | + |
| 431 | +xlabels = [0, 1, 2, 3] |
| 432 | +bar_labels = [ |
| 433 | + "Blocking copy (1x)", |
| 434 | + f"Non-blocking copy ({r1:4.4f}x)", |
| 435 | + f"Blocking pin, non-blocking copy ({r2:4.4f}x)", |
| 436 | + f"Non-blocking pin, non-blocking copy ({r3:4.4f}x)", |
| 437 | +] |
| 438 | +values = [copy_blocking, copy_non_blocking, copy_pin_nb, copy_pin_multithread_nb] |
| 439 | + |
| 440 | +ax.bar(xlabels, values, label=bar_labels) |
| 441 | + |
| 442 | +ax.set_ylabel("Runtime (ms)") |
| 443 | +ax.set_title("Device casting runtime") |
| 444 | +ax.legend() |
490 | 445 |
|
| 446 | +plt.show() |
491 | 447 |
|
492 | 448 | ######################################################################
|
493 | 449 | # As a side note, it may be tempting to create everlasting buffers in pinned memory and copy tensors from pageable memory
|
|
0 commit comments