Skip to content

Commit dc86259

Browse files
author
Vincent Moens
committed
amend
1 parent 5ae66ec commit dc86259

File tree

1 file changed

+108
-152
lines changed

1 file changed

+108
-152
lines changed

intermediate_source/pinmem_nonblock.py

Lines changed: 108 additions & 152 deletions
Original file line numberDiff line numberDiff line change
@@ -137,88 +137,47 @@
137137
import torch
138138
import gc
139139
from torch.utils.benchmark import Timer
140+
import matplotlib.pyplot as plt
140141

141-
tensor_pageable = torch.randn(100_000)
142142

143-
tensor_pinned = torch.randn(100_000, pin_memory=True)
143+
def timer(cmd):
144+
return Timer(cmd, globals=globals()).adaptive_autorange().median * 1000
144145

145-
print(
146-
"Regular to(device)",
147-
Timer("tensor_pageable.to('cuda:0')", globals=globals()).adaptive_autorange(),
148-
)
149-
print(
150-
"Pinned to(device)",
151-
Timer("tensor_pinned.to('cuda:0')", globals=globals()).adaptive_autorange(),
152-
)
153-
print(
154-
"pin_memory() along",
155-
Timer("tensor_pageable.pin_memory()", globals=globals()).adaptive_autorange(),
156-
)
157-
print(
158-
"pin_memory() + to(device)",
159-
Timer(
160-
"tensor_pageable.pin_memory().to('cuda:0')", globals=globals()
161-
).adaptive_autorange(),
162-
)
163-
del tensor_pageable, tensor_pinned
164-
gc.collect()
165146

166-
######################################################################
167-
# Another size (TODO: Remove the one less concinving)
168-
tensor_pageable = torch.randn(1_000_000)
147+
pageable_tensor = torch.randn(1_000_000)
169148

170-
tensor_pinned = torch.randn(1_000_000, pin_memory=True)
149+
pinned_tensor = torch.randn(1_000_000, pin_memory=True)
171150

172-
print(
173-
"Regular to(device)",
174-
Timer("tensor_pageable.to('cuda:0')", globals=globals()).adaptive_autorange(),
175-
)
176-
print(
177-
"Pinned to(device)",
178-
Timer("tensor_pinned.to('cuda:0')", globals=globals()).adaptive_autorange(),
179-
)
180-
print(
181-
"pin_memory() along",
182-
Timer("tensor_pageable.pin_memory()", globals=globals()).adaptive_autorange(),
183-
)
184-
print(
185-
"pin_memory() + to(device)",
186-
Timer(
187-
"tensor_pageable.pin_memory().to('cuda:0')", globals=globals()
188-
).adaptive_autorange(),
189-
)
190-
del tensor_pageable, tensor_pinned
191-
gc.collect()
151+
pageable_to_device = timer("pageable_tensor.to('cuda:0')")
152+
pinned_to_device = timer("pinned_tensor.to('cuda:0')")
153+
pin_mem = timer("pageable_tensor.pin_memory()")
154+
pin_mem_to_device = timer("pageable_tensor.pin_memory().to('cuda:0')")
155+
r1 = pinned_to_device / pageable_to_device
156+
r2 = pin_mem_to_device / pageable_to_device
192157

158+
fig, ax = plt.subplots()
193159

194-
######################################################################
195-
# Another size (TODO: Remove the one less concinving)
196-
tensor_pageable = torch.randn(10_000)
160+
xlabels = ["Pageable Tensor", "Pinned tensor", "Pageable Tensor with pin"]
161+
bar_labels = [
162+
"pageable_tensor.to(device) (1x)",
163+
f"pinned_tensor.to(device) ({r1:4.4f}x)",
164+
f"pageable_tensor.pin_memory().to(device) ({r2:4.4f}x)",
165+
]
166+
values = [pageable_to_device, pinned_to_device, pin_mem_to_device]
197167

198-
tensor_pinned = torch.randn(10_000, pin_memory=True)
168+
ax.bar(xlabels, values, label=bar_labels)
199169

200-
print(
201-
"Regular to(device)",
202-
Timer("tensor_pageable.to('cuda:0')", globals=globals()).adaptive_autorange(),
203-
)
204-
print(
205-
"Pinned to(device)",
206-
Timer("tensor_pinned.to('cuda:0')", globals=globals()).adaptive_autorange(),
207-
)
208-
print(
209-
"pin_memory() along",
210-
Timer("tensor_pageable.pin_memory()", globals=globals()).adaptive_autorange(),
211-
)
212-
print(
213-
"pin_memory() + to(device)",
214-
Timer(
215-
"tensor_pageable.pin_memory().to('cuda:0')", globals=globals()
216-
).adaptive_autorange(),
217-
)
218-
del tensor_pageable, tensor_pinned
170+
ax.set_ylabel("Runtime (ms)")
171+
ax.set_title("Device casting runtime (pin-memory)")
172+
ax.legend()
173+
174+
plt.show()
175+
176+
del pageable_tensor, pinned_tensor
219177
gc.collect()
220178

221179
######################################################################
180+
#
222181
# We can observe that casting a pinned-memory tensor to GPU is indeed much faster than a pageable tensor, because under
223182
# the hood, a pageable tensor must be copied to pinned memory before being sent to GPU.
224183
#
@@ -253,16 +212,22 @@ def copy_to_device_nonblocking(*tensors, display_peak_mem=False):
253212

254213

255214
tensors = [torch.randn(1000) for _ in range(1000)]
256-
print(
257-
"Call to `to(device)`",
258-
Timer("copy_to_device(*tensors)", globals=globals()).adaptive_autorange(),
259-
)
260-
print(
261-
"Call to `to(device, non_blocking=True)`",
262-
Timer(
263-
"copy_to_device_nonblocking(*tensors)", globals=globals()
264-
).adaptive_autorange(),
265-
)
215+
to_device = timer("copy_to_device(*tensors)")
216+
to_device_nonblocking = timer("copy_to_device_nonblocking(*tensors)")
217+
218+
fig, ax = plt.subplots()
219+
220+
xlabels = ["to(device)", "to(device, non_blocking=True)"]
221+
bar_labels = xlabels
222+
values = [to_device, to_device_nonblocking]
223+
224+
ax.bar(xlabels, values, label=bar_labels)
225+
226+
ax.set_ylabel("Runtime (ms)")
227+
ax.set_title("Device casting runtime (non-blocking)")
228+
ax.legend()
229+
230+
plt.show()
266231

267232

268233
######################################################################
@@ -318,42 +283,44 @@ def pin_copy_to_device_nonblocking(*tensors):
318283
return result
319284

320285

321-
print("\nCall to `pin_memory()` + `to(device)`")
322-
print(
323-
"pin_memory().to(device)",
324-
Timer("pin_copy_to_device(*tensors)", globals=globals()).adaptive_autorange(),
325-
)
326-
print(
327-
"pin_memory().to(device, non_blocking=True)",
328-
Timer(
329-
"pin_copy_to_device_nonblocking(*tensors)", globals=globals()
330-
).adaptive_autorange(),
331-
)
286+
pin_and_copy = timer("pin_copy_to_device(*tensors)")
287+
pin_and_copy_nb = timer("pin_copy_to_device_nonblocking(*tensors)")
332288

333-
print("\nCall to `to(device)`")
334-
print(
335-
"to(device)",
336-
Timer("copy_to_device(*tensors)", globals=globals()).adaptive_autorange(),
337-
)
338-
print(
339-
"to(device, non_blocking=True)",
340-
Timer(
341-
"copy_to_device_nonblocking(*tensors)", globals=globals()
342-
).adaptive_autorange(),
343-
)
289+
page_copy = timer("copy_to_device(*tensors")
290+
page_copy_nb = timer("copy_to_device_nonblocking(*tensors_pinned))")
344291

345-
print("\nCall to `to(device)` from pinned tensors")
346-
tensors_pinned = [torch.zeros(1000, pin_memory=True) for _ in range(1000)]
347-
print(
348-
"tensor_pinned.to(device)",
349-
Timer("copy_to_device(*tensors_pinned)", globals=globals()).adaptive_autorange(),
350-
)
351-
print(
352-
"tensor_pinned.to(device, non_blocking=True)",
353-
Timer(
354-
"copy_to_device_nonblocking(*tensors_pinned)", globals=globals()
355-
).adaptive_autorange(),
356-
)
292+
tensors_pinned = [torch.randn(1000, pin_memory=True) for _ in range(1000)]
293+
294+
pinned_copy = timer("copy_to_device(*tensors")
295+
pinned_copy_nb = timer("copy_to_device_nonblocking(*tensors_pinned))")
296+
297+
strategies = ("pageable copy", "pinned copy", "pin and copy")
298+
blocking = {
299+
"blocking": [page_copy, pinned_copy, pin_and_copy],
300+
"non-blocking": [page_copy_nb, pinned_copy_nb, pin_and_copy_nb],
301+
}
302+
303+
x = [0, 1, 2]
304+
width = 0.25
305+
multiplier = 0
306+
307+
308+
fig, ax = plt.subplots(layout="constrained")
309+
310+
for attribute, runtimes in blocking.items():
311+
offset = width * multiplier
312+
rects = ax.bar(x + offset, runtimes, width, label=attribute)
313+
ax.bar_label(rects, padding=3)
314+
multiplier += 1
315+
316+
# Add some text for labels, title and custom x-axis tick labels, etc.
317+
ax.set_ylabel("Runtime (ms)")
318+
ax.set_title("Runtime (pin-mem and non-blocking)")
319+
ax.set_xticks(x + width, strategies)
320+
ax.legend(loc="upper left", ncols=3)
321+
ax.set_ylim(0, 250)
322+
323+
plt.show()
357324

358325
del tensors, tensors_pinned
359326
gc.collect()
@@ -447,47 +414,36 @@ def pin_copy_to_device_nonblocking(*tensors):
447414
import torch
448415
from torch.utils.benchmark import Timer
449416

450-
for s0 in (100, 1000, 10_000, 1_000_000):
451-
for s1 in (10, 100, 1000):
452-
print("\n\n\n\n", s0, s1)
453-
td = TensorDict({str(i): torch.randn(s0) for i in range(s1)})
417+
td = TensorDict({str(i): torch.randn(1_000_000) for i in range(100)})
454418

455-
print(
456-
Timer("td.to('cuda:0', non_blocking=False)", globals=globals()).adaptive_autorange()
457-
)
458-
print(Timer("td.to('cuda:0')", globals=globals()).adaptive_autorange())
459-
print(torch.get_num_threads())
460-
print(
461-
Timer(
462-
"td.to('cuda:0', non_blocking_pin=True, num_threads=2)", globals=globals()
463-
).adaptive_autorange()
464-
)
465-
print(
466-
Timer(
467-
"td.to('cuda:0', non_blocking_pin=True, num_threads=4)", globals=globals()
468-
).adaptive_autorange()
469-
)
470-
print(
471-
Timer(
472-
"td.to('cuda:0', non_blocking_pin=True, num_threads=8)", globals=globals()
473-
).adaptive_autorange()
474-
)
475-
print(
476-
Timer(
477-
"td.to('cuda:0', non_blocking_pin=True, num_threads=16)", globals=globals()
478-
).adaptive_autorange()
479-
)
480-
print(
481-
Timer(
482-
"td.to('cuda:0', non_blocking_pin=True, num_threads=32)", globals=globals()
483-
).adaptive_autorange()
484-
)
485-
print(
486-
Timer(
487-
"td.to('cuda:0', non_blocking_pin=True, num_threads=64)", globals=globals()
488-
).adaptive_autorange()
489-
)
419+
copy_blocking = timer("td.to('cuda:0', non_blocking=False)")
420+
copy_non_blocking = timer("td.to('cuda:0')")
421+
copy_pin_nb = timer("td.to('cuda:0', non_blocking_pin=True, num_threads=0)")
422+
copy_pin_multithread_nb = timer("td.to('cuda:0', non_blocking_pin=True, num_threads=4)")
423+
424+
425+
r1 = copy_non_blocking / copy_blocking
426+
r2 = copy_pin_nb / copy_blocking
427+
r3 = copy_pin_multithread_nb / copy_blocking
428+
429+
fig, ax = plt.subplots()
430+
431+
xlabels = [0, 1, 2, 3]
432+
bar_labels = [
433+
"Blocking copy (1x)",
434+
f"Non-blocking copy ({r1:4.4f}x)",
435+
f"Blocking pin, non-blocking copy ({r2:4.4f}x)",
436+
f"Non-blocking pin, non-blocking copy ({r3:4.4f}x)",
437+
]
438+
values = [copy_blocking, copy_non_blocking, copy_pin_nb, copy_pin_multithread_nb]
439+
440+
ax.bar(xlabels, values, label=bar_labels)
441+
442+
ax.set_ylabel("Runtime (ms)")
443+
ax.set_title("Device casting runtime")
444+
ax.legend()
490445

446+
plt.show()
491447

492448
######################################################################
493449
# As a side note, it may be tempting to create everlasting buffers in pinned memory and copy tensors from pageable memory

0 commit comments

Comments
 (0)