Skip to content

Commit 1717910

Browse files
Merge pull request #15 from LukasHedegaard/cuda-sync
Cuda sync
2 parents 426f81d + 855972f commit 1717910

File tree

4 files changed

+30
-8
lines changed

4 files changed

+30
-8
lines changed

CHANGELOG.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,12 @@ From v1.0.0 and on, the project will adherence strictly to Semantic Versioning.
99

1010
## [Unreleased]
1111

12+
## [0.3.5] - 2023-08-08
13+
### Fixed
14+
- Updated on-gpu model benchmaking with best-practices on `cuda.Event` and `cuda.synchronize`.
15+
- FLOPs measurement error on CUDA.
16+
17+
1218
## [0.3.4] - 2022-02-22
1319

1420
### Fixed

pytorch_benchmark/benchmark.py

Lines changed: 23 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -108,7 +108,6 @@ def warm_up(
108108
def measure_detailed_inference_timing(
109109
model, sample, model_device, transfer_to_device_fn=torch.Tensor.to
110110
):
111-
112111
try:
113112
with torch.autograd.profiler.profile(
114113
use_cuda=(model_device.type == "cuda"), profile_memory=True
@@ -135,7 +134,6 @@ def measure_repeated_inference_timing(
135134
num_runs=100,
136135
batch_size: int = None,
137136
):
138-
139137
t_c2d = []
140138
t_inf = []
141139
t_d2c = []
@@ -146,14 +144,29 @@ def measure_repeated_inference_timing(
146144
):
147145
start_on_cpu = time()
148146
device_sample = transfer_to_device_fn(sample, model_device)
149-
start_on_device = time()
147+
148+
if model_device.type == "cuda":
149+
start_event = torch.cuda.Event(enable_timing=True)
150+
stop_event = torch.cuda.Event(enable_timing=True)
151+
start_event.record() # For GPU timing
152+
start_on_device = time() # For CPU timing
153+
150154
device_result = model(device_sample)
151-
stop_on_device = time()
155+
156+
if model_device.type == "cuda":
157+
stop_event.record()
158+
torch.cuda.synchronize()
159+
elapsed_on_device = stop_event.elapsed_time(start_event)
160+
stop_on_device = time()
161+
else:
162+
stop_on_device = time()
163+
elapsed_on_device = stop_on_device - start_on_device
164+
152165
transfer_to_device_fn(device_result, "cpu")
153166
stop_on_cpu = time()
154167

155168
t_c2d.append(start_on_device - start_on_cpu)
156-
t_inf.append(stop_on_device - start_on_device)
169+
t_inf.append(elapsed_on_device)
157170
t_d2c.append(stop_on_cpu - stop_on_device)
158171
t_tot.append(stop_on_cpu - start_on_cpu)
159172

@@ -328,7 +341,11 @@ def benchmark(
328341
batch_size=1,
329342
)
330343

331-
flops = measure_flops(model, sample1, print_details)
344+
with torch.no_grad():
345+
flops = measure_flops(
346+
model, transfer_to_device_fn(sample1, model_device), print_details
347+
)
348+
332349
if _is_valid(flops):
333350
results["flops"] = flops
334351
print_fn(f"Model FLOPs: {flops} ({format_num(flops)})")

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ def from_file(file_name: str = "requirements.txt", comment_char: str = "#"):
2525

2626
setup(
2727
name="pytorch-benchmark",
28-
version="0.3.4",
28+
version="0.3.5",
2929
description="Easily benchmark PyTorch model FLOPs, latency, throughput, max allocated memory and energy consumption in one go.",
3030
long_description=long_description(),
3131
long_description_content_type="text/markdown",

tests/test_example.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,6 @@
66

77

88
def test_example():
9-
109
model = efficientnet_b0()
1110

1211
if torch.cuda.is_available():

0 commit comments

Comments
 (0)