Updated on-gpu benchmaking with cuda event & sync

LukasHedegaard · LukasHedegaard · commit 3727ef8a8979 · 2023-08-08T11:53:51.000Z
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -9,6 +9,11 @@ From v1.0.0 and on, the project will adherence strictly to Semantic Versioning.
 
 ## [Unreleased]
 
+## [0.3.5] - 2023-08-08
+### Fixed
+- Updated on-gpu model benchmaking with best-practices on `cuda.Event` and `cuda.synchronize`.
+
+
 ## [0.3.4] - 2022-02-22
 
 ### Fixed
diff --git a/pytorch_benchmark/benchmark.py b/pytorch_benchmark/benchmark.py
@@ -146,14 +146,29 @@ def measure_repeated_inference_timing(
     ):
         start_on_cpu = time()
         device_sample = transfer_to_device_fn(sample, model_device)
-        start_on_device = time()
+
+        if model_device.type == "cuda":
+            start_event = torch.cuda.Event(enable_timing=True)
+            stop_event = torch.cuda.Event(enable_timing=True)
+            start_event.record()  # For GPU timing
+        start_on_device = time()  # For CPU timing
+
         device_result = model(device_sample)
-        stop_on_device = time()
+
+        if model_device.type == "cuda":
+            stop_event.record()
+            torch.cuda.synchronize()
+            elapsed_on_device = stop_event.elapsed_time(start_event)
+            stop_on_device = time()
+        else:
+            stop_on_device = time()
+            elapsed_on_device = stop_on_device - start_on_device
+
         transfer_to_device_fn(device_result, "cpu")
         stop_on_cpu = time()
 
         t_c2d.append(start_on_device - start_on_cpu)
-        t_inf.append(stop_on_device - start_on_device)
+        t_inf.append(elapsed_on_device)
         t_d2c.append(stop_on_cpu - stop_on_device)
         t_tot.append(stop_on_cpu - start_on_cpu)
 
diff --git a/setup.py b/setup.py
@@ -25,7 +25,7 @@ def from_file(file_name: str = "requirements.txt", comment_char: str = "#"):
 
 setup(
     name="pytorch-benchmark",
-    version="0.3.4",
+    version="0.3.5",
     description="Easily benchmark PyTorch model FLOPs, latency, throughput, max allocated memory and energy consumption in one go.",
     long_description=long_description(),
     long_description_content_type="text/markdown",