Update throughput table to include H200 stats (#21119)

SkafteNicki · lantiga · commit 33a17db3aca5 · 2025-08-29T12:07:57.000+02:00
* add h200 support to throughput * add testing * changelog --------- Co-authored-by: Jirka Borovec <6035284+Borda@users.noreply.github.com> (cherry picked from commit e55650d)
diff --git a/src/lightning/fabric/CHANGELOG.md b/src/lightning/fabric/CHANGELOG.md
@@ -4,6 +4,14 @@ All notable changes to this project will be documented in this file.
 
 The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
+
+## [unreleased] - YYYY-MM-DD
+
+### Changed
+
+- Added support for NVIDIA H200 GPUs in `get_available_flops` ([#20913](https://github.com/Lightning-AI/pytorch-lightning/pull/21119))
+
+
 ---
 
 ## [2.5.3] - 2025-08-DD
diff --git a/src/lightning/fabric/utilities/throughput.py b/src/lightning/fabric/utilities/throughput.py
@@ -304,6 +304,23 @@ def measure_flops(
 
 _CUDA_FLOPS: dict[str, dict[Union[str, torch.dtype], float]] = {
     # Hopper
+    # source: https://nvdam.widen.net/s/nb5zzzsjdf/hpc-datasheet-sc23-h200-datasheet-3002446
+    "h200 sxm1": {
+        torch.float64: 3.4e13,
+        torch.float32: 6.7e13,
+        "tfloat32": 9.9e14,
+        torch.bfloat16: 2.0e15,
+        torch.float16: 2.0e15,
+        torch.int8: 4.0e15,
+    },
+    "h200 nvl1": {
+        torch.float64: 3.0e13,
+        torch.float32: 6.0e13,
+        "tfloat32": 8.4e14,
+        torch.bfloat16: 1.7e15,
+        torch.float16: 1.7e15,
+        torch.int8: 3.3e15,
+    },
     # source: https://resources.nvidia.com/en-us-tensor-core
     "h100 nvl": {
         torch.float64: 67e12,
@@ -536,7 +553,12 @@ def get_available_flops(device: torch.device, dtype: Union[torch.dtype, str]) ->
     if device.type == "cuda":
         device_name = torch.cuda.get_device_name(device)
         chip = device_name.lower()
-        if "h100" in chip:
+        if "h200" in chip:
+            if "sxm1" in chip:
+                chip = "h200 sxm1"
+            elif "nvl1" in chip:
+                chip = "h200 nvl1"
+        elif "h100" in chip:
             if "hbm3" in chip:
                 chip = "h100 sxm"
             elif "nvl" in chip:
diff --git a/tests/tests_fabric/utilities/test_throughput.py b/tests/tests_fabric/utilities/test_throughput.py
@@ -68,6 +68,8 @@ def test_get_available_flops(xla_available):
     "device_name",
     [
         # Hopper
+        "NVIDIA H200 SXM1",
+        "NVIDIA H200 NVL1",
         "h100-nvl",  # TODO: switch with `torch.cuda.get_device_name()` result
         "h100-hbm3",  # TODO: switch with `torch.cuda.get_device_name()` result
         "NVIDIA H100 PCIe",

Original file line number	Diff line number	Diff line change
`@@ -68,6 +68,8 @@ def test_get_available_flops(xla_available):`
`68`	`68`	`"device_name",`
`69`	`69`	`[`
`70`	`70`	`# Hopper`
	`71`	`+ "NVIDIA H200 SXM1",`
	`72`	`+ "NVIDIA H200 NVL1",`
`71`	`73`	"h100-nvl", # TODO: switch with `torch.cuda.get_device_name()` result
`72`	`74`	"h100-hbm3", # TODO: switch with `torch.cuda.get_device_name()` result
`73`	`75`	`"NVIDIA H100 PCIe",`