support fp8 linear on AMD (#86)

qzzz95 · akaitsuki-ii · web-flow · commit dd2029ee93db · 2025-07-17T11:03:55.000+08:00
* support fp8 linear on AMD

up

add comment

* move

---------

Co-authored-by: zhuguoxuan.zgx &lt;zhuguoxuan.zgx@alibaba-inc.com&gt;
diff --git a/diffsynth_engine/utils/fp8_linear.py b/diffsynth_engine/utils/fp8_linear.py
@@ -2,6 +2,7 @@
 import torch.nn as nn
 import torch.nn.functional as F
 from contextlib import contextmanager
+from diffsynth_engine.utils.platform import DTYPE_FP8
 
 
 def enable_fp8_autocast(module: nn.Module, compute_dtype: torch.dtype = torch.bfloat16, use_fp8_linear: bool = False):
@@ -51,7 +52,7 @@ def enable_fp8_linear(module: nn.Module):
 def _enable_fp8_linear(module: nn.Module):
     if isinstance(module, nn.Linear) and torch.is_floating_point(module.weight.data):
         # avoid conversion for int weights like GGUF
-        module.weight.data = module.weight.data.to(torch.float8_e4m3fn)
+        module.weight.data = module.weight.data.to(DTYPE_FP8)
     for submodule in module.children():
         _enable_fp8_linear(submodule)
 
@@ -71,16 +72,24 @@ def fp8_linear(
     ) -> torch.Tensor:
         device = input.device
         origin_dtype = input.dtype
-        input = input.to(torch.float8_e4m3fn)
-        weight = weight.to(torch.float8_e4m3fn)
+        scale_a = 1.0
+        # For float8_e4m3fnuz, the maximum representable value is half of that of e4m3fn.
+        # To avoid overflow and ensure numerical compatibility during FP8 computation,
+        # we scale down the input by 2.0 in advance.
+        # This scaling will be compensated later during the final result scaling.
+        if DTYPE_FP8 == torch.float8_e4m3fnuz:
+            scale_a = 2.0
+            input = input / scale_a
+        input = input.to(DTYPE_FP8)
+        weight = weight.to(DTYPE_FP8)
 
         if len(input.shape) > 2:
             origin_shape = input.shape
             input = input.reshape(-1, origin_shape[-1])
             result = torch._scaled_mm(
                 input,
                 weight.T,
-                scale_a=torch.tensor(1.0).to(device=device),
+                scale_a=torch.tensor(scale_a).to(device=device),
                 scale_b=torch.tensor(1.0).to(device=device),
                 bias=bias,
                 out_dtype=origin_dtype,
@@ -91,7 +100,7 @@ def fp8_linear(
             result = torch._scaled_mm(
                 input,
                 weight.T,
-                scale_a=torch.tensor(1.0).to(device=device),
+                scale_a=torch.tensor(scale_a).to(device=device),
                 scale_b=torch.tensor(1.0).to(device=device),
                 bias=bias,
                 out_dtype=origin_dtype,
diff --git a/diffsynth_engine/utils/platform.py b/diffsynth_engine/utils/platform.py
@@ -1,7 +1,15 @@
+# cross-platform definitions and utilities
 import torch
 import gc
 
-# 存放跨平台的工具类
+
+# data type
+# AMD only supports float8_e4m3fnuz
+# https://onnx.ai/onnx/technical/float8.html
+if torch.version.hip and "gfx94" in torch.cuda.get_device_properties(0).gcnArchName:
+    DTYPE_FP8 = torch.float8_e4m3fnuz
+else:
+    DTYPE_FP8 = torch.float8_e4m3fn
 
 
 def empty_cache():