ROCm · Micky774 · Sep 5, 2025 · Sep 5, 2025 · Sep 5, 2025 · Sep 5, 2025
@@ -17,13 +17,11 @@
 )
 from transformer_engine.pytorch.tensor.float8_tensor import Float8Quantizer, Float8Tensor
 from transformer_engine.pytorch.tensor.mxfp8_tensor import MXFP8Quantizer, MXFP8Tensor
-from transformer_engine.pytorch.triton_kernels.rmsnorm import (
-    te_rmsnorm_bwd_triton,
-    te_rmsnorm_fwd_triton,
-)
-from transformer_engine.pytorch.triton_kernels.layernorm import (
+from transformer_engine.pytorch.triton_kernels.norms import (
     te_layernorm_bwd_triton,
     te_layernorm_fwd_triton,
+    te_rmsnorm_bwd_triton,
+    te_rmsnorm_fwd_triton,
 )
 from test_common import dtype_tols, te_compare_results, str_to_torch_dtype, fill_uniform
 

@@ -24,8 +24,12 @@
 _use_cudnn_mxfp8_norm = bool(int(os.getenv("NVTE_CUDNN_MXFP8_NORM", "0")))
 
 if IS_HIP_EXTENSION:
-    from ..triton_kernels.layernorm import te_layernorm_fwd_triton, te_layernorm_bwd_triton
-    from ..triton_kernels.rmsnorm import te_rmsnorm_bwd_triton, te_rmsnorm_fwd_triton
+    from ..triton_kernels.norms import (
+        te_layernorm_fwd_triton,
+        te_layernorm_bwd_triton,
+        te_rmsnorm_fwd_triton,
+        te_rmsnorm_bwd_triton
+    )
 
 def _get_normalization_func(normalization: str, forward: bool):
     use_rmsnorm_triton = bool( int(os.environ.get('NVTE_USE_RMSNORM_TRITON', '0')) ) and IS_HIP_EXTENSION

@@ -68,8 +68,7 @@
 )
 
 if IS_HIP_EXTENSION:
-    from ..triton_kernels.layernorm import te_layernorm_bwd_triton
-    from ..triton_kernels.rmsnorm import te_rmsnorm_bwd_triton
+    from ..triton_kernels.norms import te_layernorm_bwd_triton, te_rmsnorm_bwd_triton
 
 from ..rocm_utils import create_fp8_weight_transpose_cache, clear_fp8_weight_transpose_cache
 

@@ -75,8 +75,7 @@
 )
 
 if IS_HIP_EXTENSION:
-    from ..triton_kernels.layernorm import te_layernorm_bwd_triton
-    from ..triton_kernels.rmsnorm import te_rmsnorm_bwd_triton
+    from ..triton_kernels.norms import te_layernorm_bwd_triton, te_rmsnorm_bwd_triton
 
 from ..rocm_utils import create_fp8_weight_transpose_cache, clear_fp8_weight_transpose_cache
 

@@ -17,7 +17,7 @@
 from transformer_engine_torch import layernorm_bwd, layernorm_fwd
 from torch.utils.cpp_extension import IS_HIP_EXTENSION
 if IS_HIP_EXTENSION:
-    from ...triton_kernels.layernorm import te_layernorm_fwd_triton, te_layernorm_bwd_triton
+    from ...triton_kernels.norms import te_layernorm_fwd_triton, te_layernorm_bwd_triton
 from ...fp8 import FP8GlobalStateManager
 from ...tensor import QuantizedTensor
 from ...constants import TE_DType

@@ -17,7 +17,7 @@
 from transformer_engine_torch import rmsnorm_bwd, rmsnorm_fwd
 from torch.utils.cpp_extension import IS_HIP_EXTENSION
 if IS_HIP_EXTENSION:
-    from ...triton_kernels.rmsnorm import (
+    from ...triton_kernels.norms import (
         te_rmsnorm_bwd_triton,
         te_rmsnorm_fwd_triton
     )