ROCm
diff --git a/‎include/ck/ck.hpp‎
Lines changed: 3 additions & 0 deletions b/‎include/ck/ck.hpp‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎include/ck/utility/data_type.hpp‎
Lines changed: 5 additions & 5 deletions b/‎include/ck/utility/data_type.hpp‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎include/ck/utility/mxf4_utils.hpp‎
Lines changed: 4 additions & 6 deletions b/‎include/ck/utility/mxf4_utils.hpp‎
Lines changed: 4 additions & 6 deletions
diff --git a/‎include/ck/utility/mxfp_utils.hpp‎
Lines changed: 2 additions & 2 deletions b/‎include/ck/utility/mxfp_utils.hpp‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎include/ck/utility/scaled_type_convert.hpp‎
Lines changed: 26 additions & 204 deletions b/‎include/ck/utility/scaled_type_convert.hpp‎
Lines changed: 26 additions & 204 deletions
@@ -245,6 +245,9 @@ CK_DECLARE_ENV_VAR_BOOL(CK_LOGGING)
 // workaround: compiler issue on gfx908
 #define CK_WORKAROUND_SWDEV_388832 1
 
+// workaround: compiler issue on gfx950
+#define CK_WORKAROUND_FP32_TO_FP4_SR_CONVERSION 1
+
 // denorm test fix, necessary for gfx90a
 #ifndef CK_GFX90A_DENORM_WORKAROUND
 #define CK_GFX90A_DENORM_WORKAROUND 0
 
@@ -36,22 +36,22 @@ struct f4x2_pk_t
 {
     using type = uint8_t;
     type data;
-    f4x2_pk_t() : data{type{}} {}
-    f4x2_pk_t(type init) : data{init} {}
+    __host__ __device__ f4x2_pk_t() : data{type{}} {}
+    __host__ __device__ f4x2_pk_t(type init) : data{init} {}
 
     template <index_t I>
     __host__ __device__ inline type unpack(Number<I>) const
     {
         static_assert(I < 2, "Index is out of range.");
         if constexpr(I == 0)
-            return data & 0b00001111;
-        else
             return (data >> 4);
+        else
+            return data & 0b00001111;
     }
 
     __host__ __device__ inline type pack(const type x0, const type x1)
     {
-        return (x1 << 4) | (x0 & 0b00001111);
+        return (x0 << 4) | (x1 & 0b00001111);
     }
 };
 
 
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #ifndef CK_CODE_GEN_RTC
 #pragma once
@@ -14,7 +14,7 @@ __host__ __device__ inline bool is_nan<f4_t>(e8m0_bexp_t const scale,
                                              f4_t const dataBytes [[maybe_unused]])
 {
     // no need to check for data as it does not have NaN representation
-    return scale == NumericLimits<e8m0_bexp_t>::QuietNaN();
+    return scale.is_nan();
 }
 
 // no infinity representation in ocp_e2m1_mxfp4 will always return false
@@ -27,11 +27,9 @@ __host__ __device__ inline bool is_inf<f4_t>(e8m0_bexp_t const scale [[maybe_unu
 }
 
 template <>
-__host__ __device__ inline bool is_zero<f4_t>(e8m0_bexp_t const scale, f4_t const data)
+__host__ __device__ inline bool is_zero<f4_t>(e8m0_bexp_t const scale [[maybe_unused]],
+                                              f4_t const data)
 {
-    if(is_nan<f4_t>(scale, data))
-        return false;
-
     // no need to check for scale as it does not have a 0 representation
     f4_t result = (data & 0b00001111) & NumericUtils<f4_t>::set_sign_mask;
 
 
@@ -99,7 +99,7 @@ template <typename T>
 __host__ __device__ T sat_convert_to_type_sr(float value, uint32_t seed);
 
 template <typename T>
-inline T convert_to_type(float value)
+__host__ __device__ inline T convert_to_type(float value)
 {
     using bitwise_type = typename NumericUtils<T>::bitwise_type;
 
@@ -258,7 +258,7 @@ inline T convert_to_type(float value)
 }
 
 template <typename T>
-inline T convert_to_type_sr(float value, uint32_t seed)
+__host__ __device__ inline T convert_to_type_sr(float value, uint32_t seed)
 {
     if(std::abs(value) > NumericLimits<T>::Max())
     {
 
@@ -377,12 +377,15 @@ inline __host__ __device__ float2_t scaled_type_convert<float2_t, f4x2_t>(e8m0_b
         f4x2_t f4x2_array[4];
     } value{};
     value.f4x2_array[0] = x;
-    return __builtin_amdgcn_cvt_scalef32_pk_f32_fp4(value.bitwise, type_convert<float>(scale), 0);
+    float2_t tmp =
+        __builtin_amdgcn_cvt_scalef32_pk_f32_fp4(value.bitwise, type_convert<float>(scale), 0);
+    // permute high bits and low bits to match the order of the original vector
+    return float2_t{tmp[1], tmp[0]};
 #else
     float2_t ret{utils::to_float<f4_t>(
-                     scale, x.template AsType<f4x2_pk_t>()[Number<0>{}].unpack<>(Number<1>{})),
+                     scale, x.template AsType<f4x2_pk_t>()[Number<0>{}].unpack<>(Number<0>{})),
                  utils::to_float<f4_t>(
-                     scale, x.template AsType<f4x2_pk_t>()[Number<0>{}].unpack<>(Number<0>{}))};
+                     scale, x.template AsType<f4x2_pk_t>()[Number<0>{}].unpack<>(Number<1>{}))};
     return ret;
 #endif
 }
@@ -398,109 +401,16 @@ inline __host__ __device__ float32_t scaled_type_convert<float32_t, f4x32_t>(e8m
         f4x32_t f4x32_array;
         f4x2_t fp4x2[16];
     } value{x};
-    union
-    {
-        uint32_t bitwise;
-        f4x2_t f4x2_array[4];
-    } bitwise_value{};
     float2_t op;
     float32_t ret;
-    // TODO: pack in a loop
-    bitwise_value.f4x2_array[0] = value.fp4x2[0];
-    op                          = __builtin_amdgcn_cvt_scalef32_pk_f32_fp4(
-        bitwise_value.bitwise, type_convert<float>(scale), 0);
-    ret[0] = op[0];
-    ret[1] = op[1];
-
-    bitwise_value.f4x2_array[0] = value.fp4x2[1];
-    op                          = __builtin_amdgcn_cvt_scalef32_pk_f32_fp4(
-        bitwise_value.bitwise, type_convert<float>(scale), 0);
-    ret[2] = op[0];
-    ret[3] = op[1];
-
-    bitwise_value.f4x2_array[0] = value.fp4x2[2];
-    op                          = __builtin_amdgcn_cvt_scalef32_pk_f32_fp4(
-        bitwise_value.bitwise, type_convert<float>(scale), 0);
-    ret[4] = op[0];
-    ret[5] = op[1];
-
-    bitwise_value.f4x2_array[0] = value.fp4x2[3];
-    op                          = __builtin_amdgcn_cvt_scalef32_pk_f32_fp4(
-        bitwise_value.bitwise, type_convert<float>(scale), 0);
-    ret[6] = op[0];
-    ret[7] = op[1];
-
-    bitwise_value.f4x2_array[0] = value.fp4x2[4];
-    op                          = __builtin_amdgcn_cvt_scalef32_pk_f32_fp4(
-        bitwise_value.bitwise, type_convert<float>(scale), 0);
-    ret[8] = op[0];
-    ret[9] = op[1];
-
-    bitwise_value.f4x2_array[0] = value.fp4x2[5];
-    op                          = __builtin_amdgcn_cvt_scalef32_pk_f32_fp4(
-        bitwise_value.bitwise, type_convert<float>(scale), 0);
-    ret[10] = op[0];
-    ret[11] = op[1];
-
-    bitwise_value.f4x2_array[0] = value.fp4x2[6];
-    op                          = __builtin_amdgcn_cvt_scalef32_pk_f32_fp4(
-        bitwise_value.bitwise, type_convert<float>(scale), 0);
-    ret[12] = op[0];
-    ret[13] = op[1];
-
-    bitwise_value.f4x2_array[0] = value.fp4x2[7];
-    op                          = __builtin_amdgcn_cvt_scalef32_pk_f32_fp4(
-        bitwise_value.bitwise, type_convert<float>(scale), 0);
-    ret[14] = op[0];
-    ret[15] = op[1];
-
-    bitwise_value.f4x2_array[0] = value.fp4x2[8];
-    op                          = __builtin_amdgcn_cvt_scalef32_pk_f32_fp4(
-        bitwise_value.bitwise, type_convert<float>(scale), 0);
-    ret[16] = op[0];
-    ret[17] = op[1];
-
-    bitwise_value.f4x2_array[0] = value.fp4x2[9];
-    op                          = __builtin_amdgcn_cvt_scalef32_pk_f32_fp4(
-        bitwise_value.bitwise, type_convert<float>(scale), 0);
-    ret[18] = op[0];
-    ret[19] = op[1];
-
-    bitwise_value.f4x2_array[0] = value.fp4x2[10];
-    op                          = __builtin_amdgcn_cvt_scalef32_pk_f32_fp4(
-        bitwise_value.bitwise, type_convert<float>(scale), 0);
-    ret[20] = op[0];
-    ret[21] = op[1];
-
-    bitwise_value.f4x2_array[0] = value.fp4x2[11];
-    op                          = __builtin_amdgcn_cvt_scalef32_pk_f32_fp4(
-        bitwise_value.bitwise, type_convert<float>(scale), 0);
-    ret[22] = op[0];
-    ret[23] = op[1];
-
-    bitwise_value.f4x2_array[0] = value.fp4x2[12];
-    op                          = __builtin_amdgcn_cvt_scalef32_pk_f32_fp4(
-        bitwise_value.bitwise, type_convert<float>(scale), 0);
-    ret[24] = op[0];
-    ret[25] = op[1];
-
-    bitwise_value.f4x2_array[0] = value.fp4x2[13];
-    op                          = __builtin_amdgcn_cvt_scalef32_pk_f32_fp4(
-        bitwise_value.bitwise, type_convert<float>(scale), 0);
-    ret[26] = op[0];
-    ret[27] = op[1];
-
-    bitwise_value.f4x2_array[0] = value.fp4x2[14];
-    op                          = __builtin_amdgcn_cvt_scalef32_pk_f32_fp4(
-        bitwise_value.bitwise, type_convert<float>(scale), 0);
-    ret[28] = op[0];
-    ret[29] = op[1];
-
-    bitwise_value.f4x2_array[0] = value.fp4x2[15];
-    op                          = __builtin_amdgcn_cvt_scalef32_pk_f32_fp4(
-        bitwise_value.bitwise, type_convert<float>(scale), 0);
-    ret[30] = op[0];
-    ret[31] = op[1];
+    float f_scale = type_convert<float>(scale);
+
+    ck::static_for<0, 32 / 2, 1>{}([&](auto idx) {
+        op = __builtin_amdgcn_cvt_scalef32_pk_f32_fp4(value.fp4x2[idx], f_scale, 0);
+        // permute high bits and low bits to match the order of the original vector
+        ret[2 * idx]     = op[1];
+        ret[2 * idx + 1] = op[0];
+    });
 
     return ret;
 #else
@@ -515,106 +425,18 @@ inline __host__ __device__ float32_t scaled_type_convert<float32_t, f4x32_t>(e8m
         f4x2_t f4x2_array[16];
         f4x32_t f4x32_array;
     } f4_values{bit_cast<__uint128_t>(x)};
-    // TODO: pack in a loop
-    float_values.float_array[0] = utils::to_float<f4_t>(
-        scale,
-        f4_values.f4x2_array[0].template AsType<f4x2_pk_t>()[Number<0>{}].unpack<>(Number<0>{}));
-    float_values.float_array[1] = utils::to_float<f4_t>(
-        scale,
-        f4_values.f4x2_array[0].template AsType<f4x2_pk_t>()[Number<0>{}].unpack<>(Number<1>{}));
-    float_values.float_array[2] = utils::to_float<f4_t>(
-        scale,
-        f4_values.f4x2_array[1].template AsType<f4x2_pk_t>()[Number<0>{}].unpack<>(Number<0>{}));
-    float_values.float_array[3] = utils::to_float<f4_t>(
-        scale,
-        f4_values.f4x2_array[1].template AsType<f4x2_pk_t>()[Number<0>{}].unpack<>(Number<1>{}));
-    float_values.float_array[4] = utils::to_float<f4_t>(
-        scale,
-        f4_values.f4x2_array[2].template AsType<f4x2_pk_t>()[Number<0>{}].unpack<>(Number<0>{}));
-    float_values.float_array[5] = utils::to_float<f4_t>(
-        scale,
-        f4_values.f4x2_array[2].template AsType<f4x2_pk_t>()[Number<0>{}].unpack<>(Number<1>{}));
-    float_values.float_array[6] = utils::to_float<f4_t>(
-        scale,
-        f4_values.f4x2_array[3].template AsType<f4x2_pk_t>()[Number<0>{}].unpack<>(Number<0>{}));
-    float_values.float_array[7] = utils::to_float<f4_t>(
-        scale,
-        f4_values.f4x2_array[3].template AsType<f4x2_pk_t>()[Number<0>{}].unpack<>(Number<1>{}));
-
-    float_values.float_array[0] = utils::to_float<f4_t>(
-        scale,
-        f4_values.f4x2_array[4].template AsType<f4x2_pk_t>()[Number<0>{}].unpack<>(Number<0>{}));
-    float_values.float_array[1] = utils::to_float<f4_t>(
-        scale,
-        f4_values.f4x2_array[4].template AsType<f4x2_pk_t>()[Number<0>{}].unpack<>(Number<1>{}));
-    float_values.float_array[2] = utils::to_float<f4_t>(
-        scale,
-        f4_values.f4x2_array[5].template AsType<f4x2_pk_t>()[Number<0>{}].unpack<>(Number<0>{}));
-    float_values.float_array[3] = utils::to_float<f4_t>(
-        scale,
-        f4_values.f4x2_array[5].template AsType<f4x2_pk_t>()[Number<0>{}].unpack<>(Number<1>{}));
-    float_values.float_array[4] = utils::to_float<f4_t>(
-        scale,
-        f4_values.f4x2_array[6].template AsType<f4x2_pk_t>()[Number<0>{}].unpack<>(Number<0>{}));
-    float_values.float_array[5] = utils::to_float<f4_t>(
-        scale,
-        f4_values.f4x2_array[6].template AsType<f4x2_pk_t>()[Number<0>{}].unpack<>(Number<1>{}));
-    float_values.float_array[6] = utils::to_float<f4_t>(
-        scale,
-        f4_values.f4x2_array[7].template AsType<f4x2_pk_t>()[Number<0>{}].unpack<>(Number<0>{}));
-    float_values.float_array[7] = utils::to_float<f4_t>(
-        scale,
-        f4_values.f4x2_array[7].template AsType<f4x2_pk_t>()[Number<0>{}].unpack<>(Number<1>{}));
-
-    float_values.float_array[0] = utils::to_float<f4_t>(
-        scale,
-        f4_values.f4x2_array[8].template AsType<f4x2_pk_t>()[Number<0>{}].unpack<>(Number<0>{}));
-    float_values.float_array[1] = utils::to_float<f4_t>(
-        scale,
-        f4_values.f4x2_array[8].template AsType<f4x2_pk_t>()[Number<0>{}].unpack<>(Number<1>{}));
-    float_values.float_array[2] = utils::to_float<f4_t>(
-        scale,
-        f4_values.f4x2_array[9].template AsType<f4x2_pk_t>()[Number<0>{}].unpack<>(Number<0>{}));
-    float_values.float_array[3] = utils::to_float<f4_t>(
-        scale,
-        f4_values.f4x2_array[9].template AsType<f4x2_pk_t>()[Number<0>{}].unpack<>(Number<1>{}));
-    float_values.float_array[4] = utils::to_float<f4_t>(
-        scale,
-        f4_values.f4x2_array[10].template AsType<f4x2_pk_t>()[Number<0>{}].unpack<>(Number<0>{}));
-    float_values.float_array[5] = utils::to_float<f4_t>(
-        scale,
-        f4_values.f4x2_array[10].template AsType<f4x2_pk_t>()[Number<0>{}].unpack<>(Number<1>{}));
-    float_values.float_array[6] = utils::to_float<f4_t>(
-        scale,
-        f4_values.f4x2_array[11].template AsType<f4x2_pk_t>()[Number<0>{}].unpack<>(Number<0>{}));
-    float_values.float_array[7] = utils::to_float<f4_t>(
-        scale,
-        f4_values.f4x2_array[11].template AsType<f4x2_pk_t>()[Number<0>{}].unpack<>(Number<1>{}));
-
-    float_values.float_array[0] = utils::to_float<f4_t>(
-        scale,
-        f4_values.f4x2_array[12].template AsType<f4x2_pk_t>()[Number<0>{}].unpack<>(Number<0>{}));
-    float_values.float_array[1] = utils::to_float<f4_t>(
-        scale,
-        f4_values.f4x2_array[12].template AsType<f4x2_pk_t>()[Number<0>{}].unpack<>(Number<1>{}));
-    float_values.float_array[2] = utils::to_float<f4_t>(
-        scale,
-        f4_values.f4x2_array[13].template AsType<f4x2_pk_t>()[Number<0>{}].unpack<>(Number<0>{}));
-    float_values.float_array[3] = utils::to_float<f4_t>(
-        scale,
-        f4_values.f4x2_array[13].template AsType<f4x2_pk_t>()[Number<0>{}].unpack<>(Number<1>{}));
-    float_values.float_array[4] = utils::to_float<f4_t>(
-        scale,
-        f4_values.f4x2_array[14].template AsType<f4x2_pk_t>()[Number<0>{}].unpack<>(Number<0>{}));
-    float_values.float_array[5] = utils::to_float<f4_t>(
-        scale,
-        f4_values.f4x2_array[14].template AsType<f4x2_pk_t>()[Number<0>{}].unpack<>(Number<1>{}));
-    float_values.float_array[6] = utils::to_float<f4_t>(
-        scale,
-        f4_values.f4x2_array[15].template AsType<f4x2_pk_t>()[Number<0>{}].unpack<>(Number<0>{}));
-    float_values.float_array[7] = utils::to_float<f4_t>(
-        scale,
-        f4_values.f4x2_array[15].template AsType<f4x2_pk_t>()[Number<0>{}].unpack<>(Number<1>{}));
+
+    ck::static_for<0, 32 / 2, 1>{}([&](auto idx) {
+        float_values.float_array[2 * idx] = utils::to_float<f4_t>(
+            scale,
+            f4_values.f4x2_array[idx].template AsType<f4x2_pk_t>()[Number<0>{}].template unpack<>(
+                Number<0>{}));
+
+        float_values.float_array[2 * idx + 1] = utils::to_float<f4_t>(
+            scale,
+            f4_values.f4x2_array[idx].template AsType<f4x2_pk_t>()[Number<0>{}].template unpack<>(
+                Number<1>{}));
+    });
 
     return float_values.float32_array;
 #endif
Original file line number	Diff line number	Diff line change
`@@ -36,22 +36,22 @@ struct f4x2_pk_t`
`36`	`36`	`{`
`37`	`37`	`using type = uint8_t;`
`38`	`38`	`type data;`
`39`		`- f4x2_pk_t() : data{type{}} {}`
`40`		`- f4x2_pk_t(type init) : data{init} {}`
	`39`	`+ __host__ __device__ f4x2_pk_t() : data{type{}} {}`
	`40`	`+ __host__ __device__ f4x2_pk_t(type init) : data{init} {}`
`41`	`41`
`42`	`42`	`template <index_t I>`
`43`	`43`	`__host__ __device__ inline type unpack(Number<I>) const`
`44`	`44`	`{`
`45`	`45`	`static_assert(I < 2, "Index is out of range.");`
`46`	`46`	`if constexpr(I == 0)`
`47`		`- return data & 0b00001111;`
`48`		`- else`
`49`	`47`	`return (data >> 4);`
	`48`	`+ else`
	`49`	`+ return data & 0b00001111;`
`50`	`50`	`}`
`51`	`51`
`52`	`52`	`__host__ __device__ inline type pack(const type x0, const type x1)`
`53`	`53`	`{`
`54`		`- return (x1 << 4) \| (x0 & 0b00001111);`
	`54`	`+ return (x0 << 4) \| (x1 & 0b00001111);`
`55`	`55`	`}`
`56`	`56`	`};`
`57`	`57`
Original file line number	Diff line number	Diff line change
`@@ -1,5 +1,5 @@`
`1`	`1`	`// SPDX-License-Identifier: MIT`
`2`		`-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.`
	`2`	`+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.`
`3`	`3`
`4`	`4`	`#ifndef CK_CODE_GEN_RTC`
`5`	`5`	`#pragma once`
`@@ -14,7 +14,7 @@ __host__ __device__ inline bool is_nan<f4_t>(e8m0_bexp_t const scale,`
`14`	`14`	`f4_t const dataBytes [[maybe_unused]])`
`15`	`15`	`{`
`16`	`16`	`// no need to check for data as it does not have NaN representation`
`17`		`- return scale == NumericLimits<e8m0_bexp_t>::QuietNaN();`
	`17`	`+ return scale.is_nan();`
`18`	`18`	`}`
`19`	`19`
`20`	`20`	`// no infinity representation in ocp_e2m1_mxfp4 will always return false`
`@@ -27,11 +27,9 @@ __host__ __device__ inline bool is_inf<f4_t>(e8m0_bexp_t const scale [[maybe_unu`
`27`	`27`	`}`
`28`	`28`
`29`	`29`	`template <>`
`30`		`-__host__ __device__ inline bool is_zero<f4_t>(e8m0_bexp_t const scale, f4_t const data)`
	`30`	`+__host__ __device__ inline bool is_zero<f4_t>(e8m0_bexp_t const scale [[maybe_unused]],`
	`31`	`+ f4_t const data)`
`31`	`32`	`{`
`32`		`- if(is_nan<f4_t>(scale, data))`
`33`		`- return false;`
`34`		`-`
`35`	`33`	`// no need to check for scale as it does not have a 0 representation`
`36`	`34`	`f4_t result = (data & 0b00001111) & NumericUtils<f4_t>::set_sign_mask;`
`37`	`35`
Original file line number	Diff line number	Diff line change
`@@ -99,7 +99,7 @@ template <typename T>`
`99`	`99`	`__host__ __device__ T sat_convert_to_type_sr(float value, uint32_t seed);`
`100`	`100`
`101`	`101`	`template <typename T>`
`102`		`-inline T convert_to_type(float value)`
	`102`	`+__host__ __device__ inline T convert_to_type(float value)`
`103`	`103`	`{`
`104`	`104`	`using bitwise_type = typename NumericUtils<T>::bitwise_type;`
`105`	`105`
`@@ -258,7 +258,7 @@ inline T convert_to_type(float value)`
`258`	`258`	`}`
`259`	`259`
`260`	`260`	`template <typename T>`
`261`		`-inline T convert_to_type_sr(float value, uint32_t seed)`
	`261`	`+__host__ __device__ inline T convert_to_type_sr(float value, uint32_t seed)`
`262`	`262`	`{`
`263`	`263`	`if(std::abs(value) > NumericLimits<T>::Max())`
`264`	`264`	`{`