Add pure numpy conversion

nikita-savelyevv · nikita-savelyevv · commit 0266a3e2d799 · 2025-11-27T16:59:16.000+01:00
diff --git a/src/nncf/quantization/algorithms/weight_compression/fp8_conversion.py b/src/nncf/quantization/algorithms/weight_compression/fp8_conversion.py
@@ -36,103 +36,146 @@
 # fmt: on
 
 
-def _f16_to_f8e4m3_bits_scalar(h_bits: int) -> int:
-    """Exact port of ov::f16_to_f8e4m3_bits for a single float16 bit-pattern."""
+def f16_to_f8e4m3_bits_numpy(x: np.ndarray) -> np.ndarray:
+    """
+    Convert an array of f16 values (or their uint16 bit patterns) to
+    f8e4m3 bit patterns (uint8) using a fully vectorized NumPy
+    port of _f16_to_f8e4m3_bits_scalar.
+    """
     # f16 layout
-    f16_s_mask = 0x8000
-    f16_e_mask = 0x7C00
+    f16_s_mask = np.uint16(0x8000)
+    f16_e_mask = np.uint16(0x7C00)
     f16_e_bias = 15
     f16_e_size = 5
-    f16_m_mask = 0x03FF
+    f16_m_mask = np.uint16(0x03FF)
     f16_m_size = 10
 
     # f8 e4m3 layout
     f8e4m3_e_size = 4
-    f8e4m3_e_mask = 0x78
+    f8e4m3_e_mask = np.uint16(0x78)
     f8e4m3_e_bias = 7
     f8e4m3_e_max = 0x0F
     f8e4m3_m_size = 3
-    f8e4m3_m_mask = 0x07
+    f8e4m3_m_mask = np.uint16(0x07)
 
     byte_shift = 8
 
     # f8 masks in uint16 domain
-    f8_e_mask = f8e4m3_e_mask << byte_shift  # 0x7800
-    f8_m_mask = f8e4m3_m_mask << byte_shift  # 0x0700
-    f8_m_hidden_one_mask = 0x0800  # hidden 1 for subnormals
+    f8_e_mask = np.uint16(f8e4m3_e_mask << byte_shift)  # 0x7800
+    f8_m_mask = np.uint16(f8e4m3_m_mask << byte_shift)  # 0x0700
+    f8_m_hidden_one_mask = np.uint16(0x0800)  # hidden 1 for subnormals
 
-    # rounding constants (same as C++)
-    round_half = 0x01FF
-    round_norm = 0x007F
-    round_even = 0x0080
-    round_odd = 0x0180
+    # rounding constants
+    round_half = np.uint16(0x01FF)
+    round_norm = np.uint16(0x007F)
+    round_even = np.uint16(0x0080)
+    round_odd = np.uint16(0x0180)
 
     # min exponent for which subnormals are representable
     f8_e_subnormal_min = -10
 
-    inp = int(h_bits) & 0xFFFF
-
     # sign bit: f16 sign -> f8 sign position (bit 15 -> bit 7)
-    f8_bits = (inp & f16_s_mask) >> byte_shift
-
-    f16_e_field = inp & f16_e_mask
-
-    if f16_e_field == f16_e_mask:
-        # f16 NaN / Inf -> f8 NaN (no Inf)
-        f8_bits |= f8e4m3_e_mask | f8e4m3_m_mask
-    elif f16_e_field != 0:
-        # normalized f16
-        f8_biased_exp = (f16_e_field >> f16_m_size) - (f16_e_bias - f8e4m3_e_bias)
-        # *** IMPORTANT FIX: shift by (f16_e_size - f8e4m3_e_size) = 5 - 4 = 1 ***
-        fractional = (inp & f16_m_mask) << (f16_e_size - f8e4m3_e_size)
-
-        # normalized f8 part (exp >= 0)
-        if f8_biased_exp >= 0:
-            if (fractional & round_half) == round_odd or (fractional & round_norm) != 0:
-                fractional += round_even
-                if (fractional & f8_e_mask) != 0:
-                    f8_biased_exp += 1
-            fractional &= f8_m_mask
-
-        # now set exponent & mantissa
-        if f8_biased_exp > f8e4m3_e_max:
-            # overflow -> NaN (no Inf)
-            f8_bits |= f8e4m3_e_mask | f8e4m3_m_mask
-        elif f8_biased_exp > 0:
-            # normalized f8
-            exp_field = (f8_biased_exp & (f8e4m3_e_mask >> f8e4m3_m_size)) << f8e4m3_m_size
-            f8_bits |= exp_field
-            f8_bits |= fractional >> byte_shift
-        else:
-            # subnormal f8
-            fractional = f8_m_hidden_one_mask | ((inp & f16_m_mask) << (f16_e_size - f8e4m3_e_size))
-            f8_exp = f8_biased_exp - f8e4m3_e_bias
-            shift = 1 - f8_exp
-            sticky_mask = 0 if f8_exp < f8_e_subnormal_min else ((1 << shift) - 1)
-            sticky = 1 if (fractional & sticky_mask) != 0 else 0
-
-            fractional = 0 if f8_exp < f8_e_subnormal_min else (fractional >> (1 - f8_biased_exp))
-
-            if (
-                ((fractional & round_half) == round_odd and sticky == 0)
-                or (fractional & round_norm) != 0
-                or sticky != 0
-            ):
-                fractional += round_even
-
-            f8_bits |= fractional >> byte_shift
-    else:
-        # f16 zero / subnormal -> sign + zero exponent/mantissa
-        # (f8_bits already contains the sign)
-        pass
-
-    return f8_bits & 0xFF
-
-
-_f16_to_f8e4m3_bits_vec = np.vectorize(_f16_to_f8e4m3_bits_scalar, otypes=[np.uint8])
-
-
-def fp32_to_fp8e4m3_values(x: np.ndarray) -> np.ndarray:
+    f8_bits = ((x & f16_s_mask) >> byte_shift).astype(np.uint16)
+
+    f16_e_field = x & f16_e_mask
+    is_naninf = f16_e_field == f16_e_mask
+    is_zero = f16_e_field == 0
+    is_normal = (~is_naninf) & (~is_zero)
+
+    nan_pattern = np.uint16(f8e4m3_e_mask | f8e4m3_m_mask)
+
+    # --- Case 1: f16 NaN / Inf -> f8 NaN (no Inf) ---
+    f8_bits = np.where(is_naninf, f8_bits | nan_pattern, f8_bits)
+
+    # --- Case 2: normalized f16 ---
+    # f8_biased_exp = (f16_e_field >> f16_m_size) - (f16_e_bias - f8e4m3_e_bias)
+    f8_biased_exp = (f16_e_field >> f16_m_size).astype(np.int32) - (f16_e_bias - f8e4m3_e_bias)
+
+    # fractional = (inp & f16_m_mask) << (f16_e_size - f8e4m3_e_size)
+    fractional_norm = ((x & f16_m_mask) << (f16_e_size - f8e4m3_e_size)).astype(np.uint16)
+
+    exp_ge0 = (f8_biased_exp >= 0) & is_normal
+
+    # Rounding for normalized part (exp >= 0)
+    # if (fractional & round_half) == round_odd or (fractional & round_norm) != 0:
+    cond_round_norm = (((fractional_norm & round_half) == round_odd) | ((fractional_norm & round_norm) != 0)) & exp_ge0
+
+    # fractional += round_even where cond_round_norm
+    frac_tmp = fractional_norm.astype(np.uint32) + np.where(cond_round_norm, round_even, np.uint16(0)).astype(np.uint32)
+    fractional_norm = (frac_tmp & 0xFFFF).astype(np.uint16)
+
+    # if (fractional & f8_e_mask) != 0: f8_biased_exp += 1
+    exp_inc = np.where(exp_ge0 & ((fractional_norm & f8_e_mask) != 0), 1, 0).astype(np.int32)
+    f8_biased_exp_after = f8_biased_exp + exp_inc
+
+    # fractional &= f8_m_mask
+    fractional_norm &= f8_m_mask
+
+    # Overflow / normalized / subnormal classification
+    overflow_mask = is_normal & (f8_biased_exp_after > f8e4m3_e_max)
+    normal_mask = is_normal & (f8_biased_exp_after > 0) & (~overflow_mask)
+    # For subnormals, the scalar code uses f8_biased_exp (after possible increment),
+    # but increment is only applied when exp >= 0, so exp <= 0 path is unchanged.
+    subnormal_mask = is_normal & (f8_biased_exp_after <= 0) & (~overflow_mask)
+
+    # --- Overflow -> NaN ---
+    f8_bits = np.where(overflow_mask, f8_bits | nan_pattern, f8_bits)
+
+    # --- Normalized f8 ---
+    # exp_field = (f8_biased_exp & (f8e4m3_e_mask >> f8e4m3_m_size)) << f8e4m3_m_size
+    exp_field = ((f8_biased_exp_after & (f8e4m3_e_mask >> f8e4m3_m_size)) << f8e4m3_m_size).astype(np.uint16)
+    mant_norm = (fractional_norm >> byte_shift).astype(np.uint16)
+
+    f8_bits_norm = f8_bits | exp_field | mant_norm
+    f8_bits = np.where(normal_mask, f8_bits_norm, f8_bits)
+
+    # --- Subnormal f8 ---
+    # fractional = f8_m_hidden_one_mask | ((inp & f16_m_mask) << (f16_e_size - f8e4m3_e_size))
+    fractional_sub = f8_m_hidden_one_mask | ((x & f16_m_mask) << (f16_e_size - f8e4m3_e_size))
+
+    # f8_exp = f8_biased_exp - f8e4m3_e_bias
+    f8_exp = (f8_biased_exp_after - f8e4m3_e_bias).astype(np.int32)
+
+    # shift = 1 - f8_exp
+    shift = 1 - f8_exp
+
+    # sticky_mask = 0 if f8_exp < f8_e_subnormal_min else ((1 << shift) - 1)
+    # we avoid invalid shifts by clipping / masking
+    valid_sub = f8_exp >= f8_e_subnormal_min
+    shift_pos = np.maximum(shift, 0)
+    sticky_mask32 = np.where(valid_sub, (np.uint32(1) << shift_pos) - 1, 0).astype(np.uint32)
+    sticky_mask16 = (sticky_mask32 & np.uint32(0xFFFF)).astype(np.uint16)
+
+    # sticky = 1 if (fractional & sticky_mask) != 0 else 0
+    sticky = ((fractional_sub & sticky_mask16) != 0) & valid_sub
+
+    # fractional = 0 if f8_exp < f8_e_subnormal_min else (fractional >> (1 - f8_biased_exp))
+    shift2 = 1 - f8_biased_exp_after
+    shift2_pos = np.maximum(shift2, 0)
+    frac_shifted = (fractional_sub.astype(np.uint32) >> shift2_pos).astype(np.uint16)
+    frac_shifted = np.where(valid_sub, frac_shifted, np.uint16(0))
+
+    # Rounding for subnormal:
+    # if (((fractional & round_half) == round_odd and sticky == 0)
+    #     or (fractional & round_norm) != 0
+    #     or sticky != 0):
+    cond_round_sub = (
+        (((frac_shifted & round_half) == round_odd) & (~sticky)) | ((frac_shifted & round_norm) != 0) | sticky
+    ) & subnormal_mask
+
+    frac_tmp_sub = frac_shifted.astype(np.uint32) + np.where(cond_round_sub, round_even, np.uint16(0)).astype(np.uint32)
+    fractional_sub_final = (frac_tmp_sub & 0xFFFF).astype(np.uint16)
+
+    mant_sub = (fractional_sub_final >> byte_shift).astype(np.uint16)
+    f8_bits = np.where(subnormal_mask, f8_bits | mant_sub, f8_bits)
+
+    # Case: f16 zero / subnormal -> sign + zero exponent/mantissa
+    # Already handled by initialization + not touching zero_mask entries.
+
+    return (f8_bits & np.uint16(0x00FF)).astype(np.uint8)
+
+
+def fp32_to_fp8e4m3(x: np.ndarray) -> np.ndarray:
     """
     Bit-exact to ov::float8_e4m3(float):
         float32 -> float16 -> f8e4m3 bits -> float via LUT
@@ -141,7 +184,7 @@ def fp32_to_fp8e4m3_values(x: np.ndarray) -> np.ndarray:
     x_f16 = x.astype(np.float16)
     h_bits = x_f16.view(np.uint16)
 
-    f8_bits = _f16_to_f8e4m3_bits_vec(h_bits)
+    f8_bits = f16_to_f8e4m3_bits_numpy(h_bits)
 
     # Decode exactly like C++: LUT for magnitude + sign bit
     idx = f8_bits & 0x7F
diff --git a/src/nncf/quantization/algorithms/weight_compression/weight_lowering.py b/src/nncf/quantization/algorithms/weight_compression/weight_lowering.py
@@ -520,9 +520,9 @@ def _calculate_float_quantized_weight(norm_weight: Tensor, compression_dtype: Te
     assert compression_dtype in [TensorDataType.f8e4m3, TensorDataType.f4e2m1, TensorDataType.nf4]
 
     if compression_dtype == TensorDataType.f8e4m3:
-        from nncf.quantization.algorithms.weight_compression.fp8_conversion import fp32_to_fp8e4m3_values
+        from nncf.quantization.algorithms.weight_compression.fp8_conversion import fp32_to_fp8e4m3
 
-        quantiles_np = fp32_to_fp8e4m3_values(norm_weight.as_numpy_tensor().data)
+        quantiles_np = fp32_to_fp8e4m3(norm_weight.as_numpy_tensor().data)
         return fns.from_numpy(quantiles_np, backend=norm_weight.backend)
 
     is_nf4 = compression_dtype == TensorDataType.nf4