Simplify arith.scaling_truncf to just do division and trunction. Denorm flushign on input should be carried out using specified fastMath flag. Scales are assumed to be normalized and clamped.

umangyadav · umangyadav · commit 45e7dba87ade · 2025-06-06T20:49:51.000Z
diff --git a/mlir/include/mlir/Dialect/Arith/IR/ArithOps.td b/mlir/include/mlir/Dialect/Arith/IR/ArithOps.td
@@ -1354,6 +1354,10 @@ def Arith_ScalingTruncFOp
     both scales and the input operand to be of the same shape and, therefore, 
     makes the operation elementwise. Scales are usually calculated per block 
     following the OCP MXFP spec as described in https://arxiv.org/abs/2310.10537.
+    Users are required to normalize and clamp the scales as necessary before calling
+    passing them to this operation.  OCP MXFP spec also does the flushing of denorms
+    on the input operand, which should be handled during lowering by passing appropriate 
+    fastMath flag to this operation. 
 
     If scales are calculated per block where blockSize != 1, scales may require 
     broadcasting to make this operation elementwise. For example, let's say the 
@@ -1369,23 +1373,17 @@ def Arith_ScalingTruncFOp
     broadcasted to `<dim1 x dim2 x dim3 ... (dimN/blockSize) x blockSize>`. Note 
     that there could be multiple quantization axes. Internally, 
     `arith.scaling_truncf` would perform the following:
- 
+
     ```
-    scaleETy = get_type(scale)
-    inputETy = get_type(input)
-    resultETy = get_type(result)
-    // prepare Scale values with normalization and clamping
-    scale.exponent = arith.truncf(scale) : scaleETy to f8E8M0
-    scale.extf = arith.extf(scale.exponent)  : f8E8M0 to inputETy
-    // emax is calculated as exponent of the largest normal value in quantized type.
-    scale.normalize = arith.divf(scale.extf, emax)   
-    scale.clamped = clamp(scale.normalize) // clamp underflows
-    input.flused = flush_denorms(input)
-    result = arith.divf(input.flushed, scale.clamped)
-    result.cast = arith.truncf(result, resultETy)
+    scaleTy = get_type(scale)
+    inputTy = get_type(input)
+    resultTy = get_type(result)
+    assert(scaleTy.shape() == inputTy.shape() == resultTy.shape())
+    scale.exponent = arith.truncf(scale) : scaleTy to f8E8M0
+    scale.extf = arith.extf(scale.exponent) : f8E8M0 to inputTy
+    result = arith.divf(input, scale.extf)
+    result.cast = arith.truncf(result, resultTy)
     ```
-    Flushing of denorms in input and scale normalization with emax is added as per 
-    the OCP MXFP spec. 
   }];
   let hasVerifier = 1;
   let assemblyFormat =
diff --git a/mlir/lib/Dialect/Arith/Transforms/ExpandOps.cpp b/mlir/lib/Dialect/Arith/Transforms/ExpandOps.cpp
@@ -13,8 +13,6 @@
 #include "mlir/IR/ImplicitLocOpBuilder.h"
 #include "mlir/IR/TypeUtilities.h"
 #include "mlir/Transforms/DialectConversion.h"
-#include "llvm/ADT/APFloat.h"
-#include <cstdint>
 
 namespace mlir {
 namespace arith {
@@ -25,16 +23,6 @@ namespace arith {
 
 using namespace mlir;
 
-static Value createFloatConst(Location loc, Type type, float value,
-                              PatternRewriter &rewriter) {
-  auto attr = rewriter.getFloatAttr(getElementTypeOrSelf(type), value);
-  if (auto shapedTy = dyn_cast<ShapedType>(type)) {
-    return rewriter.create<arith::ConstantOp>(
-        loc, DenseElementsAttr::get(shapedTy, attr));
-  }
-  return rewriter.create<arith::ConstantOp>(loc, attr);
-}
-
 /// Create an integer or index constant.
 static Value createConst(Location loc, Type type, int value,
                          PatternRewriter &rewriter) {
@@ -368,7 +356,8 @@ struct F8E8M0ExtFOpConverter : public OpRewritePattern<arith::ExtFOp> {
     f32Bits = b.create<arith::SelectOp>(isNan, cF32NaN, f32Bits);
     Value result = b.create<arith::BitcastOp>(f32Ty, f32Bits);
     if (resultETy.getIntOrFloatBitWidth() < 32) {
-      result = b.create<arith::TruncFOp>(resultTy, result);
+      result = b.create<arith::TruncFOp>(resultTy, result, nullptr,
+                                         op.getFastmathAttr());
     } else if (resultETy.getIntOrFloatBitWidth() > 32) {
       result = b.create<arith::ExtFOp>(resultTy, result);
     }
@@ -406,9 +395,10 @@ struct F8E8M0TruncFOpConverter : public OpRewritePattern<arith::TruncFOp> {
     Type f32Ty = cloneToShapedType(operandTy, b.getF32Type());
 
     if (operandETy.getIntOrFloatBitWidth() < 32) {
-      operand = b.create<arith::ExtFOp>(f32Ty, operand);
+      operand = b.create<arith::ExtFOp>(f32Ty, operand, op.getFastmathAttr());
     } else if (operandETy.getIntOrFloatBitWidth() > 32) {
-      operand = b.create<arith::TruncFOp>(f32Ty, operand);
+      operand = b.create<arith::TruncFOp>(
+          f32Ty, operand, op.getRoundingmodeAttr(), op.getFastmathAttr());
     }
     Value f32Bits = b.create<arith::BitcastOp>(i32Ty, operand);
     Value cF32MantissaWidth = createConst(op->getLoc(), i32Ty, 23, rewriter);
@@ -431,7 +421,8 @@ struct ScalingExtFOpConverter : public OpRewritePattern<arith::ScalingExtFOp> {
     // allow implicit exponent extraction from 16/32 bits floats
     if (scaleETy.getIntOrFloatBitWidth() >= 16) {
       scaleETy = b.getF8E8M0Type();
-      scaleOperand = b.create<arith::TruncFOp>(scaleETy, scaleOperand);
+      scaleOperand = b.create<arith::TruncFOp>(scaleETy, scaleOperand, nullptr,
+                                               op.getFastmathAttr());
     }
     if (!llvm::isa<Float8E8M0FNUType>(scaleETy)) {
       return rewriter.notifyMatchFailure(
@@ -441,14 +432,22 @@ struct ScalingExtFOpConverter : public OpRewritePattern<arith::ScalingExtFOp> {
     Type resultTy = op.getType();
     // extf on scale will essentially create floating point number
     // of type resulTy that is 2^scale and will also propagate NaNs
-    Value scaleExt = b.create<arith::ExtFOp>(resultTy, scaleOperand);
-    Value inputExt = b.create<arith::ExtFOp>(resultTy, inputOperand);
-    Value result = b.create<arith::MulFOp>(inputExt, scaleExt);
+    Value scaleExt =
+        b.create<arith::ExtFOp>(resultTy, scaleOperand, op.getFastmathAttr());
+    Value inputExt =
+        b.create<arith::ExtFOp>(resultTy, inputOperand, op.getFastmathAttr());
+    Value result =
+        b.create<arith::MulFOp>(inputExt, scaleExt, op.getFastmathAttr());
     rewriter.replaceOp(op, result);
     return success();
   }
 };
 
+/*
+Expands arith.ScalingTruncFOp(in, scale) into
+  scale = arith.truncf(scale) : scaleTy -> f8E8M0FNU
+  result = arith.truncf(in / (2^scale))
+ */
 struct ScalingTruncFOpConverter
     : public OpRewritePattern<arith::ScalingTruncFOp> {
   using OpRewritePattern::OpRewritePattern;
@@ -470,68 +469,14 @@ struct ScalingTruncFOpConverter
           op, "scaling_truncf is using scales type which can not be converted "
               "to f8E8M0FNU");
     }
-
     Type resultTy = op.getType();
-    Type resultETy = getElementTypeOrSelf(op.getOut());
-
     Type inputTy = inputOperand.getType();
-    Type inputETy = getElementTypeOrSelf(inputOperand);
-
-    Type i8Ty = cloneToShapedType(resultTy, b.getI8Type());
-    Type i32Ty = cloneToShapedType(resultTy, b.getI32Type());
-    Type f32Ty = cloneToShapedType(resultTy, b.getF32Type());
-
-    if (inputETy.getIntOrFloatBitWidth() < 32) {
-      inputOperand = b.create<arith::ExtFOp>(f32Ty, inputOperand);
-    } else if (inputETy.getIntOrFloatBitWidth() > 32) {
-      inputOperand = b.create<arith::TruncFOp>(f32Ty, inputOperand);
-    }
-    inputTy = inputOperand.getType();
-    inputETy = getElementTypeOrSelf(inputOperand);
-
-    // normalize scale by exponent of the max normal value (emax) in result type
-    // as per the OCP MXFP spec
-    // https://github.com/microsoft/microxcaling/blob/7bc41952de394f5cc5e782baf132e7c7542eb4e4/mx/mx_ops.py#L277
-    // here this normalization is carried in f32. Therefore instead of
-    // subtraction it does the DivFOp
-    const llvm::fltSemantics &resultFltSemantics =
-        llvm::cast<FloatType>(resultETy).getFloatSemantics();
-    int maxExponent = APFloat::semanticsMaxExponent(resultFltSemantics);
-    Value cEmax = createConst(op->getLoc(), i32Ty, maxExponent, rewriter);
-    Value c1 = createConst(op->getLoc(), i32Ty, 1, rewriter);
-    Value cPow2 = b.create<arith::ShLIOp>(c1, cEmax);
-    Value cPow2F32 = b.create<arith::SIToFPOp>(f32Ty, cPow2);
-    Value scaleF32 = b.create<arith::ExtFOp>(f32Ty, scaleOperand);
-    // note that spec also does the clamping but it should only be done for
-    // underflows because dividing by 2^emax will only make it smaller.
-    // https://github.com/microsoft/microxcaling/blob/7bc41952de394f5cc5e782baf132e7c7542eb4e4/mx/mx_ops.py#L282
-    Value scaleNormalizedF32 = b.create<arith::DivFOp>(scaleF32, cPow2F32);
-    // If it has underflown then scale will be a denorm FP32 number after
-    // division. Clamp underflows to 2^-127 as per the spec implementation
-    Value scaleNormalizedExponentF8 =
-        b.create<arith::TruncFOp>(scaleTy, scaleNormalizedF32);
-    Value scaleNormalizedExponentU8 =
-        b.create<arith::BitcastOp>(i8Ty, scaleNormalizedExponentF8);
-    Value cI8Zero = createConst(op.getLoc(), i8Ty, 0x00, rewriter);
-    Value scaleClampCond = b.create<arith::CmpIOp>(
-        arith::CmpIPredicate::eq, cI8Zero, scaleNormalizedExponentU8);
-    // 5.8e-39 is 2^-127, it is a denorm value in f32
-    float clampValue = 5.87747e-39;
-    Value scaleClampValue =
-        createFloatConst(op.getLoc(), f32Ty, clampValue, rewriter);
-    Value clampedScale = b.create<arith::SelectOp>(
-        scaleClampCond, scaleClampValue, scaleNormalizedF32);
-    // flush denorms by checking if exponent part of input operand is zero
-    // or not.
-    Value inputExponent = b.create<arith::TruncFOp>(scaleTy, inputOperand);
-    Value inputExponentU8 = b.create<arith::BitcastOp>(i8Ty, inputExponent);
-    Value inputFlushCond = b.create<arith::CmpIOp>(arith::CmpIPredicate::eq,
-                                                   cI8Zero, inputExponentU8);
-    Value inputTyZero = createFloatConst(op.getLoc(), inputTy, 0, rewriter);
-    Value flushedInput =
-        b.create<arith::SelectOp>(inputFlushCond, inputTyZero, inputOperand);
-    Value result = b.create<arith::DivFOp>(flushedInput, clampedScale);
-    // propagate rounding mode and fast math attributes
+    // this will create a floating point number of type
+    // inputTy that is 2^scale and will also propagate NaNs
+    scaleOperand =
+        b.create<arith::ExtFOp>(inputTy, scaleOperand, op.getFastmathAttr());
+    Value result = b.create<arith::DivFOp>(inputOperand, scaleOperand,
+                                           op.getFastmathAttr());
     Value resultCast = b.create<arith::TruncFOp>(
         resultTy, result, op.getRoundingmodeAttr(), op.getFastmathAttr());
     rewriter.replaceOp(op, resultCast);
diff --git a/mlir/test/Dialect/Arith/expand-ops.mlir b/mlir/test/Dialect/Arith/expand-ops.mlir
@@ -316,24 +316,8 @@ func.func @scaling_truncf_f32_to_f4E2M1FN(%arg0 : f32, %arg1: f8E8M0FNU) -> f4E2
 }
 
 // SCHECK-LABEL: @scaling_truncf_f32_to_f4E2M1FN
-// SCHECK: %[[C2:.+]] = arith.constant 2 : i32
-// SCHECK: %[[C1:.+]] = arith.constant 1 : i32
-// SCHECK: %[[EMAX:.+]] = arith.shli %[[C1]], %[[C2]] : i32
-// SCHECK: %[[EMAXF32:.+]] = arith.sitofp %[[EMAX]] : i32 to f32
 // SCHECK: %[[SCALEF32:.+]] = arith.extf %arg1 : f8E8M0FNU to f32
-// SCHECK: %[[SCALEDIV:.+]] = arith.divf %[[SCALEF32]], %[[EMAXF32]] : f32
-// SCHECK: %[[SCALEDIVF8:.+]] = arith.truncf %[[SCALEDIV]] : f32 to f8E8M0FNU
-// SCHECK: %[[SCALEDIVI8:.+]] =  arith.bitcast %[[SCALEDIVF8]] : f8E8M0FNU to i8
-// SCHECK: %[[C0:.+]] = arith.constant 0 : i8
-// SCHECK: %[[UFLOWCOND:.+]] = arith.cmpi eq, %[[C0]], %[[SCALEDIVI8]] : i8
-// SCHECK: %[[CLAMPVAL:.+]] = arith.constant 5.877470e-39 : f32
-// SCHECK: %[[CLAMP:.+]] = arith.select %[[UFLOWCOND]], %[[CLAMPVAL]], %[[SCALEDIV]] : f32 
-// SCHECK: %[[INPUTEXP:.+]] = arith.truncf %arg0 : f32 to f8E8M0FNU
-// SCHECK: %[[INPUTEXPI8:.+]] = arith.bitcast %[[INPUTEXP]] : f8E8M0FNU to i8
-// SCHECK: %[[FLUSHCOND:.+]] = arith.cmpi eq, %[[C0]], %[[INPUTEXPI8]] : i8
-// SCHECK: %[[CF0:.+]] = arith.constant 0.000000e+00 : f32
-// SCHECK: %[[FLUSHINPUT:.+]] = arith.select %[[FLUSHCOND]], %[[CF0]], %arg0 : f32
-// SCHECK: %[[DIVF:.+]] = arith.divf %[[FLUSHINPUT]], %[[CLAMP]] : f32
+// SCHECK: %[[DIVF:.+]] = arith.divf %arg0, %[[SCALEF32]] : f32
 // SCHECK: %[[RESULT:.+]] = arith.truncf %[[DIVF]] : f32 to f4E2M1FN
 // SCHECK: return %[[RESULT]]
 
@@ -345,26 +329,9 @@ func.func @scaling_truncf_vector_f16_to_f6E3M2FN(%arg0 : vector<4xf16>, %arg1: v
 }
 
 // SCHECK-LABEL: @scaling_truncf_vector_f16_to_f6E3M2FN
-// SCHECK: %[[INPUTF32:.+]] = arith.extf %arg0 : vector<4xf16> to vector<4xf32>
-// SCHECK: %[[C2:.+]] = arith.constant dense<4> : vector<4xi32>
-// SCHECK: %[[C1:.+]] = arith.constant dense<1> : vector<4xi32>
-// SCHECK: %[[EMAX:.+]] = arith.shli %[[C1]], %[[C2]] : vector<4xi32>
-// SCHECK: %[[EMAXF32:.+]] = arith.sitofp %[[EMAX]] : vector<4xi32> to vector<4xf32>
-// SCHECK: %[[SCALEF32:.+]] = arith.extf %arg1 : vector<4xf8E8M0FNU> to vector<4xf32>
-// SCHECK: %[[SCALEDIV:.+]] = arith.divf %[[SCALEF32]], %[[EMAXF32]] : vector<4xf32>
-// SCHECK: %[[SCALEDIVF8:.+]] = arith.truncf %[[SCALEDIV]] : vector<4xf32> to vector<4xf8E8M0FNU>
-// SCHECK: %[[SCALEDIVI8:.+]] =  arith.bitcast %[[SCALEDIVF8]] : vector<4xf8E8M0FNU> to vector<4xi8>
-// SCHECK: %[[C0:.+]] = arith.constant dense<0> : vector<4xi8>
-// SCHECK: %[[UFLOWCOND:.+]] = arith.cmpi eq, %[[C0]], %[[SCALEDIVI8]] : vector<4xi8>
-// SCHECK: %[[CLAMPVAL:.+]] = arith.constant dense<5.877470e-39> : vector<4xf32>
-// SCHECK: %[[CLAMP:.+]] = arith.select %[[UFLOWCOND]], %[[CLAMPVAL]], %[[SCALEDIV]] : vector<4xi1>, vector<4xf32>
-// SCHECK: %[[INPUTEXP:.+]] = arith.truncf %[[INPUTF32]] : vector<4xf32> to vector<4xf8E8M0FNU>
-// SCHECK: %[[INPUTEXPI8:.+]] = arith.bitcast %[[INPUTEXP]] : vector<4xf8E8M0FNU> to vector<4xi8> 
-// SCHECK: %[[FLUSHCOND:.+]] = arith.cmpi eq, %[[C0]], %[[INPUTEXPI8]] : vector<4xi8>
-// SCHECK: %[[CF0:.+]] = arith.constant dense<0.000000e+00> : vector<4xf32>
-// SCHECK: %[[FLUSHINPUT:.+]] = arith.select %[[FLUSHCOND]], %[[CF0]], %[[INPUTF32]] : vector<4xi1>, vector<4xf32>
-// SCHECK: %[[DIVF:.+]] = arith.divf %[[FLUSHINPUT]], %[[CLAMP]] : vector<4xf32>
-// SCHECK: %[[RESULT:.+]] = arith.truncf %[[DIVF]] : vector<4xf32> to vector<4xf6E3M2FN>
+// SCHECK: %[[SCALEF16:.+]] = arith.extf %arg1 : vector<4xf8E8M0FNU> to vector<4xf16>
+// SCHECK: %[[DIVF:.+]] = arith.divf %arg0, %[[SCALEF16]] : vector<4xf16>
+// SCHECK: %[[RESULT:.+]] = arith.truncf %[[DIVF]] : vector<4xf16> to vector<4xf6E3M2FN>
 // SCHECK: return %[[RESULT]] : vector<4xf6E3M2FN>
 
 // -----
@@ -374,7 +341,7 @@ func.func @scaling_truncf_propagate_rounding_mode(%arg0 : vector<4xf16>, %arg1:
     return %0 : vector<4xf6E3M2FN>
 }
 // SCHECK-LABEL: @scaling_truncf_propagate_rounding_mode
-// SCHECK: %[[TRUNCF:.+]] = arith.truncf [[_:%[a-zA-Z0-9_]+]] to_nearest_even : vector<4xf32> to vector<4xf6E3M2FN>
+// SCHECK: %[[TRUNCF:.+]] = arith.truncf [[_:%[a-zA-Z0-9_]+]] to_nearest_even : vector<4xf16> to vector<4xf6E3M2FN>
 // SCHECK: return %[[TRUNCF]] : vector<4xf6E3M2FN>
 
 // -----