add tests and fix various issues revealed by tests

Muzammiluddin-Syed-ECE · Muzammiluddin-Syed-ECE · commit b3c497753107 · 2025-06-19T20:48:21.000Z
Signed-off-by: Muzammiluddin Syed &lt;muzasyed@amd.com&gt;
diff --git a/mlir/include/mlir/Dialect/Arith/Transforms/Passes.h b/mlir/include/mlir/Dialect/Arith/Transforms/Passes.h
@@ -59,6 +59,9 @@ void populateCeilFloorDivExpandOpsPatterns(RewritePatternSet &patterns);
 /// Add patterns to expand Arith bf16 patterns to lower level bitcasts/shifts.
 void populateExpandBFloat16Patterns(RewritePatternSet &patterns);
 
+/// Add patterns to expand Arith f4e2m1 patterns to lower level bitcasts/shifts.
+void populateExpandF4E2M1Patterns(RewritePatternSet &patterns);
+
 /// Add patterns to expand Arith f8e8m0 patterns to lower level bitcasts/shifts.
 void populateExpandF8E8M0Patterns(RewritePatternSet &patterns);
 
diff --git a/mlir/include/mlir/Dialect/Arith/Transforms/Passes.td b/mlir/include/mlir/Dialect/Arith/Transforms/Passes.td
@@ -19,6 +19,8 @@ def ArithExpandOpsPass : Pass<"arith-expand"> {
               "Enable the BF16 expansion patterns">,
        Option<"includeF8E8M0", "include-f8e8m0", "bool", /*default=*/"false",
               "Enable the F8E8M0 expansion patterns">,
+       Option<"includeF4E2M1", "include-f4e2m1", "bool", /*default=*/"false",
+              "Enable the F4E2M1 expansion patterns">,
   ];
 }
 
diff --git a/mlir/lib/Dialect/Arith/Transforms/ExpandOps.cpp b/mlir/lib/Dialect/Arith/Transforms/ExpandOps.cpp
@@ -345,9 +345,8 @@ struct F4E2M1ExtFOpConverter : public OpRewritePattern<arith::ExtFOp> {
     Type operandETy = getElementTypeOrSelf(operandTy);
     Type resultETy = getElementTypeOrSelf(resultTy);
 
-    if (!llvm::isa<Float4E2M1FNType>(operandETy) ||
-        !llvm::isa<Float32Type>(resultETy)) {
-      return rewriter.notifyMatchFailure(op, "not a ext of F4E2M1FN to F32");
+    if (!isa<Float4E2M1FNType>(operandETy)) {
+      return rewriter.notifyMatchFailure(op, "not a ext of F4E2M1FN");
     }
 
     Type i4Ty = cloneToShapedType(operandTy, b.getI4Type());
@@ -357,8 +356,9 @@ struct F4E2M1ExtFOpConverter : public OpRewritePattern<arith::ExtFOp> {
     Value bitcast = b.create<arith::BitcastOp>(i4Ty, operand);
 
     Value c0x1 = createConst(op->getLoc(), i4Ty, 1, rewriter);
-    Value c0x0000001c = createConst(op->getLoc(), i32Ty, 28, rewriter);
     Value c0x00000014 = createConst(op->getLoc(), i32Ty, 22, rewriter);
+    Value c0x00000015 = createConst(op->getLoc(), i32Ty, 23, rewriter);
+    Value c0x0000001c = createConst(op->getLoc(), i32Ty, 28, rewriter);
     Value cZero =
         createFloatConst(op->getLoc(), f32Ty, APFloat(0.0f), rewriter);
     Value cHalf =
@@ -370,29 +370,33 @@ struct F4E2M1ExtFOpConverter : public OpRewritePattern<arith::ExtFOp> {
 
     Value f4SignBit = b.create<arith::AndIOp>(bitcast, signBitmask);
     Value f32Bits = b.create<arith::ExtUIOp>(i32Ty, f4SignBit);
-    f32Bits = b.create<arith::ShRUIOp>(f32Bits, c0x0000001c);
+    f32Bits = b.create<arith::ShLIOp>(f32Bits, c0x0000001c);
 
     Value biasAdjustment = createConst(op.getLoc(), i32Ty, 126, rewriter);
     Value f4ExpBits = b.create<arith::AndIOp>(bitcast, exponentBitmask);
     f4ExpBits = b.create<arith::ShRUIOp>(f4ExpBits, c0x1);
     Value f32ExpBits = b.create<arith::ExtUIOp>(i32Ty, f4ExpBits);
     f32ExpBits = b.create<arith::AddIOp>(f32ExpBits, biasAdjustment);
-    f32ExpBits = b.create<arith::ShLIOp>(f32ExpBits, c0x00000014);
-    f32Bits = b.create<arith::AddIOp>(f32Bits, f32ExpBits);
+    Value f32Exp = b.create<arith::ShLIOp>(f32ExpBits, c0x00000015);
+    f32Bits = b.create<arith::AddIOp>(f32Bits, f32Exp);
 
     Value f4ManBit = b.create<arith::AndIOp>(bitcast, mantissaBitmask);
     Value f32ManBit = b.create<arith::ExtUIOp>(i32Ty, f4ManBit);
+    f32ManBit = b.create<arith::ShLIOp>(f32ManBit, c0x00000014);
     f32Bits = b.create<arith::AddIOp>(f32Bits, f32ManBit);
 
-    // Special consideration for subnormal exp (exp == 0).
+    // Special consideration for subnormal exponent (exp == 00).
     Value isSubnormal = b.create<arith::CmpIOp>(arith::CmpIPredicate::eq,
                                                 f32ExpBits, biasAdjustment);
     Value isManSet =
         b.create<arith::CmpIOp>(arith::CmpIPredicate::eq, f4ManBit, c0x1);
     Value subnormalVal = b.create<arith::SelectOp>(isManSet, cHalf, cZero);
-    f32Bits = b.create<arith::SelectOp>(isSubnormal, subnormalVal, f32Bits);
 
     Value result = b.create<arith::BitcastOp>(f32Ty, f32Bits);
+    result = b.create<arith::SelectOp>(isSubnormal, subnormalVal, result);
+    if (!isa<Float32Type>(resultETy)) {
+      result = b.create<arith::TruncFOp>(resultETy, operand);
+    }
     rewriter.replaceOp(op, result);
     return success();
   }
@@ -481,8 +485,11 @@ struct F4E2M1TruncFOpConverter : public OpRewritePattern<arith::TruncFOp> {
     Type operandETy = getElementTypeOrSelf(operandTy);
     Type resultETy = getElementTypeOrSelf(resultTy);
 
-    if (!isa<Float32Type>(operandETy) || !isa<Float4E2M1FNType>(resultETy)) {
-      return rewriter.notifyMatchFailure(op, "not a trunc of F32 to F4E2M1FN");
+    if (!isa<Float32Type>(operandETy)) {
+      operand = b.create<arith::ExtFOp>(b.getF32Type(), operand);
+    }
+    if (!isa<Float4E2M1FNType>(resultETy)) {
+      return rewriter.notifyMatchFailure(op, "not a trunc of F4E2M1FN");
     }
 
     Type i4Ty = cloneToShapedType(operandTy, b.getI4Type());
@@ -491,20 +498,28 @@ struct F4E2M1TruncFOpConverter : public OpRewritePattern<arith::TruncFOp> {
     Type f32Ty = cloneToShapedType(operandTy, b.getF32Type());
 
     Value c0x1 = createConst(op->getLoc(), i4Ty, 1, rewriter);
+    Value c0x3 = createConst(op->getLoc(), i4Ty, 3, rewriter);
     Value c0x00000016 = createConst(op->getLoc(), i32Ty, 22, rewriter);
     Value c0x00 = createConst(op.getLoc(), i8Ty, 0x00, rewriter);
     Value c0xff = createConst(op.getLoc(), i8Ty, 0xff, rewriter);
     Value c0x00000000 = createConst(op.getLoc(), i32Ty, 0, rewriter);
 
-    // Step 1: Clamp to bounds.
+    // Step 0: Clamp to bounds.
     Value cHigherBound =
         createFloatConst(op->getLoc(), f32Ty, APFloat(6.0f), rewriter);
     Value cLowerBound =
         createFloatConst(op->getLoc(), f32Ty, APFloat(-6.0f), rewriter);
-    Value operandClamped = b.create<arith::MinimumFOp>(cLowerBound, operand);
-    operandClamped = b.create<arith::MaximumFOp>(cHigherBound, operandClamped);
+    Value operandClamped = b.create<arith::MinimumFOp>(cHigherBound, operand);
+    operandClamped = b.create<arith::MaximumFOp>(cLowerBound, operandClamped);
     Value f32Bits = b.create<arith::BitcastOp>(i32Ty, operandClamped);
 
+    // Step 1: Set sign bit.
+    Value cF32ExpManWidth =
+        createConst(op->getLoc(), i32Ty, 31, rewriter); // 23
+    Value f32Sign = b.create<arith::ShRUIOp>(f32Bits, cF32ExpManWidth);
+    Value f4Sign = b.create<arith::TruncIOp>(i4Ty, f32Sign);
+    Value f4Bits = b.create<arith::ShLIOp>(f4Sign, c0x3);
+
     // Step 2: Convert exponent by adjusting bias.
     Value biasAdjustment = createConst(op.getLoc(), i32Ty, 0x7e, rewriter);
     Value cF4MantissaWidth = c0x1; // 1
@@ -513,16 +528,17 @@ struct F4E2M1TruncFOpConverter : public OpRewritePattern<arith::TruncFOp> {
     Value f32SignExp = b.create<arith::ShRUIOp>(f32Bits, cF32MantissaWidth);
     Value biasAdjustedSignExp =
         b.create<arith::SubIOp>(f32SignExp, biasAdjustment);
-    Value f4SignExp = b.create<arith::TruncIOp>(i4Ty, biasAdjustedSignExp);
-    f4SignExp = b.create<arith::ShLIOp>(f4SignExp, cF4MantissaWidth);
+    Value f4Exp = b.create<arith::TruncIOp>(i4Ty, biasAdjustedSignExp);
+    f4Exp = b.create<arith::ShLIOp>(f4Exp, cF4MantissaWidth);
+    f4Bits = b.create<arith::AddIOp>(f4Bits, f4Exp);
 
     // Step 3: Set mantissa to first bit.
     Value cF32FirstBitMask =
         createConst(op.getLoc(), i32Ty, 0x400000, rewriter);
     Value man1Bit = b.create<arith::AndIOp>(f32Bits, cF32FirstBitMask);
     man1Bit = b.create<arith::ShRUIOp>(man1Bit, c0x00000016);
     Value f4Man = b.create<arith::TruncIOp>(i4Ty, man1Bit);
-    Value f4Bits = b.create<arith::AddIOp>(f4SignExp, f4Man);
+    f4Bits = b.create<arith::AddIOp>(f4Bits, f4Man);
 
     // Step 4: Special consideration for conversion to 0.5.
     Value cF32MantissaMask =
@@ -538,7 +554,6 @@ struct F4E2M1TruncFOpConverter : public OpRewritePattern<arith::TruncFOp> {
     Value roundToHalf = b.create<arith::AndIOp>(isNegOneExp, isNonZeroMan);
     Value isZeroExp =
         b.create<arith::CmpIOp>(arith::CmpIPredicate::eq, f8Exp, c0x00);
-
     Value subnormalF4Bits = createConst(op->getLoc(), i4Ty, 0xf, rewriter);
     Value halfF4Bits = createConst(op->getLoc(), i4Ty, 0x0, rewriter);
     Value subResult =
@@ -719,16 +734,24 @@ struct ArithExpandOpsPass
     if (includeF8E8M0) {
       arith::populateExpandF8E8M0Patterns(patterns);
     }
+    if (includeF4E2M1) {
+      arith::populateExpandF4E2M1Patterns(patterns);
+    }
 
     target.addDynamicallyLegalOp<arith::ExtFOp>(
       [=](arith::ExtFOp op) {
         Type inETy = getElementTypeOrSelf(op.getOperand().getType());
         Type outETy = getElementTypeOrSelf(op.getType());
         bool legalTypes = true;
-        if (includeBf16) 
+        if (includeBf16) {
           legalTypes &= !(inETy.isBF16() && outETy.isF32());
-        if (includeF8E8M0)
+        } 
+        if (includeF8E8M0) {
           legalTypes &= !llvm::isa<Float8E8M0FNUType>(inETy);
+        } 
+        if (includeF4E2M1) {
+          legalTypes &= !llvm::isa<Float4E2M1FNType>(inETy);
+        }
         return legalTypes;
       });
 
@@ -737,10 +760,15 @@ struct ArithExpandOpsPass
         Type inETy = getElementTypeOrSelf(op.getOperand().getType());
         Type outETy = getElementTypeOrSelf(op.getType());
         bool legalTypes = true;
-        if (includeBf16) 
+        if (includeBf16) {
           legalTypes &= !(inETy.isF32() && outETy.isBF16());
-        if (includeF8E8M0) 
+        }
+        if (includeF8E8M0) {
           legalTypes &= !(llvm::isa<Float8E8M0FNUType>(outETy)); 
+        }
+        if (includeF4E2M1) {
+          legalTypes &= !llvm::isa<Float4E2M1FNType>(outETy);
+        }
         return legalTypes;
       });
 
@@ -765,6 +793,11 @@ void mlir::arith::populateExpandBFloat16Patterns(RewritePatternSet &patterns) {
       patterns.getContext());
 }
 
+void mlir::arith::populateExpandF4E2M1Patterns(RewritePatternSet &patterns) {
+  patterns.add<F4E2M1ExtFOpConverter, F4E2M1TruncFOpConverter>(
+      patterns.getContext());
+}
+
 void mlir::arith::populateExpandF8E8M0Patterns(RewritePatternSet &patterns) {
   patterns.add<F8E8M0ExtFOpConverter, F8E8M0TruncFOpConverter>(
       patterns.getContext());
diff --git a/mlir/test/Dialect/Arith/expand-ops-scale.mlir b/mlir/test/Dialect/Arith/expand-ops-scale.mlir
@@ -0,0 +1,159 @@
+// RUN: mlir-opt %s -arith-expand -split-input-file -verify-diagnostics | FileCheck %s
+
+func.func @scaling_truncf_f32_to_f4E2M1FN(%arg0 : f32, %arg1: f8E8M0FNU) -> f4E2M1FN {
+    %0 = arith.scaling_truncf %arg0, %arg1 : f32, f8E8M0FNU to f4E2M1FN
+    return %0 : f4E2M1FN
+}
+
+// CHECK-LABEL: @scaling_truncf_f32_to_f4E2M1FN
+// CHECK: %[[SCALEF32:.+]] = arith.extf %arg1 : f8E8M0FNU to f32
+// CHECK: %[[DIVF:.+]] = arith.divf %arg0, %[[SCALEF32]] : f32
+// CHECK: %[[RESULT:.+]] = arith.truncf %[[DIVF]] : f32 to f4E2M1FN
+// CHECK: return %[[RESULT]]
+
+// -----
+
+func.func @scaling_truncf_vector_f16_to_f6E3M2FN(%arg0 : vector<4xf16>, %arg1: vector<4xf8E8M0FNU>) -> vector<4xf6E3M2FN> {
+    %0 = arith.scaling_truncf %arg0, %arg1 : vector<4xf16>, vector<4xf8E8M0FNU> to vector<4xf6E3M2FN>
+    return %0 : vector<4xf6E3M2FN>
+}
+
+// CHECK-LABEL: @scaling_truncf_vector_f16_to_f6E3M2FN
+// CHECK: %[[SCALEF16:.+]] = arith.extf %arg1 : vector<4xf8E8M0FNU> to vector<4xf16>
+// CHECK: %[[DIVF:.+]] = arith.divf %arg0, %[[SCALEF16]] : vector<4xf16>
+// CHECK: %[[RESULT:.+]] = arith.truncf %[[DIVF]] : vector<4xf16> to vector<4xf6E3M2FN>
+// CHECK: return %[[RESULT]] : vector<4xf6E3M2FN>
+
+// -----
+
+func.func @scaling_truncf_propagate_rounding_mode_fast_math(%arg0 : vector<4xf16>, %arg1: vector<4xf16>) -> vector<4xf6E3M2FN> {
+    %0 = arith.scaling_truncf %arg0, %arg1 to_nearest_even fastmath<fast> : vector<4xf16>, vector<4xf16> to vector<4xf6E3M2FN>
+    return %0 : vector<4xf6E3M2FN>
+}
+// CHECK-LABEL: @scaling_truncf_propagate_rounding_mode_fast_math
+// CHECK: %[[SCALEF8:.+]] = arith.truncf %arg1 fastmath<fast> : vector<4xf16> to vector<4xf8E8M0FNU>
+// CHECK: %[[SCALEINTY:.+]] = arith.extf %[[SCALEF8]] fastmath<fast> : vector<4xf8E8M0FNU> to vector<4xf16>
+// CHECK: %[[DIVF:.+]] = arith.divf %arg0, %[[SCALEINTY]] fastmath<fast> : vector<4xf16>
+// CHECK: %[[TRUNCF:.+]] = arith.truncf [[_:%[a-zA-Z0-9_]+]] to_nearest_even fastmath<fast> : vector<4xf16> to vector<4xf6E3M2FN>
+// CHECK: return %[[TRUNCF]] : vector<4xf6E3M2FN>
+
+// -----
+
+func.func @scaling_truncf_f16_to_f4E2M1FN_using_f16_scales(%arg0: f16, %arg1 : f16) -> f4E2M1FN {
+    %0 = arith.scaling_truncf %arg0, %arg1 : f16, f16 to f4E2M1FN
+    return %0 : f4E2M1FN
+}
+// CHECK-LABEL: @scaling_truncf_f16_to_f4E2M1FN_using_f16_scales
+// CHECK: %[[SCALETRUNCF:.+]] = arith.truncf %arg1 : f16 to f8E8M0FN
+// CHECK: return
+
+// -----
+func.func @scaling_truncf_vector_f16_to_f4E2M1FN_using_f16_scales(%arg0: vector<4xf16>, %arg1 : vector<4xf16>) -> vector<4xf4E2M1FN> {
+    %0 = arith.scaling_truncf %arg0, %arg1 : vector<4xf16>, vector<4xf16> to vector<4xf4E2M1FN>
+    return %0 : vector<4xf4E2M1FN>
+}
+// CHECK-LABEL: @scaling_truncf_vector_f16_to_f4E2M1FN_using_f16_scales
+// CHECK: %[[SCALETRUNCF:.+]] = arith.truncf %arg1 : vector<4xf16> to vector<4xf8E8M0FNU>
+// CHECK: return
+
+// -----
+
+func.func @scaling_extf_to_f32(%arg0: f4E2M1FN, %arg1 : f8E8M0FNU) -> f32 {
+    %0 = arith.scaling_extf %arg0, %arg1 : f4E2M1FN, f8E8M0FNU to f32
+    return %0 : f32 
+}
+
+// CHECK-LABEL: @scaling_extf_to_f32
+// CHECK: %[[EXT_SCALE:.+]] = arith.extf %arg1 : f8E8M0FNU to f32
+// CHECK: %[[EXT_INPUT:.+]] = arith.extf %arg0 : f4E2M1FN to f32
+// CHECK: %[[RESULT:.+]] = arith.mulf %[[EXT_INPUT]], %[[EXT_SCALE]] : f32
+// CHECK: return %[[RESULT]]
+
+// -----
+
+func.func @scaling_extf_to_f32_using_f16_scales(%arg0: f4E2M1FN, %arg1 : f16) -> f32 {
+    %0 = arith.scaling_extf %arg0, %arg1 : f4E2M1FN, f16 to f32
+    return %0 : f32 
+}
+
+// CHECK-LABEL: @scaling_extf_to_f32_using_f16_scales
+// CHECK: %[[TRUNCF_SCALE:.+]] = arith.truncf %arg1 : f16 to f8E8M0FNU
+// CHECK: %[[EXT_SCALE:.+]] = arith.extf %[[TRUNCF_SCALE]] : f8E8M0FNU to f32
+// CHECK: %[[EXT_INPUT:.+]] = arith.extf %arg0 : f4E2M1FN to f32
+// CHECK: %[[RESULT:.+]] = arith.mulf %[[EXT_INPUT]], %[[EXT_SCALE]] : f32
+// CHECK: return %[[RESULT]]
+
+// -----
+
+func.func @invalid_scaling_extf_to_f32(%arg0: f4E2M1FN, %arg1 : f8E5M2FNUZ) -> f32 {
+    // expected-error@+1 {{failed to legalize operation 'arith.scaling_extf' that was explicitly marked illegal}}
+    %0 = arith.scaling_extf %arg0, %arg1 : f4E2M1FN, f8E5M2FNUZ to f32
+    return %0 : f32
+}
+
+// -----
+
+func.func @scaling_extf_vector_to_f32(%arg0: vector<4xf4E2M1FN>, %arg1 : vector<4xf8E8M0FNU>) -> vector<4xf32> {
+    %0 = arith.scaling_extf %arg0, %arg1 : vector<4xf4E2M1FN>, vector<4xf8E8M0FNU> to vector<4xf32>
+    return %0 : vector<4xf32>
+}
+
+// CHECK-LABEL: @scaling_extf_vector_to_f32
+// CHECK: %[[EXT_SCALE:.+]] = arith.extf %arg1 : vector<4xf8E8M0FNU> to vector<4xf32>
+// CHECK: %[[EXT_INPUT:.+]] = arith.extf %arg0 : vector<4xf4E2M1FN> to vector<4xf32>
+// CHECK: %[[RESULT:.+]] = arith.mulf %[[EXT_INPUT]], %[[EXT_SCALE]] : vector<4xf32> 
+// CHECK: return %[[RESULT]]
+
+// -----
+
+func.func @scaling_extf_vector_to_f16(%arg0: vector<4xf4E2M1FN>, %arg1 : vector<4xf8E8M0FNU>) -> vector<4xf16> {
+    %0 = arith.scaling_extf %arg0, %arg1 : vector<4xf4E2M1FN>, vector<4xf8E8M0FNU> to vector<4xf16>
+    return %0 : vector<4xf16>
+}
+
+// CHECK-LABEL: @scaling_extf_vector_to_f16
+// CHECK: %[[EXT_SCALE:.+]] = arith.extf %arg1 : vector<4xf8E8M0FNU> to vector<4xf16>
+// CHECK: %[[EXT_INPUT:.+]] = arith.extf %arg0 : vector<4xf4E2M1FN> to vector<4xf16>
+// CHECK: %[[RESULT:.+]] = arith.mulf %[[EXT_INPUT]], %[[EXT_SCALE]] : vector<4xf16> 
+// CHECK: return %[[RESULT]]
+
+// -----
+
+func.func @scaling_extf_vector_to_bf16(%arg0: vector<4xf4E2M1FN>, %arg1 : vector<4xf8E8M0FNU>) -> vector<4xbf16> {
+    %0 = arith.scaling_extf %arg0, %arg1 : vector<4xf4E2M1FN>, vector<4xf8E8M0FNU> to vector<4xbf16>
+    return %0 : vector<4xbf16>
+}
+
+// CHECK-LABEL: @scaling_extf_vector_to_bf16
+// CHECK: %[[EXT_SCALE:.+]] = arith.extf %arg1 : vector<4xf8E8M0FNU> to vector<4xbf16>
+// CHECK: %[[EXT_INPUT:.+]] = arith.extf %arg0 : vector<4xf4E2M1FN> to vector<4xbf16>
+// CHECK: %[[RESULT:.+]] = arith.mulf %[[EXT_INPUT]], %[[EXT_SCALE]] : vector<4xbf16> 
+// CHECK: return %[[RESULT]]
+
+// -----
+
+func.func @scaling_extf_vector_to_f32_using_f16_scales(%arg0: vector<4xf4E2M1FN>, %arg1 : vector<4xf16>) -> vector<4xf32> {
+    %0 = arith.scaling_extf %arg0, %arg1 : vector<4xf4E2M1FN>, vector<4xf16> to vector<4xf32>
+    return %0 : vector<4xf32>
+}
+
+// CHECK-LABEL: @scaling_extf_vector_to_f32_using_f16_scales
+// CHECK: %[[TRUNCF_SCALE:.+]] = arith.truncf %arg1 : vector<4xf16> to vector<4xf8E8M0FNU>
+// CHECK: %[[EXT_SCALE:.+]] = arith.extf %[[TRUNCF_SCALE]] : vector<4xf8E8M0FNU> to vector<4xf32>
+// CHECK: %[[EXT_INPUT:.+]] = arith.extf %arg0 : vector<4xf4E2M1FN> to vector<4xf32>
+// CHECK: %[[RESULT:.+]] = arith.mulf %[[EXT_INPUT]], %[[EXT_SCALE]] : vector<4xf32>
+// CHECK: return %[[RESULT]]
+
+// -----
+
+func.func @scaling_extf_vector_to_f32_using_f16_scales_fastmath(%arg0: vector<4xf4E2M1FN>, %arg1 : vector<4xf16>) -> vector<4xf32> {
+    %0 = arith.scaling_extf %arg0, %arg1 fastmath<fast> : vector<4xf4E2M1FN>, vector<4xf16> to vector<4xf32>
+    return %0 : vector<4xf32>
+}
+
+// CHECK-LABEL: @scaling_extf_vector_to_f32_using_f16_scales_fastmath
+// CHECK: %[[TRUNCF_SCALE:.+]] = arith.truncf %arg1 fastmath<fast> : vector<4xf16> to vector<4xf8E8M0FNU>
+// CHECK: %[[EXT_SCALE:.+]] = arith.extf %[[TRUNCF_SCALE]] fastmath<fast> : vector<4xf8E8M0FNU> to vector<4xf32>
+// CHECK: %[[EXT_INPUT:.+]] = arith.extf %arg0 fastmath<fast> : vector<4xf4E2M1FN> to vector<4xf32>
+// CHECK: %[[RESULT:.+]] = arith.mulf %[[EXT_INPUT]], %[[EXT_SCALE]] fastmath<fast> : vector<4xf32>
+// CHECK: return %[[RESULT]]
diff --git a/mlir/test/Dialect/Arith/expand-ops.mlir b/mlir/test/Dialect/Arith/expand-ops.mlir
diff --git a/mlir/test/Integration/Dialect/Arith/CPU/test-arith-expand-truncf-extf.mlir b/mlir/test/Integration/Dialect/Arith/CPU/test-arith-expand-truncf-extf.mlir

Original file line number	Diff line number	Diff line change
`@@ -19,6 +19,8 @@ def ArithExpandOpsPass : Pass<"arith-expand"> {`
`19`	`19`	`"Enable the BF16 expansion patterns">,`
`20`	`20`	`Option<"includeF8E8M0", "include-f8e8m0", "bool", /default=/"false",`
`21`	`21`	`"Enable the F8E8M0 expansion patterns">,`
	`22`	`+ Option<"includeF4E2M1", "include-f4e2m1", "bool", /default=/"false",`
	`23`	`+ "Enable the F4E2M1 expansion patterns">,`
`22`	`24`	`];`
`23`	`25`	`}`
`24`	`26`