Vectorize TritonGEN::FToTf32Op (#4811)

anmyachev · web-flow · commit 2c8f3e27daea · 2025-07-31T15:11:11.000Z
This change reduces `.llir` file one of the largest Flex Attn kernels by ~16k
lines of code: 79094 -&gt; 63276. It seems vectorization on our side is
necessary if we want to reduce compilation time.

---------

Signed-off-by: Anatoly Myachev &lt;anatoly.myachev@intel.com&gt;
diff --git a/test/Conversion/intel/tritongpu_to_gen_dot.mlir b/test/Conversion/intel/tritongpu_to_gen_dot.mlir
@@ -76,15 +76,15 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32, "ttg.thr
   // CHECK-SAME:  %[[A:.*]]: !llvm.struct<(f32, f32, f32, f32)>, %[[B:.*]]: !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32)>, %[[C:.*]]: !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32)>, %[[PTR_1:.*]]: !llvm.ptr<1>) attributes {intel_reqd_sub_group_size = 16 : i32, reqd_work_group_size = array<i32: 16, 1, 1>} {
   tt.func @dot_f32_tf32_tf32_f32_1(%a: tensor<8x8xf32, #dot_operand_a>, %b: tensor<8x16xf32, #dot_operand_b>, %c: tensor<8x16xf32, #dpas>) {
     // COM: To simplify, only check RTNE and its usage for the last element of A, B, C
-    // CHECK: %[[A_LAST_VAL:.*]] = llvm.extractvalue %[[A]][3]
-    // CHECK: %[[A_RTNE_VAL:.*]] = llvm.call spir_funccc @_Z25__spirv_RoundFToTF32INTELf(%[[A_LAST_VAL]])
-    // CHECK: %[[A_0:.*]] = llvm.insertelement %[[A_RTNE_VAL]], %{{.*}}{{\[}}%{{.*}} : i32] : vector<4xf32>
-    // CHECK: %[[B_LAST_VAL:.*]] = llvm.extractvalue %[[B]][7]
-    // CHECK: %[[B_RTNE_VAL:.*]] = llvm.call spir_funccc @_Z25__spirv_RoundFToTF32INTELf(%[[B_LAST_VAL]])
-    // CHECK: %[[B_0:.*]] = llvm.insertelement %[[B_RTNE_VAL]], %{{.*}}{{\[}}%{{.*}} : i32] : vector<8xf32>
+    // CHECK: %[[A_EXTR_LAST_VAL:.*]] = llvm.extractvalue %[[A]][3]
+    // CHECK: %[[A_LAST_VAL:.*]] = llvm.insertelement %[[A_EXTR_LAST_VAL]], %{{.*}} : vector<4xf32>
+    // CHECK: %[[A_RTNE_VAL:.*]] = llvm.call spir_funccc @_Z25__spirv_RoundFToTF32INTELDv4_f(%[[A_LAST_VAL]])
+    // CHECK: %[[B_EXTR_LAST_VAL:.*]] = llvm.extractvalue %[[B]][7]
+    // CHECK: %[[B_LAST_VAL:.*]] = llvm.insertelement %[[B_EXTR_LAST_VAL]], %{{.*}} : vector<8xf32>
+    // CHECK: %[[B_RTNE_VAL:.*]] = llvm.call spir_funccc @_Z25__spirv_RoundFToTF32INTELDv8_f(%[[B_LAST_VAL]])
     // CHECK: %[[C_LAST_VAL:.*]] = llvm.extractvalue %[[C]][7]
     // CHECK: %[[C_0:.*]] = llvm.insertelement %[[C_LAST_VAL]], %{{.*}}{{\[}}%{{.*}} : i32] : vector<8xf32>
-    // CHECK: llvm.call spir_funccc @_Z45__spirv_SubgroupMatrixMultiplyAccumulateINTELiDv4_fDv8_fS0_i(%{{.*}}, %[[A_0]], %[[B_0]], %[[C_0]], %{{.*}}} : (i32, vector<4xf32>, vector<8xf32>, vector<8xf32>, i32) -> vector<8xf32>
+    // CHECK: llvm.call spir_funccc @_Z45__spirv_SubgroupMatrixMultiplyAccumulateINTELiDv4_fDv8_fS0_i(%{{.*}}, %[[A_RTNE_VAL]], %[[B_RTNE_VAL]], %[[C_0]], %{{.*}}} : (i32, vector<4xf32>, vector<8xf32>, vector<8xf32>, i32) -> vector<8xf32>
     %0 = tt.dot %a, %b, %c, inputPrecision = tf32 : tensor<8x8xf32, #dot_operand_a> * tensor<8x16xf32, #dot_operand_b> -> tensor<8x16xf32, #dpas>
     tt.return
   }
diff --git a/third_party/intel/include/Dialect/TritonGEN/IR/TritonGENOps.td b/third_party/intel/include/Dialect/TritonGEN/IR/TritonGENOps.td
@@ -394,8 +394,8 @@ def TritonGEN_FToTf32Op
     a 32-bit floating point type to TF32 with rounding to the nearest even.
   }];
 
-  let arguments = (ins F32:$val);
-  let results = (outs F32:$res);
+  let arguments = (ins LLVM_ScalarOrVectorOf<F32>:$val);
+  let results = (outs LLVM_ScalarOrVectorOf<F32>:$res);
   let assemblyFormat = [{
     $val attr-dict `:` type($val)
   }];
diff --git a/third_party/intel/lib/TritonGENToLLVM/TritonGENToLLVMPass.cpp b/third_party/intel/lib/TritonGENToLLVM/TritonGENToLLVMPass.cpp
@@ -923,13 +923,16 @@ struct TritonFToTf32OpLowering
     auto b = TritonLLVMOpBuilder(loc, rewriter);
 
     Value value = op->getOperand(0);
-    SmallVector<Type> argTypes{f32_ty};
+    Type valueType = value.getType();
+
+    SmallVector<Type> argTypes{valueType};
     SmallVector<Value> args{value};
 
-    const StringLiteral funcName = "_Z25__spirv_RoundFToTF32INTELf";
-    auto retType = f32_ty;
+    std::string fnName = "__spirv_RoundFToTF32INTEL";
+    fnName = intel::mangle(fnName, argTypes);
+    auto retType = valueType;
     auto callOp = intel::createDeviceFunctionCall(
-        rewriter, funcName, retType, {argTypes}, {args}, {},
+        rewriter, fnName, retType, {argTypes}, {args}, {},
         intel::noUnwindWillReturnAttrs);
     rewriter.replaceOp(op, callOp);
     return success();
diff --git a/third_party/intel/lib/TritonIntelGPUToLLVM/DotOpToLLVM/DPAS.cpp b/third_party/intel/lib/TritonIntelGPUToLLVM/DotOpToLLVM/DPAS.cpp
@@ -352,21 +352,18 @@ class DotOpDPASConversionHelper {
             for (int repInner = 0; repInner < repClusterInner; ++repInner) {
               Value matVal = rewriter.create<LLVM::UndefOp>(loc, dotOpTy);
               for (int k = 0; k < numElemsPerOperand; ++k) {
-                if (isFToTF32Enabled) {
-                  Value f32Val = elems[offset++];
-                  auto t32Val =
-                      rewriter.create<TritonGEN::FToTf32Op>(loc, f32Val)
-                          .getResult();
-                  matVal =
-                      tb.insert_element(dotOpTy, matVal, t32Val, tb.i32_val(k));
-
-                } else {
-                  matVal = tb.insert_element(dotOpTy, matVal, elems[offset++],
-                                             tb.i32_val(k));
-                }
+                matVal = tb.insert_element(dotOpTy, matVal, elems[offset++],
+                                           tb.i32_val(k));
+              }
+              if (isFToTF32Enabled) {
+                auto t32Val = rewriter.create<TritonGEN::FToTf32Op>(loc, matVal)
+                                  .getResult();
+                vals[{b, i * repClusterOuter + repOuter,
+                      j * repClusterInner + repInner}] = t32Val;
+              } else {
+                vals[{b, i * repClusterOuter + repOuter,
+                      j * repClusterInner + repInner}] = matVal;
               }
-              vals[{b, i * repClusterOuter + repOuter,
-                    j * repClusterInner + repInner}] = matVal;
             }
           }
         }