XeGPUToVC: improvement to exp lowering. (#831)

antonio-cortes-perez · web-flow · commit ca477afa31f4 · 2024-08-14T16:32:32.000-05:00
The previous lowering was failing for f16 when creating the log2e
vector constant:

error: 'arith.constant' op failed to verify that all of {value, result}
                        have same type
%2 = math.exp %v1 : vector&lt;16xf16&gt;
     ^
note: see current operation:
%2 = "arith.constant"() &lt;{value = dense&lt;1.44269502&gt; : vector&lt;16xf32&gt;}&gt;:
                                                () -&gt; vector&lt;16xf16&gt;
diff --git a/lib/Conversion/XeGPUToVC/XeGPUToVC.cpp b/lib/Conversion/XeGPUToVC/XeGPUToVC.cpp
@@ -1378,16 +1378,17 @@ struct ElementwiseToVCPattern : public OpConversionPattern<MOp> {
     auto loc = op.getLoc();
     // This lowering pattern is needed only for spirv ops with large vector
     // lengths.
-    auto vecSize = vecTy.getNumElements();
     // for larger vector lengths, "llvm.genx.exp" returns the base 2
     // exponentiation of the input. To get the base e exponentiation, we need to
     // scale the input by log2(e)
     auto operands = adaptor.getOperands();
     SmallVector<Value> args{operands};
     if (isExpOp) {
-      SmallVector<float> log2e(vecSize, 1.442695040888963);
-      auto log2eConstVec = rewriter.create<arith::ConstantOp>(
-          op.getLoc(), vecTy, rewriter.getF32VectorAttr(log2e));
+      auto log2e = rewriter.create<arith::ConstantOp>(
+          loc,
+          rewriter.getFloatAttr(vecTy.getElementType(), 1.442695040888963));
+      auto log2eConstVec =
+          rewriter.create<vector::BroadcastOp>(loc, vecTy, log2e);
       auto input = operands[0];
       auto scaledInput =
           rewriter.create<arith::MulFOp>(op.getLoc(), input, log2eConstVec);
diff --git a/test/Conversion/XeGPUToVC/eltwise.mlir b/test/Conversion/XeGPUToVC/eltwise.mlir
@@ -1,4 +1,4 @@
-// RUN: imex-opt -convert-xegpu-to-vc='enable-vc-intrinsic=true useRawSend=true' -cse %s | FileCheck %s --check-prefixes=CHECK
+// RUN: imex-opt -convert-xegpu-to-vc='enable-vc-intrinsic=true useRawSend=true' -cse --split-input-file %s | FileCheck %s --check-prefixes=CHECK
 module @gemm attributes {gpu.container_module} {
   gpu.module @test_kernel {
 
@@ -7,10 +7,12 @@ module @gemm attributes {gpu.container_module} {
       %c0 = arith.constant 0 : index
       %cv1 = arith.constant dense<1.0> : vector<16xf32>
       %v1 = vector.load %arg0[%c0, %c0] : memref<8x16xf32>, vector<16xf32>
-      // CHECK: arith.mulf
-      // CHECK-NEXT: func.call @llvm.genx.exp.v16f32
+      // CHECK: %[[LOG2E:.*]] = arith.constant 1.44{{.*}} f32
+      // CHECK-NEXT: %[[LOG2E_VEC:.*]] = vector.broadcast %[[LOG2E]] : f32 to vector<16xf32>
+      // CHECK-NEXT: %[[MULF:.*]] = arith.mulf {{.*}} %[[LOG2E_VEC]]
+      // CHECK-NEXT: func.call @llvm.genx.exp.v16f32(%[[MULF]])
       %1 = math.exp %v1 fastmath<nnan> : vector<16xf32>
-      // CHECK-NEXT: func.call @llvm.genx.exp.v16f32
+      // CHECK-NEXT: func.call @llvm.genx.exp.v16f32(%[[MULF]])
       %2 = math.exp %v1 : vector<16xf32>
       // CHECK-NEXT: func.call @llvm.genx.fmax.v16f32
       %4 = arith.maximumf %v1, %cv1 fastmath<nnan> : vector<16xf32>
@@ -20,3 +22,21 @@ module @gemm attributes {gpu.container_module} {
     }
   }
 }
+
+// -----
+
+module @gemm attributes {gpu.container_module} {
+  gpu.module @exp_f16 {
+    // CHECK-LABEL: gpu.func @exp_f16
+    gpu.func @exp_f16(%arg0: memref<8x16xf16>) kernel attributes {VectorComputeFunctionINTEL, spirv.entry_point_abi = #spirv.entry_point_abi<>}{
+      %c0 = arith.constant 0 : index
+      %v1 = vector.load %arg0[%c0, %c0] : memref<8x16xf16>, vector<16xf16>
+      // CHECK: %[[LOG2E:.*]] = arith.constant 1.44{{.*}} f16
+      // CHECK-NEXT: %[[LOG2E_VEC:.*]] = vector.broadcast %[[LOG2E]] : f16 to vector<16xf16>
+      // CHECK-NEXT: %[[MULF:.*]] = arith.mulf {{.*}} %[[LOG2E_VEC]]
+      // CHECK-NEXT: func.call @llvm.genx.exp.v8i32(%[[MULF]])
+      %2 = math.exp %v1 : vector<16xf16>
+      gpu.return
+    }
+  }
+}