new x86 avx instructions: vbcstnebf162ps, vcvtneebf162ps, vcvtneobf162ps

arun-thmn · arun-thmn · commit 860ccf78f149 · 2025-04-10T01:30:12.000-07:00
diff --git a/mlir/include/mlir/Dialect/X86Vector/X86Vector.td b/mlir/include/mlir/Dialect/X86Vector/X86Vector.td
@@ -408,4 +408,110 @@ def DotOp : AVX_LowOp<"dot", [Pure,
   }];
 }
 
+
+//----------------------------------------------------------------------------//
+// AVX: Convert packed BF16 even-indexed/odd-indexed elements into packed F32
+//----------------------------------------------------------------------------//
+
+def CvtPackedEvenIndexedBF16ToF32Op : AVX_Op<"cvt.packed.even.indexed.bf16_to_f32", [Pure,
+  DeclareOpInterfaceMethods<OneToOneIntrinsicOpInterface>]> {
+  let summary = "AVX: Convert packed BF16 even-indexed elements into packed F32 Data.";
+  let description = [{
+    #### From the Intel Intrinsics Guide:
+
+    Convert packed BF16 (16-bit) floating-point even-indexed elements stored at
+    memory locations starting at location `__A` to packed single-precision
+    (32-bit) floating-point elements, and store the results in `dst`.
+
+    Example:
+    ```mlir
+    %dst = x86vector.avx.cvt.packed.even.indexed.bf16_to_f32 %a : !llvm.ptr -> vector<8xbf16>
+    ```
+  }];
+  let arguments = (ins LLVM_AnyPointer:$a);
+  let results = (outs VectorOfLengthAndType<[4, 8], [F32]>:$dst);
+  let assemblyFormat =
+    "$a  attr-dict`:` type($a)`->` type($dst)";
+
+  let extraClassDefinition = [{
+    std::string $cppClass::getIntrinsicName() {
+      std::string intr = "llvm.x86.vcvtneebf162ps";
+      VectorType vecType = getDst().getType();
+      unsigned elemBitWidth = vecType.getElementTypeBitWidth();
+      unsigned opBitWidth = vecType.getShape()[0] * elemBitWidth;
+      intr += std::to_string(opBitWidth);
+      return intr;
+    }
+  }];
+}
+
+def CvtPackedOddIndexedBF16ToF32Op : AVX_Op<"cvt.packed.odd.indexed.bf16_to_f32", [Pure,
+  DeclareOpInterfaceMethods<OneToOneIntrinsicOpInterface>]> {
+  let summary = "AVX: Convert packed BF16 odd-indexed elements into packed F32 Data.";
+  let description = [{
+    #### From the Intel Intrinsics Guide:
+
+    Convert packed BF16 (16-bit) floating-point odd-indexed elements stored at
+    memory locations starting at location `__A` to packed single-precision
+    (32-bit) floating-point elements, and store the results in `dst`.
+
+    Example:
+    ```mlir
+    %dst = x86vector.avx.cvt.packed.odd.indexed.bf16_to_f32 %a : !llvm.ptr -> vector<8xbf16>
+    ```
+  }];
+  let arguments = (ins LLVM_AnyPointer:$a);
+  let results = (outs VectorOfLengthAndType<[4, 8], [F32]>:$dst);
+  let assemblyFormat =
+    "$a  attr-dict`:` type($a)`->` type($dst)";
+
+  let extraClassDefinition = [{
+    std::string $cppClass::getIntrinsicName() {
+      std::string intr = "llvm.x86.vcvtneobf162ps";
+      VectorType vecType = getDst().getType();
+      unsigned elemBitWidth = vecType.getElementTypeBitWidth();
+      unsigned opBitWidth = vecType.getShape()[0] * elemBitWidth;
+      intr += std::to_string(opBitWidth);
+      return intr;
+    }
+  }];
+}
+
+//----------------------------------------------------------------------------//
+// AVX: Convert BF16 to F32 and broadcast into packed F32
+//----------------------------------------------------------------------------//
+
+def BcstBF16ToPackedF32Op : AVX_Op<"bcst.bf16_to_f32.packed", [Pure,
+  DeclareOpInterfaceMethods<OneToOneIntrinsicOpInterface>]> {
+  let summary = "AVX: Broadcasts BF16 into packed F32 Data.";
+  let description = [{
+    #### From the Intel Intrinsics Guide:
+
+    Convert scalar BF16 (16-bit) floating-point element stored at memory locations
+    starting at location `__A` to a single-precision (32-bit) floating-point,
+    broadcast it to packed single-precision (32-bit) floating-point elements,
+    and store the results in `dst`.
+
+    Example:
+    ```mlir
+    %dst = x86vector.avx.bcst.bf16_to_f32.packed %a : !llvm.ptr -> vector<8xbf16>
+    ```
+  }];
+  let arguments = (ins LLVM_AnyPointer:$a);
+  let results = (outs VectorOfLengthAndType<[4, 8], [F32]>:$dst);
+  let assemblyFormat =
+    "$a  attr-dict`:` type($a)`->` type($dst)";
+
+  let extraClassDefinition = [{
+    std::string $cppClass::getIntrinsicName() {
+      std::string intr = "llvm.x86.vbcstnebf162ps";
+      VectorType vecType = getDst().getType();
+      unsigned elemBitWidth = vecType.getElementTypeBitWidth();
+      unsigned opBitWidth = vecType.getShape()[0] * elemBitWidth;
+      intr += std::to_string(opBitWidth);
+      return intr;
+    }
+  }];
+}
+
 #endif // X86VECTOR_OPS
diff --git a/mlir/include/mlir/Dialect/X86Vector/X86VectorDialect.h b/mlir/include/mlir/Dialect/X86Vector/X86VectorDialect.h
@@ -21,6 +21,7 @@
 #include "mlir/IR/PatternMatch.h"
 #include "mlir/Interfaces/InferTypeOpInterface.h"
 #include "mlir/Interfaces/SideEffectInterfaces.h"
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 
 /// Include the generated interface declarations.
 #include "mlir/Dialect/X86Vector/X86VectorInterfaces.h.inc"
diff --git a/mlir/lib/Dialect/X86Vector/Transforms/LegalizeForLLVMExport.cpp b/mlir/lib/Dialect/X86Vector/Transforms/LegalizeForLLVMExport.cpp
@@ -115,6 +115,7 @@ void mlir::populateX86VectorLegalizeForLLVMExportPatterns(
 void mlir::configureX86VectorLegalizeForExportTarget(
     LLVMConversionTarget &target) {
   target.addIllegalOp<MaskCompressOp, MaskRndScaleOp, MaskScaleFOp,
-                      Vp2IntersectOp, DotBF16Op, CvtPackedF32ToBF16Op, RsqrtOp,
+                      Vp2IntersectOp, DotBF16Op, CvtPackedF32ToBF16Op, CvtPackedEvenIndexedBF16ToF32Op,
+		      CvtPackedOddIndexedBF16ToF32Op, BcstBF16ToPackedF32Op, RsqrtOp,
                       DotOp>();
 }
diff --git a/mlir/test/Dialect/X86Vector/bcst-avx-bf16-to-f32-packed.mlir b/mlir/test/Dialect/X86Vector/bcst-avx-bf16-to-f32-packed.mlir
@@ -0,0 +1,22 @@
+// REQUIRES: target=x86{{.*}}
+
+// RUN: mlir-opt %s \
+// RUN:   -convert-vector-to-llvm="enable-x86vector" -convert-to-llvm \
+// RUN:   -reconcile-unrealized-casts | \
+// RUN: mlir-translate --mlir-to-llvmir | \
+// RUN: llc -mcpu=sierraforest | \
+// RUN: FileCheck %s
+
+func.func @avxbf16_bcst_bf16_to_f32_packed_128(%arg0: !llvm.ptr) -> vector<4xf32> {
+  %0 = x86vector.avx.bcst.bf16_to_f32.packed %arg0 : !llvm.ptr -> vector<4xf32>
+  return %0 : vector<4xf32>
+}
+// CHECK-LABEL: avxbf16_bcst_bf16_to_f32_packed_128:
+// CHECK: vbcstnebf162ps{{.*}}%xmm
+
+func.func @avxbf16_bcst_bf16_to_f32_packed_256(%arg0: !llvm.ptr) -> vector<8xf32> {
+  %0 = x86vector.avx.bcst.bf16_to_f32.packed %arg0 : !llvm.ptr -> vector<8xf32>
+  return %0 : vector<8xf32>
+}
+// CHECK-LABEL: avxbf16_bcst_bf16_to_f32_packed_256:
+// CHECK: vbcstnebf162ps{{.*}}%ymm
diff --git a/mlir/test/Dialect/X86Vector/cvt-packed-avx-bf16-to-f32.mlir b/mlir/test/Dialect/X86Vector/cvt-packed-avx-bf16-to-f32.mlir
@@ -0,0 +1,48 @@
+// REQUIRES: target=x86{{.*}}
+
+// RUN: mlir-opt %s \
+// RUN:   -convert-vector-to-llvm="enable-x86vector" -convert-to-llvm \
+// RUN:   -reconcile-unrealized-casts | \
+// RUN: mlir-translate --mlir-to-llvmir | \
+// RUN: llc -mcpu=sierraforest | \
+// RUN: FileCheck %s
+
+func.func @avxbf16_cvt_packed_even_indexed_bf16_to_f32_128(%arg0: memref<8xbf16>) -> vector<4xf32> {
+  %intptr = memref.extract_aligned_pointer_as_index %arg0 : memref<8xbf16> -> index
+  %0 = arith.index_cast %intptr : index to i32
+  %1 = llvm.inttoptr %0 : i32 to !llvm.ptr
+  %2 = x86vector.avx.cvt.packed.even.indexed.bf16_to_f32 %1 : !llvm.ptr -> vector<4xf32>
+  return %2 : vector<4xf32>
+}
+// CHECK-LABEL: avxbf16_cvt_packed_even_indexed_bf16_to_f32_128:
+// CHECK: vcvtneebf162ps{{.*}}%xmm
+
+func.func @avxbf16_cvt_packed_even_indexed_bf16_to_f32_256(%arg0: memref<16xbf16>) -> vector<8xf32> {
+  %intptr = memref.extract_aligned_pointer_as_index %arg0 : memref<16xbf16> -> index
+  %0 = arith.index_cast %intptr : index to i32
+  %1 = llvm.inttoptr %0 : i32 to !llvm.ptr
+  %2 = x86vector.avx.cvt.packed.even.indexed.bf16_to_f32 %1 : !llvm.ptr -> vector<8xf32>
+  return %2 : vector<8xf32>
+}
+// CHECK-LABEL: avxbf16_cvt_packed_even_indexed_bf16_to_f32_256:
+// CHECK: vcvtneebf162ps{{.*}}%ymm
+
+func.func @avxbf16_cvt_packed_odd_indexed_bf16_to_f32_128(%arg0: memref<8xbf16>) -> vector<4xf32> {
+  %intptr = memref.extract_aligned_pointer_as_index %arg0 : memref<8xbf16> -> index
+  %0 = arith.index_cast %intptr : index to i32
+  %1 = llvm.inttoptr %0 : i32 to !llvm.ptr
+  %2 = x86vector.avx.cvt.packed.odd.indexed.bf16_to_f32 %1 : !llvm.ptr -> vector<4xf32>
+  return %2 : vector<4xf32>
+}
+// CHECK-LABEL: avxbf16_cvt_packed_odd_indexed_bf16_to_f32_128:
+// CHECK: vcvtneobf162ps{{.*}}%xmm
+
+func.func @avxbf16_cvt_packed_odd_indexed_bf16_to_f32_256(%arg0: memref<16xbf16>) -> vector<8xf32> {
+  %intptr = memref.extract_aligned_pointer_as_index %arg0 : memref<16xbf16> -> index
+  %0 = arith.index_cast %intptr : index to i32
+  %1 = llvm.inttoptr %0 : i32 to !llvm.ptr
+  %2 = x86vector.avx.cvt.packed.odd.indexed.bf16_to_f32 %1 : !llvm.ptr -> vector<8xf32>
+  return %2 : vector<8xf32>
+}
+// CHECK-LABEL: avxbf16_cvt_packed_odd_indexed_bf16_to_f32_256:
+// CHECK: vcvtneobf162ps{{.*}}%ymm
diff --git a/mlir/test/Dialect/X86Vector/legalize-for-llvm.mlir b/mlir/test/Dialect/X86Vector/legalize-for-llvm.mlir
@@ -95,6 +95,60 @@ func.func @avx512bf16_cvt_packed_f32_to_bf16_512(
   return %0 : vector<16xbf16>
 }
 
+// CHECK-LABEL: func @avxbf16_cvt_packed_even_indexed_bf16_to_f32_128
+func.func @avxbf16_cvt_packed_even_indexed_bf16_to_f32_128(
+  %a: !llvm.ptr) -> vector<4xf32>
+{
+  // CHECK: llvm.call_intrinsic "llvm.x86.vcvtneebf162ps128"
+  %0 = x86vector.avx.cvt.packed.even.indexed.bf16_to_f32 %a : !llvm.ptr -> vector<4xf32>
+  return %0 : vector<4xf32>
+}
+
+// CHECK-LABEL: func @avxbf16_cvt_packed_even_indexed_bf16_to_f32_256
+func.func @avxbf16_cvt_packed_even_indexed_bf16_to_f32_256(
+  %a: !llvm.ptr) -> vector<8xf32>
+{
+  // CHECK: llvm.call_intrinsic "llvm.x86.vcvtneebf162ps256"
+  %0 = x86vector.avx.cvt.packed.even.indexed.bf16_to_f32 %a : !llvm.ptr -> vector<8xf32>
+  return %0 : vector<8xf32>
+}
+
+// CHECK-LABEL: func @avxbf16_cvt_packed_odd_indexed_bf16_to_f32_128
+func.func @avxbf16_cvt_packed_odd_indexed_bf16_to_f32_128(
+  %a: !llvm.ptr) -> vector<4xf32>
+{
+  // CHECK: llvm.call_intrinsic "llvm.x86.vcvtneobf162ps128"
+  %0 = x86vector.avx.cvt.packed.odd.indexed.bf16_to_f32 %a : !llvm.ptr -> vector<4xf32>
+  return %0 : vector<4xf32>
+}
+
+// CHECK-LABEL: func @avxbf16_cvt_packed_odd_indexed_bf16_to_f32_256
+func.func @avxbf16_cvt_packed_odd_indexed_bf16_to_f32_256(
+  %a: !llvm.ptr) -> vector<8xf32>
+{
+  // CHECK: llvm.call_intrinsic "llvm.x86.vcvtneobf162ps256"
+  %0 = x86vector.avx.cvt.packed.odd.indexed.bf16_to_f32 %a : !llvm.ptr -> vector<8xf32>
+  return %0 : vector<8xf32>
+}
+
+// CHECK-LABEL: func @avxbf16_bsct_bf16_to_f32_packed_128
+func.func @avxbf16_bsct_bf16_to_f32_packed_128(
+  %a: !llvm.ptr) -> vector<4xf32>
+{
+  // CHECK: llvm.call_intrinsic "llvm.x86.vbcstnebf162ps128"
+  %0 = x86vector.avx.bcst.bf16_to_f32.packed %a : !llvm.ptr -> vector<4xf32>
+  return %0 : vector<4xf32>
+}
+
+// CHECK-LABEL: func @avxbf16_bsct_bf16_to_f32_packed_256
+func.func @avxbf16_bsct_bf16_to_f32_packed_256(
+  %a: !llvm.ptr) -> vector<8xf32>
+{
+  // CHECK: llvm.call_intrinsic "llvm.x86.vbcstnebf162ps256"
+  %0 = x86vector.avx.bcst.bf16_to_f32.packed %a : !llvm.ptr -> vector<8xf32>
+  return %0 : vector<8xf32>
+}
+
 // CHECK-LABEL: func @avx_rsqrt
 func.func @avx_rsqrt(%a: vector<8xf32>) -> (vector<8xf32>)
 {
diff --git a/mlir/test/Dialect/X86Vector/roundtrip.mlir b/mlir/test/Dialect/X86Vector/roundtrip.mlir
@@ -94,6 +94,66 @@ func.func @avx512bf16_cvt_packed_f32_to_bf16_512(
   return %0 : vector<16xbf16>
 }
 
+// CHECK-LABEL: func @avxbf16_cvt_packed_even_indexed_bf16_to_f32_128
+func.func @avxbf16_cvt_packed_even_indexed_bf16_to_f32_128(
+  %a: !llvm.ptr) -> vector<4xf32>
+{
+  // CHECK: x86vector.avx.cvt.packed.even.indexed.bf16_to_f32 {{.*}} :
+  // CHECK-SAME: !llvm.ptr -> vector<4xf32>
+  %0 = x86vector.avx.cvt.packed.even.indexed.bf16_to_f32 %a : !llvm.ptr -> vector<4xf32>
+  return %0 : vector<4xf32>
+}
+
+// CHECK-LABEL: func @avxbf16_cvt_packed_even_indexed_bf16_to_f32_256
+func.func @avxbf16_cvt_packed_even_indexed_bf16_to_f32_256(
+  %a: !llvm.ptr) -> vector<8xf32>
+{
+  // CHECK: x86vector.avx.cvt.packed.even.indexed.bf16_to_f32 {{.*}} :
+  // CHECK-SAME: !llvm.ptr -> vector<8xf32>
+  %0 = x86vector.avx.cvt.packed.even.indexed.bf16_to_f32 %a : !llvm.ptr -> vector<8xf32>
+  return %0 : vector<8xf32>
+}
+
+// CHECK-LABEL: func @avxbf16_cvt_packed_odd_indexed_bf16_to_f32_128
+func.func @avxbf16_cvt_packed_odd_indexed_bf16_to_f32_128(
+  %a: !llvm.ptr) -> vector<4xf32>
+{
+  // CHECK: x86vector.avx.cvt.packed.odd.indexed.bf16_to_f32 {{.*}} :
+  // CHECK-SAME: !llvm.ptr -> vector<4xf32>
+  %0 = x86vector.avx.cvt.packed.odd.indexed.bf16_to_f32 %a : !llvm.ptr -> vector<4xf32>
+  return %0 : vector<4xf32>
+}
+
+// CHECK-LABEL: func @avxbf16_cvt_packed_odd_indexed_bf16_to_f32_256
+func.func @avxbf16_cvt_packed_odd_indexed_bf16_to_f32_256(
+  %a: !llvm.ptr) -> vector<8xf32>
+{
+  // CHECK: x86vector.avx.cvt.packed.odd.indexed.bf16_to_f32 {{.*}} :
+  // CHECK-SAME: !llvm.ptr -> vector<8xf32>
+  %0 = x86vector.avx.cvt.packed.odd.indexed.bf16_to_f32 %a : !llvm.ptr -> vector<8xf32>
+  return %0 : vector<8xf32>
+}
+
+// CHECK-LABEL: func @avxbf16_bcst_bf16_to_f32_128
+func.func @avxbf16_bcst_bf16_to_f32_128(
+  %a: !llvm.ptr) -> vector<4xf32>
+{
+  // CHECK: x86vector.avx.bcst.bf16_to_f32.packed {{.*}} :
+  // CHECK-SAME: !llvm.ptr -> vector<4xf32>
+  %0 = x86vector.avx.bcst.bf16_to_f32.packed %a : !llvm.ptr -> vector<4xf32>
+  return %0 : vector<4xf32>
+}
+
+// CHECK-LABEL: func @avxbf16_bcst_bf16_to_f32_256
+func.func @avxbf16_bcst_bf16_to_f32_256(
+  %a: !llvm.ptr) -> vector<8xf32>
+{
+  // CHECK: x86vector.avx.bcst.bf16_to_f32.packed {{.*}} :
+  // CHECK-SAME: !llvm.ptr -> vector<8xf32>
+  %0 = x86vector.avx.bcst.bf16_to_f32.packed %a : !llvm.ptr -> vector<8xf32>
+  return %0 : vector<8xf32>
+}
+
 // CHECK-LABEL: func @avx_rsqrt
 func.func @avx_rsqrt(%a: vector<8xf32>) -> (vector<8xf32>)
 {
diff --git a/mlir/test/Target/LLVMIR/x86vector.mlir b/mlir/test/Target/LLVMIR/x86vector.mlir
@@ -109,6 +109,60 @@ func.func @LLVM_x86_avx512bf16_cvtneps2bf16_512(
   return %0 : vector<16xbf16>
 }
 
+// CHECK-LABEL: define <4 x float> @LLVM_x86_avxbf16_vcvtneebf162ps128
+func.func @LLVM_x86_avxbf16_vcvtneebf162ps128(
+  %a: !llvm.ptr) -> vector<4xf32>
+{
+  // CHECK: call <4 x float> @llvm.x86.vcvtneebf162ps128(
+  %0 = x86vector.avx.cvt.packed.even.indexed.bf16_to_f32 %a : !llvm.ptr -> vector<4xf32>
+  return %0 : vector<4xf32>
+}
+
+// CHECK-LABEL: define <8 x float> @LLVM_x86_avxbf16_vcvtneebf162ps256
+func.func @LLVM_x86_avxbf16_vcvtneebf162ps256(
+  %a: !llvm.ptr) -> vector<8xf32>
+{
+  // CHECK: call <8 x float> @llvm.x86.vcvtneebf162ps256(
+  %0 = x86vector.avx.cvt.packed.even.indexed.bf16_to_f32 %a : !llvm.ptr -> vector<8xf32>
+  return %0 : vector<8xf32>
+}
+
+// CHECK-LABEL: define <4 x float> @LLVM_x86_avxbf16_vcvtneobf162ps128
+func.func @LLVM_x86_avxbf16_vcvtneobf162ps128(
+  %a: !llvm.ptr) -> vector<4xf32>
+{
+  // CHECK: call <4 x float> @llvm.x86.vcvtneobf162ps128(
+  %0 = x86vector.avx.cvt.packed.odd.indexed.bf16_to_f32 %a : !llvm.ptr -> vector<4xf32>
+  return %0 : vector<4xf32>
+}
+
+// CHECK-LABEL: define <8 x float> @LLVM_x86_avxbf16_vcvtneobf162ps256
+func.func @LLVM_x86_avxbf16_vcvtneobf162ps256(
+  %a: !llvm.ptr) -> vector<8xf32>
+{
+  // CHECK: call <8 x float> @llvm.x86.vcvtneobf162ps256(
+  %0 = x86vector.avx.cvt.packed.odd.indexed.bf16_to_f32 %a : !llvm.ptr -> vector<8xf32>
+  return %0 : vector<8xf32>
+}
+
+// CHECK-LABEL: define <4 x float> @LLVM_x86_avxbf16_vbcstnebf162ps128
+func.func @LLVM_x86_avxbf16_vbcstnebf162ps128(
+  %a: !llvm.ptr) -> vector<4xf32>
+{
+  // CHECK: call <4 x float> @llvm.x86.vbcstnebf162ps128(
+  %0 = x86vector.avx.bcst.bf16_to_f32.packed %a : !llvm.ptr -> vector<4xf32>
+  return %0 : vector<4xf32>
+}
+
+// CHECK-LABEL: define <8 x float> @LLVM_x86_avxbf16_vbcstnebf162ps256
+func.func @LLVM_x86_avxbf16_vbcstnebf162ps256(
+  %a: !llvm.ptr) -> vector<8xf32>
+{
+  // CHECK: call <8 x float> @llvm.x86.vbcstnebf162ps256(
+  %0 = x86vector.avx.bcst.bf16_to_f32.packed %a : !llvm.ptr -> vector<8xf32>
+  return %0 : vector<8xf32>
+}
+
 // CHECK-LABEL: define <8 x float> @LLVM_x86_avx_rsqrt_ps_256
 func.func @LLVM_x86_avx_rsqrt_ps_256(%a: vector <8xf32>) -> vector<8xf32>
 {

Original file line number	Diff line number	Diff line change
`@@ -115,6 +115,7 @@ void mlir::populateX86VectorLegalizeForLLVMExportPatterns(`
`115`	`115`	`void mlir::configureX86VectorLegalizeForExportTarget(`
`116`	`116`	`LLVMConversionTarget &target) {`
`117`	`117`	`target.addIllegalOp<MaskCompressOp, MaskRndScaleOp, MaskScaleFOp,`
`118`		`- Vp2IntersectOp, DotBF16Op, CvtPackedF32ToBF16Op, RsqrtOp,`
	`118`	`+ Vp2IntersectOp, DotBF16Op, CvtPackedF32ToBF16Op, CvtPackedEvenIndexedBF16ToF32Op,`
	`119`	`+ CvtPackedOddIndexedBF16ToF32Op, BcstBF16ToPackedF32Op, RsqrtOp,`
`119`	`120`	`DotOp>();`
`120`	`121`	`}`