libxsmm
diff --git a/‎lib/TPP/Transforms/VectorContractToMicroKernels.cpp‎
Lines changed: 94 additions & 47 deletions b/‎lib/TPP/Transforms/VectorContractToMicroKernels.cpp‎
Lines changed: 94 additions & 47 deletions
diff --git a/‎test/CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions b/‎test/CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎test/I8/Integration/lit.local.cfg‎
Lines changed: 40 additions & 0 deletions b/‎test/I8/Integration/lit.local.cfg‎
Lines changed: 40 additions & 0 deletions
diff --git a/‎test/I8/Integration/vector-contract-to-ukernels-i8.mlir‎
Lines changed: 15 additions & 0 deletions b/‎test/I8/Integration/vector-contract-to-ukernels-i8.mlir‎
Lines changed: 15 additions & 0 deletions
@@ -60,7 +60,8 @@ static bool isTransposedMatrix(vector::ContractionOp contractOp,
   AffineMap mapB = contractMaps[1];
 
   bool isF32 = elementType.isF32();
-  bool isF16_BF16 = (elementType.isF16() || elementType.isBF16());
+  bool isPackedType = (elementType.isF16() || elementType.isBF16() ||
+                       elementType.isSignlessInteger(8));
 
   auto resultsMapA = mapA.getNumResults();
   auto resultsMapB = mapB.getNumResults();
@@ -70,7 +71,7 @@ static bool isTransposedMatrix(vector::ContractionOp contractOp,
            "Result dim map for A and B should be 3");
   }
 
-  if (isF16_BF16) {
+  if (isPackedType) {
     assert(resultsMapA == 4 && resultsMapB == 4 &&
            "Result dim map for A and B should be 4");
   }
@@ -83,7 +84,7 @@ static bool isTransposedMatrix(vector::ContractionOp contractOp,
            "Input dim map for A and B should be 4");
   }
 
-  if (isF16_BF16) {
+  if (isPackedType) {
     assert(inputsMapA == 5 && inputsMapB == 5 &&
            "Input dim map for A and B should be 5");
   }
@@ -95,7 +96,7 @@ static bool isTransposedMatrix(vector::ContractionOp contractOp,
     auto affineExpr =
         dyn_cast<AffineDimExpr>(mlir::getAffineDimExpr(i, mapA.getContext()));
 
-    if (isF16_BF16) {
+    if (isPackedType) {
       auto vnniDim = dyn_cast<AffineDimExpr>(mapA.getResult(3));
       if (affineExpr != vnniDim && affineExpr != dimBR)
         listMxNxK.push_back(affineExpr);
@@ -129,7 +130,8 @@ static bool permutationCheck(vector::ContractionOp contractOp,
   AffineMap mapB = contractMaps[1];
 
   bool isF32 = elementType.isF32();
-  bool isF16_BF16 = (elementType.isF16() || elementType.isBF16());
+  bool isPackedType = (elementType.isF16() || elementType.isBF16() ||
+                       elementType.isSignlessInteger(8));
 
   auto inputsMapA = mapA.getNumInputs();
   SmallVector<AffineDimExpr> inputDims;
@@ -148,7 +150,7 @@ static bool permutationCheck(vector::ContractionOp contractOp,
     outputDimsA.push_back(affineExpr);
   }
 
-  if (isF16_BF16) {
+  if (isPackedType) {
     // We match the pattern {Batch-reduction, vnni, M, N, K} or
     // {Batch-reduction, M, N, K, vnni} -> {Batch-reduction, M, K, vnni}
     auto c1 = inputDims[0] == outputDimsA[0];
@@ -178,7 +180,7 @@ static bool permutationCheck(vector::ContractionOp contractOp,
     outputDimsB.push_back(affineExpr);
   }
 
-  if (isF16_BF16) {
+  if (isPackedType) {
     // We match the pattern {Batch-reduction, vnni, M, N, K} or
     // {Batch-reduction, M, N, K, vnni} -> {Batch-reduction, K, N, vnni}
     auto c4 = inputDims[0] == outputDimsB[0];
@@ -290,16 +292,20 @@ struct MicroKernelsOp : OpRewritePattern<vector::ContractionOp> {
     bool isF32 = elementType.isF32();
     bool isF16 = elementType.isF16();
     bool isBF16 = elementType.isBF16();
+    bool isI8 = elementType.isSignlessInteger(8);
 
-    if (!(isF32 || isF16 || isBF16))
-      return rewriter.notifyMatchFailure(contractOp,
-                                         "The type is not F32 or F16 or BF16");
+    bool isPackedType = isF16 || isBF16 || isI8;
+    int64_t vnniFactor = (isBF16 || isF16) ? 2 : isI8 ? 4 : 0;
+
+    if (!(isF32 || isPackedType))
+      return rewriter.notifyMatchFailure(
+          contractOp, "The type is not F32 or F16 or BF16 or I8");
 
     bool bf16dp = false;
     bool srf = false;
     bool fallback = false;
 
-    if (isBF16 || isF16) {
+    if (isPackedType) {
       auto cpuName = vnni::utils::getTargetArchName();
       if (cpuName == "SRF")
         srf = true;
@@ -311,9 +317,9 @@ struct MicroKernelsOp : OpRewritePattern<vector::ContractionOp> {
         fallback = true;
     }
 
-    if (isF16 && !(srf))
+    if ((isF16 || isI8) && !(srf))
       return rewriter.notifyMatchFailure(
-          contractOp, "F16 type is supported only for SRF kind of machines");
+          contractOp, "F16/I8 type is supported only for SRF kind of machines");
 
     // Check the operation type MatMul, B-MatMul, or BR-MatMul
     SmallVector<vector::IteratorType> contractIteratorTypes =
@@ -328,7 +334,7 @@ struct MicroKernelsOp : OpRewritePattern<vector::ContractionOp> {
       return rewriter.notifyMatchFailure(
           contractOp, "Batch matmul operation not supported yet");
 
-    if (isBF16 || isF16) {
+    if (isPackedType) {
       if (reductionCount == 2)
         return rewriter.notifyMatchFailure(
             contractOp, "Batch reduce matmul operation without vnni layout");
@@ -360,14 +366,11 @@ struct MicroKernelsOp : OpRewritePattern<vector::ContractionOp> {
     int64_t K = 0;
     int64_t vnni = 0;
 
-    if (isBF16 || isF16) {
+    if (isPackedType) {
       M = lhsType.getDimSize(lhsType.getRank() - 3);
       N = rhsType.getDimSize(lhsType.getRank() - 2);
       K = lhsType.getDimSize(lhsType.getRank() - 2);
       vnni = lhsType.getDimSize(lhsType.getRank() - 1);
-      if (K != (vnni / 2))
-        return rewriter.notifyMatchFailure(
-            contractOp, "K tile size should be equal to VNNI layout");
 
       // TODO: We need the N tile size to be divisible by 16 for avx2
       // fallback case. So that it ensures, LLVM find a pattern and lowers to
@@ -376,9 +379,17 @@ struct MicroKernelsOp : OpRewritePattern<vector::ContractionOp> {
         return rewriter.notifyMatchFailure(
             contractOp, "N tile size divisible by 16 are only supported");
 
-      if (vnni != 2)
+      if (vnni != 2 && isBF16)
+        return rewriter.notifyMatchFailure(
+            contractOp, "Only VNNI layout=2 is supported for bf16, now");
+
+      if (vnni != 4 && isI8)
         return rewriter.notifyMatchFailure(
-            contractOp, "Only VNNI layout=2 is supported, now");
+            contractOp, "Only VNNI layout=4 is supported for i8, now");
+
+      if (K != (vnni / vnniFactor))
+        return rewriter.notifyMatchFailure(
+            contractOp, "K tile size should be equal to VNNI layout");
     }
 
     if (isF32) {
@@ -412,8 +423,8 @@ struct MicroKernelsOp : OpRewritePattern<vector::ContractionOp> {
     // matrix then broadcast A ony-by-one + FMA.
     // If N > M: perform opposite. Broadcast A matrix then load B one-by-
     // one + FMA.
-    // Following this kind of lowering, we reduce the register loads by 
-    // stacking the less B loads or less A broadcasts and do the larger B 
+    // Following this kind of lowering, we reduce the register loads by
+    // stacking the less B loads or less A broadcasts and do the larger B
     // loads or A broadcast in a LIFO manner. Finally, it helps in reducing
     // the probablity of register spills.
     bool mDriven = true;
@@ -491,7 +502,7 @@ struct MicroKernelsOp : OpRewritePattern<vector::ContractionOp> {
       }
     }
 
-    if (outsElementType.isF32()) {
+    if (outsElementType.isF32() || outsElementType.isSignlessInteger(32)) {
       for (int j = 0; j < N; j = j + sizeFactor) {
         for (int i = 0; i < M; i++) {
           Value indexOp_A = rewriter.create<arith::ConstantIndexOp>(
@@ -562,12 +573,22 @@ struct MicroKernelsOp : OpRewritePattern<vector::ContractionOp> {
                 auto i1Mask_2 = rewriter.create<arith::ConstantOp>(
                     kForOp.getLoc(), VectorType::get(2, rewriter.getI1Type()),
                     boolAttr_2);
-                auto zeroAttr = rewriter.getFloatAttr(elementType, 0.0);
+
+		// ZeroAttr is not needed for i8 type lowering on ARL machine,
+		// may be need in future for lowering on other machine.
+                FloatAttr zeroAttr;
+                if (!isI8) {
+                  zeroAttr = rewriter.getFloatAttr(elementType, 0.0);
+                }
 
                 // Destination type
                 mlir::VectorType dstType =
                     mlir::VectorType::get(sizeFactor, rewriter.getF32Type());
 
+                if (isI8)
+                  dstType =
+                      mlir::VectorType::get(sizeFactor, rewriter.getI32Type());
+
                 llvm::SmallVector<OpFoldResult> strides = {
                     rewriter.getIndexAttr(1), rewriter.getIndexAttr(1),
                     rewriter.getIndexAttr(1), rewriter.getIndexAttr(1)};
@@ -664,15 +685,16 @@ struct MicroKernelsOp : OpRewritePattern<vector::ContractionOp> {
 
                 // bf16 type + avx512. uKernel lowering for machines like
                 // cpx (zen5) to target avx512bf16dp.
-                if (bf16dp && isBF16) {
+                if (bf16dp || isI8) {
 
                   if (mDriven) { // M -> N
                     // Load elements of B matrix and store in a DS
                     for (int j = 0; j < N; j = j + sizeFactor) {
                       Value indexOp_j = rewriter.create<arith::ConstantIndexOp>(
                           reductionForOp.getLoc(), j);
                       auto valueRow = rewriterNewKForOp.create<vector::LoadOp>(
-                          kForOp.getLoc(), VectorType::get(32, elementType),
+                          kForOp.getLoc(),
+                          VectorType::get({sizeFactor * vnni}, elementType),
                           rhsClone->getResult(0),
                           ValueRange{indexOp_c0, indexOp_c0, indexOp_j,
                                      indexOp_c0});
@@ -700,15 +722,27 @@ struct MicroKernelsOp : OpRewritePattern<vector::ContractionOp> {
                       auto valuef32 =
                           rewriterNewKForOp.create<vector::BitCastOp>(
                               kForOp.getLoc(),
-                              VectorType::get(32,
-                                              rewriterNewKForOp.getBF16Type()),
+                              VectorType::get({sizeFactor * vnni}, elementType),
                               bcst_i32);
-                      for (int j = 0; j < (N / sizeFactor); j++) {
-                        auto dp = rewriter.create<mlir::x86vector::DotBF16Op>(
-                            kForOp.getLoc(), dstType,
-                            iterArgsNewKForOp[i + (j * M)], valuef32,
-                            matf32[j]);
-                        oddFMAs.push_back(dp);
+
+                      if (isBF16) {
+                        for (int j = 0; j < (N / sizeFactor); j++) {
+                          auto dp = rewriter.create<mlir::x86vector::DotBF16Op>(
+                              kForOp.getLoc(), dstType,
+                              iterArgsNewKForOp[i + (j * M)], valuef32,
+                              matf32[j]);
+                          oddFMAs.push_back(dp);
+                        }
+                      }
+
+                      if (isI8) {
+                        for (int j = 0; j < (N / sizeFactor); j++) {
+                          auto dp = rewriter.create<mlir::x86vector::DotInt8Op>(
+                              kForOp.getLoc(), dstType,
+                              iterArgsNewKForOp[i + (j * M)], valuef32,
+                              matf32[j]);
+                          oddFMAs.push_back(dp);
+                        }
                       }
                     }
 
@@ -743,8 +777,7 @@ struct MicroKernelsOp : OpRewritePattern<vector::ContractionOp> {
                       auto valuef32 =
                           rewriterNewKForOp.create<vector::BitCastOp>(
                               kForOp.getLoc(),
-                              VectorType::get(32,
-                                              rewriterNewKForOp.getBF16Type()),
+                              VectorType::get({sizeFactor * vnni}, elementType),
                               bcst_i32);
                       matf32.push_back(valuef32);
                     }
@@ -753,16 +786,30 @@ struct MicroKernelsOp : OpRewritePattern<vector::ContractionOp> {
                       Value indexOp_j = rewriter.create<arith::ConstantIndexOp>(
                           reductionForOp.getLoc(), j);
                       auto valueRow = rewriterNewKForOp.create<vector::LoadOp>(
-                          kForOp.getLoc(), VectorType::get(32, elementType),
+                          kForOp.getLoc(),
+                          VectorType::get({sizeFactor * vnni}, elementType),
                           rhsClone->getResult(0),
                           ValueRange{indexOp_c0, indexOp_c0, indexOp_j,
                                      indexOp_c0});
-                      for (int i = 0; i < M; i++) {
-                        auto dp = rewriter.create<mlir::x86vector::DotBF16Op>(
-                            kForOp.getLoc(), dstType, iterArgsNewKForOp[k],
-                            matf32[i], valueRow);
-                        k++;
-                        evenFMAs.push_back(dp);
+
+                      if (isBF16) {
+                        for (int i = 0; i < M; i++) {
+                          auto dp = rewriter.create<mlir::x86vector::DotBF16Op>(
+                              kForOp.getLoc(), dstType, iterArgsNewKForOp[k],
+                              matf32[i], valueRow);
+                          k++;
+                          evenFMAs.push_back(dp);
+                        }
+                      }
+
+                      if (isI8) {
+                        for (int i = 0; i < M; i++) {
+                          auto dp = rewriter.create<mlir::x86vector::DotInt8Op>(
+                              kForOp.getLoc(), dstType, iterArgsNewKForOp[k],
+                              matf32[i], valueRow);
+                          k++;
+                          evenFMAs.push_back(dp);
+                        }
                       }
                     }
                   }
@@ -905,7 +952,7 @@ struct MicroKernelsOp : OpRewritePattern<vector::ContractionOp> {
                 // (b) bf16 fallback + avx2 instructions.
                 // TODO: update lowering based on M & N. Now it is
                 // default to M -> N
-                if (srf || (fallback && avx2 && !avx512)) {
+                if ((srf && !isI8) || (fallback && avx2 && !avx512)) {
                   // Load odd elements of A Matrix and store in a DS
                   for (int i = 0; i < M; i++) {
                     Value oddA;
@@ -1228,7 +1275,7 @@ struct MicroKernelsOp : OpRewritePattern<vector::ContractionOp> {
 
     // get the 2nd input source for addOp via vector transfer read
     // ps: the 1st one is C matrix
-    if (addOp && maxOp && !isF32) {
+    if (addOp && maxOp && !isF32 && !isI8) {
       vector::TransferReadOp readOp_add;
       if (auto vectBcst = addOp.getLhs().getDefiningOp<vector::BroadcastOp>()) {
         if (auto vectorRead =
@@ -1268,7 +1315,7 @@ struct MicroKernelsOp : OpRewritePattern<vector::ContractionOp> {
         auto acc_value = newReductionForOp.getResult(k);
         k++;
 
-        if (addOp && maxOp && !isF32) {
+        if (addOp && maxOp && !isF32 && !isI8) {
           Value add_row;
 
           if (global_readOp) {
@@ -1360,7 +1407,7 @@ struct MicroKernelsOp : OpRewritePattern<vector::ContractionOp> {
         }
 
         // We do arith.tuncf for f32 -> bf16 in SRF/ARL/SPR kind of machines
-        if ((srf || bf16dp) && !outsElementType.isF32()) {
+        if ((srf || bf16dp) && !outsElementType.isF32() && !isI8) {
           vec_final = rewriter.create<arith::TruncFOp>(
               reductionForOp.getLoc(), VectorType::get(sizeFactor, type),
               acc_value);
 
@@ -14,6 +14,7 @@ set(TPP_OPT_TEST_DEPENDS
         tpp-run
         tpp-sched
         fpcmp
+	check-cpuid
         )
 
 add_lit_testsuite(check-tpp "Running the regression tests"
 
@@ -0,0 +1,40 @@
+import os
+import subprocess
+
+exec = getattr(config, "cpuid_checker", None)
+
+def is_vpdpbssd_supported():
+    if not exec or not os.path.exists(exec):
+        return False
+    try:
+        result = subprocess.run([exec], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
+        return result.returncode == 1
+    except Exception as e:
+        return False
+
+def is_arch(target):
+    # Arch detection not working on Windows
+    if sys.platform in ['win32']:
+        return False
+
+    try:
+        cmd = subprocess.Popen(
+            ['uname', '-m'], stdout=subprocess.PIPE)
+    except OSError:
+        return False
+
+    out = cmd.stdout.read().decode('ascii')
+    cmd.wait()
+
+    return target in out
+
+
+# Should skip the machine that has no vpdpbssd instruction support
+if not is_vpdpbssd_supported():
+    config.unsupported = True
+
+# Enable only on x86
+# Other targets may use different VNNI blocking scheme that is not compatible with
+# prepacked shapes in some of the tests
+if not is_arch('x86'):
+    config.unsupported = True
@@ -0,0 +1,15 @@
+// RUN: tpp-run  -e gemm_i8 --entry-point-result=void -print --splat-to-random --init-type normal  -seed 123  %s > %t.1
+// RUN: tpp-run  -e gemm_i8 --entry-point-result=void --vector-to-kernels --registerBlocking=3,32,4 -print  --splat-to-random --init-type normal  -seed 123 %s > %t.2
+// RUN: fpcmp -r 0.001 %t.1 %t.2
+
+func.func @gemm_i8(%arg0: memref<2x24x8x4xi8>, %arg1: memref<2x8x128x4xi8>, %arg2: memref<24x128xi32>) -> memref<24x128xi32> {
+    linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d2, d4, d1)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d4, d3, d1)>, affine_map<(d0, d1, d2, d3, d4) -> (d2, d3)>], iterator_types = ["reduction", "reduction", "parallel", "parallel", "reduction"]} ins(%arg0, %arg1 : memref<2x24x8x4xi8>, memref<2x8x128x4xi8>) outs(%arg2 : memref<24x128xi32>) {
+    ^bb0(%in: i8, %in_1: i8, %out: i32):
+        %0 = arith.extsi %in : i8 to i32
+        %1 = arith.extsi %in_1 : i8 to i32
+        %2 = arith.muli %0, %1 : i32
+        %3 = arith.addi %out, %2 : i32
+        linalg.yield %3 : i32
+    }
+  return %arg2 : memref<24x128xi32>
+}
Original file line number	Diff line number	Diff line change
`@@ -14,6 +14,7 @@ set(TPP_OPT_TEST_DEPENDS`
`14`	`14`	`tpp-run`
`15`	`15`	`tpp-sched`
`16`	`16`	`fpcmp`
	`17`	`+ check-cpuid`
`17`	`18`	`)`
`18`	`19`
`19`	`20`	`add_lit_testsuite(check-tpp "Running the regression tests"`