libxsmm
diff --git a/‎lib/TPP/PassBundles/VectorToKernel.cpp‎
Lines changed: 2 additions & 7 deletions b/‎lib/TPP/PassBundles/VectorToKernel.cpp‎
Lines changed: 2 additions & 7 deletions
diff --git a/‎lib/TPP/Transforms/TransformUtils.cpp‎
Lines changed: 7 additions & 3 deletions b/‎lib/TPP/Transforms/TransformUtils.cpp‎
Lines changed: 7 additions & 3 deletions
diff --git a/‎lib/TPP/Transforms/Utils/VNNIUtils.cpp‎
Lines changed: 7 additions & 3 deletions b/‎lib/TPP/Transforms/Utils/VNNIUtils.cpp‎
Lines changed: 7 additions & 3 deletions
@@ -52,17 +52,12 @@ struct VectorToKernel : public tpp::impl::VectorToKernelBase<VectorToKernel>,
 
 private:
   void constructPipeline() override {
-    // TODO: Pass ordering based on target architecture starting from AMX ->
-    // avx512 -> avx2 to subset needs to be improved by updating the `k`
-    // tile size check for AMX lowering. With k = 1 (or vnni size) AMX fails
-    // lowering to micro-kernels on EMR. Bf16DotProduct tests with k = 1 
-    // and those tests gets lowered by AMX pass on EMR machine.
     pm.addNestedPass<func::FuncOp>(createHoistVectorTransfers());
+    if (vnni::utils::hasAMX())
+      pm.addNestedPass<func::FuncOp>(createVectorContractToAMX());
     MicroKernelsOptions options;
     options.targetFeature = vecBundleCpuTargetFeature;
     pm.addNestedPass<func::FuncOp>(createMicroKernels(options));
-    if (vnni::utils::hasAMX())
-      pm.addNestedPass<func::FuncOp>(createVectorContractToAMX());
     pm.addNestedPass<func::FuncOp>(createCanonicalizerPass());
   }
 };
@@ -268,9 +268,13 @@ isContraction(linalg::LinalgOp linalgOp) {
       .operation(NumDpsInits(EqualsTo(1)))
       .operation(NumDpsInputs(EqualsTo(2)))
       .operation(NumAffineMaps(EqualsTo(3)))
-      .region(MatchOne(0),
-            WithOpChain<arith::MulFOp,
-                        arith::AddFOp>(/*captures=*/nullptr));
+      .region(MatchOne(0), [&](Region *region, Operation *op) {
+        return WithOpChain<KindMul, KindAdd>(/*captures=*/nullptr)(region, op) ||
+               WithOpChain<arith::ExtFOp,
+                arith::ExtFOp, KindMul, KindAdd>(nullptr)(region, op) ||
+               WithOpChain<arith::ExtSIOp,
+                arith::ExtSIOp, KindMul, KindAdd>(nullptr)(region, op);
+      });
   // clang-format on
   if (!maybeContraction.match(linalgOp))
     return failure();
 
@@ -57,14 +57,17 @@ unsigned getVnniBlockingFactor(Type type, Operation *op) {
   unsigned blockingFactor = 0;
 
   auto elementType = getElementTypeOrSelf(type);
-  if (elementType.isBF16()) {
+  if (elementType.isBF16() || elementType.isInteger(8)) {
     // Check if a VNNI factor hint is associated to the IR via DLTI.
     auto vnniValue = dlti::utils::query(op, {"CPU", "vnni"});
     if (succeeded(vnniValue)) {
       if (auto intAttr = llvm::dyn_cast<IntegerAttr>(*vnniValue))
         blockingFactor = intAttr.getInt();
     } else {
-      blockingFactor = libxsmm_cpuid_dot_pack_factor(LIBXSMM_DATATYPE_BF16);
+      blockingFactor =
+          elementType.isBF16()
+              ? libxsmm_cpuid_dot_pack_factor(LIBXSMM_DATATYPE_BF16)
+              : libxsmm_cpuid_dot_pack_factor(LIBXSMM_DATATYPE_I8);
     }
   }
 
@@ -177,7 +180,8 @@ bool isInVnniLayout(VnniOperandRank expectedRank, ShapedType shape,
 
 bool isInVnniLayout(int64_t expectedRank, ShapedType shape,
                     std::optional<unsigned> blockingFactor) {
-  if (shape.getRank() != expectedRank || !shape.getElementType().isBF16())
+  if (shape.getRank() != expectedRank ||
+      !(shape.getElementType().isBF16() || shape.getElementType().isInteger(8)))
     return false;
 
   auto vnniDim = shape.getShape().back();