intel
diff --git a/‎lib/Conversion/TritonGPUToLLVM/Utility.cpp
Lines changed: 67 additions & 0 deletions b/‎lib/Conversion/TritonGPUToLLVM/Utility.cpp
Lines changed: 67 additions & 0 deletions
diff --git a/‎test/Conversion/cvt_to_llvm.mlir renamed to ‎test/Conversion/cvt_to_llvm.mlir.unsupported b/‎test/Conversion/cvt_to_llvm.mlir renamed to ‎test/Conversion/cvt_to_llvm.mlir.unsupported
diff --git a/‎test/Conversion/gather_to_llvm.mlir renamed to ‎test/Conversion/gather_to_llvm.mlir.unsupported b/‎test/Conversion/gather_to_llvm.mlir renamed to ‎test/Conversion/gather_to_llvm.mlir.unsupported
diff --git a/‎test/TritonIntelGPU/tritonintlgpu-nested-layout.mlir
Lines changed: 18 additions & 24 deletions b/‎test/TritonIntelGPU/tritonintlgpu-nested-layout.mlir
Lines changed: 18 additions & 24 deletions
@@ -95,6 +95,53 @@ LLVM::LLVMFuncOp appendOrGetExternFuncOp(RewriterBase &rewriter, Operation *op,
                               StringAttr::get(op->getContext(), libpath));
   return ret;
 }
+
+Value matrixVectorProd(TritonLLVMOpBuilder &b, const LinearLayout &A, Value x) {
+  assert(A.getNumInDims() == 1);
+  assert(A.getNumOutDims() == 1);
+  auto flatten = [](const std::vector<std::vector<int32_t>> &matrix) {
+    SmallVector<int32_t> ret;
+    for (const auto &row : matrix) {
+      ret.push_back(row[0]);
+    }
+    return ret;
+  };
+  auto nCol = A.getTotalInDimSizeLog2();
+  auto nRow = A.getTotalOutDimSizeLog2();
+  SmallVector<int32_t> matrix = flatten(A.getBases().begin()->second);
+  assert(matrix.size() == nCol);
+  // We iterate the matrix following the diagonals
+  // The idea here is that we want to generate code of the form:
+  // \xor_i (x & mask_i) << s_i
+  // where s_i may by positive or negative (left or right shift)
+  // The hope here (and we see it in codegen) is that LLVM can turn
+  // the xor into a sum and then the sum + LHS/RHS can be fused into a mad.lo
+  // Get the i-th diagonal
+  auto getMask = [&](int i) {
+    uint32_t mask = 0;
+    int row = i < 0 ? -i : 0;
+    int col = i < 0 ? 0 : i;
+    while (row < nRow && col < nCol) {
+      uint32_t bitValue = (matrix[col] >> row) & 1u;
+      mask |= bitValue << col;
+      ++row;
+      ++col;
+    }
+    return mask;
+  };
+
+  Value ret = b.i32_val(0);
+  for (int i = -nRow + 1; i < nCol; i++) {
+    auto mask = getMask(i);
+    if (mask == 0)
+      continue;
+    auto masked = b.and_(x, b.i32_val(mask));
+    ret = b.xor_(ret, i >= 0 ? Value(b.lshr(masked, b.i32_val(i)))
+                             : Value(b.shl(masked, b.i32_val(-i))));
+  }
+  return ret;
+}
+
 } // namespace triton::gpu
 
 SmallVector<std::pair<StringAttr, Value>>
@@ -115,12 +162,14 @@ applyLinearLayout(Location loc, RewriterBase &rewriter,
 
   // Manually constant-fold the layout where possible.
   SmallVector<std::pair<StringAttr, int32_t>> constantIns;
+  SmallVector<std::pair<StringAttr, Value>> nonConstantIns;
   for (auto [inDimName, idx] : indices) {
     if (auto constant = idx.getDefiningOp<LLVM::ConstantOp>()) {
       constantIns.push_back(
           {inDimName, cast<IntegerAttr>(constant.getValue()).getInt()});
     } else {
       constantIns.push_back({inDimName, 0});
+      nonConstantIns.push_back({inDimName, idx});
     }
   }
   SmallVector<int32_t> constantComponent =
@@ -134,6 +183,24 @@ applyLinearLayout(Location loc, RewriterBase &rewriter,
     else
       outIndices.push_back({outDimName, b.i32_val(constantComponent[i])});
   }
+  // Happy path: Only one output.
+  if (outIndices.size() == 1) {
+    SmallVector<StringAttr> inDimNames;
+    // Concatenate input
+    Value x = b.i32_val(0);
+    int shift = 0;
+    for (auto [inDimName, idx] : nonConstantIns) {
+      inDimNames.push_back(inDimName);
+      x = b.or_(x, b.shl(idx, b.i32_val(shift)));
+      shift += layout.getInDimSizeLog2(inDimName);
+    }
+    // Flatten ins
+    auto matrix = layout.sublayout(inDimNames, outIndices[0].first);
+    matrix = matrix.flattenIns();
+    auto out = triton::gpu::matrixVectorProd(b, matrix, x);
+    outIndices[0].second = b.xor_(outIndices[0].second, out);
+    return outIndices;
+  }
 
   for (auto [inDimName, idx] : indices) {
     if (idx.getDefiningOp<LLVM::ConstantOp>()) {
 
@@ -179,7 +179,6 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.targ
     // CHECK-DAG:           %[[CST_5:.*]] = llvm.mlir.constant(5 : i32) : i32
     // CHECK-DAG:           %[[CST_6:.*]] = llvm.mlir.constant(6 : i32) : i32
     // CHECK-DAG:           %[[CST_7:.*]] = llvm.mlir.constant(7 : i32) : i32
-    // CHECK-DAG:           %[[CST_8:.*]] = llvm.mlir.constant(8 : i32) : i32
     // CHECK-DAG:           %[[CST_16:.*]] = llvm.mlir.constant(16 : i32) : i32
     // CHECK-DAG:           %[[CST_17:.*]] = llvm.mlir.constant(17 : i32) : i32
     // CHECK-DAG:           %[[CST_18:.*]] = llvm.mlir.constant(18 : i32) : i32
@@ -188,29 +187,24 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.targ
     // CHECK-DAG:           %[[CST_21:.*]] = llvm.mlir.constant(21 : i32) : i32
     // CHECK-DAG:           %[[CST_22:.*]] = llvm.mlir.constant(22 : i32) : i32
     // CHECK-DAG:           %[[CST_23:.*]] = llvm.mlir.constant(23 : i32) : i32
-    // CHECK:           %[[THREADS_ID:.*]] = llvm.call spir_funccc @_Z12get_local_idj(%[[CST_0]])
-    // CHECK:           %[[THREADS_ID_32:.*]] = llvm.trunc %[[THREADS_ID]] : i64 to i32
-    // CHECK:           %[[WARP_ID:.*]] = llvm.udiv %[[THREADS_ID_32]], %[[CST_16]]  : i32
-    // CHECK:           %[[VAL_26:.*]] = llvm.and %[[WARP_ID]], %[[CST_2]] : i32
-    // CHECK:           %[[VAL_27:.*]] = llvm.icmp "eq" %[[VAL_26]], %[[CST_0]] : i32
-    // CHECK:           %[[VAL_28:.*]] = llvm.select %[[VAL_27]], %[[CST_0]], %[[CST_8]] : i1, i32
-    // CHECK:           %[[VAL_29:.*]] = llvm.xor %[[CST_0]], %[[VAL_28]] : i32
-    // CHECK:           %[[OFFSET_X_0:.*]] = llvm.xor %[[VAL_29]], %[[CST_0]] : i32
-    // CHECK:           %[[OFFSET_X_1:.*]] = llvm.xor %[[VAL_29]], %[[CST_1]] : i32
-    // CHECK:           %[[OFFSET_X_2:.*]] = llvm.xor %[[VAL_29]], %[[CST_2]] : i32
-    // CHECK:           %[[OFFSET_X_3:.*]] = llvm.xor %[[VAL_29]], %[[CST_3]] : i32
-    // CHECK:           %[[OFFSET_X_4:.*]] = llvm.xor %[[VAL_29]], %[[CST_4]] : i32
-    // CHECK:           %[[OFFSET_X_5:.*]] = llvm.xor %[[VAL_29]], %[[CST_5]] : i32
-    // CHECK:           %[[OFFSET_X_6:.*]] = llvm.xor %[[VAL_29]], %[[CST_6]] : i32
-    // CHECK:           %[[OFFSET_X_7:.*]] = llvm.xor %[[VAL_29]], %[[CST_7]] : i32
-    // CHECK:           %[[OFFSET_X_8:.*]] = llvm.xor %[[VAL_29]], %[[CST_16]] : i32
-    // CHECK:           %[[OFFSET_X_9:.*]] = llvm.xor %[[VAL_29]], %[[CST_17]] : i32
-    // CHECK:           %[[OFFSET_X_10:.*]] = llvm.xor %[[VAL_29]], %[[CST_18]] : i32
-    // CHECK:           %[[OFFSET_X_11:.*]] = llvm.xor %[[VAL_29]], %[[CST_19]] : i32
-    // CHECK:           %[[OFFSET_X_12:.*]] = llvm.xor %[[VAL_29]], %[[CST_20]] : i32
-    // CHECK:           %[[OFFSET_X_13:.*]] = llvm.xor %[[VAL_29]], %[[CST_21]] : i32
-    // CHECK:           %[[OFFSET_X_14:.*]] = llvm.xor %[[VAL_29]], %[[CST_22]] : i32
-    // CHECK:           %[[OFFSET_X_15:.*]] = llvm.xor %[[VAL_29]], %[[CST_23]] : i32
+    // CHECK:           %[[VAL_34:.*]] = llvm.xor {{.*}} : i32
+    // CHECK:           %[[VAL_35:.*]] = llvm.xor %[[CST_0]], %[[VAL_34]] : i32
+    // CHECK:           %[[OFFSET_X_0:.*]] = llvm.xor %[[VAL_35]], %[[CST_0]] : i32
+    // CHECK:           %[[OFFSET_X_1:.*]] = llvm.xor %[[VAL_35]], %[[CST_1]] : i32
+    // CHECK:           %[[OFFSET_X_2:.*]] = llvm.xor %[[VAL_35]], %[[CST_2]] : i32
+    // CHECK:           %[[OFFSET_X_3:.*]] = llvm.xor %[[VAL_35]], %[[CST_3]] : i32
+    // CHECK:           %[[OFFSET_X_4:.*]] = llvm.xor %[[VAL_35]], %[[CST_4]] : i32
+    // CHECK:           %[[OFFSET_X_5:.*]] = llvm.xor %[[VAL_35]], %[[CST_5]] : i32
+    // CHECK:           %[[OFFSET_X_6:.*]] = llvm.xor %[[VAL_35]], %[[CST_6]] : i32
+    // CHECK:           %[[OFFSET_X_7:.*]] = llvm.xor %[[VAL_35]], %[[CST_7]] : i32
+    // CHECK:           %[[OFFSET_X_8:.*]] = llvm.xor %[[VAL_35]], %[[CST_16]] : i32
+    // CHECK:           %[[OFFSET_X_9:.*]] = llvm.xor %[[VAL_35]], %[[CST_17]] : i32
+    // CHECK:           %[[OFFSET_X_10:.*]] = llvm.xor %[[VAL_35]], %[[CST_18]] : i32
+    // CHECK:           %[[OFFSET_X_11:.*]] = llvm.xor %[[VAL_35]], %[[CST_19]] : i32
+    // CHECK:           %[[OFFSET_X_12:.*]] = llvm.xor %[[VAL_35]], %[[CST_20]] : i32
+    // CHECK:           %[[OFFSET_X_13:.*]] = llvm.xor %[[VAL_35]], %[[CST_21]] : i32
+    // CHECK:           %[[OFFSET_X_14:.*]] = llvm.xor %[[VAL_35]], %[[CST_22]] : i32
+    // CHECK:           %[[OFFSET_X_15:.*]] = llvm.xor %[[VAL_35]], %[[CST_23]] : i32
     // CHECK:           %[[VAL_56:.*]] = llvm.call spir_funccc @_Z18__spirv_ocl_printf({{.*}}, {{.*}}, {{.*}}, {{.*}}, %[[OFFSET_X_0]], {{.*}}, {{.*}})
     // CHECK:           %[[VAL_57:.*]] = llvm.call spir_funccc @_Z18__spirv_ocl_printf({{.*}}, {{.*}}, {{.*}}, {{.*}}, %[[OFFSET_X_1]], {{.*}}, {{.*}})
     // CHECK:           %[[VAL_58:.*]] = llvm.call spir_funccc @_Z18__spirv_ocl_printf({{.*}}, {{.*}}, {{.*}}, {{.*}}, %[[OFFSET_X_2]], {{.*}}, {{.*}})