intel
diff --git a/‎include/triton/Conversion/TritonGPUToLLVM/Utility.h‎
Lines changed: 3 additions & 2 deletions b/‎include/triton/Conversion/TritonGPUToLLVM/Utility.h‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎lib/Conversion/TritonGPUToLLVM/Utility.cpp‎
Lines changed: 74 additions & 25 deletions b/‎lib/Conversion/TritonGPUToLLVM/Utility.cpp‎
Lines changed: 74 additions & 25 deletions
diff --git a/‎python/test/conftest.py‎
Lines changed: 4 additions & 4 deletions b/‎python/test/conftest.py‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎python/test/unit/language/test_core.py‎
Lines changed: 20 additions & 0 deletions b/‎python/test/unit/language/test_core.py‎
Lines changed: 20 additions & 0 deletions
diff --git a/‎python/test/unit/runtime/test_driver.py‎
Lines changed: 4 additions & 4 deletions b/‎python/test/unit/runtime/test_driver.py‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎python/triton/_internal_testing.py‎
Lines changed: 6 additions & 1 deletion b/‎python/triton/_internal_testing.py‎
Lines changed: 6 additions & 1 deletion
diff --git a/‎python/triton/compiler/compiler.py‎
Lines changed: 9 additions & 4 deletions b/‎python/triton/compiler/compiler.py‎
Lines changed: 9 additions & 4 deletions
@@ -571,8 +571,9 @@ SmallVector<Value> lowerLdSt(
     ArrayRef<Value> valsArray, // Input for store, output for load
     Type llvmElemTy, Value smemBase,
     std::function<Value(Value)> calcPaddedOffset, Value affineOffset,
-    uint64_t maskSpanAffineOffset, RewriterBase &rewriter,
-    const TargetInfoBase &targetInfo, std::optional<int> maybeMaxVecElems,
+    uint64_t maskSpanAffineOffset, Value laneId, Value warpId,
+    RewriterBase &rewriter, const TargetInfoBase &targetInfo,
+    std::optional<int> maybeMaxVecElems,
     std::function<SmallVector<Value>(RewriterBase &, Location, ArrayRef<Value>,
                                      Value, int, VectorType)>
         lowerInst);
 
@@ -158,39 +158,54 @@ Value matrixVectorProd(TritonLLVMOpBuilder &b, const LinearLayout &A, Value x) {
   SmallVector<int32_t> matrix = flatten(A.getBases().begin()->second);
   assert(matrix.size() == nCol);
 
-  // We iterate the matrix following the diagonals
-  // The idea here is that we want to generate code of the form:
-  // \xor_i (x & mask_i) << s_i
-  // where s_i may by positive or negative (left or right shift)
-  // The hope here (and we see it in codegen) is that LLVM can turn
-  // the xor into a sum and then the sum + LHS/RHS can be fused into a mad.lo
-  // Get the i-th diagonal
-  auto getMask = [&](int i) {
+  // Row-wise popcount to detect rows that appear exactly once across columns.
+  uint32_t rowsUnique = 0;
+  {
+    SmallVector<int> rowPopCnt(nRow, 0);
+    for (int c = 0; c < nCol; ++c) {
+      uint32_t colBits = matrix[c];
+      for (int r = 0; r < nRow; ++r) {
+        if (colBits & (1u << r))
+          ++rowPopCnt[r];
+      }
+    }
+    for (int r = 0; r < nRow; ++r) {
+      if (rowPopCnt[r] == 1)
+        rowsUnique |= 1u << r;
+    }
+  }
+
+  // We iterate the matrix following the diagonals and build
+  // (x & mask_i) << s_i terms. Prefer OR for diagonals whose rows are unique,
+  // then XOR everything else. This tends to encourage mad.lo codegen.
+  auto getMaskAndAllRowsUnique = [&](int i) -> std::pair<uint32_t, bool> {
     uint32_t mask = 0;
     int row = i < 0 ? -i : 0;
     int col = i < 0 ? 0 : i;
+    bool allRowsUnique = true;
     while (row < nRow && col < nCol) {
       uint32_t bitValue = (matrix[col] >> row) & 1u;
       mask |= bitValue << col;
+      allRowsUnique &= ((rowsUnique >> row) & 1u) == 1u;
       ++row;
       ++col;
     }
-    return mask;
+    return {mask, allRowsUnique};
   };
 
   uint32_t explicitCols = 0;
 
   {
     SmallVector<uint32_t> masks;
     for (int i = -nRow + 1; i < nCol; i++) {
-      masks.push_back(getMask(i));
+      masks.push_back(std::get<0>(getMaskAndAllRowsUnique(i)));
     }
     bool reachedFixedPoint = false;
     while (!reachedFixedPoint) {
       reachedFixedPoint = true;
       for (uint32_t m : masks) {
         uint32_t c = m & ~explicitCols;
-        if ((c != 0) && ((c & (c - 1)) == 0)) {
+        if (llvm::isPowerOf2_32(c)) {
           // found a single-element diagonal
           explicitCols |= c;
           reachedFixedPoint = false;
@@ -200,14 +215,21 @@ Value matrixVectorProd(TritonLLVMOpBuilder &b, const LinearLayout &A, Value x) {
   }
 
   // handle any diagonals that have survived
-  Value ret = b.i32_val(0);
+  SmallVector<Value> ors;
+  SmallVector<Value> xors;
   for (int i = -nRow + 1; i < nCol; i++) {
-    auto mask = getMask(i) & ~explicitCols;
+    auto [mask, allRowsUnique] = getMaskAndAllRowsUnique(i);
+    mask &= ~explicitCols;
     if (mask == 0)
       continue;
     auto masked = b.and_(x, b.i32_val(mask));
-    ret = b.xor_(ret, i >= 0 ? Value(b.lshr(masked, b.i32_val(i)))
-                             : Value(b.shl(masked, b.i32_val(-i))));
+    auto shifted = i >= 0 ? Value(b.lshr(masked, b.i32_val(i)))
+                          : Value(b.shl(masked, b.i32_val(-i)));
+    if (allRowsUnique) {
+      ors.push_back(shifted);
+    } else {
+      xors.push_back(shifted);
+    }
   }
 
   // handle any explicit columns:
@@ -219,10 +241,35 @@ Value matrixVectorProd(TritonLLVMOpBuilder &b, const LinearLayout &A, Value x) {
       int32_t basis = matrix[i];
       if (basis == 0)
         continue;
-      ret = b.xor_(ret, b.select(bit_is_zero, zero, b.i32_val(basis)));
+      auto select = b.select(bit_is_zero, zero, b.i32_val(basis));
+      if ((rowsUnique & basis) == basis) {
+        ors.push_back(select);
+      } else {
+        xors.push_back(select);
+      }
     }
   }
-  return ret;
+
+  auto treeReduce = [&](SmallVector<Value> &terms,
+                        std::function<Value(Value, Value)> op) -> Value {
+    if (terms.empty())
+      return b.i32_val(0);
+    while (terms.size() > 1) {
+      SmallVector<Value> next;
+      for (size_t i = 0; i + 1 < terms.size(); i += 2)
+        next.push_back(op(terms[i], terms[i + 1]));
+      if (terms.size() % 2 == 1)
+        next.push_back(terms.back());
+      terms = std::move(next);
+    }
+    return terms[0];
+  };
+
+  auto orPart = treeReduce(
+      ors, [&b](Value x, Value y) { return b.or_(x, y, /*disjoint=*/true); });
+  auto xorPart =
+      treeReduce(xors, [&b](Value x, Value y) { return b.xor_(x, y); });
+  return b.or_(orPart, xorPart, /*disjoint=*/true);
 }
 
 } // namespace triton::gpu
@@ -542,18 +589,20 @@ lowerLdStShared(Location loc, MLIRContext *ctx, LinearLayout cvt,
       return unpackLLVector(loc, valsVec, rewriter);
     }
   };
+  auto [laneId, warpId] = getLaneAndWarpId(rewriter, loc);
   return lowerLdSt(loc, ctx, cvt, valsArray, llvmElemTy, smemBase,
-                   calcPaddedOffset, affineOffset, maskSpanAffineOffset,
-                   rewriter, targetInfo, {}, emitLdSt);
+                   calcPaddedOffset, affineOffset, maskSpanAffineOffset, laneId,
+                   warpId, rewriter, targetInfo, {}, emitLdSt);
 }
 
 SmallVector<Value> lowerLdSt(
     Location loc, MLIRContext *ctx, LinearLayout cvt,
     ArrayRef<Value> valsArray, // Input for store, output for load
     Type llvmElemTy, Value smemBase,
     std::function<Value(Value)> calcPaddedOffset, Value affineOffset,
-    uint64_t maskSpanAffineOffset, RewriterBase &rewriter,
-    const TargetInfoBase &targetInfo, std::optional<int> maybeMaxVecElems,
+    uint64_t maskSpanAffineOffset, Value laneId, Value warpId,
+    RewriterBase &rewriter, const TargetInfoBase &targetInfo,
+    std::optional<int> maybeMaxVecElems,
     std::function<SmallVector<Value>(RewriterBase &, Location, ArrayRef<Value>,
                                      Value, int, VectorType)>
         lowerInst) {
@@ -599,7 +648,6 @@ SmallVector<Value> lowerLdSt(
       zerosLike(LinearLayout::identity1D(bitwidth / 8, kReg, kOffset));
   auto i8AddrLayout = i8Tile * addrLayout;
 
-  auto [laneId, warpId] = getLaneAndWarpId(rewriter, loc);
   auto regBaseI8 =
       applyLinearLayout(
           loc, rewriter, i8AddrLayout,
@@ -2022,16 +2070,17 @@ void finalizeTensorAtomicResults(Operation *op, RankedTensorType tensorTy,
   };
 
   auto noPaddingOffset = [](Value v) { return v; };
+  auto [laneId, warpId] = getLaneAndWarpId(rewriter, loc);
   lowerLdSt(loc, ctx, dstLayout, resultVals, valueElemTy, smemBase,
             /*calcPaddedOffset=*/noPaddingOffset, /*affineOffset=*/b.i32_val(0),
-            /*maskSpanAffineOffset=*/0, rewriter, targetInfo,
+            /*maskSpanAffineOffset=*/0, laneId, warpId, rewriter, targetInfo,
             /*maybeMaxVecElems=*/{}, emitSt);
   b.barrier();
   resultVals = lowerLdSt(loc, ctx, dstLayout, resultVals, valueElemTy, smemBase,
                          /*calcPaddedOffset=*/noPaddingOffset,
                          /*affineOffset=*/b.i32_val(0),
-                         /*maskSpanAffineOffset=*/0, rewriter, targetInfo,
-                         /*maybeMaxVecElems=*/{}, emitLd);
+                         /*maskSpanAffineOffset=*/0, laneId, warpId, rewriter,
+                         targetInfo, /*maybeMaxVecElems=*/{}, emitLd);
 
   // Create the result struct and replace the operation
   Value resultStruct =
 
@@ -108,24 +108,24 @@ def fresh_cache():
 
 
 @pytest.fixture
-def fresh_knobs(monkeypatch):
+def fresh_knobs():
     from triton._internal_testing import _fresh_knobs_impl
-    fresh_function, reset_function = _fresh_knobs_impl(monkeypatch)
+    fresh_function, reset_function = _fresh_knobs_impl()
     try:
         yield fresh_function()
     finally:
         reset_function()
 
 
 @pytest.fixture
-def fresh_knobs_except_libraries(monkeypatch):
+def fresh_knobs_except_libraries():
     """
     A variant of `fresh_knobs` that keeps library path
     information from the environment as these may be
     needed to successfully compile kernels.
     """
     from triton._internal_testing import _fresh_knobs_impl
-    fresh_function, reset_function = _fresh_knobs_impl(monkeypatch, skipped_attr={"build", "nvidia", "amd"})
+    fresh_function, reset_function = _fresh_knobs_impl(skipped_attr={"build", "nvidia", "amd"})
     try:
         yield fresh_function()
     finally:
 
@@ -7325,6 +7325,26 @@ def simple(data, out):
         assert amdgcn_gfx.group(1) == arch
 
 
+def test_num_ctas_pre_sm90(device):
+    if not is_cuda() and not is_hip():
+        pytest.skip("Only supported on CUDA and HIP")
+
+    @triton.jit
+    def _kernel(src):
+        pass
+
+    src = torch.empty(1, device=device)
+    if is_cuda():
+        arch = "sm80"
+        msg = r"num_ctas > 1 requires NVIDIA SM90\+ \(Hopper\)"
+    else:
+        arch = "gfx942"
+        msg = r"num_ctas > 1 not supported for AMD GPUs"
+
+    with pytest.raises(ValueError, match=msg):
+        _kernel.warmup(src, grid=(1, ), num_ctas=2, arch=arch)
+
+
 # -----------------------
 # test propagate_nan
 # -----------------------
 
@@ -10,11 +10,11 @@ def test_is_lazy():
     from importlib import reload
     reload(sys.modules["triton.runtime.driver"])
     reload(sys.modules["triton.runtime"])
-    mod = sys.modules[triton.runtime.driver.__module__]
-    assert isinstance(triton.runtime.driver.active, getattr(mod, "LazyProxy"))
-    assert triton.runtime.driver.active._obj is None
+    assert triton.runtime.driver._active is None
+    assert triton.runtime.driver._default is None
+    assert isinstance(triton.runtime.driver.active, getattr(triton.backends.driver, "DriverBase"))
+    assert isinstance(triton.runtime.driver.default, getattr(triton.backends.driver, "DriverBase"))
     utils = triton.runtime.driver.active.utils  # noqa: F841
-    assert issubclass(triton.runtime.driver.active._obj.__class__, getattr(triton.backends.driver, "DriverBase"))
 
 
 def test_kernel_in_thread(device):
 
@@ -204,12 +204,14 @@ def unwrap_tensor(t: Union[torch.Tensor, triton.runtime.jit.TensorWrapper]) -> t
     return t
 
 
-def _fresh_knobs_impl(monkeypatch, skipped_attr: Optional[Set[str]] = None):
+def _fresh_knobs_impl(skipped_attr: Optional[Set[str]] = None):
     from triton import knobs
 
     if skipped_attr is None:
         skipped_attr = set()
 
+    monkeypatch = pytest.MonkeyPatch()
+
     knobs_map = {
         name: knobset
         for name, knobset in knobs.__dict__.items()
@@ -237,6 +239,9 @@ def fresh_function():
     def reset_function():
         for name, knobset in knobs_map.items():
             setattr(knobs, name, knobset)
+        # `undo` should be placed before `del os.environ`
+        # Otherwise, it may restore environment variables that monkeypatch deleted
+        monkeypatch.undo()
         for k in env_to_unset:
             if k in os.environ:
                 del os.environ[k]
 
@@ -442,13 +442,14 @@ def __init__(self, src, metadata_group, hash):
         # (e.g., checking amount of shared memory on current device)
         self.module = None
         self.function = None
+        self._run = None
 
     def _init_handles(self):
         if self.module is not None:
             return
         device = driver.active.get_current_device()
         # create launcher
-        self.run = driver.active.launcher_cls(self.src, self.metadata)
+        self._run = driver.active.launcher_cls(self.src, self.metadata)
         # not enough shared memory to run the kernel
         max_shared = max_shared_mem(device)
         if self.metadata.shared > max_shared:
@@ -469,10 +470,14 @@ def _init_handles(self):
         if self.metadata.num_warps * warp_size > self.n_max_threads:
             raise OutOfResources(self.metadata.num_warps * warp_size, self.n_max_threads, "threads")
 
-    def __getattribute__(self, name):
-        if name == 'run':
+    @property
+    def run(self):
+        # it should be safe to do this as launch_metadata will
+        # call _init_handles before running the kernel or it
+        # was called manually or it was already initialized
+        if self._run is None:
             self._init_handles()
-        return super().__getattribute__(name)
+        return self._run
 
     def launch_metadata(self, grid, stream, *args):
         if knobs.runtime.launch_enter_hook is None: