intel
diff --git a/‎include/triton/Dialect/TritonGPU/IR/TritonGPUOps.td‎
Lines changed: 8 additions & 0 deletions b/‎include/triton/Dialect/TritonGPU/IR/TritonGPUOps.td‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎lib/Dialect/TritonGPU/IR/Ops.cpp‎
Lines changed: 24 additions & 16 deletions b/‎lib/Dialect/TritonGPU/IR/Ops.cpp‎
Lines changed: 24 additions & 16 deletions
diff --git a/‎lib/Dialect/TritonGPU/Transforms/Pipeliner/PipeliningUtility.cpp‎
Lines changed: 10 additions & 0 deletions b/‎lib/Dialect/TritonGPU/Transforms/Pipeliner/PipeliningUtility.cpp‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎lib/Dialect/TritonGPU/Transforms/WarpSpecialization/AutomaticWarpSpecialization.cpp‎
Lines changed: 2 additions & 2 deletions b/‎lib/Dialect/TritonGPU/Transforms/WarpSpecialization/AutomaticWarpSpecialization.cpp‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎python/test/gluon/test_core.py‎
Lines changed: 23 additions & 0 deletions b/‎python/test/gluon/test_core.py‎
Lines changed: 23 additions & 0 deletions
diff --git a/‎python/test/unit/runtime/test_autotuner.py‎
Lines changed: 29 additions & 0 deletions b/‎python/test/unit/runtime/test_autotuner.py‎
Lines changed: 29 additions & 0 deletions
diff --git a/‎python/triton/runtime/autotuner.py‎
Lines changed: 10 additions & 3 deletions b/‎python/triton/runtime/autotuner.py‎
Lines changed: 10 additions & 3 deletions
diff --git a/‎python/triton/runtime/errors.py‎
Lines changed: 10 additions & 0 deletions b/‎python/triton/runtime/errors.py‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎python/triton_kernels/tests/test_matmul.py‎
Lines changed: 57 additions & 2 deletions b/‎python/triton_kernels/tests/test_matmul.py‎
Lines changed: 57 additions & 2 deletions
diff --git a/‎python/triton_kernels/triton_kernels/matmul_ogs.py‎
Lines changed: 1 addition & 1 deletion b/‎python/triton_kernels/triton_kernels/matmul_ogs.py‎
Lines changed: 1 addition & 1 deletion
@@ -419,6 +419,14 @@ def TTG_Fp4ToFpOp : TTG_Op<"fp4_to_fp", [Pure]> {
   let arguments = (ins RankedTensorOf<[I8]>:$src, I32Attr:$axis);
   let results = (outs TT_FloatTensor:$result);
 
+  let extraClassDeclaration = [{
+      static LogicalResult verifyFp4ToFp(
+        mlir::Operation *op,
+        RankedTensorType srcTy,
+        RankedTensorType resTy,
+        unsigned axis);
+  }];
+
   let assemblyFormat = [{
     $src attr-dict `:` type($src) `->` type($result)
   }];
 
@@ -378,36 +378,44 @@ void ConvertLayoutOp::getCanonicalizationPatterns(RewritePatternSet &patterns,
 LogicalResult Fp4ToFpOp::verify() {
   auto srcTy = cast<RankedTensorType>(getSrc().getType());
   auto resTy = cast<RankedTensorType>(getResult().getType());
+  auto axis = getAxis();
+
+  auto elemType = resTy.getElementType();
+  if (!(elemType.isBF16() || elemType.isF16()))
+    return emitError() << "only bf16 or f16 is supported for now, got "
+                       << elemType;
+
+  return verifyFp4ToFp(*this, srcTy, resTy, axis);
+}
+
+LogicalResult Fp4ToFpOp::verifyFp4ToFp(mlir::Operation *op,
+                                       RankedTensorType srcTy,
+                                       RankedTensorType resTy, unsigned axis) {
   auto rank = srcTy.getRank();
 
   if (rank != resTy.getRank())
-    return emitError() << "source rank " << rank << " != result rank "
-                       << resTy.getRank();
+    return op->emitError() << "source rank " << rank << " != result rank "
+                           << resTy.getRank();
 
   auto srcShape = srcTy.getShape();
   auto resShape = resTy.getShape();
-  auto axis = getAxis();
 
   if (!(0 <= axis && axis < rank))
-    return emitError() << "axis " << axis << " out of range for rank " << rank;
-
-  auto elemType = resTy.getElementType();
-  if (!(elemType.isBF16() || elemType.isF16()))
-    return emitError() << "only bf16 or f16 is supported for now, got "
-                       << elemType;
+    return op->emitError() << "axis " << axis << " out of range for rank "
+                           << rank;
 
   for (int i = 0; i < rank; ++i) {
     if (i == axis) {
       if (resShape[i] != srcShape[i] * 2)
-        return emitError() << "axis " << axis
-                           << " dimension must be 2x source dimension (src="
-                           << srcShape[i] << ", dst=" << resShape[i] << ")";
+        return op->emitError()
+               << "axis " << axis
+               << " dimension must be 2x source dimension (src=" << srcShape[i]
+               << ", dst=" << resShape[i] << ")";
     } else {
       if (resShape[i] != srcShape[i])
-        return emitError() << "dimension " << i
-                           << " mismatch (src=" << srcShape[i]
-                           << ", dst=" << resShape[i] << ", axis=" << axis
-                           << ")";
+        return op->emitError()
+               << "dimension " << i << " mismatch (src=" << srcShape[i]
+               << ", dst=" << resShape[i] << ", axis=" << axis << ")";
     }
   }
   return success();
 
@@ -257,6 +257,16 @@ Operation *mlir::triton::predicateOp(RewriterBase &rewriter, Operation *op,
     arriveBarrier.getPredMutable().assign(mask);
     return op;
   }
+  if (auto commit = dyn_cast<ttng::TCGen5CommitOp>(op)) {
+    rewriter.setInsertionPoint(commit);
+    Value mask = pred;
+    Value currentPred = commit.getPred();
+    if (currentPred) {
+      mask = getPredMask(rewriter, currentPred.getType(), currentPred, pred);
+    }
+    commit.getPredMutable().assign(mask);
+    return op;
+  }
   if (auto storeOp = dyn_cast<tt::StoreOp>(op)) {
     rewriter.setInsertionPoint(storeOp);
     Value mask = getPredMask(rewriter, storeOp.getPtr().getType(),
 
@@ -35,15 +35,15 @@ struct AutomaticWarpSpecialization
 void AutomaticWarpSpecialization::runOnOperation() {
   OpPassManager pm;
   pm.addPass(createTritonGPUPartitionScheduling());
+  pm.addPass(createNVWSInsertAref());
   pm.addPass(createTritonGPULoadMMASpecialization({numStages}));
   pm.addPass(createTritonGPURewritePartitionDependencies());
   // `int-range-optimizations` and SCCP are good at cleaning up loop arithmetic.
   // FIXME: Re-enable integer range analysis once it is fixed.
   // pm.addPass(arith::createIntRangeOptimizationsPass());
   pm.addPass(createSCCPPass());
   pm.addPass(createCSEPass());
-  pm.addPass(createNVWSAssignStagePhase());
-  pm.addPass(createNVWSLowerAref());
+  pm.addPass(createNVWSLowerAref({numStages}));
   pm.addPass(createTritonGPUPartitionLoops());
   pm.addPass(createNVWSLowerWarpGroup());
   if (failed(runPipeline(pm, getOperation())))
 
@@ -811,3 +811,26 @@ def kernel(N, out):
     out = torch.empty(1, dtype=torch.int32, device="cuda")
     compiled_kernel = kernel.warmup(N=100, out=out, grid=(1, ))
     assert compiled_kernel.asm["llir"].count("define") == 1
+
+
+@pytest.mark.skipif(not is_hip_cdna3() and not is_hip_cdna4(), reason="Requires CDNA3 or CDNA4")
+def test_inline_with_amdgpu_dialect():
+
+    @gluon.jit
+    def buffer_load(x, offsets):
+        return ttgl.amd.cdna3.buffer_load(ptr=x, offsets=offsets)
+
+    @gluon.jit
+    def kernel(x, y):
+        layout: ttgl.constexpr = ttgl.BlockedLayout(size_per_thread=[1], threads_per_warp=[64], warps_per_cta=[4],
+                                                    order=[0])
+        offsets = ttgl.arange(0, 64, layout=layout)
+
+        a = buffer_load(x, offsets)
+        ttgl.amd.cdna3.buffer_store(stored_value=a, ptr=y, offsets=offsets)
+
+    input = torch.arange(64, device="cuda").to(torch.int32)
+    output = torch.empty_like(input)
+
+    compiled_kernel = kernel.warmup(input, output, grid=(1, ))
+    assert compiled_kernel.asm["ttgir"].count("tt.func private") == 0
@@ -448,3 +448,32 @@ def grid(meta):
     warp_size = triton.runtime.driver.active.get_current_target().warp_size
     assert exception_out_of_resource is not None and f"out of resource: threads, Required: {128 * warp_size}" in str(
         exception_out_of_resource)
+
+
+def test_prune_all_configs(device):
+    N = 1024
+    src = torch.randn(N, device=device)
+    dst = torch.empty(N, device=device)
+
+    def early_config_prune(configs, named_args, **kwargs):
+        return []
+
+    configs = [triton.Config(kwargs={'BLOCK_SIZE': 32}), triton.Config(kwargs={'BLOCK_SIZE': 128})]
+
+    prune_configs_by = {'early_config_prune': early_config_prune}
+
+    @triton.autotune(configs=configs, key=['N'], prune_configs_by=prune_configs_by)
+    @triton.jit
+    def _kernel(dst, src, N, BLOCK_SIZE: tl.constexpr):
+        offsets = tl.program_id(0) * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
+        x = tl.load(src + offsets, mask=offsets < N)
+        tl.store(dst + offsets, x, mask=offsets < N)
+
+    grid = lambda META: (triton.cdiv(N, META['BLOCK_SIZE']), )
+    try:
+        _kernel[grid](dst, src, N=N)
+        pytest.fail("Expected exception was not thrown.")
+    except triton.TritonError as e:
+        assert e is not None and str(
+            e
+        ) == "Autotuner error: No valid autotuner configs after pruning. `early_config_prune` should return at least one config."
@@ -10,7 +10,7 @@
 
 from .. import knobs
 from .jit import KernelInterface, JITFunction
-from .errors import OutOfResources, PTXASError
+from .errors import OutOfResources, PTXASError, AutotunerError
 from .driver import driver
 from .cache import get_cache_manager, triton_key
 from triton._C.libtriton import get_cache_invalidating_env_vars
@@ -25,7 +25,9 @@ def __init__(self, fn, arg_names, configs, key, reset_to_zero, restore_value, pr
         :param prune_configs_by: a dict of functions that are used to prune configs, fields:
             'perf_model': performance model used to predicate running time with different configs, returns running time
             'top_k': number of configs to bench
-            'prune_num_stages_by'(optional): a function used to prune num_stages. It takes configs:List[Config] as its input, and returns pruned configs.
+            'early_config_prune': a function used to prune configs. It should have the signature
+                `prune_configs_by( configs: List[triton.Config], named_args: Dict[str, Any], **kwargs: Dict[str, Any]) -> List[triton.Config]:`
+                and return pruned configs. It should return at least one config.
         """
         if not configs:
             self.configs = [Config({}, num_warps=4, num_stages=3, num_ctas=1)]
@@ -259,6 +261,9 @@ def prune_configs(self, kwargs: Dict) -> List[Config]:
         pruned_configs = self.configs
         if self.early_config_prune:
             pruned_configs = self.early_config_prune(self.configs, self.nargs, **kwargs)
+            if not pruned_configs:
+                raise AutotunerError(
+                    "No valid autotuner configs after pruning. `early_config_prune` should return at least one config.")
         if self.perf_model:
             top_k = self.configs_top_k
             if isinstance(top_k, float) and top_k <= 1.0:
@@ -406,7 +411,9 @@ def kernel(x_ptr, x_size, BLOCK_SIZE: tl.constexpr):
     :param prune_configs_by: a dict of functions that are used to prune configs, fields:
         'perf_model': performance model used to predicate running time with different configs, returns running time
         'top_k': number of configs to bench
-        'early_config_prune'(optional): a function used to do early prune (eg, num_stages). It takes configs:List[Config] as its input, and returns pruned configs.
+        'early_config_prune': a function used to prune configs. It should have the signature
+                `prune_configs_by( configs: List[triton.Config], named_args: Dict[str, Any], **kwargs: Dict[str, Any]) -> List[triton.Config]:`
+                and return pruned configs. It should return at least one config.
     :param reset_to_zero: a list of argument names whose value will be reset to zero before evaluating any configs.
     :type reset_to_zero: list[str]
     :param restore_value: a list of argument names whose value will be restored after evaluating any configs.
 
@@ -34,3 +34,13 @@ def __init__(self, error_message: Optional[str] = None):
     def __str__(self) -> str:
         error_message = self.error_message or ""
         return f"PTXAS error: {error_message}"
+
+
+class AutotunerError(TritonError):
+
+    def __init__(self, error_message: Optional[str] = None):
+        self.error_message = error_message
+
+    def __str__(self) -> str:
+        error_message = self.error_message or ""
+        return f"Autotuner error: {error_message}"
@@ -1,6 +1,7 @@
 # isort: off
 # fmt: off
 from dataclasses import dataclass, fields, replace
+import itertools
 import pytest
 import torch
 from typing import Union
@@ -20,7 +21,7 @@
 # testing utilities
 from triton_kernels.testing import assert_close, compute_actual_scale
 # target-specific utilities
-from triton_kernels.target_info import is_hip, is_hip_cdna3, is_cuda, is_hip_cdna4
+from triton_kernels.target_info import is_hip, is_xpu, is_hip_cdna3, is_cuda, is_hip_cdna4
 
 # ---------------
 # initialize data
@@ -471,14 +472,68 @@ def round_x(x, idx):
                 tri_y_scale).abs() < 1e-10, f"ref_y_scale: {ref_y_scale}, tri_y_scale: {tri_y_scale.item()}"
 
 
+# Test that we don't use unsupported block sizes.
+@pytest.mark.parametrize("m", [8, 16, 32, 64, 128])
+@pytest.mark.parametrize("n", [8, 16, 32, 64, 128])
+@pytest.mark.parametrize("k", [8, 16, 32, 64, 128])
+def test_small_batch_matmul(m, n, k):
+    if is_hip():
+        pytest.skip("Not fully tested on AMD")
+    if is_xpu():
+        pytest.xfail("Enable: https://github.com/intel/intel-xpu-backend-for-triton/issues/5092")
+
+    if m * n * k > 16384:
+        pytest.skip()
+
+    BATCH_SIZE = 10000
+
+    def _make_tensor(shape, dtype, trans):
+        if trans:
+            shape = (shape[0], shape[2], shape[1])
+        t = alloc_rand(shape, "cuda", dtype)
+        return t.transpose(1, 2) if trans else t
+
+    for x_transpose, w_transpose, bias, dtype in itertools.product(
+        (False, True),
+        (False, True),
+        (False, True),
+        (torch.float16, torch.bfloat16, torch.float8_e5m2),
+    ):
+        if (
+            torch.cuda.get_device_capability()[0] < 10
+            and dtype is torch.float8_e5m2
+            and (not w_transpose)
+        ):
+            continue  # Not supported
+
+        x = _make_tensor((BATCH_SIZE, m, k), dtype, x_transpose)
+        w = _make_tensor((BATCH_SIZE, k, n), dtype, w_transpose)
+        bias = _make_tensor((BATCH_SIZE, n), torch.float32, False) if bias else None
+        tri_y = matmul_ogs(x, w, bias)
+
+        # ref_y = matmul_ogs_torch(x.float(), w.float(), bias)
+
+        # This is faster than matmul_ogs_torch.
+        ref_y = torch.bmm(x.float(), w.float())
+        if bias is not None:
+            ref_y += bias[:, None, :]
+
+        assert_close(
+            ref_y,
+            tri_y,
+            maxtol=4e-1 if dtype is torch.float8_e5m2 else None,
+            rmstol=4e-2 if dtype is torch.float8_e5m2 else None,
+        )
+
+
 def test_set_idle_sms():
     if not is_cuda():
         pytest.skip("Only supported on CUDA")
     from triton_kernels.matmul_ogs_details.opt_flags import make_opt_flags
     num_idle_sms = 24
     matmul_ogs_set_idle_sms(num_idle_sms)
     flags = make_opt_flags(torch.float32, torch.float32, torch.float32, PrecisionConfig(), \
-                           1024, 1024, 1024, None, True, False, 1)
+                           1, 1024, 1024, 1024, None, True, False, 1)
     assert flags.idle_sms == num_idle_sms
 
 
 
@@ -368,7 +368,7 @@ def matmul_ogs(x, w, bias,
     can_use_tma = can_use_tma and (torch.cuda.get_device_capability()[0] > 9 or bitwidth(w.dtype) != 4)
     can_use_fused_scatter = has_scatter and (fused_activation.specs.fn is None) and (epilogue.specs.fn is None) and (routing_data.n_expts_act == 1)
     opt_flags = make_opt_flags(out_dtype, x.dtype, w.dtype, precision_config,
-        M, N, K, routing_data, can_use_tma, can_use_fused_scatter, epilogue.effective_itemsize,
+        batch_size, M, N, K, routing_data, can_use_tma, can_use_fused_scatter, epilogue.effective_itemsize,
     )
     if not can_use_fused_scatter and opt_flags.fused_scatter:
         raise InapplicableConstraint("Fused scatter is not supported")
Original file line number	Diff line number	Diff line change
`@@ -368,7 +368,7 @@ def matmul_ogs(x, w, bias,`
`368`	`368`	`can_use_tma = can_use_tma and (torch.cuda.get_device_capability()[0] > 9 or bitwidth(w.dtype) != 4)`
`369`	`369`	`can_use_fused_scatter = has_scatter and (fused_activation.specs.fn is None) and (epilogue.specs.fn is None) and (routing_data.n_expts_act == 1)`
`370`	`370`	`opt_flags = make_opt_flags(out_dtype, x.dtype, w.dtype, precision_config,`
`371`		`- M, N, K, routing_data, can_use_tma, can_use_fused_scatter, epilogue.effective_itemsize,`
	`371`	`+ batch_size, M, N, K, routing_data, can_use_tma, can_use_fused_scatter, epilogue.effective_itemsize,`
`372`	`372`	`)`
`373`	`373`	`if not can_use_fused_scatter and opt_flags.fused_scatter:`
`374`	`374`	`raise InapplicableConstraint("Fused scatter is not supported")`