intel
diff --git a/‎.github/workflows/integration-tests.yml‎
Lines changed: 1 addition & 0 deletions b/‎.github/workflows/integration-tests.yml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎.github/workflows/integration-tests.yml.in‎
Lines changed: 1 addition & 0 deletions b/‎.github/workflows/integration-tests.yml.in‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td‎
Lines changed: 2 additions & 1 deletion b/‎include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎include/triton/Tools/Sys/GetEnv.hpp‎
Lines changed: 1 addition & 0 deletions b/‎include/triton/Tools/Sys/GetEnv.hpp‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎lib/Dialect/TritonGPU/IR/Dialect.cpp‎
Lines changed: 7 additions & 3 deletions b/‎lib/Dialect/TritonGPU/IR/Dialect.cpp‎
Lines changed: 7 additions & 3 deletions
diff --git a/‎lib/Dialect/TritonGPU/Transforms/OptimizeAccumulatorInit.cpp‎
Lines changed: 1 addition & 7 deletions b/‎lib/Dialect/TritonGPU/Transforms/OptimizeAccumulatorInit.cpp‎
Lines changed: 1 addition & 7 deletions
diff --git a/‎python/test/regression/test_cast_matmul.py‎
Lines changed: 8 additions & 5 deletions b/‎python/test/regression/test_cast_matmul.py‎
Lines changed: 8 additions & 5 deletions
diff --git a/‎python/triton/runtime/interpreter.py‎
Lines changed: 12 additions & 13 deletions b/‎python/triton/runtime/interpreter.py‎
Lines changed: 12 additions & 13 deletions
diff --git a/‎test/Conversion/amd/invalid_extractslice_to_llvm.mlir‎
Lines changed: 111 additions & 0 deletions b/‎test/Conversion/amd/invalid_extractslice_to_llvm.mlir‎
Lines changed: 111 additions & 0 deletions
diff --git a/‎test/Conversion/tritongpu_to_llvm_hopper.mlir‎
Lines changed: 12 additions & 0 deletions b/‎test/Conversion/tritongpu_to_llvm_hopper.mlir‎
Lines changed: 12 additions & 0 deletions
@@ -404,6 +404,7 @@ jobs:
             echo "Could not find '${INSTRUMENTATION_LIB_DIR}'" ; exit -1
           fi
           pytest --capture=tee-sys -rfs python/tutorials/06-fused-attention.py
+          pytest --capture=tee-sys -rfs third_party/amd/python/test/test_extract_slice.py
           cd python/test/unit
           pytest --capture=tee-sys -rfs -n 16 language runtime \
                  --ignore=language/test_line_info.py \
 
@@ -402,6 +402,7 @@ jobs:
             echo "Could not find '${INSTRUMENTATION_LIB_DIR}'" ; exit -1
           fi
           pytest --capture=tee-sys -rfs python/tutorials/06-fused-attention.py
+          pytest --capture=tee-sys -rfs third_party/amd/python/test/test_extract_slice.py
           cd python/test/unit
           pytest --capture=tee-sys -rfs -n 16 language runtime \
                  --ignore=language/test_line_info.py \
 
@@ -1142,7 +1142,8 @@ For example, the matrix L corresponding to blockTileSize=[32,16] is:
     bool isHopper() const;
 
     SmallVector<int64_t> getRepForOperand(ArrayRef<int64_t> shape,
-                                          int bitwidth, int opIdx) const;
+                                          int bitwidth, int kWidth,
+                                          int opIdx) const;
     SmallVector<unsigned> getRepOrderForOperand(int opIdx) const;
 
     bool supportReduction() const {
 
@@ -28,6 +28,7 @@ inline const std::set<std::string> CACHE_INVALIDATING_ENV_VARS = {
     "TRITON_DISABLE_LINE_INFO",
     "TRITON_DISABLE_RESHAPE_ENCODING_INFERENCE",
     "TRITON_ENABLE_LLVM_DEBUG",
+    "TRITON_HIP_STREAM_PREFETCH",
     "TRITON_LLVM_DEBUG_ONLY",
     "USE_IR_LOC",
     "NVPTX_ENABLE_DUMP",
 
@@ -953,7 +953,7 @@ DotOperandEncodingAttr::getElemsPerThread(ArrayRef<int64_t> shape,
   } else if (auto mma = mlir::dyn_cast<NvidiaMmaEncodingAttr>(parent)) {
     if (mma.isAmpere() || mma.isHopper()) {
       auto bitwidth = getPointeeType(eltTy).getIntOrFloatBitWidth();
-      auto rep = mma.getRepForOperand(shape, bitwidth, idx);
+      auto rep = mma.getRepForOperand(shape, bitwidth, kWidth, idx);
       auto sizePerThread = getSizePerThread();
       auto elemsPerKRep = mma.isHopper() ? (kWidth * 2) : (32 / bitwidth * 2);
       if (rank == 3)
@@ -2018,14 +2018,18 @@ NvidiaMmaEncodingAttr::getRepOrderForOperand(int opIdx) const {
 
 SmallVector<int64_t>
 NvidiaMmaEncodingAttr::getRepForOperand(ArrayRef<int64_t> shape, int bitwidth,
-                                        int opIdx) const {
+                                        int kWidth, int opIdx) const {
   auto rank = shape.size();
   auto warpsPerCTA = getWarpsPerCTA();
 
   // {batch, m, n, k}
   // Hopper path never uses the n value, since this method is only invoked
   // for in-RF (dotOpEnc) operands, but WGMMA only supports in A to be in RF
-  SmallVector<int> shapePerWarp = {1, 16, 8, 4 * 64 / bitwidth};
+  // TODO: rep per operand is not accurate for Hopper. It is currently done that
+  // way to allow us to get the correct total number of elements. this will be
+  // fixed when moving to linear layout.
+  SmallVector<int> shapePerWarp = {
+      1, 16, 8, isHopper() ? 4 * 2 * kWidth : 4 * 64 / bitwidth};
   int numRepBatch =
       rank == 3
           ? std::max<int64_t>(1, shape[0] / (shapePerWarp[0] * warpsPerCTA[0]))
 
@@ -38,13 +38,7 @@ void setUseAccFlag(Operation *op, Value useAcc) {
 }
 
 bool isConstantZeroTensor(Value v) {
-  auto constOp = v.getDefiningOp<arith::ConstantOp>();
-  if (!constOp)
-    return false;
-  auto splat = mlir::dyn_cast<SplatElementsAttr>(constOp.getValue());
-  if (!splat)
-    return false;
-  return splat.getSplatValue<FloatAttr>().getValue().convertToFloat() == 0.0f;
+  return (matchPattern(v, m_Zero()) || matchPattern(v, m_AnyZeroFloat()));
 }
 
 std::optional<std::pair<Operation *, int>> findZeroInitOp(Value accUse,
 
@@ -12,7 +12,7 @@
 import triton
 import triton.runtime as tr
 import triton.language as tl
-from triton._internal_testing import is_hip_mi300, is_cuda
+from triton._internal_testing import is_hip_mi300, is_cuda, is_hip
 
 input_dtypes = ["float16", "float32", "float64"]
 if is_cuda():
@@ -78,19 +78,22 @@ def matmul_kernel(A, B, C, M, N, K,  #
     tl.store(C, acc, mask=mask)
 
 
-@pytest.mark.parametrize("M, K, N, BLOCK_K, w_dtype, x_dtype, out_dtype",
-                         [(M, K, N, BLOCK_K, w, x, o)  #
+@pytest.mark.parametrize("M, K, N, BLOCK_K, BLOCK_M, w_dtype, x_dtype, out_dtype",
+                         [(M, K, N, BLOCK_K, BLOCK_M, w, x, o)  #
                           for BLOCK_K in [16, 32]  #
+                          for BLOCK_M in [16, 64]  #
                           for (M, K, N) in [(128, 128, 128), (768, 768, 1024)]  #
                           for w in input_dtypes
                           for x in input_dtypes  #
                           for o in out_dtypes])
-def test_cast_matmul(M, K, N, BLOCK_K, w_dtype, x_dtype, out_dtype, device):
+def test_cast_matmul(M, K, N, BLOCK_K, BLOCK_M, w_dtype, x_dtype, out_dtype, device):
     if x_dtype == w_dtype:
         pytest.xfail("skip the same input dtype")
     if device == "xpu" and "float64" in (w_dtype,
                                          x_dtype) and not tr.driver.active.get_current_target().arch['has_fp64']:
         pytest.xfail("float64 not supported on current xpu hardware")
+    if is_hip() and BLOCK_M == 64 and w_dtype in ["float8_e5m2", "float8_e4m3fnuz"]:
+        pytest.skip("skip due to bug on HIP path")
     x_dtype: torch.dtype = getattr(torch, x_dtype)
     w_dtype: torch.dtype = getattr(torch, w_dtype)
 
@@ -112,7 +115,7 @@ def init_tensor(dtype, shape):
     out_triton = torch.empty((M, N), device=device, dtype=torch_dtype)
 
     # launch kernel
-    block_m, block_n, block_k = 16, 16, BLOCK_K
+    block_m, block_n, block_k = BLOCK_M, 16, BLOCK_K
     grid = ((triton.cdiv(M, block_m) * triton.cdiv(N, block_n)), 1)
 
     matmul_kernel[grid](
 
@@ -21,7 +21,7 @@ def __init__(self, data, dtype):
         '''
             data: numpy array
             dtype: triton type, either pointer_type or scalar_type.
-            we don't store block_type here because the shape information is already availale in the data field
+            we don't store block_type here because the shape information is already available in the data field
             attr: a dictionary of attributes
         '''
         self.data = data
@@ -46,24 +46,23 @@ def set_attr(self, key, value):
 
 class BlockPointerHandle:
 
-    def __init__(self, base, shape, strides, offsets, tensor_shape, order):
+    def __init__(self, base, shape, strides, offsets, block_shape, order):
         self.base = base
         self.shape = shape
         self.strides = strides
         self.offsets = offsets
-        self.tensor_shape = tensor_shape
+        self.block_shape = block_shape
         self.order = order
 
     def materialize_pointers(self, boundary_check):
         dtype_tt = self.base.get_element_ty()
         n_bytes = dtype_tt.primitive_bitwidth // 8
-        tensor_shape = self.tensor_shape
-        ptrs = np.broadcast_to(self.base.data, self.tensor_shape)
-        masks = np.ones(self.tensor_shape, dtype=bool)
-        for dim in range(len(tensor_shape)):
-            bcast_dims = [1] * len(tensor_shape)
-            bcast_dims[dim] = tensor_shape[dim]
-            off = (self.offsets[dim].data + np.arange(tensor_shape[dim])).reshape(bcast_dims)
+        ptrs = np.broadcast_to(self.base.data, self.block_shape)
+        masks = np.ones(self.block_shape, dtype=bool)
+        for dim in range(len(self.block_shape)):
+            bcast_dims = [1] * len(self.block_shape)
+            bcast_dims[dim] = self.block_shape[dim]
+            off = (self.offsets[dim].data + np.arange(self.block_shape[dim])).reshape(bcast_dims)
             ptrs = ptrs + (n_bytes * off * self.strides[dim].data).astype(np.uint64)
             if dim in boundary_check:
                 masks = np.logical_and(masks, off < self.shape[dim].data)
@@ -655,17 +654,17 @@ def create_barrier(self):
         # Triton's barrier applies to each program in a grid, so it's a no-op in the interpreter
         pass
 
-    def create_make_block_ptr(self, base, shape, strides, offsets, tensor_shape, order):
+    def create_make_block_ptr(self, base, shape, strides, offsets, block_shape, order):
         # Create new offsets to avoid modifying the original
         new_offsets = [offset.clone() for offset in offsets]
-        return BlockPointerHandle(base, shape, strides, new_offsets, tensor_shape, order)
+        return BlockPointerHandle(base, shape, strides, new_offsets, block_shape, order)
 
     def create_advance(self, ptr, offsets):
         if len(ptr.offsets) != len(offsets):
             raise ValueError("len(ptr.offsets) != len(offsets)")
         # Create new offsets to avoid modifying the original
         new_offsets = [offset.clone() for offset in ptr.offsets]
-        ret = BlockPointerHandle(ptr.base, ptr.shape, ptr.strides, new_offsets, ptr.tensor_shape, ptr.order)
+        ret = BlockPointerHandle(ptr.base, ptr.shape, ptr.strides, new_offsets, ptr.block_shape, ptr.order)
         for i in range(len(offsets)):
             ret.offsets[i].data += offsets[i].data
         return ret
 
@@ -0,0 +1,111 @@
+// RUN: triton-opt -split-input-file %s --convert-triton-amdgpu-to-llvm='arch=gfx942' -verify-diagnostics
+
+// Invalid size
+#blocked1 = #triton_gpu.blocked<{sizePerThread = [8, 1], threadsPerWarp = [4, 16], warpsPerCTA = [8, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [0, 1]}>
+tt.func @invalid_size_input(%arg0: tensor<256x128xi32, #blocked1> {tt.divisibility = 16 : i32}) {
+  // expected-error @+1 {{sizes [256, 2] must be a multiple of shapePerCTATile [256, 16]}}
+  %1 = amdgpu.extract_slice %arg0 [0,0] : tensor<256x128xi32, #blocked1> to tensor<256x2xi32, #blocked1>
+  tt.return
+}
+
+// -----
+
+// Invalid zero source dimension
+#blocked1 = #triton_gpu.blocked<{sizePerThread = [8, 1], threadsPerWarp = [4, 16], warpsPerCTA = [8, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [0, 1]}>
+tt.func @invalid_size_input(%arg0: tensor<256x0xi32, #blocked1> {tt.divisibility = 16 : i32}) {
+  // expected-error @+1 {{source tensor dimension size zero at dimension 1}}
+  %1 = amdgpu.extract_slice %arg0 [0,0] : tensor<256x0xi32, #blocked1> to tensor<256x16xi32, #blocked1>
+  tt.return
+}
+
+// -----
+
+// Invalid zero result dimension
+#blocked1 = #triton_gpu.blocked<{sizePerThread = [8, 1], threadsPerWarp = [4, 16], warpsPerCTA = [8, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [0, 1]}>
+tt.func @invalid_size_input(%arg0: tensor<256x128xi32, #blocked1> {tt.divisibility = 16 : i32}) {
+  // expected-error @+1 {{result tensor dimension size zero at dimension 1}}
+  %1 = amdgpu.extract_slice %arg0 [0,0] : tensor<256x128xi32, #blocked1> to tensor<256x0xi32, #blocked1>
+  tt.return
+}
+
+// -----
+
+// Invalid offset, not multiple of shapePerTile
+#blocked1 = #triton_gpu.blocked<{sizePerThread = [8, 1], threadsPerWarp = [4, 16], warpsPerCTA = [8, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [0, 1]}>
+tt.func @invalid_offset_input(%arg0: tensor<256x128xi32, #blocked1> {tt.divisibility = 16 : i32}) {
+  // expected-error @+1 {{offset [0, 5] must be a multiple of shapePerCTATile [256, 16]}}
+  %1 = amdgpu.extract_slice %arg0 [0,5] : tensor<256x128xi32, #blocked1> to tensor<256x16xi32, #blocked1>
+  tt.return
+}
+
+// -----
+
+// Invalid offset, out of bounds for dimension
+#blocked1 = #triton_gpu.blocked<{sizePerThread = [8, 1], threadsPerWarp = [4, 16], warpsPerCTA = [8, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [0, 1]}>
+tt.func @invalid_offset_input(%arg0: tensor<256x128xi32, #blocked1> {tt.divisibility = 16 : i32}) {
+  // expected-error @+1 {{invalid offset 128 at dimension 1}}
+  %1 = amdgpu.extract_slice %arg0 [0,128] : tensor<256x128xi32, #blocked1> to tensor<256x16xi32, #blocked1>
+  tt.return
+}
+
+// -----
+
+// Invalid result layout
+#blocked1 = #triton_gpu.blocked<{sizePerThread = [8, 1], threadsPerWarp = [4, 16], warpsPerCTA = [8, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [0, 1]}>
+#blocked2 = #triton_gpu.blocked<{sizePerThread = [4, 1], threadsPerWarp = [4, 16], warpsPerCTA = [8, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [0, 1]}>
+tt.func @invalid_result_layout(%arg0: tensor<256x128xi32, #blocked1> {tt.divisibility = 16 : i32}) {
+  // expected-error @+1 {{result layout must match source layout}}
+  %1 = amdgpu.extract_slice %arg0 [0,0] : tensor<256x128xi32, #blocked1> to tensor<256x16xi32, #blocked2>
+  tt.return
+}
+
+// -----
+
+// Invalid result element type
+#blocked1 = #triton_gpu.blocked<{sizePerThread = [8, 1], threadsPerWarp = [4, 16], warpsPerCTA = [8, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [0, 1]}>
+tt.func @invalid_result_element_type(%arg0: tensor<256x128xi32, #blocked1> {tt.divisibility = 16 : i32}) {
+  // expected-error @+1 {{result element type must match source element type}}
+  %1 = amdgpu.extract_slice %arg0 [0,0] : tensor<256x128xi32, #blocked1> to tensor<256x16xi64, #blocked1>
+  tt.return
+}
+
+// -----
+
+// Invalid result rank
+#blocked1 = #triton_gpu.blocked<{sizePerThread = [8, 1], threadsPerWarp = [4, 16], warpsPerCTA = [8, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [0, 1]}>
+tt.func @invalid_result_rank(%arg0: tensor<256x128xi32, #blocked1> {tt.divisibility = 16 : i32}) {
+  // expected-error @+1 {{result rank must be equal to source rank}}
+  %1 = amdgpu.extract_slice %arg0 [0,0] : tensor<256x128xi32, #blocked1> to tensor<256x16x2xi32, #blocked1>
+  tt.return
+}
+
+// -----
+
+// Invalid result shape
+#blocked1 = #triton_gpu.blocked<{sizePerThread = [8, 1], threadsPerWarp = [4, 16], warpsPerCTA = [8, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [0, 1]}>
+tt.func @invalid_result_rank(%arg0: tensor<256x128xi32, #blocked1> {tt.divisibility = 16 : i32}) {
+  // expected-error @+1 {{result shape cannot be larger than input shape at dimension 1}}
+  %1 = amdgpu.extract_slice %arg0 [0,0] : tensor<256x128xi32, #blocked1> to tensor<256x256xi32, #blocked1>
+  tt.return
+}
+
+// -----
+
+// Invalid rank
+#blocked1 = #triton_gpu.blocked<{sizePerThread = [8, 1], threadsPerWarp = [4, 16], warpsPerCTA = [8, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [0, 1]}>
+tt.func @invalid_rank(%arg0: tensor<256x128x2xi32, #blocked1> {tt.divisibility = 16 : i32}) {
+  // expected-error @+1 {{currently only 2D tensors are supported}}
+  %1 = amdgpu.extract_slice %arg0 [0,0,0] : tensor<256x128x2xi32, #blocked1> to tensor<256x16x2xi32, #blocked1>
+  tt.return
+}
+
+// -----
+
+// Invalid non static offset
+#blocked1 = #triton_gpu.blocked<{sizePerThread = [8, 1], threadsPerWarp = [4, 16], warpsPerCTA = [8, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [0, 1]}>
+tt.func @invalid_non_static_offset(%arg0: tensor<256x128xi32, #blocked1> {tt.divisibility = 16 : i32}, %arg1: i32) {
+  // expected-error @+2 {{expected ']'}}
+  // expected-error @+1 {{expected integer value}}
+  %2 = amdgpu.extract_slice %arg0 [%arg1, 0] : tensor<256x128xi32, #blocked1> to tensor<256x16xi32, #blocked1>
+  tt.return
+}
@@ -293,3 +293,15 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 1 :
     tt.return
   }
 }
+
+// -----
+
+#mma = #triton_gpu.nvidia_mma<{versionMajor = 3, versionMinor = 0, warpsPerCTA = [4, 1], instrShape = [16, 128, 16]}>
+module attributes {"triton_gpu.target" = "cuda:90", "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 1 : i32} {
+  // CHECK-LABEL: test_fp8_to_fp16_dot_operand
+  // CHECK-COUNT-16: cvt.rn.f16x2.e5m2x2
+  tt.func @test_fp8_to_fp16_dot_operand(%arg: tensor<128x32xf8E5M2, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>>) {
+    %r = tt.fp_to_fp %arg : tensor<128x32xf8E5M2, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> -> tensor<128x32xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>>
+    tt.return
+  }
+}
Original file line number	Diff line number	Diff line change
`@@ -38,13 +38,7 @@ void setUseAccFlag(Operation *op, Value useAcc) {`
`38`	`38`	`}`
`39`	`39`
`40`	`40`	`bool isConstantZeroTensor(Value v) {`
`41`		`- auto constOp = v.getDefiningOp<arith::ConstantOp>();`
`42`		`- if (!constOp)`
`43`		`- return false;`
`44`		`- auto splat = mlir::dyn_cast<SplatElementsAttr>(constOp.getValue());`
`45`		`- if (!splat)`
`46`		`- return false;`
`47`		`- return splat.getSplatValue<FloatAttr>().getValue().convertToFloat() == 0.0f;`
	`41`	`+ return (matchPattern(v, m_Zero()) \|\| matchPattern(v, m_AnyZeroFloat()));`
`48`	`42`	`}`
`49`	`43`
`50`	`44`	`std::optional<std::pair<Operation *, int>> findZeroInitOp(Value accUse,`