intel
diff --git a/‎lib/Dialect/Gluon/Transforms/ResolveAutoEncodings.cpp
Lines changed: 6 additions & 3 deletions b/‎lib/Dialect/Gluon/Transforms/ResolveAutoEncodings.cpp
Lines changed: 6 additions & 3 deletions
diff --git a/‎python/test/gluon/test_frontend.py
Lines changed: 25 additions & 0 deletions b/‎python/test/gluon/test_frontend.py
Lines changed: 25 additions & 0 deletions
diff --git a/‎python/test/unit/language/test_compile_errors.py
Lines changed: 9 additions & 5 deletions b/‎python/test/unit/language/test_compile_errors.py
Lines changed: 9 additions & 5 deletions
diff --git a/‎python/test/unit/language/test_conversions.py
Lines changed: 7 additions & 5 deletions b/‎python/test/unit/language/test_conversions.py
Lines changed: 7 additions & 5 deletions
diff --git a/‎python/test/unit/language/test_core.py
Lines changed: 2 additions & 2 deletions b/‎python/test/unit/language/test_core.py
Lines changed: 2 additions & 2 deletions
diff --git a/‎python/triton/_filecheck.py
Lines changed: 7 additions & 4 deletions b/‎python/triton/_filecheck.py
Lines changed: 7 additions & 4 deletions
diff --git a/‎python/triton/experimental/gluon/language/_core.py
Lines changed: 3 additions & 2 deletions b/‎python/triton/experimental/gluon/language/_core.py
Lines changed: 3 additions & 2 deletions
diff --git a/‎python/triton/experimental/gluon/language/_semantic.py
Lines changed: 11 additions & 2 deletions b/‎python/triton/experimental/gluon/language/_semantic.py
Lines changed: 11 additions & 2 deletions
diff --git a/‎python/triton/language/semantic.py
Lines changed: 12 additions & 0 deletions b/‎python/triton/language/semantic.py
Lines changed: 12 additions & 0 deletions
diff --git a/‎test/Conversion/amd/async-ops-alias-scopes.mlir
Lines changed: 1 addition & 1 deletion b/‎test/Conversion/amd/async-ops-alias-scopes.mlir
Lines changed: 1 addition & 1 deletion
@@ -121,9 +121,12 @@ LogicalResult inferAutoLayouts(FuncOp func) {
       } else {
         auto srcEncoding = inferSrcEncoding(definingOp, enc);
         if (srcEncoding) {
-          if (failed(updateEncoding(
-                  llvm::to_vector_of<Value>(definingOp->getOperands()),
-                  srcEncoding)))
+          llvm::SmallVector<Value> tensorOperands;
+          for (auto operand : definingOp->getOperands())
+            if (isa<RankedTensorType>(operand.getType()))
+              tensorOperands.push_back(operand);
+
+          if (failed(updateEncoding(tensorOperands, srcEncoding)))
             return failure();
         }
       }
 
@@ -1233,3 +1233,28 @@ def test_auto_layout():
     z = x + y
     # CHECK: (tensor<16x8xi32, #gluon.auto_encoding>) -> tensor<16xi32, #gluon.auto_encoding
     ttgl.sum(z, axis=1)
+
+    # CHECK: tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32, #gluon.auto_encoding>
+    ttgl.arange(0, 32)
+
+
+@filecheck_test
+@gluon.jit
+def test_auto_layout_broadcast():
+    # CHECK: [[BLOCKED:#.*]] = #ttg.blocked
+    # CHECK: [[X:%.*]] = arith.constant dense<1> : tensor<16x1xi32, #gluon.auto_encoding>
+    # CHECK: [[Y:%.*]] = arith.constant dense<2> : tensor<1x16xi32, [[BLOCKED]]>
+    x = ttgl.full([16, 1], 1, ttgl.int32, layout=ttgl.AutoLayout())
+    y = ttgl.full([1, 16], 2, ttgl.int32, layout=ttgl.BlockedLayout([1, 1], [1, 32], [4, 1], [1, 0]))
+
+    # CHECK: [[XCVT:%.*]] = ttg.convert_layout [[X]] : tensor<16x1xi32, #gluon.auto_encoding> -> tensor<16x1xi32, [[BLOCKED]]>
+    # CHECK: [[XBCAST:%.*]] = tt.broadcast [[XCVT]]
+    # CHECK: [[YBCAST:%.*]] = tt.broadcast [[Y]]
+    # CHECK: arith.addi [[XBCAST]], [[YBCAST]] : tensor<16x16xi32, [[BLOCKED]]>
+    _ = x + y
+
+    # CHECK: [[XCVT2:%.*]] = ttg.convert_layout [[X]] : tensor<16x1xi32, #gluon.auto_encoding> -> tensor<16x1xi32, [[BLOCKED]]>
+    # CHECK: [[YBCAST2:%.*]] = tt.broadcast [[Y]]
+    # CHECK: [[XBCAST2:%.*]] = tt.broadcast [[XCVT2]]
+    # CHECK: arith.muli [[YBCAST2]], [[XBCAST2]] : tensor<16x16xi32, [[BLOCKED]]>
+    _ = y * x
@@ -7,7 +7,7 @@
 import triton.language as tl
 from triton.compiler.errors import CompilationError, CompileTimeAssertionFailure
 import traceback
-from triton._internal_testing import is_cuda, is_hip, is_hip_cdna3, is_xpu
+from triton._internal_testing import is_cuda, is_hip, is_hip_cdna4, is_xpu
 
 
 def format_exception(type, value, tb):
@@ -364,9 +364,9 @@ def test_fp8_support(fresh_triton_cache, dtype):
         if cc >= (8, 9):
             supported_dtypes.append(tl.float8e4nv)
     elif is_hip():
-        supported_dtypes.append(tl.float8e4nv)
-        if is_hip_cdna3():
-            supported_dtypes += [tl.float8e4b8, tl.float8e5b16]
+        supported_dtypes += [tl.float8e4nv, tl.float8e4b8, tl.float8e5b16]
+        if is_hip_cdna4():
+            warning_dtypes += [tl.float8e4b8, tl.float8e5b16]
     elif is_xpu():
         supported_dtypes += [tl.float8e4b15, tl.float8e4nv]
 
@@ -376,7 +376,11 @@ def dtype_kernel(dtype: tl.constexpr):
         tl.dot(a, a)
 
     if dtype in warning_dtypes:
-        ctx = pytest.warns(UserWarning, match=r"the use of fp8e4b15 is deprecated on Hopper and later architectures")
+        if is_cuda():
+            ctx = pytest.warns(UserWarning,
+                               match=r"the use of fp8e4b15 is deprecated on Hopper and later architectures")
+        elif is_hip_cdna4():
+            ctx = pytest.warns(UserWarning, match=r"AMD gfx942 specific and not supported on gfx950")
     elif dtype in supported_dtypes:
         ctx = contextlib.nullcontext()
     else:
 
@@ -7,7 +7,7 @@
 import triton
 import triton.language as tl
 
-from triton._internal_testing import is_cuda, is_hip, is_hip_cdna3, is_hip_cdna4, is_xpu
+from triton._internal_testing import is_cuda, is_hip, is_hip_cdna2, is_hip_cdna3, is_hip_cdna4, is_xpu
 
 
 def matching_int(dtype):
@@ -297,6 +297,7 @@ def upcast_test(src_dtype, dst_dtype, exponent_bits, mantissa_bits, exponent_bia
     ('float8e4nv', 'float32'),
 
     ('float8e4b8', 'float32'),
+    ('float8e4b8', 'bfloat16'),
     ('float8e4b8', 'float16'),
 
     ('float8e5b16', 'float32'),
@@ -316,12 +317,13 @@ def test_typeconvert_upcast(src_dtype, dst_dtype, device):
     elif is_hip():
         if  (src_dtype == 'float8e4nv' and not (is_hip_cdna3() or is_hip_cdna4())):
             pytest.skip(f"upcasting {src_dtype} to {dst_dtype} not supported in this architecture")
-        if  (src_dtype in ('float8e4b15') or
-            (src_dtype in ('float8e4b8', 'float8e5b16') and not is_hip_cdna3())):
+        if  src_dtype == 'float8e4b15':
             # If the dtype should error out in the given device, we assert that and return
             with pytest.raises(triton.CompilationError, match="not supported in this architecture"):
                 launch_exhaustive_populate(getattr(tl, src_dtype), 0, 65536, False, 8, 0x7f, device=device)
             return
+        if src_dtype in ('float8e4b8', 'float8e5b16') and is_hip_cdna2():
+            pytest.skip(f"{src_dtype} is not supported on AMDGPU CDNA2")
     elif is_xpu():
         if (src_dtype in ('float8e4b8', 'float8e5b16')):
             # If the dtype should error out in the given device, we assert that and return
@@ -379,8 +381,8 @@ def test_typeconvert_downcast(src_dtype, dst_dtype, rounding, max_repr, device):
             pytest.skip(f"{dst_dtype} downcast with RTNE rounding tests only supported on AMDGPU CDNA3")
 
     if is_hip():
-        if dst_dtype in ('float8e5b16', 'float8e4b8') and rounding == 'rtne' and not is_hip_cdna3():
-            pytest.skip(f"{dst_dtype} downcast with RTNE rounding tests only supported on AMDGPU CDNA3")
+        if dst_dtype in ('float8e4b8', 'float8e5b16') and is_hip_cdna2():
+            pytest.skip(f"{dst_dtype} is not supported on AMDGPU CDNA2")
 
     if is_xpu():
         if dst_dtype in ('float8e5b16', 'float8e4b8') and rounding == 'rtne':
 
@@ -6139,9 +6139,9 @@ def kernel(Out):
 
 
 def test_globaltimer(device):
-    if is_hip_cdna2():
-        pytest.skip("test_globaltimer is flaky on gfx90a")
     check_cuda_or_hip(device)
+    if is_hip():
+        pytest.skip("test_globaltimer is flaky on AMD GPUs")
 
     @triton.jit
     def kernel(Out1, Out2, func: tl.constexpr):
 
@@ -42,8 +42,9 @@ def run_filecheck(name, module_str, check_template):
             temp.write(check_template)
 
         try:
-            subprocess.check_output([filecheck_path, temp_expected, "--input-file", temp_module],
-                                    stderr=subprocess.STDOUT)
+            subprocess.check_output(
+                [filecheck_path, temp_expected, "--input-file", temp_module, "--dump-input-context=50"],
+                stderr=subprocess.STDOUT)
         except subprocess.CalledProcessError as error:
             decoded = error.output.decode('unicode_escape')
             raise ValueError(decoded)
@@ -60,8 +61,10 @@ def run_parser(kernel_fn):
     ir.load_dialects(context)
     stub_backend.load_dialects(context)
 
-    extra_options = src.parse_options()
-    options = stub_backend.parse_options(dict(**extra_options))
+    options = dict(sanitize_overflow=False)
+    options.update(src.parse_options())
+
+    options = stub_backend.parse_options(options)
     codegen_fns = stub_backend.get_codegen_implementation(options)
     module_map = stub_backend.get_module_map()
     module = src.make_ir(options, codegen_fns, module_map, context)
 
@@ -43,6 +43,7 @@
 )
 
 _IMPORT_FROM_TRITON: List[str] = [
+    "broadcast",
     "expand_dims",
     "inline_asm_elementwise",
     "join",
@@ -341,14 +342,14 @@ def _keep_alive(self, _semantic: GluonSemantic = None) -> None:
 
 
 @builtin
-def arange(start, end, layout, _semantic=None):
+def arange(start, end, layout=None, _semantic=None):
     """
     Generate a sequence tensor with values in [start, end) using a specified layout.
 
     Args:
         start (int): Inclusive start of the sequence.
         end (int): Exclusive end of the sequence.
-        layout (DistributedLayout): The layout of the output tensor.
+        layout (DistributedLayout): The layout of the output tensor. Defaults to AutoLayout.
 
     Returns:
         tensor: A 1D tensor containing sequential values.
 
@@ -112,7 +112,14 @@ def broadcast_impl_value(self, lhs: TensorTy, rhs: TensorTy) -> TensorTy:
         lhs_shape = lhs_ty.get_block_shapes()
         rhs_shape = rhs_ty.get_block_shapes()
         ret_shape = self._broadcast_shapes(lhs_shape, rhs_shape)
-        if lhs_ty.layout != rhs_ty.layout:
+
+        is_lhs_auto = isinstance(lhs_ty.layout, AutoLayout)
+        is_rhs_auto = isinstance(rhs_ty.layout, AutoLayout)
+        if is_lhs_auto and not is_rhs_auto:
+            lhs = self.convert_layout(lhs, rhs_ty.layout)
+        elif is_rhs_auto and not is_lhs_auto:
+            rhs = self.convert_layout(rhs, lhs_ty.layout)
+        elif lhs_ty.layout != rhs_ty.layout:
             raise ValueError(f"Layout mismatch in broadcast: {lhs_ty.layout} vs {rhs_ty.layout}")
 
         lhs = self.broadcast_impl_shape(lhs, ret_shape)
@@ -121,6 +128,8 @@ def broadcast_impl_value(self, lhs: TensorTy, rhs: TensorTy) -> TensorTy:
 
     def arange(self, start, end, layout):
         shape = [end - start]
+        if layout is None:
+            layout = AutoLayout()
         ret_ty = ttgl.distributed_type(ttgl.int32, shape, layout)
         return super().arange(start, end, ret_ty=ret_ty)
 
@@ -138,7 +147,7 @@ def full(self, shape, value, dtype, layout):
         scalar = self.make_scalar(value, dtype)
         return self.splat(scalar, shape, layout)
 
-    def convert_layout(self, value, layout, assert_trivial):
+    def convert_layout(self, value, layout, assert_trivial=False):
         ty = value.type
         _check(isinstance(ty, ttgl.distributed_type),
                lambda: f"expected convert_layout input to be a distributed_type but got: {ty!r}")
 
@@ -1489,6 +1489,18 @@ def dot(self, lhs: TensorTy, rhs: TensorTy, acc: TensorTy, input_precision: Opti
             lhs = self.cast(lhs, tl.float16)
             rhs = self.cast(rhs, tl.float16)
 
+        uses_fp8e4b8 = lhs.dtype.is_fp8e4b8() or rhs.dtype.is_fp8e4b8()
+        uses_fp8e5b16 = lhs.dtype.is_fp8e5b16() or rhs.dtype.is_fp8e5b16()
+        if uses_fp8e4b8 or uses_fp8e5b16:
+            type_name = "fp8e4b8" if uses_fp8e4b8 else "fp8e5b16"
+            if type_name in self.builder.options.deprecated_fp8_dot_operand_dtypes:
+                arch = self.builder.options.arch
+                warnings.warn(
+                    f"{type_name} is AMD gfx942 specific and not supported on {arch} so it's upcasted to fp16 and can cause significant slow down. "
+                    f"Please use OCP fp8 variants on {arch} for performance")
+                lhs = self.cast(lhs, tl.float16)
+                rhs = self.cast(rhs, tl.float16)
+
         if input_precision is None:
             input_precision = self.builder.options.default_dot_input_precision
 
 
@@ -44,7 +44,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.shar
     // COMMON: rocdl.raw.ptr.buffer.load.lds {{.*}} {alias_scopes = [[[$ASYNC_COPY_SCOPE]]]
     // Check that store for 'other' has alias information set
     // COMMON: llvm.store {{.*}} {alias_scopes = [[[$LOCAL_LOAD_SCOPE]]], noalias_scopes = [[[$ASYNC_COPY_SCOPE]]]
-    %65 = amdgpu.buffer_load_to_local %arg1[%arg2] mask=%mask other=%other into %arg3 {OpIdx = #amdgpu.OpIdx<1>} : <f32>[tensor<8x64xi32, #blocked>] tensor<8x64xf32, #blocked> -> <8x64xf32, #shared, #smem, mutable>
+    %65 = amdgpu.buffer_load_to_local %arg1[%arg2] mask=%mask other=%other into %arg3 : <f32>[tensor<8x64xi32, #blocked>] tensor<8x64xf32, #blocked> -> <8x64xf32, #shared, #smem, mutable>
 
     // COMMON: llvm.return
     tt.return