Merge commit '2b41842577ce7203f51d3e975c18983b5dafb5d2'

whitneywhtsang · whitneywhtsang · commit 45bf460f412d · 2025-01-10T03:50:21.000Z
diff --git a/include/triton/Conversion/TritonGPUToLLVM/Utility.h b/include/triton/Conversion/TritonGPUToLLVM/Utility.h
@@ -46,6 +46,8 @@ using namespace mlir::triton;
 #define fadd(...) rewriter.create<LLVM::FAddOp>(loc, __VA_ARGS__)
 #define mul(...) rewriter.create<LLVM::MulOp>(loc, __VA_ARGS__)
 #define fmul(...) rewriter.create<LLVM::FMulOp>(loc, __VA_ARGS__)
+#define fma(...) rewriter.create<LLVM::FMAOp>(loc, __VA_ARGS__)
+#define neg(...) rewriter.create<LLVM::FNegOp>(loc, __VA_ARGS__)
 #define smax(...) rewriter.create<LLVM::SMaxOp>(loc, __VA_ARGS__)
 #define umax(...) rewriter.create<LLVM::UMaxOp>(loc, __VA_ARGS__)
 #define fmax(...) rewriter.create<LLVM::MaxNumOp>(loc, __VA_ARGS__)
diff --git a/include/triton/Dialect/Triton/IR/TritonOps.td b/include/triton/Dialect/Triton/IR/TritonOps.td
@@ -1240,58 +1240,60 @@ def ReturnOp : TT_Op<"return", [Pure, HasParent<"FuncOp">, /*MemRefsNormalizable
 
 
 def TT_ExperimentalDescriptorLoadOp : TT_Op<"experimental_descriptor_load", [MemoryEffects<[MemRead<GlobalMemory>]>]> {
-    let summary = "Load from descriptor";
-    let description = [{
-      This operation will be lowered to Nvidia TMA load operation on targets supporting it.
-      `desc` is a tensor descriptor object.
-      The destination tensor type and shape must match the descriptor otherwise the result is undefined.
+  let summary = "Load from descriptor";
+  let description = [{
+    This operation will be lowered to Nvidia TMA load operation on targets supporting it.
+    `desc` is a tensor descriptor object.
+    The destination tensor type and shape must match the descriptor otherwise the result is undefined.
 
-      This is an escape hatch and is only there for testing/experimenting.
-      This op will be removed in the future.
-    }];
-    let arguments = (
-      ins
-      TT_TensorDescType:$desc,
-      Variadic<I32>:$indices,
-      DefaultValuedAttr<TT_CacheModifierAttr, "::mlir::triton::CacheModifier::NONE">:$cache,
-      DefaultValuedAttr<TT_EvictionPolicyAttr, "::mlir::triton::EvictionPolicy::NORMAL">:$evict
-    );
+    This is an escape hatch and is only there for testing/experimenting.
+    This op will be removed in the future.
+  }];
+  let arguments = (ins
+    TT_TensorDescType:$desc,
+    Variadic<I32>:$indices,
+    DefaultValuedAttr<TT_CacheModifierAttr, "::mlir::triton::CacheModifier::NONE">:$cache,
+    DefaultValuedAttr<TT_EvictionPolicyAttr, "::mlir::triton::EvictionPolicy::NORMAL">:$evict
+  );
 
-    let results = (outs TT_Tensor:$result);
+  let results = (outs TT_Tensor:$result);
 
-    let assemblyFormat = [{
-      $desc `[` $indices `]`
-      oilist(
-        `cacheModifier` `=` $cache |
-        `evictionPolicy` `=` $evict
-      )
-      attr-dict `:` qualified(type($desc)) `->` type($result)
-    }];
+  let assemblyFormat = [{
+    $desc `[` $indices `]`
+    oilist(
+      `cacheModifier` `=` $cache |
+      `evictionPolicy` `=` $evict
+    )
+    attr-dict `:` qualified(type($desc)) `->` type($result)
+  }];
+
+  let hasVerifier = 1;
 }
 
 def TT_ExperimentalDescriptorStoreOp : TT_Op<"experimental_descriptor_store", [
     MemoryEffects<[MemRead<GlobalMemory>, MemWrite<GlobalMemory>]>,
 ]> {
-    let summary = "store value based on descriptor";
-    let description = [{
-      This operation will be lowered to Nvidia TMA store operation on targets supporting it.
-      `desc` is a tensor descriptor object.
-      The shape and types of `src` must match the descriptor otherwise the result is undefined.
+  let summary = "store value based on descriptor";
+  let description = [{
+    This operation will be lowered to Nvidia TMA store operation on targets supporting it.
+    `desc` is a tensor descriptor object.
+    The shape and types of `src` must match the descriptor otherwise the result is undefined.
 
-      This is an escape hatch and is only there for testing/experimenting.
-      This op will be removed in the future.
-    }];
-    let arguments = (
-      ins
-      TT_TensorDescType:$desc,
-      TT_Tensor:$src,
-      Variadic<I32>:$indices
-    );
+    This is an escape hatch and is only there for testing/experimenting.
+    This op will be removed in the future.
+  }];
+  let arguments = (ins
+    TT_TensorDescType:$desc,
+    TT_Tensor:$src,
+    Variadic<I32>:$indices
+  );
 
-    let assemblyFormat = [{
-      $desc `[` $indices `]` `,` $src
-      attr-dict `:` qualified(type($desc)) `,` type($src)
-    }];
+  let assemblyFormat = [{
+    $desc `[` $indices `]` `,` $src
+    attr-dict `:` qualified(type($desc)) `,` type($src)
+  }];
+
+  let hasVerifier = 1;
 }
 
 def TT_ExperimentalTensormapCreateOp: TT_Op<
@@ -1301,46 +1303,46 @@ def TT_ExperimentalTensormapCreateOp: TT_Op<
     AttrSizedOperandSegments,
   ]
 > {
-    let summary = "Create a new TMA descriptor on device";
-    let arguments = (
-        ins
-        TT_PtrType:$desc_ptr,
-        TT_PtrType:$global_address,
-        Variadic<I32>:$box_dim,
-        Variadic<I32>:$global_dim,
-        Variadic<I64>:$global_stride,
-        Variadic<I32>:$element_stride,
-        ConfinedAttr<I32Attr, [IntNonNegative, IntMaxValue<12>]>:$elem_type,
-        ConfinedAttr<I32Attr, [IntNonNegative, IntMaxValue<2>]>:$interleave_layout,
-        ConfinedAttr<I32Attr, [IntNonNegative, IntMaxValue<3>]>:$swizzle_mode,
-        ConfinedAttr<I32Attr, [IntNonNegative, IntMaxValue<1>]>:$fill_mode
-    );
-    let extraClassDeclaration = [{
-        int32_t getRank() {
-            return getBoxDim().size();
-        }
-    }];
-    let assemblyFormat = [{
-      $desc_ptr `,` $global_address `,`
-      `[` $box_dim `]` `,`
-      `[` $global_dim `]` `,`
-      `[` $global_stride `]` `,`
-      `[` $element_stride `]`
-      attr-dict `:` functional-type(operands, results)
-    }];
+  let summary = "Create a new TMA descriptor on device";
+  let arguments = (
+      ins
+      TT_PtrType:$desc_ptr,
+      TT_PtrType:$global_address,
+      Variadic<I32>:$box_dim,
+      Variadic<I32>:$global_dim,
+      Variadic<I64>:$global_stride,
+      Variadic<I32>:$element_stride,
+      ConfinedAttr<I32Attr, [IntNonNegative, IntMaxValue<12>]>:$elem_type,
+      ConfinedAttr<I32Attr, [IntNonNegative, IntMaxValue<2>]>:$interleave_layout,
+      ConfinedAttr<I32Attr, [IntNonNegative, IntMaxValue<3>]>:$swizzle_mode,
+      ConfinedAttr<I32Attr, [IntNonNegative, IntMaxValue<1>]>:$fill_mode
+  );
+  let extraClassDeclaration = [{
+      int32_t getRank() {
+          return getBoxDim().size();
+      }
+  }];
+  let assemblyFormat = [{
+    $desc_ptr `,` $global_address `,`
+    `[` $box_dim `]` `,`
+    `[` $global_dim `]` `,`
+    `[` $global_stride `]` `,`
+    `[` $element_stride `]`
+    attr-dict `:` functional-type(operands, results)
+  }];
 
-    let hasVerifier = 1;
+  let hasVerifier = 1;
 }
 
 def TT_ExperimentalTensormapFenceproxyAcquireOp: TT_Op<
   "experimental_tensormap_fenceproxy_acquire",
   [MemoryEffects<[MemWrite<GlobalMemory>]>]
 > {
-    let summary = "Acquire fence on a tensormap object";
-    let arguments = (ins TT_PtrType:$desc_ptr);
-    let assemblyFormat = [{
-      $desc_ptr attr-dict `:` qualified(type($desc_ptr))
-    }];
+  let summary = "Acquire fence on a tensormap object";
+  let arguments = (ins TT_PtrType:$desc_ptr);
+  let assemblyFormat = [{
+    $desc_ptr attr-dict `:` qualified(type($desc_ptr))
+  }];
 }
 
 
diff --git a/include/triton/Tools/Sys/GetEnv.hpp b/include/triton/Tools/Sys/GetEnv.hpp
@@ -33,6 +33,7 @@ inline const std::set<std::string> CACHE_INVALIDATING_ENV_VARS = {
     "TRITON_HIP_USE_BLOCK_PINGPONG",
     "TRITON_LLVM_DEBUG_ONLY",
     "TRITON_ENABLE_ASAN",
+    "TRITON_OVERRIDE_NV_CAPABILITY",
     "USE_IR_LOC",
     "NVPTX_ENABLE_DUMP",
     "TRITON_INTEL_ADVANCED_PATH",
diff --git a/lib/Dialect/Triton/IR/Ops.cpp b/lib/Dialect/Triton/IR/Ops.cpp
@@ -1117,6 +1117,27 @@ LogicalResult GatherOp::inferReturnTypes(
   return success();
 }
 
+// -- ExperimentalDesciptorLoadOp --
+static LogicalResult verifyDesciptorLoadStoreType(Operation *op,
+                                                  TensorDescType desc,
+                                                  RankedTensorType tensor) {
+  RankedTensorType block = desc.getBlockType();
+  if (block.getShape() == tensor.getShape() &&
+      block.getElementType() == tensor.getElementType())
+    return success();
+  return op->emitOpError("tensor desciptor block and tensor types must match");
+}
+
+LogicalResult ExperimentalDescriptorLoadOp::verify() {
+  return verifyDesciptorLoadStoreType(*this, getDesc().getType(), getType());
+}
+
+// -- ExperimentalDesciptorStoreOp --
+LogicalResult ExperimentalDescriptorStoreOp::verify() {
+  return verifyDesciptorLoadStoreType(*this, getDesc().getType(),
+                                      getSrc().getType());
+}
+
 // -- ExperimentalTensormapCreateOp --
 LogicalResult ExperimentalTensormapCreateOp::verify() {
   auto rank = getBoxDim().size();
diff --git a/python/test/unit/language/test_conversions.py b/python/test/unit/language/test_conversions.py
@@ -303,8 +303,8 @@ def upcast_test(src_dtype, dst_dtype, exponent_bits, mantissa_bits, exponent_bia
 ])
 def test_typeconvert_upcast(src_dtype, dst_dtype, device):
 
-    # On HIP, fp8e4nv upcasting is only supported to bf16, and it's only supported on MI300.
-    if src_dtype == 'float8e4nv' and is_hip() and (dst_dtype != 'bfloat16' or not is_hip_mi300()):
+    # On HIP, fp8e4nv upcasting is only supported to bf16 and fp16, and it's only supported on MI300.
+    if src_dtype == 'float8e4nv' and is_hip() and (dst_dtype != 'bfloat16' or dst_dtype != 'float16' or not is_hip_mi300()):
         pytest.skip(f"upcasting {src_dtype} to {dst_dtype} not supported in this architecture")
 
     if ((src_dtype == 'float8e4nv' and is_cuda() and torch.cuda.get_device_capability(0) < (8, 9))
diff --git a/python/test/unit/language/test_core.py b/python/test/unit/language/test_core.py
@@ -6136,6 +6136,37 @@ def mul_add(data):
     assert found_fma == enable_fp_fusion
 
 
+# -----------------------
+# test override_nv_compute_capability
+# -----------------------
+
+
+@pytest.mark.parametrize("nv_compute_capability", [70, 80, 90])
+@pytest.mark.parametrize("env_var_override", [False, True])
+def test_override_nv_compute_capability(nv_compute_capability, env_var_override, device):
+    if not is_cuda():
+        pytest.xfail('test_override_nv_compute_capability only for CUDA')
+
+    @triton.jit
+    def simple(data, out):
+        in_ptrs = data + tl.arange(0, 128)
+        out_ptrs = out + tl.arange(0, 128)
+        tl.store(out_ptrs, tl.load(in_ptrs) * 1.5 + 1.0)
+
+    data = torch.randn((128, ), device=device, dtype=torch.float32)
+    out = torch.empty_like(data)
+
+    if env_var_override:
+        os.environ["TRITON_OVERRIDE_NV_CAPABILITY"] = str(nv_compute_capability)
+        h = simple[(1, )](data, out)
+        os.environ.pop("TRITON_OVERRIDE_NV_CAPABILITY")
+    else:
+        h = simple[(1, )](data, out, override_nv_compute_capability=nv_compute_capability)
+    torch.testing.assert_close(data * 1.5 + 1.0, out)
+    ttgir_cc = re.search(r'cuda:(\d+)', h.asm["ttgir"])
+    assert int(ttgir_cc.group(1)) == nv_compute_capability
+
+
 # -----------------------
 # test propagate_nan
 # -----------------------
diff --git a/python/triton/language/core.py b/python/triton/language/core.py
@@ -1265,7 +1265,7 @@ def __str__(self) -> str:
         return f"tensor_descriptor<{self.type}>"
 
     @builtin
-    def load(self, offsets: List[tensor], _builder=None) -> tensor:
+    def load(self, offsets: List[constexpr | tensor], _builder=None) -> tensor:
         """Load a block from the descriptor starting at the given element offsets.
 
         Values outside of the tensor bounds will be filled with zeros.
@@ -1275,7 +1275,7 @@ def load(self, offsets: List[tensor], _builder=None) -> tensor:
         return semantic.descriptor_load(self, offsets, "", "", _builder)
 
     @builtin
-    def store(self, offsets: List[tensor], value: tensor, _builder=None) -> tensor:
+    def store(self, offsets: List[constexpr | tensor], value: tensor, _builder=None) -> tensor:
         """Store a block from the descriptor starting at the given element offsets.
 
         Values outside of the tensor bounds will be ignored.
diff --git a/python/triton/language/semantic.py b/python/triton/language/semantic.py
@@ -1147,7 +1147,7 @@ def reinterpret_tensor_descriptor(desc_ptr: tl.tensor, block_ty: tl.block_type,
     return tl._experimental_tensor_descriptor_base(handle, block_ty)
 
 
-def descriptor_load(desc: tl.tensor, offsets, cache_modifier: str, eviction_policy: str,
+def descriptor_load(desc: tl._experimental_tensor_desciptor_base, offsets, cache_modifier: str, eviction_policy: str,
                     builder: ir.builder) -> tl.tensor:
     assert isinstance(desc, tl._experimental_tensor_descriptor_base)
     offsets = _convert_to_ir_values(builder, offsets, require_i64=False)
@@ -1156,7 +1156,8 @@ def descriptor_load(desc: tl.tensor, offsets, cache_modifier: str, eviction_poli
     return tl.tensor(x, desc.type)
 
 
-def descriptor_store(desc: tl.tensor, value: tl.tensor, offsets, builder: ir.builder) -> tl.tensor:
+def descriptor_store(desc: tl._experimental_tensor_descriptor_base, value: tl.tensor, offsets,
+                     builder: ir.builder) -> tl.tensor:
     assert isinstance(desc, tl._experimental_tensor_descriptor_base)
     offsets = _convert_to_ir_values(builder, offsets, require_i64=False)
     return tl.tensor(builder.create_descriptor_store(desc.handle, value.handle, offsets), tl.void)
diff --git a/test/Conversion/amd/math-denorm-handling.mlir b/test/Conversion/amd/math-denorm-handling.mlir
@@ -1,5 +1,5 @@
-// RUN: triton-opt %s -split-input-file --convert-triton-amdgpu-to-llvm="arch=gfx942 ftz=True" --convert-builtin-func-to-llvm | FileCheck %s --check-prefix=LLVM_FTZ
-// RUN: triton-opt %s -split-input-file --convert-triton-amdgpu-to-llvm="arch=gfx942 ftz=False" --convert-builtin-func-to-llvm | FileCheck %s --check-prefix=LLVM_NO_FTZ
+// RUN: triton-opt %s -split-input-file --convert-triton-amdgpu-to-llvm="arch=gfx942 ftz=True" | FileCheck %s --check-prefixes=COMMON,LLVM_FTZ
+// RUN: triton-opt %s -split-input-file --convert-triton-amdgpu-to-llvm="arch=gfx942 ftz=False" | FileCheck %s --check-prefixes=COMMON,LLVM_NO_FTZ
 
 
 #blocked = #ttg.blocked<{sizePerThread = [1], threadsPerWarp = [64], warpsPerCTA = [1], order = [0]}>
@@ -16,7 +16,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32, ttg.targ
 
 #blocked = #ttg.blocked<{sizePerThread = [1], threadsPerWarp = [64], warpsPerCTA = [1], order = [0]}>
 module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32, ttg.target = "hip:gfx942", "ttg.threads-per-warp" = 64 : i32} {
-  tt.func public @test_exp2(%arg0: tensor<64xf32, #blocked>) attributes {noinline = false} {
+  tt.func public @test_exp(%arg0: tensor<64xf32, #blocked>) attributes {noinline = false} {
     // LLVM_FTZ: llvm.exp2.f32
     // LLVM_NO_FTZ: llvm.exp2.f32
     %0 = math.exp %arg0 : tensor<64xf32, #blocked>
@@ -35,3 +35,64 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32, ttg.targ
     tt.return
   }
 }
+
+// -----
+
+#blocked = #ttg.blocked<{sizePerThread = [1], threadsPerWarp = [64], warpsPerCTA = [1], order = [0]}>
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32, ttg.target = "hip:gfx942", "ttg.threads-per-warp" = 64 : i32} {
+  tt.func public @test_sqrt_f32(%arg0: tensor<64xf32, #blocked>) attributes {noinline = false} {
+    // LLVM_FTZ-LABEL: test_sqrt_f32
+    // LLVM_FTZ-NOT: llvm.fcmp "ogt"
+    // LLVM_FTZ: llvm.amdgcn.sqrt.f32
+    // LLVM_FTZ-NOT: llvm.fmul
+    // LLVM_FTZ-NOT: llvm.select
+    //
+    // LLVM_NO_FTZ-LABEL: test_sqrt_f32
+    // LLVM_NO_FTZ: llvm.fcmp "ogt"
+    // LLVM_NO_FTZ: llvm.fmul
+    // LLVM_NO_FTZ-NEXT: llvm.select
+    // LLVM_NO_FTZ-NEXT: llvm.amdgcn.sqrt.f32
+    // LLVM_NO_FTZ: llvm.fmul
+    // LLVM_NO_FTZ-NEXT: llvm.select
+    %0 = math.sqrt %arg0 : tensor<64xf32, #blocked>
+    tt.return
+  }
+}
+
+// -----
+
+#blocked = #ttg.blocked<{sizePerThread = [1], threadsPerWarp = [64], warpsPerCTA = [1], order = [0]}>
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32, ttg.target = "hip:gfx942", "ttg.threads-per-warp" = 64 : i32} {
+  tt.func public @test_sqrt_rn_f32(%arg0: tensor<64xf32, #blocked>) attributes {noinline = false} {
+    // LLVM_FTZ-LABEL: test_sqrt_rn_f32
+    // LLVM_FTZ: llvm.amdgcn.rsq.f32
+    // LLVM_FTZ: llvm.fmul
+    // LLVM_FTZ: llvm.fmul
+    // LLVM_FTZ: llvm.fneg
+    // LLVM_FTZ: llvm.intr.fma
+    // LLVM_FTZ-NEXT: llvm.intr.fma
+    // LLVM_FTZ-NEXT: llvm.intr.fma
+    // LLVM_FTZ-NEXT: llvm.fneg
+    // LLVM_FTZ-NEXT: llvm.intr.fma
+    // LLVM_FTZ-NEXT: llvm.intr.fma
+    // LLVM_FTZ-NEXT: llvm.intr.is.fpclass
+    // LLVM_FTZ-NEXT: llvm.select
+    //
+    // LLVM_NO_FTZ-LABEL: test_sqrt_rn_f32
+    // LLVM_NO_FTZ: llvm.intr.sqrt
+    %0 = tt.precise_sqrt %arg0 : tensor<64xf32, #blocked>
+    tt.return
+  }
+}
+
+// -----
+
+#blocked = #ttg.blocked<{sizePerThread = [1], threadsPerWarp = [64], warpsPerCTA = [1], order = [0]}>
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32, ttg.target = "hip:gfx942", "ttg.threads-per-warp" = 64 : i32} {
+  tt.func public @test_sqrt_rn_f64(%arg0: tensor<64xf64, #blocked>) attributes {noinline = false} {
+    // COMMON-LABEL: test_sqrt_rn_f64
+    // COMMON: llvm.intr.sqrt
+    %0 = tt.precise_sqrt %arg0 : tensor<64xf64, #blocked>
+    tt.return
+  }
+}
diff --git a/test/Triton/invalid.mlir b/test/Triton/invalid.mlir
diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/ElementwiseOpToLLVM.cpp b/third_party/amd/lib/TritonAMDGPUToLLVM/ElementwiseOpToLLVM.cpp
diff --git a/third_party/nvidia/backend/compiler.py b/third_party/nvidia/backend/compiler.py
diff --git a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/LoadStoreOpToLLVM.cpp b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/LoadStoreOpToLLVM.cpp