intel
diff --git a/‎CMakeLists.txt‎
Lines changed: 0 additions & 4 deletions b/‎CMakeLists.txt‎
Lines changed: 0 additions & 4 deletions
diff --git a/‎include/triton/Conversion/TritonGPUToLLVM/Utility.h‎
Lines changed: 0 additions & 26 deletions b/‎include/triton/Conversion/TritonGPUToLLVM/Utility.h‎
Lines changed: 0 additions & 26 deletions
diff --git a/‎lib/Conversion/TritonGPUToLLVM/Utility.cpp‎
Lines changed: 0 additions & 104 deletions b/‎lib/Conversion/TritonGPUToLLVM/Utility.cpp‎
Lines changed: 0 additions & 104 deletions
diff --git a/‎lib/Dialect/TritonGPU/IR/Ops.cpp‎
Lines changed: 26 additions & 0 deletions b/‎lib/Dialect/TritonGPU/IR/Ops.cpp‎
Lines changed: 26 additions & 0 deletions
diff --git a/‎python/src/gluon_ir.cc‎
Lines changed: 7 additions & 0 deletions b/‎python/src/gluon_ir.cc‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎python/test/gluon/test_frontend.py‎
Lines changed: 2 additions & 2 deletions b/‎python/test/gluon/test_frontend.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎python/test/unit/test_debuginfo.py‎
Lines changed: 54 additions & 29 deletions b/‎python/test/unit/test_debuginfo.py‎
Lines changed: 54 additions & 29 deletions
diff --git a/‎python/test/unit/test_debuginfo_helper.py‎
Lines changed: 0 additions & 37 deletions b/‎python/test/unit/test_debuginfo_helper.py‎
Lines changed: 0 additions & 37 deletions
diff --git a/‎python/triton/experimental/gluon/language/_core.py‎
Lines changed: 0 additions & 4 deletions b/‎python/triton/experimental/gluon/language/_core.py‎
Lines changed: 0 additions & 4 deletions
diff --git a/‎python/triton/experimental/gluon/language/_semantic.py‎
Lines changed: 4 additions & 0 deletions b/‎python/triton/experimental/gluon/language/_semantic.py‎
Lines changed: 4 additions & 0 deletions
@@ -89,10 +89,6 @@ if(NOT CMAKE_BUILD_TYPE)
   set(CMAKE_BUILD_TYPE "Release")
 endif()
 
-if(NOT WIN32)
-  find_library(TERMINFO_LIBRARY tinfo)
-endif()
-
 if(TRITON_BUILD_UT)
   # This is an aggregate target for all unit tests.
   add_custom_target(TritonUnitTests)
 
@@ -528,32 +528,6 @@ Value emitPadding(Location loc, RewriterBase &rewriter,
                   triton::gpu::PaddedSharedEncodingAttr layout,
                   unsigned bitwidth, Value smemOffset, bool offsetInBytes);
 
-// Emits IR to load data from shared memory into registers, or to store data
-// from registers into shared memory.
-//
-// You supply perVectorCallback, which is called once per group of register
-// elements to transfer.  You can use this callback to emit IR to load or store
-// data from or to shared memory.
-//
-// elemLlvmTy should be dstTy's element type converted to an LLVM-dialect type.
-//
-// If maxVecElems is provided, we won't vectorize more than this many elements.
-//
-// Returns true on success.
-[[nodiscard]] bool emitTransferBetweenRegistersAndShared(
-    RankedTensorType registerTy, triton::gpu::MemDescType sharedTy,
-    Type elemLlvmTy, std::optional<int32_t> maxVecElems,
-    const SharedMemoryObject &smemObj, Location loc, RewriterBase &rewriter,
-    const TargetInfoBase &target,
-    std::function<void(VectorType, Value /*shmemAddr*/)> perVectorCallback);
-
-[[nodiscard]] bool emitTransferBetweenRegistersAndShared(
-    LinearLayout &regLayout, triton::gpu::MemDescType sharedTy, Type elemLlvmTy,
-    std::optional<int32_t> maxVecElems, const SharedMemoryObject &smemObj,
-    Location loc, RewriterBase &rewriter, const TargetInfoBase &target,
-    Value laneId, Value warpId,
-    std::function<void(VectorType, Value /*shmemAddr*/)> perVectorCallback);
-
 // Close cousin of lowerLdStMatrix in MemoryOpToLLVM.cpp
 // We might want to merge them at some point, but having to support
 // ldmatrix.trans makes the code in lowerLdStMatrix a bit specific
 
@@ -706,110 +706,6 @@ lowerLocalLdSt(Location loc, MLIRContext *ctx,
                          maybeMaxVecElems, localLoadOp);
 }
 
-bool emitTransferBetweenRegistersAndShared(
-    LinearLayout &regLayout, triton::gpu::MemDescType sharedTy, Type elemLlvmTy,
-    std::optional<int32_t> maxVecElems, const SharedMemoryObject &smemObj,
-    Location loc, RewriterBase &rewriter, const TargetInfoBase &target,
-    Value laneId, Value warpId,
-    std::function<void(VectorType, Value /*shmemAddr*/)> perVectorCallback) {
-  MLIRContext *ctx = rewriter.getContext();
-  auto b = TritonLLVMOpBuilder(loc, rewriter);
-
-  StringAttr kBlock = str_attr("block");
-  StringAttr kRegister = str_attr("register");
-  StringAttr kLane = str_attr("lane");
-  StringAttr kWarp = str_attr("warp");
-  StringAttr kOffset = str_attr("offset");
-
-  auto shape = sharedTy.getShape();
-  auto paddedEnc =
-      dyn_cast<triton::gpu::PaddedSharedEncodingAttr>(sharedTy.getEncoding());
-  LinearLayout regToSharedLayout = LinearLayout::empty();
-  if (paddedEnc) {
-    const auto &sharedLL = paddedEnc.getLinearComponent();
-    regToSharedLayout = regLayout.invertAndCompose(sharedLL);
-  } else {
-    auto sharedLL = triton::gpu::toLinearLayout(sharedTy);
-    regToSharedLayout = regLayout.invertAndCompose(sharedLL);
-  }
-
-  // TODO(jlebar): We don't currently support loading from shared memory in a
-  // different CTA.  We'd need to emit `mapa.shared::cluster` instructions.
-  if (regToSharedLayout.hasInDim(kBlock) &&
-      regToSharedLayout.hasOutDim(kBlock) &&
-      !regToSharedLayout.isTrivialOver({kBlock})) {
-    return false;
-  }
-
-  // Determine how many consecutive registers map to consecutive shmem elements
-  // in out-dimension offsetN.  This is our load instruction's vector width.
-  //
-  // It's OK if the vector width we choose here is wider than the hardware
-  // supports; LLVM will legalize it.
-  int vecElems =
-      std::min({regToSharedLayout.getNumConsecutiveInOut(),
-                maxVecElems.value_or(std::numeric_limits<int>::max())});
-  if (paddedEnc) {
-    vecElems = std::min(vecElems, int(paddedEnc.getMinInterval()));
-  }
-
-  auto withCTAOffset = triton::gpu::getNumCTAs(sharedTy.getEncoding()) > 1;
-  Value blockId =
-      withCTAOffset ? target.getClusterCTAId(rewriter, loc) : b.i32_val(0);
-
-  int numElems = regToSharedLayout.getInDimSize(kRegister);
-  auto vecTy = vec_ty(elemLlvmTy, vecElems);
-  SmallVector<uint32_t> regIds;
-  for (int i = 0; i < numElems / vecElems; i++) {
-    regIds.push_back(i * vecElems);
-  }
-
-  auto smemBase = smemObj.getBase();
-
-  auto indicesVec = applyLinearLayoutVec(loc, rewriter, regToSharedLayout,
-                                         {{kRegister, b.i32_val(0)},
-                                          {kLane, laneId},
-                                          {kWarp, warpId},
-                                          {kBlock, blockId}},
-                                         regIds);
-
-  // Compute affine offset given by memdesc_subslice
-  auto offset = smemObj.getShmemOffset(loc, rewriter, sharedTy);
-  SmallVector<Value> vecAddrVec;
-  for (auto &indices : indicesVec) {
-    Value smemOffset = indices[0].second;
-    smemOffset = b.xor_(smemOffset, offset);
-    if (paddedEnc) {
-      // Apply the offset needed for padding.
-      auto bitwidth = elemLlvmTy.getIntOrFloatBitWidth();
-      Value padOffset = emitPadding(loc, rewriter, paddedEnc, bitwidth,
-                                    smemOffset, /*offsetInBytes=*/false);
-      smemOffset = b.add(smemOffset, padOffset);
-    }
-    auto vecAddr = b.gep(smemBase.getType(), elemLlvmTy, smemBase, smemOffset,
-                         LLVM::GEPNoWrapFlags::inbounds);
-    vecAddrVec.push_back(vecAddr);
-  }
-
-  for (Value &vecAddr : vecAddrVec) {
-    perVectorCallback(vecTy, vecAddr);
-  }
-  return true;
-}
-
-bool emitTransferBetweenRegistersAndShared(
-    RankedTensorType registerTy, triton::gpu::MemDescType sharedTy,
-    Type elemLlvmTy, std::optional<int32_t> maxVecElems,
-    const SharedMemoryObject &smemObj, Location loc, RewriterBase &rewriter,
-    const TargetInfoBase &target,
-    std::function<void(VectorType, Value /*shmemAddr*/)> perVectorCallback) {
-  auto regLayout = triton::gpu::toLinearLayout(registerTy);
-  auto [laneId, warpId] = getLaneAndWarpId(rewriter, loc);
-  return emitTransferBetweenRegistersAndShared(
-      regLayout, sharedTy, elemLlvmTy, maxVecElems, smemObj, loc, rewriter,
-      target, laneId, warpId, perVectorCallback);
-}
-
 SmallVector<Value> unpackLLElements(Location loc, Value llvmStruct,
                                     RewriterBase &rewriter) {
   assert(bool(llvmStruct) && "can not unpack null values");
 
@@ -73,6 +73,31 @@ bool isConvertTrivial(ConvertLayoutOp op) {
 // Canonicalizer
 //===----------------------------------------------------------------------===//
 
+// tmem_store(cvt) -> tmem_store
+struct CanonicalizeConvertFromTMEMStore
+    : public mlir::OpRewritePattern<nvidia_gpu::TMEMStoreOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+  mlir::LogicalResult
+  matchAndRewrite(nvidia_gpu::TMEMStoreOp op,
+                  PatternRewriter &rewriter) const override {
+    auto convert = op.getSrc().getDefiningOp<ConvertLayoutOp>();
+    if (!convert)
+      return failure();
+
+    // bail for incompatible layouts
+    auto cvtSrcType = convert.getSrc().getType();
+    if (!nvidia_gpu::isDistributedLayoutTMemCompatible(
+            op.getOperation(), cvtSrcType, op.getDst().getType())) {
+      return failure();
+    }
+
+    rewriter.modifyOpInPlace(
+        op, [&]() { op.getSrcMutable().assign(convert.getSrc()); });
+    return mlir::success();
+  }
+};
+
 // reshape(cvt) -> reshape
 struct CanonicalizeConvertFromReshape
     : public mlir::OpRewritePattern<triton::ReshapeOp> {
@@ -373,6 +398,7 @@ void ConvertLayoutOp::getCanonicalizationPatterns(RewritePatternSet &patterns,
   patterns.add<CanonicalizeConvertFromAlloc>(context);
   patterns.add<CanonicalizeConvertFromLocalStore>(context);
   patterns.add<CanonicalizeConvertFromSplit>(context);
+  patterns.add<CanonicalizeConvertFromTMEMStore>(context);
 }
 
 LogicalResult Fp4ToFpOp::verify() {
 
@@ -763,6 +763,13 @@ void init_gluon_ir(py::module &&m) {
              self.create<ttag::BufferLoadToLocalOp>(
                  dest, ptr, offsets, mask, other, stride, cacheModifier);
            })
+      .def("create_make_tensor_descriptor",
+           [](TritonOpBuilder &self, Type resultTy, Value &base,
+              std::vector<Value> &shape, std::vector<Value> &strides,
+              tt::PaddingOption paddingOption) -> Value {
+             return self.create<tt::MakeTensorDescOp>(resultTy, base, shape,
+                                                      strides, paddingOption);
+           })
       .def("create_async_tdm_copy_global_to_local",
            [](GluonOpBuilder &self, Value descPtr, std::vector<Value> &indices,
               Value result) {
 
@@ -2763,12 +2763,12 @@ def test_amd_tdm(target):
     %c128_i32 = arith.constant 128 : i32
     %c128_i64 = arith.constant 128 : i64
     %c1_i64 = arith.constant 1 : i64
-    %0 = tt.make_tensor_descriptor %arg0, [%c32_i32, %c128_i32], [%c128_i64, %c1_i64] : <f16>, <tensor<16x64xf16>>
+    %0 = tt.make_tensor_descriptor %arg0, [%c32_i32, %c128_i32], [%c128_i64, %c1_i64] : <f16>, <tensor<16x64xf16, #shared>>
     %1 = ttg.local_alloc : () -> !ttg.memdesc<16x64xf16, #shared, #smem, mutable>
     %c0_i32 = arith.constant 0 : i32
     %c2_i32 = arith.constant 2 : i32
     %true = arith.constant true
-    %2 = amdgpu.async_tdm_copy_global_to_local %0[%c0_i32, %c2_i32] into %1, %true : !tt.tensordesc<tensor<16x64xf16>> -> !ttg.memdesc<16x64xf16, #shared, #smem, mutable>
+    %2 = amdgpu.async_tdm_copy_global_to_local %0[%c0_i32, %c2_i32] into %1, %true : !tt.tensordesc<tensor<16x64xf16, #shared>> -> !ttg.memdesc<16x64xf16, #shared, #smem, mutable>
     %3 = amdgpu.async_tdm_wait  {num = 0 : i32}
     %4 = ttg.local_load %1 : !ttg.memdesc<16x64xf16, #shared, #smem, mutable> -> tensor<16x64xf16, #blocked>
     tt.return
 
@@ -1,40 +1,65 @@
 import os
-import subprocess
 
-all_names = ["offsets", "pid", "block_start", "mask", "x", "y", "output"]
+import pytest
+import torch
+
+import triton
+import triton.language as tl
+
+
+@triton.jit
+def add_kernel(
+    x_ptr,
+    y_ptr,
+    output_ptr,
+    n_elements,
+    BLOCK_SIZE: tl.constexpr,
+):
+    pid = tl.program_id(axis=0)
+    block_start = pid * BLOCK_SIZE
+    offsets = block_start + tl.arange(0, BLOCK_SIZE)
+    mask = offsets < n_elements
+    x = tl.load(x_ptr + offsets, mask=mask)
+    y = tl.load(y_ptr + offsets, mask=mask)
+    output = x + y
+    tl.store(output_ptr + offsets, output, mask=mask)
 
 
 def checkDbgInfo(llir, hasDbgInfo):
     assert hasDbgInfo == ('dbg_value' in llir)
-    for name in all_names:
+    for name in ["offsets", "pid", "block_start", "mask", "x", "y", "output"]:
         assert hasDbgInfo == ('!DILocalVariable(name: \"' + name + '\"' in llir)
 
 
-def test_triton_debuginfo_on():
-    lineInfoKey = "TRITON_DISABLE_LINE_INFO"
-    diLocalVarKey = "LLVM_EXTRACT_DI_LOCAL_VARIABLES"
+@pytest.mark.parametrize("lineInfoKey, diLocalVarKey, hasDbgInfo", [
+    (None, None, False),
+    # expect dbginfo based on parent proccess' TRITON_DISABLE_LINE_INFO
+    (None, "1", "infer"),
+    ("0", "1", True),
+    ("1", "1", False),
+    ("0", "0", False),
+    ("1", "0", False),
+])
+def test_triton_debuginfo_on(lineInfoKey, diLocalVarKey, hasDbgInfo, device, monkeypatch):
+    lineInfoKeyName = "TRITON_DISABLE_LINE_INFO"
+    diLocalVarKeyName = "LLVM_EXTRACT_DI_LOCAL_VARIABLES"
+    if lineInfoKey is not None:
+        monkeypatch.setenv(lineInfoKeyName, lineInfoKey)
+    if diLocalVarKey is not None:
+        monkeypatch.setenv(diLocalVarKeyName, diLocalVarKey)
 
     isEnvSet = lambda env, str: env.get(str, None) is not None
-    hasOrigLineInfo = (not isEnvSet(os.environ, lineInfoKey)
-                       or os.environ[lineInfoKey].lower() not in ["on", "true", "1"])
-    envs = [
-        # expect no dbginfo if unset
-        {lineInfoKey: None, diLocalVarKey: None, "hasDbgInfo": False},
-        # expect dbginfo based on parent proccess' TRITON_DISABLE_LINE_INFO
-        {lineInfoKey: None, diLocalVarKey: "1", "hasDbgInfo": hasOrigLineInfo},
-        {lineInfoKey: "0", diLocalVarKey: "1", "hasDbgInfo": True},
-        {lineInfoKey: "1", diLocalVarKey: "1", "hasDbgInfo": False},
-        {lineInfoKey: "0", diLocalVarKey: "0", "hasDbgInfo": False},
-        {lineInfoKey: "1", diLocalVarKey: "0", "hasDbgInfo": False},
-    ]
-
-    _run_test = lambda test_env: subprocess.run([
-        "python3", os.path.dirname(os.path.realpath(__file__)) + "/test_debuginfo_helper.py"
-    ], env=test_env, capture_output=True, text=True)
-    for env in envs:
-        test_env = os.environ.copy()
-        test_env["TRITON_ALWAYS_COMPILE"] = "1"
-        for entry in env:
-            if not isEnvSet(env, entry): continue
-            test_env[entry] = str(env[entry])
-        checkDbgInfo(str(_run_test(test_env).stdout), hasDbgInfo=env["hasDbgInfo"])
+    if hasDbgInfo == "infer":
+        hasDbgInfo = (not isEnvSet(os.environ, lineInfoKeyName)
+                      or os.environ[lineInfoKeyName].lower() not in ["on", "true", "1"])
+
+    size = 98432
+    torch.manual_seed(0)
+    x = torch.rand(size, device=device)
+    y = torch.rand(size, device=device)
+    output = torch.empty_like(x)
+    n_elements = output.numel()
+    grid = lambda meta: (triton.cdiv(n_elements, meta['BLOCK_SIZE']), )
+    add_kernel.device_caches.clear()
+    h = add_kernel[grid](x, y, output, n_elements, BLOCK_SIZE=1024)
+    checkDbgInfo(h.asm['llir'], hasDbgInfo)
@@ -509,10 +509,6 @@ def warp_specialize(default_args, default_partition, worker_args, worker_partiti
     """
     worker_num_warps = [_unwrap_if_constexpr(w) for w in worker_num_warps]
     worker_num_regs = [_unwrap_if_constexpr(r) for r in worker_num_regs]
-    if not isinstance(default_args, tuple):
-        default_args = (default_args, )
-    if not isinstance(worker_args, tuple):
-        worker_args = (worker_args, )
     return _semantic.warp_specialize(default_args, default_partition, worker_args, worker_partitions, worker_num_warps,
                                      worker_num_regs, _generator)
 
 
@@ -420,6 +420,10 @@ def gather(self, src: TensorTy, index: TensorTy, axis: int) -> TensorTy:
     def warp_specialize(self, default_args, default_partition, worker_args, worker_partitions,
                         worker_num_warps: Sequence[int], worker_num_regs: Sequence[int], generator):
         num_partitions = len(worker_partitions)
+        _check(isinstance(default_args, (tuple, ttgl.tuple)),
+               lambda: f"default_args must be a tuple of arguments, but got {type(default_args)}")
+        _check(isinstance(worker_args, (tuple, ttgl.tuple)),
+               lambda: f"worker_args must be a tuple of arguments, but got {type(worker_args)}")
         assert num_partitions == len(
             worker_num_warps
         ), f"warp specialize got {num_partitions} partitions but {len(worker_num_warps)} warp counts"