From d31ccfe5846be60b262adeace8e69ee92e82e808 Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Fri, 25 Oct 2024 07:04:07 +0200 Subject: [PATCH 1/5] [NFC] Use `get_config_var('EXT_SUFFIX')` instead of using `so` directly (#4958) Change to improve platform independence. How it works? On Windows: ```python >>> import sysconfig >>> sysconfig.get_config_var("EXT_SUFFIX") '.cp310-win_amd64.pyd' >>> sysconfig.get_config_var("EXT_SUFFIX").split(".")[-1] 'pyd' ``` On Linux: ```python >>> import sysconfig >>> sysconfig.get_config_var("EXT_SUFFIX") '.cpython-310-x86_64-linux-gnu.so' >>> sysconfig.get_config_var("EXT_SUFFIX").split(".")[-1] 'so' ``` --------- Signed-off-by: Anatoly Myachev --- python/triton/compiler/compiler.py | 4 +++- third_party/nvidia/backend/driver.py | 6 ++++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/python/triton/compiler/compiler.py b/python/triton/compiler/compiler.py index 8ca1f8b326..304b406974 100644 --- a/python/triton/compiler/compiler.py +++ b/python/triton/compiler/compiler.py @@ -15,6 +15,7 @@ import re import functools import os +import sysconfig # - ^\s*tt\.func\s+ : match the start of the string, any leading whitespace, the keyword func, # and any following whitespace @@ -151,7 +152,8 @@ def triton_key(): # backend libtriton_hash = hashlib.sha256() - with open(os.path.join(TRITON_PATH, "_C/libtriton.so"), "rb") as f: + ext = sysconfig.get_config_var("EXT_SUFFIX").split(".")[-1] + with open(os.path.join(TRITON_PATH, f"_C/libtriton.{ext}"), "rb") as f: while True: chunk = f.read(1024**2) if not chunk: diff --git a/third_party/nvidia/backend/driver.py b/third_party/nvidia/backend/driver.py index 38ce62b0c2..fa832f68ef 100644 --- a/third_party/nvidia/backend/driver.py +++ b/third_party/nvidia/backend/driver.py @@ -1,5 +1,6 @@ import functools import os +import sysconfig import hashlib import subprocess import tempfile @@ -48,7 +49,8 @@ def library_dirs(): def compile_module_from_src(src, name): key = hashlib.sha256(src.encode("utf-8")).hexdigest() cache = get_cache_manager(key) - cache_path = cache.get_file(f"{name}.so") + ext = sysconfig.get_config_var("EXT_SUFFIX").split(".")[-1] + cache_path = cache.get_file(f"{name}.{ext}") if cache_path is None: with tempfile.TemporaryDirectory() as tmpdir: src_path = os.path.join(tmpdir, "main.c") @@ -56,7 +58,7 @@ def compile_module_from_src(src, name): f.write(src) so = _build(name, src_path, tmpdir, library_dirs(), include_dir, libraries) with open(so, "rb") as f: - cache_path = cache.put(f.read(), f"{name}.so", binary=True) + cache_path = cache.put(f.read(), f"{name}.{ext}", binary=True) import importlib.util spec = importlib.util.spec_from_file_location(name, cache_path) mod = importlib.util.module_from_spec(spec) From 15c5e5519ff129104956ef0a726656f9e5cb4359 Mon Sep 17 00:00:00 2001 From: Keren Zhou Date: Fri, 25 Oct 2024 09:12:23 -0700 Subject: [PATCH 2/5] [BACKEND] Improve detection of register to register conversion (#4991) Specifically, it fixes problems when `srcLayout` and `dstLayout` have different number of registers but the same number of not free registers. We solved the problem by padding free registers to either `srcLayout` or `dstLayout`, but this can be improved by fixing the `invertAndCompose` function. --- include/triton/Analysis/Utility.h | 2 +- include/triton/Tools/LinearLayout.h | 7 ++ lib/Analysis/Utility.cpp | 42 ++++++++++- .../TritonGPUToLLVM/ConvertLayoutOpToLLVM.cpp | 53 +++++++------ lib/Tools/LinearLayout.cpp | 15 ++++ test/Conversion/tritongpu_to_llvm.mlir | 74 +++++++++++++++++++ unittest/Tools/LinearLayoutTest.cpp | 33 +++++++++ 7 files changed, 202 insertions(+), 24 deletions(-) diff --git a/include/triton/Analysis/Utility.h b/include/triton/Analysis/Utility.h index cb3e3d292e..4f6aff739c 100644 --- a/include/triton/Analysis/Utility.h +++ b/include/triton/Analysis/Utility.h @@ -212,7 +212,7 @@ bool cvtNeedsSharedMemory(RankedTensorType srcTy, RankedTensorType dstTy); bool atomicNeedsSharedMemory(Value result); -bool isBlockedToDotShortcut(RankedTensorType &srcTy, RankedTensorType &dstT); +bool isBlockedToDotShortcut(RankedTensorType srcTy, RankedTensorType dstTy); bool isMfmaToDotShortcut(RankedTensorType srcTy, RankedTensorType dstTy); diff --git a/include/triton/Tools/LinearLayout.h b/include/triton/Tools/LinearLayout.h index c728cfbb32..47e3fca79b 100644 --- a/include/triton/Tools/LinearLayout.h +++ b/include/triton/Tools/LinearLayout.h @@ -679,6 +679,13 @@ class LinearLayout { // (i.e. every input bit affects the output). llvm::MapVector getFreeVariableMasks() const; + // Increase an input dimension without affecting the output dimension. The + // added free variables are mapped to 0, ensuring that the new input + // dimensions correspond directly to the existing output space. The function + // errors out if `newInDimSize` is less than the current size or the new size + // is not a power of 2. + LinearLayout resize(StringAttr inDim, int32_t newInDimSize) const; + std::string toString() const; friend bool operator==(LinearLayout lhs, LinearLayout rhs); diff --git a/lib/Analysis/Utility.cpp b/lib/Analysis/Utility.cpp index 4915d7b1ac..9782be48d7 100644 --- a/lib/Analysis/Utility.cpp +++ b/lib/Analysis/Utility.cpp @@ -536,7 +536,7 @@ bool supportMMA(Value value, int version) { (elemTy.isInteger(8) && version >= 2); } -bool isBlockedToDotShortcut(RankedTensorType &srcTy, RankedTensorType &dstTy) { +bool isBlockedToDotShortcut(RankedTensorType srcTy, RankedTensorType dstTy) { auto blockedLayout = dyn_cast(srcTy.getEncoding()); auto dotOperandLayout = dyn_cast(dstTy.getEncoding()); if (blockedLayout == nullptr || dotOperandLayout == nullptr) @@ -655,8 +655,46 @@ std::optional minimalCvtLayout(RankedTensorType srcTy, toLinearLayout(dstTy.getShape(), dstTy.getEncoding()); if (!(srcLayout.has_value() && dstLayout.has_value())) return std::nullopt; + StringAttr kRegister = StringAttr::get(ctx, "register"); + StringAttr kLane = StringAttr::get(ctx, "lane"); + StringAttr kWarp = StringAttr::get(ctx, "warp"); + StringAttr kBlock = StringAttr::get(ctx, "block"); + auto numSrcRegs = srcLayout->getInDimSize(kRegister); + auto numDstRegs = dstLayout->getInDimSize(kRegister); + // The `invertAndCompose` function will generate a layout that is injective + // by assigning new output dimensions to free variables. For instance, + // consider a scenario where `srcLayout` has a free variable in the lane + // dimension, while `dstLayout` has two free variables in the lane + // dimension and also a larger number of registers. + // The injective form of `srcLayout` will add only a single additional row + // to the transformation matrix, whereas the injective form of `dstLayout` + // will add two additional rows. This discrepancy causes misleading results + // because the matrices end up with a different number of rows. + // + // Take `dstLayout ⋅ srcLayout^-1` as an example: + // + // - `injective(dstLayout)`: [n, m] → [n + 2, m] + // - `injective(srcLayout)`: [n, m] → [n + 1, m] + // - `injective(srcLayout)^-1`: [n + 1, m] → [m, n + 1] + // - `injective(dstLayout) ⋅ injective(srcLayout)^-1`: [n + 2, m] ⋅ [m, n + + // 1] → [n + 2, n + 1] + // + // Here, the `(n + 1)`-th row added by `dstLayout` represents the free + // variable in registers, and the `(n + 2)`-th row represents the free + // variable in lanes. However, the `(n + 1)`-th row added by `srcLayout` + // represents the free variable in lanes. As a result, the `(n + 1)`-th row + // in two layouts do not correspond to the same free variable. + // + // To address this issue, we pad the free variables in `srcLayout` and + // `dstLayout` to ensure they have the same number of registers. This + // guarantees that the resulting matrices have the same number of rows, + // ensuring consistency in the composition process. + auto numRegs = std::max(numSrcRegs, numDstRegs); + auto srcLayoutWithFreeRegs = srcLayout->resize(kRegister, numRegs); + auto dstLayoutWithFreeRegs = dstLayout->resize(kRegister, numRegs); // comp describes the layout function to create dst from src. - LinearLayout comp = dstLayout->invertAndCompose(*srcLayout); + LinearLayout comp = + dstLayoutWithFreeRegs.invertAndCompose(srcLayoutWithFreeRegs); // We try to quotient by the largest subspace first auto dims = SmallVector{"block", "warp", "lane", "register"}; for (auto dim : dims) { diff --git a/lib/Conversion/TritonGPUToLLVM/ConvertLayoutOpToLLVM.cpp b/lib/Conversion/TritonGPUToLLVM/ConvertLayoutOpToLLVM.cpp index a18b2cbc30..ea9091f4e1 100644 --- a/lib/Conversion/TritonGPUToLLVM/ConvertLayoutOpToLLVM.cpp +++ b/lib/Conversion/TritonGPUToLLVM/ConvertLayoutOpToLLVM.cpp @@ -288,49 +288,54 @@ struct ConvertLayoutOpUsingLinearLayoutsConversion return rewriter.notifyMatchFailure( op, "NYI. srcTy and/or dstTy don't implement LLs yet"); } + LinearLayout srcLayout = + *toLinearLayout(srcTy.getShape(), srcTy.getEncoding()); + LinearLayout dstLayout = + *toLinearLayout(dstTy.getShape(), dstTy.getEncoding()); + + StringAttr kBlock = str_attr("block"); + StringAttr kWarp = str_attr("warp"); + StringAttr kLane = str_attr("lane"); + StringAttr kRegister = str_attr("register"); assert(to_vector(conversion->getInDimNames()) == to_vector(conversion->getOutDimNames())); auto dims = conversion->getInDimNames(); - if (llvm::is_contained(dims, str_attr("block"))) { + if (llvm::is_contained(dims, kBlock)) { // Case 1: Transfer between values in different CTAs. // This requires moving values through distributed shared memory. return rewriter.notifyMatchFailure( op, "NYI: Transfer between different CTAs"); - } else if (llvm::is_contained(dims, str_attr("warp"))) { + } else if (llvm::is_contained(dims, kWarp)) { // Case 2: Transfer between values in the same CTA, in which case we move // values through shared memory. - LinearLayout srcLayout = - *toLinearLayout(srcTy.getShape(), srcTy.getEncoding()); - LinearLayout dstLayout = - *toLinearLayout(dstTy.getShape(), dstTy.getEncoding()); return transferWithinBlock(op, srcLayout, dstLayout, adaptor, rewriter); - } else if (llvm::is_contained(dims, str_attr("lane"))) { + } else if (llvm::is_contained(dims, kLane)) { // Case 3. Transfer between values in the same warp, in which case we try // to move values using warp shuffles, though if the pattern is // complicated enough we may fall back to using shared memory // TODO(Keren): implement warp shuffle instead of using the general // approach that uses shared memory - LinearLayout srcLayout = - *toLinearLayout(srcTy.getShape(), srcTy.getEncoding()); - LinearLayout dstLayout = - *toLinearLayout(dstTy.getShape(), dstTy.getEncoding()); return transferWithinBlock(op, srcLayout, dstLayout, adaptor, rewriter); - } else if (llvm::is_contained(dims, str_attr("register"))) { + } else if (llvm::is_contained(dims, kRegister) || + dstLayout.getInDimSize(kRegister) != + srcLayout.getInDimSize(kRegister)) { // Case 4. Transfer between values in the same thread, in which case we // simply reorder the elements of adaptor.getSrc(). - return transferWithinThread(op, *conversion, adaptor, rewriter); + return transferWithinThread( + op, dstLayout.getFreeVariableMasks()[kRegister], + dstLayout.getInDimSize(kRegister), *conversion, adaptor, rewriter); } else { - // The two layouts are equivalent. We should probably remove these in - // RemoveLayoutConversion. + // Cast 5. The two layouts are equivalent. We should probably remove + // these in RemoveLayoutConversion. rewriter.replaceOp(op, adaptor.getSrc()); return success(); } } LogicalResult - transferWithinThread(ConvertLayoutOp op, const LinearLayout &conversion, - OpAdaptor adaptor, + transferWithinThread(ConvertLayoutOp op, int32_t regMasks, int32_t numRegs, + const LinearLayout &conversion, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const { MLIRContext *ctx = op.getContext(); auto loc = op.getLoc(); @@ -338,10 +343,16 @@ struct ConvertLayoutOpUsingLinearLayoutsConversion assert(!cvtNeedsSharedMemory(op.getSrc().getType(), op.getType())); auto inVals = unpackLLElements(loc, adaptor.getSrc(), rewriter); - SmallVector outVals; - outVals.resize(conversion.getInDimSize(kRegister)); - for (int i = 0; i < conversion.getInDimSize(kRegister); i++) { - auto srcIdx = conversion.apply({{kRegister, i}}).begin()->second; + SmallVector outVals(numRegs); + for (int i = 0; i < outVals.size(); i++) { + // Remove free masks from the register index + // For example, if idx = 0b00111, and masks = 0b00100, then we get + // 0b00011. It means that register 7 (0b111) has the same value as + // register 3 (0b011). + auto idx = i & (~regMasks); + auto srcIdx = conversion.hasInDim(kRegister) + ? conversion.apply({{kRegister, idx}}).begin()->second + : idx; outVals[i] = inVals[srcIdx]; } Value result = packLLElements(loc, getTypeConverter(), outVals, rewriter, diff --git a/lib/Tools/LinearLayout.cpp b/lib/Tools/LinearLayout.cpp index bf017f8c64..4319d1f086 100644 --- a/lib/Tools/LinearLayout.cpp +++ b/lib/Tools/LinearLayout.cpp @@ -1016,6 +1016,21 @@ bool LinearLayout::equalIgnoringOutDimSizes(const LinearLayout &other) const { return true; } +LinearLayout LinearLayout::resize(StringAttr inDim, + int32_t newInDimSize) const { + BasesT bases = getBases(); + assert(bases.contains(inDim) && "inDim not in layout"); + assert(llvm::isPowerOf2_32(newInDimSize) && + "newInDimSize must be a power of 2"); + assert(newInDimSize >= getInDimSize(inDim) && + "newInDimSize must be >= old size"); + auto numFreeVariables = llvm::Log2_32(newInDimSize) - getInDimSizeLog2(inDim); + for (int i = 0; i < numFreeVariables; i++) { + bases[inDim].push_back(std::vector(getNumOutDims(), 0)); + } + return LinearLayout(std::move(bases), llvm::to_vector(getOutDimNames())); +} + std::string LinearLayout::toString() const { // Start with a newline because we print out a bulleted list; it doesn't // make sense for the first line of this list to be on the same line as diff --git a/test/Conversion/tritongpu_to_llvm.mlir b/test/Conversion/tritongpu_to_llvm.mlir index e1a2ec68bd..4a61ee4bc1 100644 --- a/test/Conversion/tritongpu_to_llvm.mlir +++ b/test/Conversion/tritongpu_to_llvm.mlir @@ -847,6 +847,80 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : // ----- +#mma = #triton_gpu.nvidia_mma<{versionMajor = 2, warpsPerCTA = [1, 1], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [0, 1], instrShape = [16, 8]}> +#dot1 = #triton_gpu.dot_op<{opIdx=0, parent=#mma, kWidth=2}> +module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 1 : i32} { + // CHECK-LABEL: convert_layout_mmav2_dot_reg + tt.func @convert_layout_mmav2_dot_reg(%arg0: tensor<16x16xf16, #mma>) { + // CHECK-NOT: st.shared + // CHECK-NOT: llvm.load + %0 = triton_gpu.convert_layout %arg0 : tensor<16x16xf16, #mma> -> tensor<16x16xf16, #dot1> + tt.return + } +} + +// ----- + +#mma0 = #triton_gpu.nvidia_mma<{versionMajor = 3, versionMinor = 0, warpsPerCTA = [4, 1], instrShape = [16, 64, 16]}> +#mma1 = #triton_gpu.nvidia_mma<{versionMajor = 3, versionMinor = 0, warpsPerCTA = [4, 1], instrShape = [16, 128, 16]}> + +module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32} { + // CHECK-LABEL: convert_layout_mmav3_mmav3_0 + tt.func @convert_layout_mmav3_mmav3_0(%arg0: tensor<64x64xf16, #mma0>) { + // CHECK-NOT: st.shared + // CHECK-NOT: llvm.load + %0 = triton_gpu.convert_layout %arg0 : tensor<64x64xf16, #mma0> -> tensor<64x64xf16, #mma1> + tt.return + } +} + +// ----- + +#mma0 = #triton_gpu.nvidia_mma<{versionMajor = 3, versionMinor = 0, warpsPerCTA = [4, 1], instrShape = [16, 64, 16]}> +#mma1 = #triton_gpu.nvidia_mma<{versionMajor = 3, versionMinor = 0, warpsPerCTA = [4, 1], instrShape = [16, 128, 16]}> + +module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32} { + // CHECK-LABEL: convert_layout_mmav3_mmav3_1 + tt.func @convert_layout_mmav3_mmav3_1(%arg0: tensor<64x64xf16, #mma1>) { + // CHECK-NOT: st.shared + // CHECK-NOT: llvm.load + %0 = triton_gpu.convert_layout %arg0 : tensor<64x64xf16, #mma1> -> tensor<64x64xf16, #mma0> + tt.return + } +} + +// ----- + +#mma0 = #triton_gpu.nvidia_mma<{versionMajor = 3, versionMinor = 0, warpsPerCTA = [4, 1], instrShape = [16, 64, 16]}> +#mma1 = #triton_gpu.nvidia_mma<{versionMajor = 3, versionMinor = 0, warpsPerCTA = [4, 1], instrShape = [16, 128, 16]}> + +module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32} { + // CHECK-LABEL: convert_layout_mmav3_mmav3_2 + tt.func @convert_layout_mmav3_mmav3_2(%arg0: tensor<16x16xf16, #mma1>) { + // CHECK-NOT: st.shared + // CHECK-NOT: llvm.load + %0 = triton_gpu.convert_layout %arg0 : tensor<16x16xf16, #mma1> -> tensor<16x16xf16, #mma0> + tt.return + } +} + +// ----- + +#mma0 = #triton_gpu.nvidia_mma<{versionMajor = 3, versionMinor = 0, warpsPerCTA = [4, 1], instrShape = [16, 64, 16]}> +#mma1 = #triton_gpu.nvidia_mma<{versionMajor = 3, versionMinor = 0, warpsPerCTA = [4, 1], instrShape = [16, 128, 16]}> + +module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32} { + // CHECK-LABEL: convert_layout_mmav3_mmav3_3 + tt.func @convert_layout_mmav3_mmav3_3(%arg0: tensor<1x64xf16, #mma1>) { + // CHECK-NOT: st.shared + // CHECK-NOT: llvm.load + %0 = triton_gpu.convert_layout %arg0 : tensor<1x64xf16, #mma1> -> tensor<1x64xf16, #mma0> + tt.return + } +} + +// ----- + #blocked = #triton_gpu.blocked<{sizePerThread = [16, 1], threadsPerWarp = [8, 4], warpsPerCTA = [1, 8], order = [0, 1]}> #mma = #triton_gpu.nvidia_mma<{versionMajor = 3, versionMinor = 0, warpsPerCTA = [8, 1], instrShape = [16, 256, 32]}> module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 8 : i32} { diff --git a/unittest/Tools/LinearLayoutTest.cpp b/unittest/Tools/LinearLayoutTest.cpp index f006447002..897172fd6d 100644 --- a/unittest/Tools/LinearLayoutTest.cpp +++ b/unittest/Tools/LinearLayoutTest.cpp @@ -747,6 +747,39 @@ TEST_F(LinearLayoutTest, QuotientIdentityMultipleDimensions) { ASSERT_TRUE(quotientLayout->quotient({S("dim2")}).has_value()); } +TEST_F(LinearLayoutTest, Resize) { + auto init = LinearLayout( + { + {S("in0"), {{0, 1}, {0, 2}}}, + {S("in1"), {{1, 0}, {2, 0}}}, + {S("in2"), {}}, + }, + {S("dim0"), S("dim1")}); + EXPECT_EQ(init.resize(S("in0"), 8), + LinearLayout( + { + {S("in0"), {{0, 1}, {0, 2}, {0, 0}}}, + {S("in1"), {{1, 0}, {2, 0}}}, + {S("in2"), {}}, + }, + {S("dim0"), S("dim1")})); + EXPECT_EQ(init.resize(S("in0"), 4), LinearLayout( + { + {S("in0"), {{0, 1}, {0, 2}}}, + {S("in1"), {{1, 0}, {2, 0}}}, + {S("in2"), {}}, + }, + {S("dim0"), S("dim1")})); + EXPECT_EQ(init.resize(S("in1"), 8), + LinearLayout( + { + {S("in0"), {{0, 1}, {0, 2}}}, + {S("in1"), {{1, 0}, {2, 0}, {0, 0}}}, + {S("in2"), {}}, + }, + {S("dim0"), S("dim1")})); +} + } // anonymous namespace } // namespace mlir::triton From 1918084cd4ce22c21476fd0366a40370f4b757ec Mon Sep 17 00:00:00 2001 From: David Berard Date: Fri, 25 Oct 2024 13:13:24 -0700 Subject: [PATCH 3/5] [TEST] float16 test for test_tensor_atomic_rmw (#4981) This adds float16 to the list of dtypes tested in test_tensor_atomic_rmw. Note that the numerics were previously bad for this test when run in float16; this PR "fixes" the numerics by internally doing the sum in float32 (upcast, sum, downcast). Since the purpose is to test the atomic_rmw, and the numerical issues of doing sums in low-precision dtypes are generally know, I think this strategy should be fine for this test. --- python/test/unit/language/test_core.py | 30 ++++++++++++++++++++++---- 1 file changed, 26 insertions(+), 4 deletions(-) diff --git a/python/test/unit/language/test_core.py b/python/test/unit/language/test_core.py index e044198046..182445836e 100644 --- a/python/test/unit/language/test_core.py +++ b/python/test/unit/language/test_core.py @@ -1453,17 +1453,29 @@ def kernel(X): for shape in [(2, 2), (2, 8), (8, 2), (8, 8), (32, 32), (64, 64)] for axis in [0, 1] for num_ctas in num_ctas_list - for dtype_x_str in ['float32', 'uint64', 'int64', 'float64']]) + for dtype_x_str in ['float16', 'float32', 'uint64', 'int64', 'float64']]) def test_tensor_atomic_rmw(shape, axis, num_ctas, dtype_x_str, device): + if is_interpreter() and dtype_x_str == 'float16': + pytest.skip('float16 atomic_add does not work in the interpreter mode') shape0, shape1 = shape # triton kernel @triton.jit - def kernel(Z, X, OLD, AXIS: tl.constexpr, SHAPE0: tl.constexpr, SHAPE1: tl.constexpr): + def kernel(Z, X, OLD, AXIS: tl.constexpr, SHAPE0: tl.constexpr, SHAPE1: tl.constexpr, DTYPE: tl.constexpr): off0 = tl.arange(0, SHAPE0) off1 = tl.arange(0, SHAPE1) x = tl.load(X + off0[:, None] * SHAPE1 + off1[None, :]) + + if DTYPE == tl.float16: + # sum can have bad numerics when accumulating in float16. + # if we're dealing with float16, do the sum in float32. + x = x.to(tl.float32) + z = tl.sum(x, axis=AXIS) + + if DTYPE == tl.float16: + z = z.to(DTYPE) + if AXIS == 1: old = tl.atomic_add(Z + off0, z) tl.store(OLD + off0, old) @@ -1477,13 +1489,23 @@ def kernel(Z, X, OLD, AXIS: tl.constexpr, SHAPE0: tl.constexpr, SHAPE1: tl.const z = numpy_random(z_shape, dtype_str=dtype_x_str, rs=rs) old = np.zeros(z_shape, dtype=getattr(np, dtype_x_str)) # reference results - z_ref = z + np.sum(x, axis=axis, keepdims=False) + if x.dtype == np.float16: + # do the sum in float32 to reduce numerical variation + z_ref = z + np.sum(x.astype(np.float32), axis=axis, keepdims=False).astype(x.dtype) + else: + z_ref = z + np.sum(x, axis=axis, keepdims=False) old_ref = np.copy(z) # triton result x_tri = to_triton(x, device=device) z_tri = to_triton(z, device=device) old_tri = to_triton(old, device=device) - kernel[(1, )](z_tri, x_tri, old_tri, axis, shape0, shape1, num_ctas=num_ctas) + + def torch_to_triton_dtype(t): + if t == torch.float16: + return tl.float16 + return None + + kernel[(1, )](z_tri, x_tri, old_tri, axis, shape0, shape1, torch_to_triton_dtype(x_tri.dtype), num_ctas=num_ctas) np.testing.assert_allclose(z_ref, to_numpy(z_tri), rtol=1e-4) np.testing.assert_equal(old_ref, to_numpy(old_tri)) From 78c8054298a81f578dcd8c79b519981c57dfb665 Mon Sep 17 00:00:00 2001 From: Ilya V <152324710+joviliast@users.noreply.github.com> Date: Sun, 27 Oct 2024 02:57:52 +0200 Subject: [PATCH 4/5] [AMD] Emit vectorized 16-bit float LLVM atomic ops (#4925) In the case of 16 bit floats operands for tt::AtomicRMWOp, construct only one LLVM::AtomicRMWOp but use vector of elements. Such approach allows to generate packed intrinsics and process 2 elements at once. Added a lit test for f16 vectorized case. --- test/Conversion/amd/tritongpu_to_llvm.mlir | 32 +++++++++++++++ .../TritonAMDGPUToLLVM/LoadStoreOpToLLVM.cpp | 41 +++++++++---------- 2 files changed, 52 insertions(+), 21 deletions(-) diff --git a/test/Conversion/amd/tritongpu_to_llvm.mlir b/test/Conversion/amd/tritongpu_to_llvm.mlir index ef67338457..de0eb140e2 100644 --- a/test/Conversion/amd/tritongpu_to_llvm.mlir +++ b/test/Conversion/amd/tritongpu_to_llvm.mlir @@ -62,3 +62,35 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : tt.return } } + +// ----- + +#blocked1 = #triton_gpu.blocked<{sizePerThread = [2], threadsPerWarp = [32], warpsPerCTA = [4], order = [0]}> +module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32} { + // CHECK-LABEL: atomic_add_f16 + tt.func @atomic_add_f16(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1 : tensor<256xi1, #blocked1>, %arg2 : tensor<256xf16, #blocked1>) { + %range = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32, #blocked1> + %base_ptr = tt.splat %arg0 : !tt.ptr -> tensor<256x!tt.ptr, #blocked1> + %ptr = tt.addptr %base_ptr, %range : tensor<256x!tt.ptr, #blocked1>, tensor<256xi32, #blocked1> + // CHECK: llvm.cond_br + // CHECK: llvm.atomicrmw fadd {{.*}} vector<2xf16> + %0 = tt.atomic_rmw fadd, relaxed, gpu, %ptr, %arg2, %arg1 : (tensor<256x!tt.ptr, #blocked1>, tensor<256xf16, #blocked1>, tensor<256xi1, #blocked1>) -> tensor<256xf16, #blocked1> + tt.return + } +} + +// ----- + +#blocked2 = #triton_gpu.blocked<{sizePerThread = [2], threadsPerWarp = [32], warpsPerCTA = [4], order = [0]}> +module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32} { + // CHECK-LABEL: atomic_add_bf16 + tt.func @atomic_add_bf16(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1 : tensor<256xi1, #blocked2>, %arg2 : tensor<256xbf16, #blocked2>) { + %range = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32, #blocked2> + %base_ptr = tt.splat %arg0 : !tt.ptr -> tensor<256x!tt.ptr, #blocked2> + %ptr = tt.addptr %base_ptr, %range : tensor<256x!tt.ptr, #blocked2>, tensor<256xi32, #blocked2> + // CHECK: llvm.cond_br + // CHECK: llvm.atomicrmw fadd {{.*}} vector<2xbf16> + %0 = tt.atomic_rmw fadd, relaxed, gpu, %ptr, %arg2, %arg1 : (tensor<256x!tt.ptr, #blocked2>, tensor<256xbf16, #blocked2>, tensor<256xi1, #blocked2>) -> tensor<256xbf16, #blocked2> + tt.return + } +} diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/LoadStoreOpToLLVM.cpp b/third_party/amd/lib/TritonAMDGPUToLLVM/LoadStoreOpToLLVM.cpp index a45efd4a79..5265f631ad 100644 --- a/third_party/amd/lib/TritonAMDGPUToLLVM/LoadStoreOpToLLVM.cpp +++ b/third_party/amd/lib/TritonAMDGPUToLLVM/LoadStoreOpToLLVM.cpp @@ -768,7 +768,11 @@ struct AtomicRMWOpConversion // tensor if (tensorTy) { auto valTy = cast(val.getType()); - vec = std::min(vec, valTy.getElementType().isF16() ? 2 : 1); + Type elTy = valTy.getElementType(); + vec = std::min(vec, llvm::isa(elTy) && + elTy.getIntOrFloatBitWidth() == 16 + ? 2 + : 1); // mask numElems = tensorTy.getNumElements(); } @@ -783,13 +787,22 @@ struct AtomicRMWOpConversion auto vecTy = vec_ty(valueElemTy, vec); auto retType = vec == 1 ? valueElemTy : vecTy; SmallVector resultVals(elemsPerThread); - const bool f16v2 = vec == 2 && valueElemTy.isF16(); for (size_t i = 0; i < elemsPerThread; i += vec) { Value rmwPtr = ptrElements[i]; // TODO: in case llMask is zero we can create only one branch for all // elemsPerThread. Value rmwMask = llMask ? and_(mask, maskElements[i]) : mask; + Value operand; + if (vec == 1) { + operand = valElements[i]; + } else { + operand = undef(vecTy); + for (size_t ii = 0; ii < vec; ++ii) + operand = + insert_element(vecTy, operand, valElements[i + ii], i32_val(ii)); + } + Value undefVal = undef(retType); // Build blocks to bypass the atomic instruction for ~rmwMask. auto *curBlock = rewriter.getInsertionBlock(); @@ -806,25 +819,11 @@ struct AtomicRMWOpConversion auto maybeKind = matchAtomicOp(atomicRmwAttr); // TODO: use rocdl.raw.buffer.atomic from ROCDL dialect to use efficient // atomics for MI-* series of AMD GPU. - Value atom = rewriter - .create( - loc, *maybeKind, rmwPtr, valElements[i], - atomicMemOrdering, StringRef("agent")) - .getResult(); - - // NV for the f16v2 case generates one packed instruction. We have to - // create two separate instructions since LLVM::AtomicRMWOp doesn't - // support this. Can be optimized out with rocdl.raw.buffer.atomic. - if (f16v2) { - Value atom2 = - rewriter - .create( - loc, *maybeKind, ptrElements[i + 1], valElements[i + 1], - atomicMemOrdering, StringRef("agent")) - .getResult(); - auto tmp = insert_element(vecTy, undef(vecTy), atom, i32_val(0)); - atom = insert_element(vecTy, tmp, atom2, i32_val(1)).getResult(); - } + Value atom = + rewriter + .create(loc, *maybeKind, rmwPtr, operand, + atomicMemOrdering, StringRef("agent")) + .getResult(); if (!tensorTy) { if (atomicNeedsSharedMemory(op.getResult())) { Value atomPtr = From 77f98f00658988d59626b50bf946159b3a3ad5f3 Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Thu, 31 Oct 2024 14:00:25 +0000 Subject: [PATCH 5/5] Revert "[BACKEND] Improve detection of register to register conversion (#4991)" This reverts commit 15c5e5519ff129104956ef0a726656f9e5cb4359. --- include/triton/Analysis/Utility.h | 2 +- include/triton/Tools/LinearLayout.h | 7 -- lib/Analysis/Utility.cpp | 42 +---------- .../TritonGPUToLLVM/ConvertLayoutOpToLLVM.cpp | 53 ++++++------- lib/Tools/LinearLayout.cpp | 15 ---- test/Conversion/tritongpu_to_llvm.mlir | 74 ------------------- unittest/Tools/LinearLayoutTest.cpp | 33 --------- 7 files changed, 24 insertions(+), 202 deletions(-) diff --git a/include/triton/Analysis/Utility.h b/include/triton/Analysis/Utility.h index 4f6aff739c..cb3e3d292e 100644 --- a/include/triton/Analysis/Utility.h +++ b/include/triton/Analysis/Utility.h @@ -212,7 +212,7 @@ bool cvtNeedsSharedMemory(RankedTensorType srcTy, RankedTensorType dstTy); bool atomicNeedsSharedMemory(Value result); -bool isBlockedToDotShortcut(RankedTensorType srcTy, RankedTensorType dstTy); +bool isBlockedToDotShortcut(RankedTensorType &srcTy, RankedTensorType &dstT); bool isMfmaToDotShortcut(RankedTensorType srcTy, RankedTensorType dstTy); diff --git a/include/triton/Tools/LinearLayout.h b/include/triton/Tools/LinearLayout.h index 47e3fca79b..c728cfbb32 100644 --- a/include/triton/Tools/LinearLayout.h +++ b/include/triton/Tools/LinearLayout.h @@ -679,13 +679,6 @@ class LinearLayout { // (i.e. every input bit affects the output). llvm::MapVector getFreeVariableMasks() const; - // Increase an input dimension without affecting the output dimension. The - // added free variables are mapped to 0, ensuring that the new input - // dimensions correspond directly to the existing output space. The function - // errors out if `newInDimSize` is less than the current size or the new size - // is not a power of 2. - LinearLayout resize(StringAttr inDim, int32_t newInDimSize) const; - std::string toString() const; friend bool operator==(LinearLayout lhs, LinearLayout rhs); diff --git a/lib/Analysis/Utility.cpp b/lib/Analysis/Utility.cpp index 339377f3ca..f51b3c1657 100644 --- a/lib/Analysis/Utility.cpp +++ b/lib/Analysis/Utility.cpp @@ -543,7 +543,7 @@ bool supportMMA(Value value, int version) { (elemTy.isInteger(8) && version >= 2); } -bool isBlockedToDotShortcut(RankedTensorType srcTy, RankedTensorType dstTy) { +bool isBlockedToDotShortcut(RankedTensorType &srcTy, RankedTensorType &dstTy) { auto blockedLayout = dyn_cast(srcTy.getEncoding()); auto dotOperandLayout = dyn_cast(dstTy.getEncoding()); if (blockedLayout == nullptr || dotOperandLayout == nullptr) @@ -662,46 +662,8 @@ std::optional minimalCvtLayout(RankedTensorType srcTy, toLinearLayout(dstTy.getShape(), dstTy.getEncoding()); if (!(srcLayout.has_value() && dstLayout.has_value())) return std::nullopt; - StringAttr kRegister = StringAttr::get(ctx, "register"); - StringAttr kLane = StringAttr::get(ctx, "lane"); - StringAttr kWarp = StringAttr::get(ctx, "warp"); - StringAttr kBlock = StringAttr::get(ctx, "block"); - auto numSrcRegs = srcLayout->getInDimSize(kRegister); - auto numDstRegs = dstLayout->getInDimSize(kRegister); - // The `invertAndCompose` function will generate a layout that is injective - // by assigning new output dimensions to free variables. For instance, - // consider a scenario where `srcLayout` has a free variable in the lane - // dimension, while `dstLayout` has two free variables in the lane - // dimension and also a larger number of registers. - // The injective form of `srcLayout` will add only a single additional row - // to the transformation matrix, whereas the injective form of `dstLayout` - // will add two additional rows. This discrepancy causes misleading results - // because the matrices end up with a different number of rows. - // - // Take `dstLayout ⋅ srcLayout^-1` as an example: - // - // - `injective(dstLayout)`: [n, m] → [n + 2, m] - // - `injective(srcLayout)`: [n, m] → [n + 1, m] - // - `injective(srcLayout)^-1`: [n + 1, m] → [m, n + 1] - // - `injective(dstLayout) ⋅ injective(srcLayout)^-1`: [n + 2, m] ⋅ [m, n + - // 1] → [n + 2, n + 1] - // - // Here, the `(n + 1)`-th row added by `dstLayout` represents the free - // variable in registers, and the `(n + 2)`-th row represents the free - // variable in lanes. However, the `(n + 1)`-th row added by `srcLayout` - // represents the free variable in lanes. As a result, the `(n + 1)`-th row - // in two layouts do not correspond to the same free variable. - // - // To address this issue, we pad the free variables in `srcLayout` and - // `dstLayout` to ensure they have the same number of registers. This - // guarantees that the resulting matrices have the same number of rows, - // ensuring consistency in the composition process. - auto numRegs = std::max(numSrcRegs, numDstRegs); - auto srcLayoutWithFreeRegs = srcLayout->resize(kRegister, numRegs); - auto dstLayoutWithFreeRegs = dstLayout->resize(kRegister, numRegs); // comp describes the layout function to create dst from src. - LinearLayout comp = - dstLayoutWithFreeRegs.invertAndCompose(srcLayoutWithFreeRegs); + LinearLayout comp = dstLayout->invertAndCompose(*srcLayout); // We try to quotient by the largest subspace first auto dims = SmallVector{"block", "warp", "lane", "register"}; for (auto dim : dims) { diff --git a/lib/Conversion/TritonGPUToLLVM/ConvertLayoutOpToLLVM.cpp b/lib/Conversion/TritonGPUToLLVM/ConvertLayoutOpToLLVM.cpp index e19bc696d2..43c7095b21 100644 --- a/lib/Conversion/TritonGPUToLLVM/ConvertLayoutOpToLLVM.cpp +++ b/lib/Conversion/TritonGPUToLLVM/ConvertLayoutOpToLLVM.cpp @@ -288,54 +288,49 @@ struct ConvertLayoutOpUsingLinearLayoutsConversion return rewriter.notifyMatchFailure( op, "NYI. srcTy and/or dstTy don't implement LLs yet"); } - LinearLayout srcLayout = - *toLinearLayout(srcTy.getShape(), srcTy.getEncoding()); - LinearLayout dstLayout = - *toLinearLayout(dstTy.getShape(), dstTy.getEncoding()); - - StringAttr kBlock = str_attr("block"); - StringAttr kWarp = str_attr("warp"); - StringAttr kLane = str_attr("lane"); - StringAttr kRegister = str_attr("register"); assert(to_vector(conversion->getInDimNames()) == to_vector(conversion->getOutDimNames())); auto dims = conversion->getInDimNames(); - if (llvm::is_contained(dims, kBlock)) { + if (llvm::is_contained(dims, str_attr("block"))) { // Case 1: Transfer between values in different CTAs. // This requires moving values through distributed shared memory. return rewriter.notifyMatchFailure( op, "NYI: Transfer between different CTAs"); - } else if (llvm::is_contained(dims, kWarp)) { + } else if (llvm::is_contained(dims, str_attr("warp"))) { // Case 2: Transfer between values in the same CTA, in which case we move // values through shared memory. + LinearLayout srcLayout = + *toLinearLayout(srcTy.getShape(), srcTy.getEncoding()); + LinearLayout dstLayout = + *toLinearLayout(dstTy.getShape(), dstTy.getEncoding()); return transferWithinBlock(op, srcLayout, dstLayout, adaptor, rewriter); - } else if (llvm::is_contained(dims, kLane)) { + } else if (llvm::is_contained(dims, str_attr("lane"))) { // Case 3. Transfer between values in the same warp, in which case we try // to move values using warp shuffles, though if the pattern is // complicated enough we may fall back to using shared memory // TODO(Keren): implement warp shuffle instead of using the general // approach that uses shared memory + LinearLayout srcLayout = + *toLinearLayout(srcTy.getShape(), srcTy.getEncoding()); + LinearLayout dstLayout = + *toLinearLayout(dstTy.getShape(), dstTy.getEncoding()); return transferWithinBlock(op, srcLayout, dstLayout, adaptor, rewriter); - } else if (llvm::is_contained(dims, kRegister) || - dstLayout.getInDimSize(kRegister) != - srcLayout.getInDimSize(kRegister)) { + } else if (llvm::is_contained(dims, str_attr("register"))) { // Case 4. Transfer between values in the same thread, in which case we // simply reorder the elements of adaptor.getSrc(). - return transferWithinThread( - op, dstLayout.getFreeVariableMasks()[kRegister], - dstLayout.getInDimSize(kRegister), *conversion, adaptor, rewriter); + return transferWithinThread(op, *conversion, adaptor, rewriter); } else { - // Cast 5. The two layouts are equivalent. We should probably remove - // these in RemoveLayoutConversion. + // The two layouts are equivalent. We should probably remove these in + // RemoveLayoutConversion. rewriter.replaceOp(op, adaptor.getSrc()); return success(); } } LogicalResult - transferWithinThread(ConvertLayoutOp op, int32_t regMasks, int32_t numRegs, - const LinearLayout &conversion, OpAdaptor adaptor, + transferWithinThread(ConvertLayoutOp op, const LinearLayout &conversion, + OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const { MLIRContext *ctx = op.getContext(); auto loc = op.getLoc(); @@ -343,16 +338,10 @@ struct ConvertLayoutOpUsingLinearLayoutsConversion assert(!cvtNeedsSharedMemory(op.getSrc().getType(), op.getType())); auto inVals = unpackLLElements(loc, adaptor.getSrc(), rewriter); - SmallVector outVals(numRegs); - for (int i = 0; i < outVals.size(); i++) { - // Remove free masks from the register index - // For example, if idx = 0b00111, and masks = 0b00100, then we get - // 0b00011. It means that register 7 (0b111) has the same value as - // register 3 (0b011). - auto idx = i & (~regMasks); - auto srcIdx = conversion.hasInDim(kRegister) - ? conversion.apply({{kRegister, idx}}).begin()->second - : idx; + SmallVector outVals; + outVals.resize(conversion.getInDimSize(kRegister)); + for (int i = 0; i < conversion.getInDimSize(kRegister); i++) { + auto srcIdx = conversion.apply({{kRegister, i}}).begin()->second; outVals[i] = inVals[srcIdx]; } Value result = packLLElements(loc, getTypeConverter(), outVals, rewriter, diff --git a/lib/Tools/LinearLayout.cpp b/lib/Tools/LinearLayout.cpp index 4319d1f086..bf017f8c64 100644 --- a/lib/Tools/LinearLayout.cpp +++ b/lib/Tools/LinearLayout.cpp @@ -1016,21 +1016,6 @@ bool LinearLayout::equalIgnoringOutDimSizes(const LinearLayout &other) const { return true; } -LinearLayout LinearLayout::resize(StringAttr inDim, - int32_t newInDimSize) const { - BasesT bases = getBases(); - assert(bases.contains(inDim) && "inDim not in layout"); - assert(llvm::isPowerOf2_32(newInDimSize) && - "newInDimSize must be a power of 2"); - assert(newInDimSize >= getInDimSize(inDim) && - "newInDimSize must be >= old size"); - auto numFreeVariables = llvm::Log2_32(newInDimSize) - getInDimSizeLog2(inDim); - for (int i = 0; i < numFreeVariables; i++) { - bases[inDim].push_back(std::vector(getNumOutDims(), 0)); - } - return LinearLayout(std::move(bases), llvm::to_vector(getOutDimNames())); -} - std::string LinearLayout::toString() const { // Start with a newline because we print out a bulleted list; it doesn't // make sense for the first line of this list to be on the same line as diff --git a/test/Conversion/tritongpu_to_llvm.mlir b/test/Conversion/tritongpu_to_llvm.mlir index 4a61ee4bc1..e1a2ec68bd 100644 --- a/test/Conversion/tritongpu_to_llvm.mlir +++ b/test/Conversion/tritongpu_to_llvm.mlir @@ -847,80 +847,6 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : // ----- -#mma = #triton_gpu.nvidia_mma<{versionMajor = 2, warpsPerCTA = [1, 1], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [0, 1], instrShape = [16, 8]}> -#dot1 = #triton_gpu.dot_op<{opIdx=0, parent=#mma, kWidth=2}> -module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 1 : i32} { - // CHECK-LABEL: convert_layout_mmav2_dot_reg - tt.func @convert_layout_mmav2_dot_reg(%arg0: tensor<16x16xf16, #mma>) { - // CHECK-NOT: st.shared - // CHECK-NOT: llvm.load - %0 = triton_gpu.convert_layout %arg0 : tensor<16x16xf16, #mma> -> tensor<16x16xf16, #dot1> - tt.return - } -} - -// ----- - -#mma0 = #triton_gpu.nvidia_mma<{versionMajor = 3, versionMinor = 0, warpsPerCTA = [4, 1], instrShape = [16, 64, 16]}> -#mma1 = #triton_gpu.nvidia_mma<{versionMajor = 3, versionMinor = 0, warpsPerCTA = [4, 1], instrShape = [16, 128, 16]}> - -module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32} { - // CHECK-LABEL: convert_layout_mmav3_mmav3_0 - tt.func @convert_layout_mmav3_mmav3_0(%arg0: tensor<64x64xf16, #mma0>) { - // CHECK-NOT: st.shared - // CHECK-NOT: llvm.load - %0 = triton_gpu.convert_layout %arg0 : tensor<64x64xf16, #mma0> -> tensor<64x64xf16, #mma1> - tt.return - } -} - -// ----- - -#mma0 = #triton_gpu.nvidia_mma<{versionMajor = 3, versionMinor = 0, warpsPerCTA = [4, 1], instrShape = [16, 64, 16]}> -#mma1 = #triton_gpu.nvidia_mma<{versionMajor = 3, versionMinor = 0, warpsPerCTA = [4, 1], instrShape = [16, 128, 16]}> - -module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32} { - // CHECK-LABEL: convert_layout_mmav3_mmav3_1 - tt.func @convert_layout_mmav3_mmav3_1(%arg0: tensor<64x64xf16, #mma1>) { - // CHECK-NOT: st.shared - // CHECK-NOT: llvm.load - %0 = triton_gpu.convert_layout %arg0 : tensor<64x64xf16, #mma1> -> tensor<64x64xf16, #mma0> - tt.return - } -} - -// ----- - -#mma0 = #triton_gpu.nvidia_mma<{versionMajor = 3, versionMinor = 0, warpsPerCTA = [4, 1], instrShape = [16, 64, 16]}> -#mma1 = #triton_gpu.nvidia_mma<{versionMajor = 3, versionMinor = 0, warpsPerCTA = [4, 1], instrShape = [16, 128, 16]}> - -module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32} { - // CHECK-LABEL: convert_layout_mmav3_mmav3_2 - tt.func @convert_layout_mmav3_mmav3_2(%arg0: tensor<16x16xf16, #mma1>) { - // CHECK-NOT: st.shared - // CHECK-NOT: llvm.load - %0 = triton_gpu.convert_layout %arg0 : tensor<16x16xf16, #mma1> -> tensor<16x16xf16, #mma0> - tt.return - } -} - -// ----- - -#mma0 = #triton_gpu.nvidia_mma<{versionMajor = 3, versionMinor = 0, warpsPerCTA = [4, 1], instrShape = [16, 64, 16]}> -#mma1 = #triton_gpu.nvidia_mma<{versionMajor = 3, versionMinor = 0, warpsPerCTA = [4, 1], instrShape = [16, 128, 16]}> - -module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32} { - // CHECK-LABEL: convert_layout_mmav3_mmav3_3 - tt.func @convert_layout_mmav3_mmav3_3(%arg0: tensor<1x64xf16, #mma1>) { - // CHECK-NOT: st.shared - // CHECK-NOT: llvm.load - %0 = triton_gpu.convert_layout %arg0 : tensor<1x64xf16, #mma1> -> tensor<1x64xf16, #mma0> - tt.return - } -} - -// ----- - #blocked = #triton_gpu.blocked<{sizePerThread = [16, 1], threadsPerWarp = [8, 4], warpsPerCTA = [1, 8], order = [0, 1]}> #mma = #triton_gpu.nvidia_mma<{versionMajor = 3, versionMinor = 0, warpsPerCTA = [8, 1], instrShape = [16, 256, 32]}> module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 8 : i32} { diff --git a/unittest/Tools/LinearLayoutTest.cpp b/unittest/Tools/LinearLayoutTest.cpp index 897172fd6d..f006447002 100644 --- a/unittest/Tools/LinearLayoutTest.cpp +++ b/unittest/Tools/LinearLayoutTest.cpp @@ -747,39 +747,6 @@ TEST_F(LinearLayoutTest, QuotientIdentityMultipleDimensions) { ASSERT_TRUE(quotientLayout->quotient({S("dim2")}).has_value()); } -TEST_F(LinearLayoutTest, Resize) { - auto init = LinearLayout( - { - {S("in0"), {{0, 1}, {0, 2}}}, - {S("in1"), {{1, 0}, {2, 0}}}, - {S("in2"), {}}, - }, - {S("dim0"), S("dim1")}); - EXPECT_EQ(init.resize(S("in0"), 8), - LinearLayout( - { - {S("in0"), {{0, 1}, {0, 2}, {0, 0}}}, - {S("in1"), {{1, 0}, {2, 0}}}, - {S("in2"), {}}, - }, - {S("dim0"), S("dim1")})); - EXPECT_EQ(init.resize(S("in0"), 4), LinearLayout( - { - {S("in0"), {{0, 1}, {0, 2}}}, - {S("in1"), {{1, 0}, {2, 0}}}, - {S("in2"), {}}, - }, - {S("dim0"), S("dim1")})); - EXPECT_EQ(init.resize(S("in1"), 8), - LinearLayout( - { - {S("in0"), {{0, 1}, {0, 2}}}, - {S("in1"), {{1, 0}, {2, 0}, {0, 0}}}, - {S("in2"), {}}, - }, - {S("dim0"), S("dim1")})); -} - } // anonymous namespace } // namespace mlir::triton