intel
diff --git a/‎Makefile‎
Lines changed: 5 additions & 4 deletions b/‎Makefile‎
Lines changed: 5 additions & 4 deletions
diff --git a/‎README.md‎
Lines changed: 1 addition & 1 deletion b/‎README.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td‎
Lines changed: 1 addition & 1 deletion b/‎include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎lib/Dialect/TritonGPU/IR/LinearLayoutConversions.cpp‎
Lines changed: 22 additions & 27 deletions b/‎lib/Dialect/TritonGPU/IR/LinearLayoutConversions.cpp‎
Lines changed: 22 additions & 27 deletions
diff --git a/‎python/src/passes.cc‎
Lines changed: 0 additions & 1 deletion b/‎python/src/passes.cc‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎python/test/unit/conftest.py‎
Lines changed: 13 additions & 0 deletions b/‎python/test/unit/conftest.py‎
Lines changed: 13 additions & 0 deletions
@@ -30,13 +30,14 @@ test-cpp:
 
 .PHONY: test-python
 test-unit: all
-	cd python/test/unit && $(PYTEST) -s -n 8 --ignore=cuda/test_flashattention.py \
-		--ignore=language/test_line_info.py --ignore=language/test_subprocess.py --ignore=test_debug.py
+	cd python/test/unit && $(PYTEST) -s -n 8 --ignore=language/test_line_info.py \
+		--ignore=language/test_subprocess.py --ignore=test_debug.py
 	$(PYTEST) -s -n 8 python/test/unit/language/test_subprocess.py
 	$(PYTEST) -s -n 8 python/test/unit/test_debug.py --forked
+	$(PYTEST) -s -n 8 python/triton_kernels/tests/
 	TRITON_DISABLE_LINE_INFO=0 $(PYTEST) -s python/test/unit/language/test_line_info.py
-	# Run cuda/test_flashattention.py separately to avoid out of gpu memory
-	$(PYTEST) -s python/test/unit/cuda/test_flashattention.py
+	# Run attention separately to avoid out of gpu memory
+	$(PYTEST) -s python/tutorials/06-fused-attention.py
 	TRITON_ALWAYS_COMPILE=1 TRITON_DISABLE_LINE_INFO=0 LLVM_PASS_PLUGIN_PATH=python/triton/instrumentation/libGPUInstrumentationTestLib.so \
 		$(PYTEST) --capture=tee-sys -rfs -vvv python/test/unit/instrumentation/test_gpuhello.py
 
 
@@ -212,7 +212,7 @@ See [`python/triton/knobs.py`](python/triton/knobs.py) for the full list of conf
 - `TRITON_OVERRIDE_DIR` specifies the directory from which to load the IR/ptx/amdgcn files when `TRITON_KERNEL_OVERRIDE` is set to 1.
 - `TRITON_F32_DEFAULT` sets the default input precision of `tl.dot` when using 32-bit floats, which can be either `ieee`, `tf32`, or `tf32x3`.
 - `TRITON_FRONT_END_DEBUGGING=1` disables exception wrapping when an error occurs in the compiler frontend, allowing the full stack trace to be seen.
-- `TRITON_STRIP_DEBUG_INFO` removes all debug information from the module, including location information
+- `TRITON_DISABLE_LINE_INFO=1` removes all line information from the module
 
 N.B. Some of these environment variables don't have a knob in `knobs.py`-- those are only relevant to the C++ layer(s), hence they don't exist in the python layer.
 
 
@@ -322,7 +322,7 @@ When vec=2, elements are swizzled in pairs of 2.  In other words, the element at
 
         // ---- begin Ampere & Hopper ----
         if (mmaEnc.isAmpere() || mmaEnc.isHopper()) {
-          int perPhase = 128 / (shapePerCTA[order[0]] * 4 / dotOpEnc.getKWidth());
+          int perPhase = 128 / (std::max<int>(1, shapePerCTA[order[0]] * 4 / dotOpEnc.getKWidth()));
           perPhase = std::max<int>(perPhase, 1);
           std::vector<size_t> matShape = {8, 8, 4 * dotOpEnc.getKWidth()};
           int vecWidth = 32 / typeWidthInBit;
 
@@ -1539,40 +1539,35 @@ std::optional<LinearLayout>
 chooseMfmaLikeStoreLayout(RankedTensorType valType) {
   auto mfmaLayout = cast<AMDMfmaEncodingAttr>(valType.getEncoding());
 
-  // Currently support transposed [B]F16 MFMA32x32 on CDNA4
+  // We currently only support transposed [B]F16 MFMA32x32 on CDNA4.
   bool isMfma32 = mfmaLayout.getMDim() == 32 && mfmaLayout.getNDim() == 32;
   Type elemType = valType.getElementType();
   if (!(valType.getRank() == 2 && (elemType.isF16() || elemType.isBF16()) &&
         mfmaLayout.getVersionMajor() == 4 && mfmaLayout.getIsTransposed() &&
         isMfma32))
     return {};
 
-  MLIRContext *ctx = mfmaLayout.getContext();
-  StringAttr kRegister = S("register");
-  StringAttr kLane = S("lane");
-  StringAttr kWarp = S("warp");
-  StringAttr kBlock = S("block");
-
-  SmallVector<unsigned> order = getDefaultMmaOrder(mfmaLayout);
-  auto standardOutDims = standardOutDimNames(ctx, 2);
-  // We make each thread handle 8 consecutive elements to enable 128-bit
-  // global stores for [b]f16 types and keep the thread pattern in each lane
-  // similar to the canonical mfmaLayout.
-  LinearLayout mfma8Layout = LinearLayout::empty();
-  mfma8Layout =
-      LinearLayout({{kRegister, {{1, 0}, {2, 0}, {4, 0}}},
-                    {kLane, {{0, 1}, {0, 2}, {0, 4}, {0, 8}, {0, 16}, {8, 0}}},
-                    {kWarp, {}},
-                    {kBlock, {}}},
-                   {standardOutDims[order[0]], standardOutDims[order[1]]});
-
-  LinearLayout warpLayout =
-      identityStandardND(kWarp, mfmaLayout.getWarpsPerCTA(), order);
-  LinearLayout ctaLayout = mfma8Layout.transposeOuts(standardOutDims) *
-                           warpLayout.transposeOuts(standardOutDims);
-  mfma8Layout = combineCtaCgaWithShape(ctaLayout, mfmaLayout.getCTALayout(),
-                                       valType.getShape());
-  return mfma8Layout;
+  auto valShape = valType.getShape();
+  LinearLayout mfmaLL = mfmaLayout.toLinearLayout(valShape);
+  auto mfmaOutDims = llvm::to_vector(mfmaLL.getOutDimNames());
+  StringAttr dimM = mfmaOutDims[0];
+  StringAttr dimN = mfmaOutDims[1];
+
+  auto swapLL = LinearLayout::empty();
+  // The rows are kept as is with an identity linear layout.
+  swapLL *= LinearLayout::identity1D(valShape[0], dimM, dimM);
+  // In transposed mfma32 layout, each thread holds 4 consecutive values along N
+  // dim. We want to exchange column 4-7 (owned by thread 32-63) and column 8-11
+  // (owned by thread 0-31) every 16 columns to make each thread holds 8
+  // elements. This would mean exchange the 2nd and 3rd basis vector from an
+  // identity linear layout.
+  std::vector<std::vector<int32_t>> dimNBases(mfmaLL.getOutDimSizeLog2(dimN));
+  std::generate(dimNBases.begin(), dimNBases.end(),
+                [i = 0]() mutable { return std::vector<int32_t>{1 << i++}; });
+  std::swap(dimNBases[2], dimNBases[3]);
+  swapLL *= LinearLayout({{dimN, dimNBases}}, {dimN});
+
+  return mfmaLL.compose(swapLL);
 }
 
 LinearLayout getScaleTMEMStoreLinearLayout(RankedTensorType scaleType,
 
@@ -25,7 +25,6 @@ void init_triton_analysis(py::module &&m) {
 
 void init_triton_passes_common(py::module &&m) {
   using namespace mlir;
-  ADD_PASS_WRAPPER_0("add_strip_debug_info", createStripDebugInfoPass);
   ADD_PASS_WRAPPER_0("add_sccp", createSCCPPass);
   ADD_PASS_WRAPPER_0("add_symbol_dce", createSymbolDCEPass);
   ADD_PASS_WRAPPER_0("add_inliner", createInlinerPass);
 
@@ -138,3 +138,16 @@ def fresh_knobs_except_libraries(monkeypatch):
         yield fresh_function()
     finally:
         reset_function()
+
+
+@pytest.fixture
+def with_allocator():
+    import triton
+    from triton.runtime._allocation import NullAllocator
+    from triton._internal_testing import default_alloc_fn
+
+    triton.set_allocator(default_alloc_fn)
+    try:
+        yield
+    finally:
+        triton.set_allocator(NullAllocator())