[release/3.4] "[BACKEND] Workaround for ptxas bug in matrix descriptor arithmetic (triton-lang#7197)" (triton-lang#7389)

davidberard98 · ThomasRaoux · web-flow · commit 6e1dafab03cf · 2025-07-11T10:14:56.000-04:00
The previous code sequence was hitting a bug in ptxas that caused
Emiting this new code sequence should be cheaper and saves us from
hitting the ptxas bug.

Co-authored-by: Thomas Raoux &lt;thomas.raoux@openai.com&gt;
diff --git a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/DotOpToLLVM/WGMMA.cpp b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/DotOpToLLVM/WGMMA.cpp
@@ -151,14 +151,10 @@ Value mlir::triton::NVIDIA::DotOpMmaV3SmemLoader::smemLoad(
   } else {
     off1 = tb.mul(tb.i32_val(elemBits / 8), offset);
   }
-  Value off_ = tb.zext(i64_ty, tb.udiv(off1, tb.i32_val(16)));
-
-  Value loadDesc = tb.add(descriptor, off_);
-  // Add the base at the end to make it easier to do loop invariant code
-  // motion.
-  loadDesc = tb.add(
-      loadDesc, tb.lshr(tb.shl(tb.ptrtoint(i64_ty, base), tb.int_val(64, 46)),
-                        tb.int_val(64, 50)));
+  Value smemBase = tb.ptrtoint(i32_ty, base);
+  smemBase = tb.add(smemBase, off1);
+  smemBase = tb.lshr(tb.and_(smemBase, tb.i32_val(0x3FFFF)), tb.i32_val(4));
+  Value loadDesc = tb.add(descriptor, tb.zext(i64_ty, smemBase));
   return loadDesc;
 }