[AMD] Fix gfx12 warp size and fix wmma in maybeDeduplicate (#4912)

AlexAUT · web-flow · commit a60fa8c0bc90 · 2024-10-15T10:00:54.000-07:00
This adds a missing exception to the warp size and fixes dot test for m
or n &gt; 32 when using wmma.
diff --git a/include/triton/Conversion/TritonGPUToLLVM/ElementwiseOpToLLVMBase.h b/include/triton/Conversion/TritonGPUToLLVM/ElementwiseOpToLLVMBase.h
@@ -88,10 +88,11 @@ class ElementwiseOpConversionBase : public ConvertOpToLLVMPattern<SourceOp> {
       // encoding not available
       return resultVals;
     Attribute baseEncoding = encoding;
-    if (isa<AMDMfmaEncodingAttr>(baseEncoding))
-      // TODO: this logic seems incorrect for mfma layout. Skip for now.
-      // We saw mismatches for some flash-attention tests on AMD backend.
-      // Note that this logic works for sliced layout whose parent is
+    if (isa<AMDMfmaEncodingAttr>(baseEncoding) ||
+        isa<AMDWmmaEncodingAttr>(baseEncoding))
+      // TODO: this logic seems incorrect for mfma and wmma layout. Skip for
+      // now. We saw mismatches for some flash-attention and dot tests on AMD
+      // backend. Note that this logic works for sliced layout whose parent is
       // mfma layout. Therefore, this is not combined with the following check.
       return resultVals;
     while (auto sliced = dyn_cast<SliceEncodingAttr>(baseEncoding))
diff --git a/third_party/amd/backend/compiler.py b/third_party/amd/backend/compiler.py
@@ -58,7 +58,7 @@ def __post_init__(self):
         default_libdir = Path(__file__).parent / 'lib'
         extern_libs = {} if self.extern_libs is None else dict(self.extern_libs)
         # Ignore user-defined warp size for gfx9
-        warp_size = 32 if 'gfx10' in self.arch or 'gfx11' in self.arch else 64
+        warp_size = 32 if 'gfx10' in self.arch or 'gfx11' in self.arch or 'gfx12' in self.arch else 64
         object.__setattr__(self, 'warp_size', warp_size)
         libs = ["ocml", "ockl"]
         for lib in libs: