Pipeline scale_dot (#4950)

lezcano · web-flow · commit 6a4be785354e · 2024-10-22T08:37:17.000+01:00
We allow DotOperand within MemoryOpToLLVM in the buggy ampere case via
LLs. This allows us to remove two workarounds that we added in a
previous PR.

We add tests in test_pipeliner.py

We also remove some implementation-defined behaviour (overflows / NaNs)
in test_core.py, thus making the tests more resilient and realistic.
diff --git a/lib/Conversion/TritonGPUToLLVM/DecomposeUnsupportedConversions.cpp b/lib/Conversion/TritonGPUToLLVM/DecomposeUnsupportedConversions.cpp
@@ -90,16 +90,6 @@ void decomposeBlockedToDotLayoutConversion(ModuleOp module) {
     auto dstDotOp =
         dyn_cast<triton::gpu::DotOperandEncodingAttr>(dstType.getEncoding());
     if (srcBlocked && dstDotOp) {
-      // FIXME [Dot LL]
-      // We support this one via LLs, as the LocalLoad path is buggy
-      if (auto mma = dyn_cast<NvidiaMmaEncodingAttr>(dstDotOp.getParent())) {
-        bool largeKWidth =
-            dstDotOp.getKWidth() * dstType.getElementTypeBitWidth() > 64;
-        if (mma.isAmpere() && largeKWidth) {
-          return;
-        }
-      }
-
       Attribute sharedMemorySpace =
           triton::gpu::SharedMemorySpaceAttr::get(srcType.getContext());
       auto tmpType = MemDescType::get(
diff --git a/lib/Conversion/TritonGPUToLLVM/MemoryOpToLLVM.cpp b/lib/Conversion/TritonGPUToLLVM/MemoryOpToLLVM.cpp
@@ -116,9 +116,20 @@ struct LocalLoadOpConversion : public ConvertOpToLLVMPattern<LocalLoadOp> {
     RankedTensorType dstTy = op.getType();
     Attribute srcLayout = srcTy.getEncoding();
     Attribute dstLayout = dstTy.getEncoding();
+    // FIXME [Dot LL]
+    // Do for all DotOperandEncodingAttr once we have LLs for all of them
+    auto isAmpereLargeKWidth = [](Attribute layout) {
+      if (auto dot = dyn_cast<DotOperandEncodingAttr>(layout)) {
+        if (auto mma = dyn_cast<NvidiaMmaEncodingAttr>(dot.getParent())) {
+          return mma.isAmpere() && dot.getKWidth() == 8;
+        }
+      }
+      return false;
+    };
     if (isa<SharedEncodingAttr>(srcLayout) &&
-        isa<BlockedEncodingAttr, MmaEncodingTrait, SliceEncodingAttr>(
-            dstLayout)) {
+        (isa<BlockedEncodingAttr, MmaEncodingTrait, SliceEncodingAttr>(
+             dstLayout) ||
+         isAmpereLargeKWidth(dstLayout))) {
       return lowerSharedToDistributed(op, adaptor, getTypeConverter(),
                                       rewriter);
     }
@@ -170,6 +181,37 @@ struct LocalLoadOpConversion : public ConvertOpToLLVMPattern<LocalLoadOp> {
     SmallVector<Value> outVals = loadSharedToDistributed(
         dstTy, srcTy, elemLlvmTy, smemObj, loc, rewriter, targetInfo);
 
+    // FIXME [Dot LL]
+    // Ampere case
+    // In this case, we need to pack the outputs into i32
+    if (isa<DotOperandEncodingAttr>(dstTy.getEncoding())) {
+      if (elemLlvmTy.isInteger(8)) {
+        auto concat = [&](Value a1, Value a2, Value a3, Value a4) {
+          return or_(or_(zext(i32_ty, a1), shl(zext(i32_ty, a2), i32_val(8))),
+                     or_(shl(zext(i32_ty, a3), i32_val(16)),
+                         shl(zext(i32_ty, a4), i32_val(24))));
+        };
+        SmallVector<Value> outVals32(outVals.size() / 4);
+        for (int i = 0; i < outVals32.size(); ++i) {
+          outVals32[i] = concat(outVals[4 * i], outVals[4 * i + 1],
+                                outVals[4 * i + 2], outVals[4 * i + 3]);
+        }
+        outVals = outVals32;
+      } else {
+        assert(elemLlvmTy.isBF16() && "Unexpected element type");
+        auto concat = [&](Value a, Value b) {
+          return or_(zext(i32_ty, bitcast(a, i16_ty)),
+                     shl(zext(i32_ty, bitcast(b, i16_ty)), i32_val(16)));
+        };
+
+        SmallVector<Value> outVals32(outVals.size() / 2);
+        for (int i = 0; i < outVals32.size(); ++i) {
+          outVals32[i] = concat(outVals[2 * i], outVals[2 * i + 1]);
+        }
+        outVals = outVals32;
+      }
+    }
+
     Value result = packLLElements(loc, typeConverter, outVals, rewriter, dstTy);
     rewriter.replaceOp(op, result);
 
diff --git a/lib/Dialect/TritonGPU/Transforms/ReduceDataDuplication.cpp b/lib/Dialect/TritonGPU/Transforms/ReduceDataDuplication.cpp
@@ -44,13 +44,6 @@ class TritonGPUReduceDataDuplicationPass
         return;
       if (!cvtNeedsSharedMemory(srcType, dstType))
         return;
-      // FIXME [Dot LL]
-      // We support this one via LLs, as the LocalLoad path is buggy
-      bool largeKWidth =
-          dstDotOp.getKWidth() * dstType.getElementTypeBitWidth() > 64;
-      if (largeKWidth) {
-        return;
-      }
       auto srcOrder = triton::gpu::getOrder(srcEncoding);
       auto rank = srcOrder.size();
       SmallVector<unsigned> sharedOrder;
diff --git a/python/test/unit/language/test_core.py b/python/test/unit/language/test_core.py
@@ -3315,16 +3315,12 @@ def kernel(X, stride_xm, stride_xk, Y, stride_yk, stride_yn, W, stride_wn, strid
             assert 'wgmma.mma_async.sync.aligned.m64n128k32.f32.e4m3.e4m3' in ptx
 
 
-@pytest.mark.parametrize("M, N, K, col_a, col_b, type_a, type_b, num_warps", [
-    (M, N, K, col_a, col_b, type_a, type_b, 4)
-    for M, N, K in itertools.product([32, 64, 128], [32, 64, 128], [64, 128])
-    for col_a, col_b in itertools.product([True, False], repeat=2)
-    # We don't test e5m2 as its range + the uniform sampling overflows easily
-    # Tested locally and it works fine other than for ~10 entries out of 10_000
-    # which are of the size of 10**30
-    for type_a in ["e2m1", "e4m3"]
-    for type_b in ["e4m3"]
-])
+@pytest.mark.parametrize("M, N, K, col_a, col_b, type_a, type_b, num_warps",
+                         [(M, N, K, col_a, col_b, type_a, type_b, 4)
+                          for M, N, K in itertools.product([32, 64, 128], [32, 64, 128], [64, 128])
+                          for col_a, col_b in itertools.product([True, False], repeat=2)
+                          for type_a in ["e2m1", "e4m3", "e5m2"]
+                          for type_b in ["e4m3", "e5m2"]])
 def test_scaled_dot(M, N, K, col_a, col_b, type_a, type_b, num_warps, device):
     if not is_cuda():
         pytest.skip("scaled_dot only supported on CUDA")
@@ -3355,7 +3351,7 @@ def dot_scale_kernel(a_base, stride_a0, stride_a1, a_scale, b_base, stride_b0, s
         a_scale = tl.load(scale_a_ptr)
         c = tl.dot_scaled(a, a_scale, type_a, b, None, type_b)
         out_ptr = out + tl.arange(0, BLOCK_M)[:, None] * BLOCK_N + tl.arange(0, BLOCK_N)[None, :]
-        tl.store(out_ptr, c)
+        tl.store(out_ptr, c.to(tl.bfloat16))
 
     @triton.jit
     def mxfp_to_bf16_kernel(
@@ -3431,7 +3427,6 @@ def dot_scale_ref(x, scale, y, type_x, type_y):
         type_fp8_y = {"e4m3": torch.float8_e4m3fn, "e5m2": torch.float8_e5m2}[type_y]
 
         comp_dtype = torch.bfloat16
-        out_dtype = torch.float32
 
         x = x.contiguous()
         x_upcast = x.new_empty(scale.shape[:-1] + (32 * scale.shape[-1], ), dtype=comp_dtype)
@@ -3440,42 +3435,65 @@ def dot_scale_ref(x, scale, y, type_x, type_y):
         BLOCK_SIZE = 512
         grid = ((N + BLOCK_SIZE - 1) // BLOCK_SIZE, )
         mxfp_to_bf16_kernel[grid](x, scale, x_upcast, scale.numel(), e_bits, m_bits, BLOCK_SIZE, num_warps=num_warps)
+        assert x_upcast.isfinite().all()
 
         y_upcast = y.view(type_fp8_y).to(comp_dtype)
-        return torch.matmul(x_upcast.to(out_dtype), y_upcast.to(out_dtype))
+
+        class AccumulateInFp32:
+
+            def __enter__(self):
+                self.prev_value = torch.backends.cuda.matmul.allow_bf16_reduced_precision_reduction
+                torch.backends.cuda.matmul.allow_bf16_reduced_precision_reduction = False
+
+            def __exit__(self, exc_type, exc_val, exc_tb):
+                torch.backends.cuda.matmul.allow_bf16_reduced_precision_reduction = self.prev_value
+
+        with AccumulateInFp32():
+            return torch.matmul(x_upcast.to(comp_dtype), y_upcast.to(comp_dtype))
 
     torch.manual_seed(0)
 
-    def create_uint8(shape, col_major=False):
+    def create_uint8(shape, col_major=False, max_val=255):
         if col_major:
             shape = shape[:-2] + (shape[-1], shape[-2])
-        ret = torch.randint(1 << 8, shape, dtype=torch.uint8, device=device)
+        ret = torch.randint(max_val + 1, shape, dtype=torch.uint8, device=device)
         if col_major:
             ret = ret.mT
         return ret
 
     DIV_FACTOR = 2 if type_a == "e2m1" else 1
     x = create_uint8((M, K // DIV_FACTOR), col_major=col_a)
     y = create_uint8((K, N), col_major=col_b)
-    scale_x = create_uint8((M, K // 32))
 
-    z = x.new_empty((M, N), dtype=torch.float32)
+    # sample scales that don't overflow as otherwise it's implementation defined (underflowing is alright)
+    # We substract a reasonably high number (64) so that the sum of all the mxfp elements does not overflow
+    m_bytes = int(type_a[1])
+    bias_type_a = 1 << (m_bytes - 1) - 1
+    max_exponent_type_a = (1 << m_bytes) - 1 - bias_type_a
+    scale_x = create_uint8((M, K // 32), max_val=255 - max_exponent_type_a - 64)
+
+    def make_finite(x, dtype):
+        # e5m2 has too many non-finite values when sampled uniformly (1 / 32) and
+        # Fp8E5M2_to_Bf16 doesn't preserve NaNs (fixme)
+        if dtype not in ("e5m2", "e4m3"):
+            return x
+        mask = 0x7C if dtype == "e5m2" else 0x7F
+        finite = torch.arange(x.numel(), device=device, dtype=torch.uint8).reshape_as(x) % mask
+        x_finite = torch.where(x & mask == mask, finite | (0x80 & x), x)
+        x.copy_(x_finite)
+        return x
+
+    x = make_finite(x, type_a)
+    y = make_finite(y, type_b)
+
+    z = x.new_empty((M, N), dtype=torch.bfloat16)
     pgm = dot_scale_kernel[(1, )](x, *x.stride(), scale_x, y, *y.stride(), z, M, N, K, type_a, type_b,
                                   num_warps=num_warps)
 
     z_ref = dot_scale_ref(x, scale_x, y, type_a, type_b)
 
-    # dot_scale_ref computes the result in higher precision
-    # so we equalise all the non-finite values
-    # This also fixes a bug in our upcasting from e5m2 to bf16 where inf is not preserved
-    non_finite_z = ~z.isfinite()
-    z_ref[non_finite_z] = z[non_finite_z]
-    non_finite_ref = ~z_ref.isfinite()
-    z[non_finite_ref] = z_ref[non_finite_ref]
-
-    # generous rtol set because the ref is more precise than the fused
-    # (computes in higher dtype) and we are sampling the whole range of floats
-    torch.testing.assert_close(z, z_ref, equal_nan=True, atol=1e-5, rtol=1e-2)
+    # generous rtol as we are sampling the whole range of floats
+    torch.testing.assert_close(z, z_ref, atol=1e-5, rtol=1e-2)
 
     # make sure ld/st are vectorized
     ptx = pgm.asm['ptx']
diff --git a/python/test/unit/language/test_pipeliner.py b/python/test/unit/language/test_pipeliner.py