[TEST] Reenable mixed precision dot tests (#4965)

Jokeren · web-flow · commit 9357902f9416 · 2024-10-22T09:30:08.000-04:00
And remove the outdated performance tests.
We can also add various float8 types and move `scaled_dot` tests here.
diff --git a/.github/workflows/integration-tests.yml b/.github/workflows/integration-tests.yml
@@ -239,14 +239,14 @@ jobs:
           cd python
           LIT_TEST_DIR="build/$(ls build | grep -i cmake)/test"
           if [ ! -d "${LIT_TEST_DIR}" ]; then
-            echo "Coult not find '${LIT_TEST_DIR}'" ; exit -1
+            echo "Could not find '${LIT_TEST_DIR}'" ; exit -1
           fi
           lit -v "${LIT_TEST_DIR}"
       - name: Run python tests on CUDA
         run: |
           INSTRUMENTATION_LIB_DIR="${GITHUB_WORKSPACE}/python/build/$(ls python/build | grep -i lib)/triton/instrumentation"
           if [ ! -d "${INSTRUMENTATION_LIB_DIR}" ]; then
-            echo "Coult not find '${INSTRUMENTATION_LIB_DIR}'" ; exit -1
+            echo "Could not find '${INSTRUMENTATION_LIB_DIR}'" ; exit -1
           fi
           cd python/test/unit
           python3 -m pytest -s -n 8 --ignore=hopper/test_flashattention.py --ignore=language/test_line_info.py --ignore=language/test_subprocess.py --ignore=test_debug.py
@@ -268,14 +268,16 @@ jobs:
            language/test_random.py language/test_block_pointer.py language/test_subprocess.py language/test_line_info.py \
            runtime/test_autotuner.py::test_kwargs[False]\
            ../../tutorials/06-fused-attention.py::test_op --device cpu
+      - name: Run regression tests
+        run: |
+          cd python/test/regression
+          python3 -m pytest -s -n 8 .
       - name: Run C++ unittests
         run: |
           cd python
           cd "build/$(ls build | grep -i cmake)"
           ctest -j32
       - name: Run Proton tests
-        env:
-          LD_LIBRARY_PATH: "/usr/local/cuda/extras/CUPTI/lib64:$LD_LIBRARY_PATH"
         run: |
           cd third_party/proton
           python3 -m pytest -s test
@@ -395,14 +397,14 @@ jobs:
           cd python
           LIT_TEST_DIR="build/$(ls build | grep -i cmake)/test"
           if [ ! -d "${LIT_TEST_DIR}" ]; then
-            echo "Coult not find '${LIT_TEST_DIR}'" ; exit -1
+            echo "Could not find '${LIT_TEST_DIR}'" ; exit -1
           fi
           lit -v "${LIT_TEST_DIR}"
       - name: Run python tests on HIP
         run: |
           INSTRUMENTATION_LIB_DIR="${GITHUB_WORKSPACE}/python/triton/instrumentation"
           if [ ! -d "${INSTRUMENTATION_LIB_DIR}" ]; then
-            echo "Coult not find '${INSTRUMENTATION_LIB_DIR}'" ; exit -1
+            echo "Could not find '${INSTRUMENTATION_LIB_DIR}'" ; exit -1
           fi
           pytest --capture=tee-sys -rfs python/tutorials/06-fused-attention.py
           cd python/test/unit
@@ -416,10 +418,15 @@ jobs:
 
           # Run test_line_info.py separately with TRITON_DISABLE_LINE_INFO=0
           TRITON_DISABLE_LINE_INFO=0 python3 -m pytest -s -n 8 language/test_line_info.py
+      - name: Run regression tests
+        run: |
+          # Reenable test_functional_regression.py once it's fixed
+          cd python/test/regression
+          python3 -m pytest -s -n 8 ./test_cast_matmul.py
       - name: Run Proton tests
         run: |
           cd third_party/proton
-          python3 -m pytest test
+          python3 -m pytest -s test
       - name: Run C++ unittests
         run: |
           cd python
diff --git a/.github/workflows/integration-tests.yml.in b/.github/workflows/integration-tests.yml.in
@@ -272,15 +272,15 @@ jobs:
           cd python
           LIT_TEST_DIR="build/$(ls build | grep -i cmake)/test"
           if [ ! -d "${LIT_TEST_DIR}" ]; then
-            echo "Coult not find '${LIT_TEST_DIR}'" ; exit -1
+            echo "Could not find '${LIT_TEST_DIR}'" ; exit -1
           fi
           lit -v "${LIT_TEST_DIR}"
 
       - name: Run python tests on CUDA
         run: |
           INSTRUMENTATION_LIB_DIR="${GITHUB_WORKSPACE}/python/build/$(ls python/build | grep -i lib)/triton/instrumentation"
           if [ ! -d "${INSTRUMENTATION_LIB_DIR}" ]; then
-            echo "Coult not find '${INSTRUMENTATION_LIB_DIR}'" ; exit -1
+            echo "Could not find '${INSTRUMENTATION_LIB_DIR}'" ; exit -1
           fi
           cd python/test/unit
           python3 -m pytest -s -n 8 --ignore=hopper/test_flashattention.py --ignore=language/test_line_info.py --ignore=language/test_subprocess.py --ignore=test_debug.py
@@ -304,16 +304,20 @@ jobs:
            runtime/test_autotuner.py::test_kwargs[False]\
            ../../tutorials/06-fused-attention.py::test_op --device cpu
 
+      - name: Run regression tests
+        run: |
+          cd python/test/regression
+          python3 -m pytest -s -n 8 .
+
       - &run-cpp-unittests-step
         name: Run C++ unittests
         run: |
           cd python
           cd "build/$(ls build | grep -i cmake)"
           ctest -j32
 
-      - name: Run Proton tests
-        env:
-          LD_LIBRARY_PATH: "/usr/local/cuda/extras/CUPTI/lib64:$LD_LIBRARY_PATH"
+      - &run-proton-tests-step
+        name: Run Proton tests
         run: |
           cd third_party/proton
           python3 -m pytest -s test
@@ -398,7 +402,7 @@ jobs:
         run: |
           INSTRUMENTATION_LIB_DIR="${GITHUB_WORKSPACE}/python/triton/instrumentation"
           if [ ! -d "${INSTRUMENTATION_LIB_DIR}" ]; then
-            echo "Coult not find '${INSTRUMENTATION_LIB_DIR}'" ; exit -1
+            echo "Could not find '${INSTRUMENTATION_LIB_DIR}'" ; exit -1
           fi
           pytest --capture=tee-sys -rfs python/tutorials/06-fused-attention.py
           cd python/test/unit
@@ -413,11 +417,13 @@ jobs:
           # Run test_line_info.py separately with TRITON_DISABLE_LINE_INFO=0
           TRITON_DISABLE_LINE_INFO=0 python3 -m pytest -s -n 8 language/test_line_info.py
 
-      - name: Run Proton tests
+      - name: Run regression tests
         run: |
-          cd third_party/proton
-          python3 -m pytest test
+          # Reenable test_functional_regression.py once it's fixed
+          cd python/test/regression
+          python3 -m pytest -s -n 8 ./test_cast_matmul.py
 
+      - *run-proton-tests-step
       - *run-cpp-unittests-step
       - *save-build-artifacts-step
       - *inspect-cache-directories-step
diff --git a/python/test/regression/conftest.py b/python/test/regression/conftest.py
@@ -0,0 +1,22 @@
+import os
+import pytest
+import tempfile
+
+
+def pytest_addoption(parser):
+    parser.addoption("--device", action="store", default="cuda")
+
+
+@pytest.fixture
+def device(request):
+    return request.config.getoption("--device")
+
+
+@pytest.fixture
+def fresh_triton_cache():
+    with tempfile.TemporaryDirectory() as tmpdir:
+        try:
+            os.environ["TRITON_CACHE_DIR"] = tmpdir
+            yield tmpdir
+        finally:
+            os.environ.pop("TRITON_CACHE_DIR", None)
diff --git a/python/test/regression/test_cast_matmul.py b/python/test/regression/test_cast_matmul.py
@@ -1,20 +1,68 @@
 """
+Mixed precision tests for matmul (tl.dot) with cast (tl.to)
+
 issue: https://github.com/triton-lang/triton/issues/2523
-fused type convert and matmul, base on triton matmul, the different with matmul:
-1. force C's dtype=dot_out_dtype to ["float16", "float32"]
-2. accept A and B with dtype=["float32", "float64"]
 
+TODO: float8 types
 """
+
 import pytest
 import torch
 
+import triton
 import triton.language as tl
-from triton import cdiv, jit
 
-input_dtypes = ["float32", "float64"]
+input_dtypes = ["float16", "float32", "float64"]
 out_dtypes = ["float16", "float32"]
 
 
+@triton.jit
+def matmul_kernel(A, B, C, M, N, K,  #
+                  stride_am, stride_ak,  #
+                  stride_bk, stride_bn,  #
+                  stride_cm, stride_cn,  #
+                  dot_out_dtype: tl.constexpr,  #
+                  BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr,  #
+                  BLOCK_K: tl.constexpr, GROUP_M: tl.constexpr):
+    # matrix multiplication
+    pid = tl.program_id(0)
+    grid_m = tl.cdiv(M, BLOCK_M)
+    grid_n = tl.cdiv(N, BLOCK_N)
+    # re-order program ID for better L2 performance
+    width = GROUP_M * grid_n
+    group_id = pid // width
+    group_size = min(grid_m - group_id * GROUP_M, GROUP_M)
+    pid_m = group_id * GROUP_M + (pid % group_size)
+    pid_n = (pid % width) // (group_size)
+    # do matrix multiplication
+    rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+    ram = tl.max_contiguous(tl.multiple_of(rm % M, BLOCK_M), BLOCK_M)
+    rbn = tl.max_contiguous(tl.multiple_of(rn % N, BLOCK_N), BLOCK_N)
+    rk = tl.arange(0, BLOCK_K)
+    # pointers
+    A = A + (ram[:, None] * stride_am + rk[None, :] * stride_ak)
+    B = B + (rk[:, None] * stride_bk + rbn[None, :] * stride_bn)
+    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=dot_out_dtype)
+    for k in range(0, tl.cdiv(K, BLOCK_K)):
+        k_remaining = K - k * BLOCK_K
+        _0 = tl.zeros((1, 1), dtype=C.dtype.element_ty)
+        a = tl.load(A, mask=rk[None, :] < k_remaining, other=_0)
+        b = tl.load(B, mask=rk[:, None] < k_remaining, other=_0)
+        a = a.to(C.dtype.element_ty)
+        b = b.to(C.dtype.element_ty)
+        acc += tl.dot(a, b, out_dtype=dot_out_dtype)
+        A += BLOCK_K * stride_ak
+        B += BLOCK_K * stride_bk
+    acc = acc.to(C.dtype.element_ty)
+    # rematerialize rm and rn to save registers
+    rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+    C = C + (rm[:, None] * stride_cm + rn[None, :] * stride_cn)
+    mask = (rm < M)[:, None] & (rn < N)[None, :]
+    tl.store(C, acc, mask=mask)
+
+
 @pytest.mark.parametrize("M, K, N, w_dtype, x_dtype, out_dtype",
                          [(M, K, N, w, x, o)  #
                           for (M, K, N) in [(128, 128, 128), (1280, 768, 1024)]  #
@@ -23,7 +71,7 @@
                           for o in out_dtypes])
 def test_cast_matmul(M, K, N, w_dtype, x_dtype, out_dtype):
     if x_dtype == w_dtype:
-        pytest.skip("skip same dtype")
+        pytest.skip("skip the same input dtype")
     device = torch.cuda.current_device()
     x_dtype = getattr(torch, x_dtype)
     w_dtype = getattr(torch, w_dtype)
@@ -36,53 +84,7 @@ def test_cast_matmul(M, K, N, w_dtype, x_dtype, out_dtype):
 
     # launch kernel
     BLOCK_M, BLOCK_N, BLOCK_K = 16, 16, 32
-    grid = ((cdiv(M, BLOCK_M) * cdiv(N, BLOCK_N)), 1)
-
-    @jit
-    def matmul_kernel(A, B, C, M, N, K,  #
-                      stride_am, stride_ak,  #
-                      stride_bk, stride_bn,  #
-                      stride_cm, stride_cn,  #
-                      dot_out_dtype: tl.constexpr,  #
-                      BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr,  #
-                      BLOCK_K: tl.constexpr, GROUP_M: tl.constexpr):
-        # matrix multiplication
-        pid = tl.program_id(0)
-        grid_m = tl.cdiv(M, BLOCK_M)
-        grid_n = tl.cdiv(N, BLOCK_N)
-        # re-order program ID for better L2 performance
-        width = GROUP_M * grid_n
-        group_id = pid // width
-        group_size = min(grid_m - group_id * GROUP_M, GROUP_M)
-        pid_m = group_id * GROUP_M + (pid % group_size)
-        pid_n = (pid % width) // (group_size)
-        # do matrix multiplication
-        rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
-        rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
-        ram = tl.max_contiguous(tl.multiple_of(rm % M, BLOCK_M), BLOCK_M)
-        rbn = tl.max_contiguous(tl.multiple_of(rn % N, BLOCK_N), BLOCK_N)
-        rk = tl.arange(0, BLOCK_K)
-        # pointers
-        A = A + (ram[:, None] * stride_am + rk[None, :] * stride_ak)
-        B = B + (rk[:, None] * stride_bk + rbn[None, :] * stride_bn)
-        acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=dot_out_dtype)
-        for k in range(0, tl.cdiv(K, BLOCK_K)):
-            k_remaining = K - k * BLOCK_K
-            _0 = tl.zeros((1, 1), dtype=C.dtype.element_ty)
-            a = tl.load(A, mask=rk[None, :] < k_remaining, other=_0)
-            b = tl.load(B, mask=rk[:, None] < k_remaining, other=_0)
-            a = a.to(C.dtype.element_ty)
-            b = b.to(C.dtype.element_ty)
-            acc += tl.dot(a, b, out_dtype=dot_out_dtype)
-            A += BLOCK_K * stride_ak
-            B += BLOCK_K * stride_bk
-        acc = acc.to(C.dtype.element_ty)
-        # rematerialize rm and rn to save registers
-        rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
-        rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
-        C = C + (rm[:, None] * stride_cm + rn[None, :] * stride_cn)
-        mask = (rm < M)[:, None] & (rn < N)[None, :]
-        tl.store(C, acc, mask=mask)
+    grid = ((triton.cdiv(M, BLOCK_M) * triton.cdiv(N, BLOCK_N)), 1)
 
     matmul_kernel[grid](
         a, b, out_triton, M, N, K,  #
diff --git a/python/test/regression/test_performance.py b/python/test/regression/test_performance.py