intel
diff --git a/‎.github/workflows/integration-tests.yml‎
Lines changed: 7 additions & 3 deletions b/‎.github/workflows/integration-tests.yml‎
Lines changed: 7 additions & 3 deletions
diff --git a/‎.github/workflows/integration-tests.yml.in‎
Lines changed: 7 additions & 3 deletions b/‎.github/workflows/integration-tests.yml.in‎
Lines changed: 7 additions & 3 deletions
diff --git a/‎python/test/unit/language/assert_helper.py‎
Lines changed: 0 additions & 154 deletions b/‎python/test/unit/language/assert_helper.py‎
Lines changed: 0 additions & 154 deletions
diff --git a/‎python/test/unit/language/test_subprocess.py‎
Lines changed: 0 additions & 61 deletions b/‎python/test/unit/language/test_subprocess.py‎
Lines changed: 0 additions & 61 deletions
diff --git a/‎python/test/unit/runtime/test_cache.py‎
Lines changed: 10 additions & 15 deletions b/‎python/test/unit/runtime/test_cache.py‎
Lines changed: 10 additions & 15 deletions
@@ -226,7 +226,7 @@ jobs:
       - name: Install pip dependencies
         run: |
           python3 -m pip install --upgrade pip
-          python3 -m pip install wheel cmake==3.24 ninja pytest-xdist lit
+          python3 -m pip install cython setuptools wheel cmake==3.24 ninja pytest-forked pytest-xdist lit
       - name: Install Triton
         env:
           TRITON_BUILD_WITH_CCACHE: "true"
@@ -250,8 +250,9 @@ jobs:
             echo "Coult not find '${SHARED_LIB_DIR}'" ; exit -1
           fi
           cd python/test/unit
-          python3 -m pytest -s -n 8 --ignore=hopper/test_flashattention.py --ignore=language/test_line_info.py --ignore=language/test_subprocess.py
+          python3 -m pytest -s -n 8 --ignore=hopper/test_flashattention.py --ignore=language/test_line_info.py --ignore=language/test_subprocess.py --ignore=test_debug.py
           python3 -m pytest -s -n 8 language/test_subprocess.py
+          python3 -m pytest -s -n 8 test_debug.py --forked
           # Run test_line_info.py separately with TRITON_DISABLE_LINE_INFO=0
           TRITON_DISABLE_LINE_INFO=0 python3 -m pytest -s language/test_line_info.py
           # Run hopper/test_flashattention.py separately to avoid out of gpu memory
@@ -407,7 +408,10 @@ jobs:
           pytest --capture=tee-sys -rfs python/tutorials/06-fused-attention.py
           cd python/test/unit
           pytest --capture=tee-sys -rfs -n 16 language runtime \
-                 --ignore=language/test_line_info.py
+                 --ignore=language/test_line_info.py \
+                 --ignore=test_debug.py
+          # TODO: uncomment
+          # pytest --capture=tee-sys -rfs test_debug.py
           TRITON_ALWAYS_COMPILE=1 TRITON_DISABLE_LINE_INFO=0 LLVM_PASS_PLUGIN_PATH=${SHARED_LIB_DIR}/libGPUHello.so \
           pytest --capture=tee-sys -rfs -vvv instrumentation/test_gpuhello.py
 
 
@@ -256,7 +256,7 @@ jobs:
       - name: Install pip dependencies
         run: |
           python3 -m pip install --upgrade pip
-          python3 -m pip install wheel cmake==3.24 ninja pytest-xdist lit
+          python3 -m pip install cython setuptools wheel cmake==3.24 ninja pytest-forked pytest-xdist lit
 
       - name: Install Triton
         env:
@@ -284,8 +284,9 @@ jobs:
             echo "Coult not find '${SHARED_LIB_DIR}'" ; exit -1
           fi
           cd python/test/unit
-          python3 -m pytest -s -n 8 --ignore=hopper/test_flashattention.py --ignore=language/test_line_info.py --ignore=language/test_subprocess.py
+          python3 -m pytest -s -n 8 --ignore=hopper/test_flashattention.py --ignore=language/test_line_info.py --ignore=language/test_subprocess.py --ignore=test_debug.py
           python3 -m pytest -s -n 8 language/test_subprocess.py
+          python3 -m pytest -s -n 8 test_debug.py --forked
           # Run test_line_info.py separately with TRITON_DISABLE_LINE_INFO=0
           TRITON_DISABLE_LINE_INFO=0 python3 -m pytest -s language/test_line_info.py
           # Run hopper/test_flashattention.py separately to avoid out of gpu memory
@@ -403,7 +404,10 @@ jobs:
           pytest --capture=tee-sys -rfs python/tutorials/06-fused-attention.py
           cd python/test/unit
           pytest --capture=tee-sys -rfs -n 16 language runtime \
-                 --ignore=language/test_line_info.py
+                 --ignore=language/test_line_info.py \
+                 --ignore=test_debug.py
+          # TODO: uncomment
+          # pytest --capture=tee-sys -rfs test_debug.py
           TRITON_ALWAYS_COMPILE=1 TRITON_DISABLE_LINE_INFO=0 LLVM_PASS_PLUGIN_PATH=${SHARED_LIB_DIR}/libGPUHello.so \
           pytest --capture=tee-sys -rfs -vvv instrumentation/test_gpuhello.py
 
 
@@ -10,11 +10,6 @@
 
 dir_path = os.path.dirname(os.path.realpath(__file__))
 print_path = os.path.join(dir_path, "print_helper.py")
-assert_path = os.path.join(dir_path, "assert_helper.py")
-
-# TODO: bfloat16 after LLVM-15
-assert_types = ["device_assert", "device_assert_passes", "assert", "static_assert", "no_debug", "double_assert"]
-nested_types = [(caller, callee) for caller in ["true", "false", "none"] for callee in ["true", "false", "none"]]
 torch_types = ["int8", "uint8", "int16", "int32", "long", "float16", "float32", "float64"]
 
 
@@ -124,59 +119,3 @@ def test_print(func_type: str, data_type: str, device: str):
             continue
         print(f'Expected line "{line}" {expected_lines[line]} time(s), but saw {actual_lines[line]} time(s)')
     assert all(delta == 0 for delta in diff.values())
-
-
-@pytest.mark.parametrize("func_type", assert_types)
-def test_assert(func_type: str, device: str):
-    # The total number of elements in the 1-D tensor to assert on.
-    N = 128
-
-    proc = subprocess.run(
-        [sys.executable, assert_path, "test_assert", func_type, device],
-        capture_output=True,
-        env={**os.environ, "TRITON_DEBUG": "1"},
-    )
-    errs = proc.stderr.splitlines()
-    num_errs = 0
-    for err in errs:
-        if "x != 0" in err.decode("utf-8", errors="ignore"):
-            num_errs += 1
-
-    # Check for segfaults.
-    assert all("segmentation fault" not in line.decode("utf-8", errors="ignore").lower() for line in errs)
-
-    if func_type == "static_assert" or func_type == "device_assert_passes":
-        assert num_errs == 0
-    else:
-        assert num_errs == N - 1
-
-
-@pytest.mark.parametrize("caller_type, callee_type", nested_types)
-def test_assert_nested(caller_type, callee_type, device):
-    # The total number of elements in the 1-D tensor to assert on.
-    N = 128
-
-    proc = subprocess.run(
-        [sys.executable, assert_path, "test_assert_nested", caller_type, callee_type, device],
-        capture_output=True,
-    )
-    errs = proc.stderr.splitlines()
-    num_errs = 0
-    for err in errs:
-        if "x != 0" in err.decode("utf-8", errors="ignore"):
-            num_errs += 1
-    if caller_type == "none":
-        if callee_type == "true":
-            assert num_errs == N - 1
-        else:
-            assert num_errs == 0
-    elif caller_type == "true":
-        if callee_type == "false":
-            assert num_errs == 0
-        else:
-            assert num_errs == N - 1
-    elif caller_type == "false":
-        if callee_type == "true":
-            assert num_errs == N - 1
-        else:
-            assert num_errs == 0
@@ -427,23 +427,18 @@ def kernel_add(a, b, o, N: tl.constexpr):
 def test_jit_debug(device) -> None:
 
     @triton.jit
-    def kernel_add(a, b, o, N: tl.constexpr):
-        idx = tl.arange(0, N)
-        tl.device_assert(idx < 32, "idx < 32")
-        tl.store(o + idx, tl.load(a + idx) + tl.load(b + idx))
+    def kernel(tmp):
+        tl.device_assert(tl.load(tmp) == 1, "tmp == 1")
 
     device = getattr(torch, device).current_device()
-    assert len(kernel_add.cache[device]) == 0
-    kernel_add.warmup(torch.float32, torch.float32, torch.float32, 32, grid=(1, ))
-    assert len(kernel_add.cache[device]) == 1
-    kernel_add.debug = False
-    kernel_add.warmup(torch.float32, torch.float32, torch.float32, 32, grid=(1, ))
-    assert len(kernel_add.cache[device]) == 2
-    kernel_add.debug = True
-    kernel_add.warmup(torch.float32, torch.float32, torch.float32, 32, grid=(1, ))
-    assert len(kernel_add.cache[device]) == 3
-    bins = list(kernel_add.cache[device].values())
-    assert bins[2].asm['ttir'] != bins[1].asm['ttir']
+    tmp = torch.tensor([1], dtype=torch.int32, device=device)
+    assert len(kernel.cache[device]) == 0
+    kernel[(1, )](tmp, debug=False)
+    assert len(kernel.cache[device]) == 1
+    kernel[(1, )](tmp, debug=True)
+    assert len(kernel.cache[device]) == 2
+    bins = list(kernel.cache[device].values())
+    assert bins[0].asm['ttir'] != bins[1].asm['ttir']
 
 
 @triton.jit