intel
diff --git a/‎.github/workflows/integration-tests-amd.yml
Lines changed: 3 additions & 1 deletion b/‎.github/workflows/integration-tests-amd.yml
Lines changed: 3 additions & 1 deletion
diff --git a/‎.github/workflows/integration-tests-nvidia.yml
Lines changed: 10 additions & 13 deletions b/‎.github/workflows/integration-tests-nvidia.yml
Lines changed: 10 additions & 13 deletions
diff --git a/‎Makefile
Lines changed: 6 additions & 0 deletions b/‎Makefile
Lines changed: 6 additions & 0 deletions
diff --git a/‎lib/Tools/GenericSwizzling.cpp
Lines changed: 0 additions & 1 deletion b/‎lib/Tools/GenericSwizzling.cpp
Lines changed: 0 additions & 1 deletion
diff --git a/‎python/test/conftest.py
Lines changed: 2 additions & 42 deletions b/‎python/test/conftest.py
Lines changed: 2 additions & 42 deletions
diff --git a/‎python/triton/_internal_testing.py
Lines changed: 42 additions & 1 deletion b/‎python/triton/_internal_testing.py
Lines changed: 42 additions & 1 deletion
diff --git a/‎python/triton/knobs.py
Lines changed: 1 addition & 1 deletion b/‎python/triton/knobs.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎python/triton/language/standard.py
Lines changed: 2 additions & 2 deletions b/‎python/triton/language/standard.py
Lines changed: 2 additions & 2 deletions
diff --git a/‎python/triton/runtime/_allocation.py
Lines changed: 0 additions & 1 deletion b/‎python/triton/runtime/_allocation.py
Lines changed: 0 additions & 1 deletion
diff --git a/‎python/triton/runtime/_async_compile.py
Lines changed: 6 additions & 6 deletions b/‎python/triton/runtime/_async_compile.py
Lines changed: 6 additions & 6 deletions
@@ -141,7 +141,9 @@ jobs:
             cd ../../triton_kernels/
             python3 -m pytest -s -n 12 tests/
           fi
-
+      - name: Run distributed tests
+        run: |
+          make test-distributed
       - name: Run asan tests on AMD
         if: false
         run: |
 
@@ -70,13 +70,16 @@ jobs:
       - name: Update PATH
         run: |
           echo "$HOME/.local/bin" >> $GITHUB_PATH
+      - name: Setup Python environment for GB200
+        if: ${{ matrix.runner[0] == 'nvidia-gb200' }}
+        run: |
+          echo "/venv/bin" >> $GITHUB_PATH
+          echo "VIRTUAL_ENV=/venv" >> $GITHUB_ENV
+          echo "PYTHONHOME=" >> $GITHUB_ENV
       - name: Install Triton
         env:
           CUDA_HOME: "/usr/local/cuda"
         run: |
-          if [ "${{ matrix.runner[0] }}" == "nvidia-gb200" ]; then
-            source /venv/bin/activate
-          fi
           nproc
           nvidia-smi
           echo "PATH is '$PATH'"
@@ -87,20 +90,14 @@ jobs:
       - name: Run lit tests
         run: make test-lit
       - name: Run python tests on CUDA
-        run: |
-          if [ "${{ matrix.runner[0] }}" == "nvidia-gb200" ]; then
-            source /venv/bin/activate
-          fi
-          make NUM_PROCS=24 test-unit
+        run: make NUM_PROCS=24 test-unit
+      - name: Run distributed tests
+        run: make test-distributed
       - name: Run interpreter tests
         if: ${{ matrix.runner[0] == 'nvidia-h100' }}
         run: make test-interpret
       - name: Run regression tests
-        run: |
-          if [ "${{ matrix.runner[0] }}" == "nvidia-gb200" ]; then
-            source /venv/bin/activate
-          fi
-          make test-regression
+        run: make test-regression
       - name: Run C++ unittests
         run: make test-cpp
       - name: Run Proton tests
 
@@ -44,6 +44,12 @@ test-unit: all
 		$(PYTEST) --capture=tee-sys -rfs -vvv python/test/unit/instrumentation/test_gpuhello.py
 	$(PYTEST) -s -n $(NUM_PROCS) python/test/gluon
 
+.PHONY: test-distributed
+test-distributed: all
+	$(PYTHON) -m pip install --upgrade pip
+	$(PYTHON) -m pip install python/triton_kernels -v
+	$(PYTEST) -s python/triton_kernels/bench/distributed.py
+
 .PHONY: test-gluon
 test-gluon: all
 	$(PYTEST) -s -n $(NUM_PROCS) python/test/gluon
 
@@ -1,5 +1,4 @@
 #include "triton/Tools/GenericSwizzling.h"
-#include "triton/Conversion/TritonGPUToLLVM/TargetInfoBase.h"
 
 #include "third_party/f2reduce/f2reduce.h"
 #include "triton/Tools/LayoutUtils.h"
 
@@ -2,7 +2,6 @@
 import sys
 import pathlib
 import pytest
-from typing import Optional, Set
 import contextlib
 
 
@@ -108,49 +107,9 @@ def fresh_cache():
     yield fresh_cache
 
 
-def _fresh_knobs_impl(monkeypatch, skipped_attr: Optional[Set[str]] = None):
-    from triton import knobs
-
-    if skipped_attr is None:
-        skipped_attr = set()
-
-    knobs_map = {
-        name: knobset
-        for name, knobset in knobs.__dict__.items()
-        if isinstance(knobset, knobs.base_knobs) and knobset != knobs.base_knobs and name not in skipped_attr
-    }
-
-    # We store which variables we need to unset below in finally because
-    # monkeypatch doesn't appear to reset variables that were never set
-    # before the monkeypatch.delenv call below.
-    env_to_unset = []
-    prev_propagate_env = knobs.propagate_env
-
-    def fresh_function():
-        nonlocal env_to_unset
-        for name, knobset in knobs_map.items():
-            setattr(knobs, name, knobset.copy().reset())
-            for knob in knobset.knob_descriptors.values():
-                if knob.key in os.environ:
-                    monkeypatch.delenv(knob.key, raising=False)
-                else:
-                    env_to_unset.append(knob.key)
-        knobs.propagate_env = True
-        return knobs
-
-    def reset_function():
-        for name, knobset in knobs_map.items():
-            setattr(knobs, name, knobset)
-        for k in env_to_unset:
-            if k in os.environ:
-                del os.environ[k]
-        knobs.propagate_env = prev_propagate_env
-
-    return fresh_function, reset_function
-
-
 @pytest.fixture
 def fresh_knobs(monkeypatch):
+    from triton._internal_testing import _fresh_knobs_impl
     fresh_function, reset_function = _fresh_knobs_impl(monkeypatch)
     try:
         yield fresh_function()
@@ -165,6 +124,7 @@ def fresh_knobs_except_libraries(monkeypatch):
     information from the environment as these may be
     needed to successfully compile kernels.
     """
+    from triton._internal_testing import _fresh_knobs_impl
     fresh_function, reset_function = _fresh_knobs_impl(monkeypatch, skipped_attr={"build", "nvidia", "amd"})
     try:
         yield fresh_function()
 
@@ -5,10 +5,10 @@
 import triton
 import triton.language as tl
 from triton import knobs
+from typing import Optional, Set, Union
 import pytest
 
 from numpy.random import RandomState
-from typing import Optional, Union
 from triton.runtime.jit import TensorWrapper, reinterpret, type_canonicalisation_dict
 
 int_dtypes = ['int8', 'int16', 'int32', 'int64']
@@ -202,3 +202,44 @@ def unwrap_tensor(t: Union[torch.Tensor, triton.runtime.jit.TensorWrapper]) -> t
     if isinstance(t, triton.runtime.jit.TensorWrapper):
         return t.base
     return t
+
+
+def _fresh_knobs_impl(monkeypatch, skipped_attr: Optional[Set[str]] = None):
+    from triton import knobs
+
+    if skipped_attr is None:
+        skipped_attr = set()
+
+    knobs_map = {
+        name: knobset
+        for name, knobset in knobs.__dict__.items()
+        if isinstance(knobset, knobs.base_knobs) and knobset != knobs.base_knobs and name not in skipped_attr
+    }
+
+    # We store which variables we need to unset below in finally because
+    # monkeypatch doesn't appear to reset variables that were never set
+    # before the monkeypatch.delenv call below.
+    env_to_unset = []
+    prev_propagate_env = knobs.propagate_env
+
+    def fresh_function():
+        nonlocal env_to_unset
+        for name, knobset in knobs_map.items():
+            setattr(knobs, name, knobset.copy().reset())
+            for knob in knobset.knob_descriptors.values():
+                if knob.key in os.environ:
+                    monkeypatch.delenv(knob.key, raising=False)
+                else:
+                    env_to_unset.append(knob.key)
+        knobs.propagate_env = True
+        return knobs
+
+    def reset_function():
+        for name, knobset in knobs_map.items():
+            setattr(knobs, name, knobset)
+        for k in env_to_unset:
+            if k in os.environ:
+                del os.environ[k]
+        knobs.propagate_env = prev_propagate_env
+
+    return fresh_function, reset_function
@@ -361,7 +361,7 @@ def scope(self) -> Generator[None, None, None]:
 class BuildImpl(Protocol):
 
     def __call__(self, name: str, src: str, srcdir: str, library_dirs: list[str], include_dirs: list[str],
-                 libraries: list[str], extra_compile_args: list[str], /) -> str:
+                 libraries: list[str], ccflags: list[str], /) -> str:
         ...
 
 
 
@@ -317,9 +317,9 @@ def _or_combine(x, y):
 
 @core._tensor_member_fn
 @jit
-@core._add_reduction_docstr("reduce_of")
+@core._add_reduction_docstr("reduce_or")
 def reduce_or(input, axis, keep_dims=False):
-    core.static_assert(input.type.scalar.is_int(), "reduce_of only supported for integers")
+    core.static_assert(input.type.scalar.is_int(), "reduce_or only supported for integers")
     return core.reduce(input, axis, _or_combine, keep_dims=keep_dims)
 
 
 
@@ -29,5 +29,4 @@ def set_allocator(allocator: Allocator):
     The allocator function is called during kernel launch for kernels that
     require additional global memory workspace.
     """
-    global _allocator
     _allocator.set(allocator)
@@ -1,8 +1,9 @@
 from __future__ import annotations
 from typing import Callable, Optional
 from concurrent.futures import Executor, as_completed, Future
+from contextvars import ContextVar
 
-active_mode: Optional[AsyncCompileMode] = None
+active_mode: ContextVar[Optional[AsyncCompileMode]] = ContextVar("async_compile_active_mode", default=None)
 
 
 class FutureKernel:
@@ -42,14 +43,13 @@ def submit(self, key, compile_fn, finalize_fn):
         return future_kernel
 
     def __enter__(self):
-        global active_mode
-        if active_mode is not None:
+        if active_mode.get() is not None:
             raise RuntimeError("Another AsyncCompileMode is already active")
-        active_mode = self
+        active_mode.set(self)
+        return self
 
     def __exit__(self, exc_type, exc_value, traceback):
-        global active_mode
         # Finalize any outstanding compiles
         for future in as_completed(self.raw_futures):
             self.future_kernels[future._key].result()
-        active_mode = None
+        active_mode.set(None)
Original file line number	Diff line number	Diff line change
`@@ -1,5 +1,4 @@`
`1`	`1`	`#include "triton/Tools/GenericSwizzling.h"`
`2`		`-#include "triton/Conversion/TritonGPUToLLVM/TargetInfoBase.h"`
`3`	`2`
`4`	`3`	`#include "third_party/f2reduce/f2reduce.h"`
`5`	`4`	`#include "triton/Tools/LayoutUtils.h"`