diff --git a/.github/workflows/iris-tests-apptainer.yml b/.github/workflows/iris-tests-apptainer.yml
new file mode 100644
index 00000000..f9b8f3ac
--- /dev/null
+++ b/.github/workflows/iris-tests-apptainer.yml
@@ -0,0 +1,76 @@
+name: Iris Tests with Apptainer
+
+on:
+  push:
+    branches: [ main ]
+  pull_request:
+    branches: [ main ]
+  workflow_dispatch:
+
+jobs:
+  build-apptainer-image:
+    runs-on: [self-hosted, mi3008x]
+    timeout-minutes: 90
+    concurrency:
+      group: ${{ github.workflow }}-${{ github.head_ref || github.ref }}-build
+      cancel-in-progress: true
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Setup Apptainer
+        run: |
+          apt-get update && apt-get install -y software-properties-common
+          add-apt-repository -y ppa:apptainer/ppa
+          apt-get update && apt-get install -y apptainer
+
+      - name: Build Iris Apptainer container
+        run: |
+          # Create persistent Apptainer directory
+          mkdir -p ~/apptainer
+
+          # Build Apptainer image from definition file (only if it doesn't exist)
+          if [ ! -f ~/apptainer/iris-dev.sif ]; then
+            echo "Building new Apptainer image..."
+            apptainer build ~/apptainer/iris-dev.sif apptainer/iris.def
+          else
+            echo "Using existing Apptainer image"
+          fi
+  run-tests:
+    name: ${{ matrix.ranks }}-rank Iris Test
+    needs: build-apptainer-image
+    runs-on: [self-hosted, mi3008x]
+    timeout-minutes: 20
+    strategy:
+      matrix:
+        ranks: [1, 2, 4, 8]
+    concurrency:
+      group: ${{ github.workflow }}-${{ github.head_ref || github.ref }}-${{ matrix.ranks }}
+      cancel-in-progress: true
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Run Iris Tests with ${{ matrix.ranks }} MPI ranks
+        run: |
+          apptainer exec ~/apptainer/iris-dev.sif bash -c "
+            # Install iris first
+            pip install -e .
+
+            # Create function for mpirun with root permissions
+            mpirun-root() { mpirun --allow-run-as-root \"\$@\"; }
+
+            # Run examples tests one at a time
+            echo 'Running examples tests one at a time...'
+            for test_file in tests/examples/test_*.py; do
+              echo \"Testing: \$test_file with ${{ matrix.ranks }} MPI ranks\"
+              mpirun-root -np ${{ matrix.ranks }} python -m pytest \"\$test_file\" -v --tb=short
+            done
+
+            # Run unit tests one at a time
+            echo 'Running unit tests one at a time...'
+            for test_file in tests/unittests/test_*.py; do
+              echo \"Testing: \$test_file with ${{ matrix.ranks }} MPI ranks\"
+              mpirun-root -np ${{ matrix.ranks }} python -m pytest \"\$test_file\" -v --tb=short
+            done
+          "
\ No newline at end of file
diff --git a/docker/build.sh b/docker/build.sh
index 050a1998..973c9366 100755
--- a/docker/build.sh
+++ b/docker/build.sh
@@ -10,4 +10,4 @@ pushd "$SCRIPT_DIR" > /dev/null
 
 docker build -t $IMAGE_NAME .
 
-popd > /dev/null
\ No newline at end of file
+popd > /dev/null
diff --git a/pyproject.toml b/pyproject.toml
index 82a5c0dc..b700c83d 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -32,7 +32,7 @@ dependencies = [
     "requests",
     "mpi4py",
     "ruff",
-    "triton"
+    "triton @ git+https://github.com/triton-lang/triton.git@dd5823453bcc7973eabadb65f9d827c43281c434"
 ]
 
 [project.optional-dependencies]
@@ -54,4 +54,4 @@ select = ["E", "F", "W"]
 ignore = ["E501", "E701", "E731", "E741", "F841", "F401"]
 
 [tool.ruff.format]
-quote-style = "double"
\ No newline at end of file
+quote-style = "double"
diff --git a/tests/examples/test_load_bench.py b/tests/examples/test_load_bench.py
index ec2bb472..16d6c403 100644
--- a/tests/examples/test_load_bench.py
+++ b/tests/examples/test_load_bench.py
@@ -51,6 +51,8 @@ def test_load_bench(dtype, buffer_size, heap_size, block_size):
     source_buffer = shmem.ones(buffer_size // element_size_bytes, dtype=dtype)
     result_buffer = shmem.zeros_like(source_buffer)
 
+    shmem.barrier()
+
     for source_rank in range(num_ranks):
         for destination_rank in range(num_ranks):
             bandwidth_gbps = module.bench_load(
diff --git a/tests/unittests/test_atomic_add.py b/tests/unittests/test_atomic_add.py
index e09d078e..2bcbea02 100644
--- a/tests/unittests/test_atomic_add.py
+++ b/tests/unittests/test_atomic_add.py
@@ -75,6 +75,8 @@ def test_atomic_add_api(dtype, sem, scope, BLOCK_SIZE):
 
     results = shmem.zeros(BLOCK_SIZE, dtype=dtype)
 
+    shmem.barrier()
+
     grid = lambda meta: (1,)
     atomic_add_kernel[grid](results, sem, scope, cur_rank, num_ranks, BLOCK_SIZE, heap_bases)
     shmem.barrier()
diff --git a/tests/unittests/test_atomic_and.py b/tests/unittests/test_atomic_and.py
index 19c44ce0..29b2f02b 100644
--- a/tests/unittests/test_atomic_and.py
+++ b/tests/unittests/test_atomic_and.py
@@ -76,6 +76,8 @@ def test_atomic_and_api(dtype, sem, scope, BLOCK_SIZE):
 
     results = shmem.full((BLOCK_SIZE,), initial_mask, dtype=dtype)
 
+    shmem.barrier()
+
     grid = lambda meta: (1,)
     atomic_and_kernel[grid](results, sem, scope, cur_rank, num_ranks, BLOCK_SIZE, heap_bases)
     shmem.barrier()
diff --git a/tests/unittests/test_atomic_cas.py b/tests/unittests/test_atomic_cas.py
index 52db3dd0..38eb5aec 100644
--- a/tests/unittests/test_atomic_cas.py
+++ b/tests/unittests/test_atomic_cas.py
@@ -59,6 +59,8 @@ def test_atomic_cas_api(dtype, sem, scope):
 
     results = shmem.zeros((1,), dtype=dtype)
 
+    shmem.barrier()
+
     grid = lambda meta: (1,)
     atomic_cas_kernel[grid](results, sem, scope, cur_rank, num_ranks, heap_bases)
     shmem.barrier()
diff --git a/tests/unittests/test_atomic_max.py b/tests/unittests/test_atomic_max.py
index 61359afd..32f381e5 100644
--- a/tests/unittests/test_atomic_max.py
+++ b/tests/unittests/test_atomic_max.py
@@ -71,6 +71,8 @@ def test_atomic_max_api(dtype, sem, scope, BLOCK_SIZE):
     min_val = torch.iinfo(dtype).min
     results = shmem.full((BLOCK_SIZE,), min_val, dtype=dtype)
 
+    shmem.barrier()
+
     grid = lambda meta: (1,)
     atomic_max_kernel[grid](results, sem, scope, cur_rank, num_ranks, BLOCK_SIZE, heap_bases)
     shmem.barrier()
diff --git a/tests/unittests/test_atomic_min.py b/tests/unittests/test_atomic_min.py
index da7ef176..15d862d9 100644
--- a/tests/unittests/test_atomic_min.py
+++ b/tests/unittests/test_atomic_min.py
@@ -71,6 +71,8 @@ def test_atomic_min_api(dtype, sem, scope, BLOCK_SIZE):
     max_val = torch.iinfo(dtype).max
     results = shmem.full((BLOCK_SIZE,), max_val, dtype=dtype)
 
+    shmem.barrier()
+
     grid = lambda meta: (1,)
     atomic_min_kernel[grid](results, sem, scope, cur_rank, num_ranks, BLOCK_SIZE, heap_bases)
     shmem.barrier()
diff --git a/tests/unittests/test_atomic_or.py b/tests/unittests/test_atomic_or.py
index 8d17be65..18705763 100644
--- a/tests/unittests/test_atomic_or.py
+++ b/tests/unittests/test_atomic_or.py
@@ -71,6 +71,8 @@ def test_atomic_or_api(dtype, sem, scope, BLOCK_SIZE):
 
     results = shmem.zeros(BLOCK_SIZE, dtype=dtype)
 
+    shmem.barrier()
+
     grid = lambda meta: (1,)
     atomic_or_kernel[grid](results, sem, scope, cur_rank, num_ranks, BLOCK_SIZE, heap_bases)
     shmem.barrier()
diff --git a/tests/unittests/test_atomic_xchg.py b/tests/unittests/test_atomic_xchg.py
index f11ad798..f47a4dc2 100644
--- a/tests/unittests/test_atomic_xchg.py
+++ b/tests/unittests/test_atomic_xchg.py
@@ -58,6 +58,8 @@ def test_atomic_xchg_api(dtype, sem, scope):
 
     results = shmem.zeros((1,), dtype=dtype)
 
+    shmem.barrier()
+
     grid = lambda meta: (1,)
     atomic_xchg_kernel[grid](results, sem, scope, cur_rank, num_ranks, heap_bases)
     shmem.barrier()
diff --git a/tests/unittests/test_atomic_xor.py b/tests/unittests/test_atomic_xor.py
index 702002b9..774df798 100644
--- a/tests/unittests/test_atomic_xor.py
+++ b/tests/unittests/test_atomic_xor.py
@@ -72,6 +72,8 @@ def test_atomic_xor_api(dtype, sem, scope, BLOCK_SIZE):
 
     results = shmem.zeros(BLOCK_SIZE, dtype=dtype)
 
+    shmem.barrier()
+
     grid = lambda meta: (1,)
     atomic_xor_kernel[grid](results, sem, scope, cur_rank, num_ranks, BLOCK_SIZE, heap_bases)
     shmem.barrier()
diff --git a/tests/unittests/test_get.py b/tests/unittests/test_get.py
index f7df2340..75cf1e26 100644
--- a/tests/unittests/test_get.py
+++ b/tests/unittests/test_get.py
@@ -66,6 +66,8 @@ def test_get_api(dtype, BLOCK_SIZE):
     data = shmem.ones(BLOCK_SIZE, dtype=dtype)
     results = shmem.zeros_like(data)
 
+    shmem.barrier()
+
     grid = lambda meta: (1,)
     get_kernel[grid](data, results, cur_rank, num_ranks, BLOCK_SIZE, heap_bases)
     shmem.barrier()
diff --git a/tests/unittests/test_load.py b/tests/unittests/test_load.py
index 4b0f0b36..a379a9e3 100644
--- a/tests/unittests/test_load.py
+++ b/tests/unittests/test_load.py
@@ -59,6 +59,8 @@ def test_load_api(dtype, BLOCK_SIZE):
     data = shmem.full((BLOCK_SIZE,), source_rank, dtype=dtype)
     results = shmem.zeros_like(data)
 
+    shmem.barrier()
+
     grid = lambda meta: (1,)
     load_kernel[grid](data, results, source_rank, num_ranks, BLOCK_SIZE, heap_bases)
     shmem.barrier()
diff --git a/tests/unittests/test_put.py b/tests/unittests/test_put.py
index c41f2f7e..a305af4a 100644
--- a/tests/unittests/test_put.py
+++ b/tests/unittests/test_put.py
@@ -60,6 +60,8 @@ def test_put_api(dtype, BLOCK_SIZE):
     data = shmem.ones(BLOCK_SIZE, dtype=dtype)
     results = shmem.zeros_like(data)
 
+    shmem.barrier()
+
     grid = lambda meta: (1,)
     put_kernel[grid](data, results, cur_rank, num_ranks, BLOCK_SIZE, heap_bases)
     shmem.barrier()
diff --git a/tests/unittests/test_store.py b/tests/unittests/test_store.py
index 361500a7..fd9762a5 100644
--- a/tests/unittests/test_store.py
+++ b/tests/unittests/test_store.py
@@ -61,6 +61,8 @@ def test_store_api(dtype, BLOCK_SIZE):
     src = shmem.ones(BLOCK_SIZE, dtype=dtype)
     results = shmem.zeros_like(src)
 
+    shmem.barrier()
+
     grid = lambda meta: (1,)
     store_kernel[grid](src, results, destination_rank, num_ranks, BLOCK_SIZE, heap_bases)
     shmem.barrier()