intel
diff --git a/‎.github/WINDOWS.md‎
Lines changed: 1 addition & 1 deletion b/‎.github/WINDOWS.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/pins/e2e_reference_torch-xpu-ops.txt‎
Lines changed: 1 addition & 1 deletion b/‎.github/pins/e2e_reference_torch-xpu-ops.txt‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/pins/pytorch.txt‎
Lines changed: 1 addition & 1 deletion b/‎.github/pins/pytorch.txt‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/build-test-reusable.yml‎
Lines changed: 8 additions & 9 deletions b/‎.github/workflows/build-test-reusable.yml‎
Lines changed: 8 additions & 9 deletions
diff --git a/‎.github/workflows/pip-test-windows.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/pip-test-windows.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/pip-test.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/pip-test.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎lib/Conversion/TritonGPUToLLVM/FuncOpToLLVM.cpp‎
Lines changed: 14 additions & 0 deletions b/‎lib/Conversion/TritonGPUToLLVM/FuncOpToLLVM.cpp‎
Lines changed: 14 additions & 0 deletions
diff --git a/‎python/triton/compiler/compiler.py‎
Lines changed: 0 additions & 13 deletions b/‎python/triton/compiler/compiler.py‎
Lines changed: 0 additions & 13 deletions
diff --git a/‎scripts/skiplist/a770/interpreter.txt‎ b/‎scripts/skiplist/a770/interpreter.txt‎
diff --git a/‎scripts/skiplist/a770/tools.txt‎ b/‎scripts/skiplist/a770/tools.txt‎
@@ -39,7 +39,7 @@ Install Microsoft Visual Studio 2022 and make sure the following [components](ht
 
 ### Intel® Deep Learning Essentials
 
-Install [Intel® Deep Learning Essentials 2025.2.1](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit-download.html?packages=dl-essentials&dl-essentials-os=windows&dl-win=offline).
+Install [Intel® Deep Learning Essentials 2025.3.1](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit-download.html?packages=dl-essentials&dl-essentials-os=windows&dl-win=offline).
 By default, it is installed to `C:\Program Files (x86)\Intel\oneAPI`.
 
 ### Chocolatey
 
@@ -1 +1 @@
-1e69f40b3c03492eb3dd7e03462a5566f29674d3
+549347d24e9b509b653a350053d56992fc8436ad
@@ -1 +1 @@
-01f94d4096060597e2815efe385255ac19c9c787
+365a6c84db516f244b7234b7aa3c8843af52936b
@@ -194,6 +194,7 @@ jobs:
       matrix:
         suite:
           - minicore
+          - language
           - scaled_dot
           - gluon
           - triton-kernels
@@ -303,6 +304,11 @@ jobs:
         run: |
           ${{ env.TRITON_TEST_CMD }} --minicore
 
+      - name: Run language tests
+        if: matrix.suite == 'language'
+        run: |
+          ${{ env.TRITON_TEST_CMD }} --language
+
       - name: Run mxfp tests
         if: matrix.suite == 'rest'
         run: |
@@ -329,14 +335,6 @@ jobs:
           export PYTEST_MAX_PROCESSES=4
           ${{ env.TRITON_TEST_CMD }} --triton-kernels
 
-      # FIXME: make sure new tutorials are added to one of the groups (scaled_dot, rest, tutorial-faX)
-      - name: Select tutorials to run (scaled_dot)
-        if: matrix.suite == 'scaled_dot'
-        run: |
-          cat <<EOF | tee tutorials.txt
-          09-persistent-matmul
-          EOF
-
       - name: Select tutorials to run (rest)
         if: matrix.suite == 'rest'
         run: |
@@ -348,11 +346,12 @@ jobs:
           05-layer-norm
           07-extern-functions
           08-grouped-gemm
+          09-persistent-matmul
           10-experimental-block-pointer
           EOF
 
       - name: Run Tutorials
-        if: matrix.suite == 'scaled_dot' || matrix.suite == 'rest'
+        if: matrix.suite == 'rest'
         run: |
           ${{ env.TRITON_TEST_CMD }} --select-from-file tutorials.txt --tutorial
 
 
@@ -90,7 +90,7 @@ jobs:
           .venv\Scripts\activate.ps1
           Invoke-BatchFile "C:\Program Files\Microsoft Visual Studio\2022\Community\VC\Auxiliary\Build\vcvarsall.bat" x64
           cd ${{ env.NEW_WORKSPACE }}
-          pip install -U pybind11 'cmake>=3.20,<4.0' intel-sycl-rt==2025.2.1 build
+          pip install -U pybind11 'cmake>=3.20,<4.0' intel-sycl-rt==2025.3.1 build
           # `build` can't determine that Ninja is already installed.
           # similar issue: https://github.com/pypa/build/issues/506
           python -m build --wheel --no-isolation --skip-dependency-check
 
@@ -65,7 +65,7 @@ jobs:
       - name: Install runtime dependencies
         run: |
           curl -sSLO --retry 10 https://raw.githubusercontent.com/pytorch/pytorch/$(<.github/pins/pytorch.txt)/.github/scripts/generate_binary_build_matrix.py
-          sed -i '/^validate_nccl_dep_consistency.*/d' generate_binary_build_matrix.py
+          sed -i 's/^\(\s*\)validate_nccl_dep_consistency(arch_version)/\1pass/' generate_binary_build_matrix.py
           python -c "from generate_binary_build_matrix import PYTORCH_EXTRA_INSTALL_REQUIREMENTS; print('\n'.join(PYTORCH_EXTRA_INSTALL_REQUIREMENTS['xpu'].split(' | ')))" | tee /tmp/requirements.txt
           pip install -r /tmp/requirements.txt
           pip install transformers==4.54.0
 
@@ -179,11 +179,25 @@ struct FuncOpConversion : public ConvertOpToLLVMPattern<triton::FuncOp> {
             "ttg.total-num-warps"))
       numWarps = totalNumWarps.getInt();
 
+    int numCTAs = 1;
+    if (auto module = funcOp->getParentOfType<ModuleOp>()) {
+      if (auto moduleAttr =
+              module->getAttrOfType<IntegerAttr>(triton::gpu::AttrNumCTAsName))
+        numCTAs = moduleAttr.getInt();
+    }
+
     // Set `nvvm.maxnreg` if it was specified on the module.
     if (Attribute maxnregAttr =
             funcOp.getParentOp()->getAttr(triton::gpu::AttrMaxRegistersName))
       newFuncOp->setAttr(NVVM::NVVMDialect::getMaxnregAttrName(), maxnregAttr);
 
+    // Do we want to do this for nCTAs == 1 whenever sm >= 90?
+    if (numCTAs > 1) {
+      // Request a specific number of CTAs per cluster in the generated PTX.
+      newFuncOp->setAttr(NVVM::NVVMDialect::getClusterDimAttrName(),
+                         rewriter.getDenseI32ArrayAttr(numCTAs));
+    }
+
     // Set an attribute for reqntidx, it could be used in latter LLVM codegen
     // for `nvvm.annotation` metadata.
     newFuncOp->setAttr(NVVM::NVVMDialect::getReqntidAttrName(),
 
@@ -297,18 +297,6 @@ def compile(src, target=None, options=None, _env_vars=None):
 
     metadata["cache_dir"] = fn_cache_manager.cache_dir
     metadata["triton_version"] = __version__
-    cluster_dims = getattr(options, "cluster_dims", None)
-    if cluster_dims is None:
-        num_ctas = getattr(options, "num_ctas", None)
-        if num_ctas is None:
-            num_ctas = 1
-        cluster_dims = (num_ctas, 1, 1)
-    if not isinstance(cluster_dims, (list, tuple)):
-        cluster_dims = (cluster_dims, )
-    cluster_dims = tuple(cluster_dims)
-    if len(cluster_dims) < 3:
-        cluster_dims = cluster_dims + (1, ) * (3 - len(cluster_dims))
-    metadata["cluster_dims"] = cluster_dims
     # run compilation pipeline  and populate metadata
     stages = dict()
     backend.add_stages(stages, options, src.language)
@@ -435,7 +423,6 @@ def __init__(self, src, metadata_group, hash):
         from collections import namedtuple
         metadata_path = next((Path(p) for c, p in metadata_group.items() if c.endswith(".json")))
         metadata = json.loads(metadata_path.read_text())
-        metadata['cluster_dims'] = tuple(metadata['cluster_dims'])
         # JSON serialization dumps the target as a dict. Restore it to a GPUTarget.
         target = metadata['target']
         metadata['target'] = GPUTarget(target['backend'], target['arch'], target['warp_size'])
Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-1e69f40b3c03492eb3dd7e03462a5566f29674d3`
	`1`	`+549347d24e9b509b653a350053d56992fc8436ad`
Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-01f94d4096060597e2815efe385255ac19c9c787`
	`1`	`+365a6c84db516f244b7234b7aa3c8843af52936b`