intel
diff --git a/‎CMakeLists.txt‎
Lines changed: 18 additions & 1 deletion b/‎CMakeLists.txt‎
Lines changed: 18 additions & 1 deletion
diff --git a/‎README.md‎
Lines changed: 60 additions & 0 deletions b/‎README.md‎
Lines changed: 60 additions & 0 deletions
diff --git a/‎docs/conf.py‎
Lines changed: 1 addition & 1 deletion b/‎docs/conf.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎lib/Conversion/TritonGPUToLLVM/ReduceOpToLLVM.cpp‎
Lines changed: 0 additions & 7 deletions b/‎lib/Conversion/TritonGPUToLLVM/ReduceOpToLLVM.cpp‎
Lines changed: 0 additions & 7 deletions
diff --git a/‎lib/Conversion/TritonGPUToLLVM/ScanOpToLLVM.cpp‎
Lines changed: 1 addition & 1 deletion b/‎lib/Conversion/TritonGPUToLLVM/ScanOpToLLVM.cpp‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp‎
Lines changed: 6 additions & 0 deletions b/‎lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎python/setup.py‎
Lines changed: 8 additions & 7 deletions b/‎python/setup.py‎
Lines changed: 8 additions & 7 deletions
diff --git a/‎python/test/unit/language/test_compile_errors.py‎
Lines changed: 1 addition & 1 deletion b/‎python/test/unit/language/test_compile_errors.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎python/test/unit/language/test_conversions.py‎
Lines changed: 9 additions & 1 deletion b/‎python/test/unit/language/test_conversions.py‎
Lines changed: 9 additions & 1 deletion
@@ -12,7 +12,7 @@ set(CMAKE_CXX_STANDARD 17)
 
 set(CMAKE_INCLUDE_CURRENT_DIR ON)
 
-project(triton)
+project(triton CXX)
 include(CTest)
 
 if(NOT WIN32)
@@ -26,8 +26,25 @@ option(TRITON_BUILD_TUTORIALS "Build C++ Triton tutorials" ON)
 option(TRITON_BUILD_PYTHON_MODULE "Build Python Triton bindings" OFF)
 option(TRITON_BUILD_PROTON "Build the Triton Proton profiler" ON)
 option(TRITON_BUILD_UT "Build C++ Triton Unit Tests" ON)
+option(TRITON_BUILD_WITH_CCACHE "Build with ccache (if available)" ON)
 set(TRITON_CODEGEN_BACKENDS "" CACHE STRING "Enable different codegen backends")
 
+if(TRITON_BUILD_WITH_CCACHE)
+  find_program(CCACHE_PROGRAM ccache)
+  if(CCACHE_PROGRAM)
+    set(CMAKE_C_COMPILER_LAUNCHER "${CCACHE_PROGRAM}"
+        CACHE STRING "C compiler launcher")
+    set(CMAKE_CXX_COMPILER_LAUNCHER "${CCACHE_PROGRAM}"
+        CACHE STRING "CXX compiler launcher")
+  else()
+    message(
+      STATUS
+        "Could not find ccache. Consider installing ccache to speed up compilation."
+    )
+  endif()
+endif()
+
+
 # Ensure Python3 vars are set correctly
 # used conditionally in this file and by lit tests
 
 
@@ -6,6 +6,7 @@
 
 This is the development repository of Intel® XPU Backend for Triton\*, a new [Triton](https://github.com/triton-lang/triton/) backend for Intel GPUs. Intel® XPU Backend for Triton\* is a out of tree backend module for [Triton](https://github.com/triton-lang/triton/blob/main/CONTRIBUTING.md) used to provide best-in-class performance and productivity on any Intel GPUs for [PyTorch](https://github.com/triton-lang/triton/blob/main/CONTRIBUTING.md) and standalone usage.
 
+<<<<<<< HEAD
 # Compatibility
 
 * Operating systems:
@@ -21,11 +22,25 @@ This is the development repository of Intel® XPU Backend for Triton\*, a new [T
   * Latest [PyTorch Prerequisites for Intel GPUs](https://www.intel.com/content/www/us/en/developer/articles/tool/pytorch-prerequisites-for-intel-gpus.html)
 
 Note that Intel® XPU Backend for Triton\* is not compatible with Intel® Extension for PyTorch\* and Intel® oneAPI Base Toolkit\*.
+=======
+| **`Documentation`** | **`Nightly Wheels`** |
+|-------------------- | -------------------- |
+| [![Documentation](https://github.com/triton-lang/triton/actions/workflows/documentation.yml/badge.svg)](https://triton-lang.org/) | [![Wheels](https://github.com/triton-lang/triton/actions/workflows/wheels.yml/badge.svg?branch=release/2.0.x)](https://github.com/triton-lang/triton/actions/workflows/wheels.yml) |
+
+# Triton
+
+This is the development repository of Triton, a language and compiler for writing highly efficient custom Deep-Learning primitives. The aim of Triton is to provide an open-source environment to write fast code at higher productivity than CUDA, but also with higher flexibility than other existing DSLs.
+
+The foundations of this project are described in the following MAPL2019 publication: [Triton: An Intermediate Language and Compiler for Tiled Neural Network Computations](http://www.eecs.harvard.edu/~htk/publication/2019-mapl-tillet-kung-cox.pdf). Please consider citing this work if you use Triton!
+
+The [official documentation](https://triton-lang.org) contains installation instructions and tutorials.  See also these third-party [Triton puzzles](https://github.com/srush/Triton-Puzzles), which can all be run using the Triton interpreter -- no GPU required.
+>>>>>>> d6739d3c33dee481f2d4dee4f6ecd4123f671597
 
 # Quick Installation
 
 ## Prerequisites
 
+<<<<<<< HEAD
 1. Latest [Rolling Release](https://dgpu-docs.intel.com/driver/installation-rolling.html) or [Long Term Support Release](https://dgpu-docs.intel.com/driver/installation.html) of GPU driver
 2. Latest release of [PyTorch Prerequisites for Intel GPUs](https://www.intel.com/content/www/us/en/developer/articles/tool/pytorch-prerequisites-for-intel-gpus.html)
 3. Latest release of [Profiling Tools Interfaces for Intel GPU (PTI for GPU)](https://www.intel.com/content/www/us/en/developer/articles/tool/pytorch-prerequisites-for-intel-gpus.html)
@@ -40,18 +55,35 @@ Extract the archive and in the extracted directory execute:
 ```shell
 pip install torch-*.whl triton-*.whl
 ```
+=======
+```shell
+pip install triton
+```
+
+Binary wheels are available for CPython 3.8-3.12 and PyPy 3.8-3.9.
+>>>>>>> d6739d3c33dee481f2d4dee4f6ecd4123f671597
 
 Before using Intel® XPU Backend for Triton\* you need to initialize the toolchain.
 The default location is `/opt/intel/oneapi` (if installed as a `root` user) or `~/intel/oneapi` (if installed as a regular user).
 
 ```shell
+<<<<<<< HEAD
 # replace /opt/intel/oneapi with the actual location of PyTorch Prerequisites for Intel GPUs
 source /opt/intel/oneapi/setvars.sh
+=======
+pip install -U --index-url https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/Triton-Nightly/pypi/simple/ triton-nightly
+>>>>>>> d6739d3c33dee481f2d4dee4f6ecd4123f671597
 ```
 
 # Install from source
 
+<<<<<<< HEAD
 ## Prerequisites
+=======
+```shell
+git clone https://github.com/triton-lang/triton.git;
+cd triton;
+>>>>>>> d6739d3c33dee481f2d4dee4f6ecd4123f671597
 
 1. Latest [Rolling Release](https://dgpu-docs.intel.com/driver/installation-rolling.html) or [Long Term Support Release](https://dgpu-docs.intel.com/driver/installation.html) of GPU driver
 2. Latest release of [PyTorch Prerequisites for Intel GPUs](https://www.intel.com/content/www/us/en/developer/articles/tool/pytorch-prerequisites-for-intel-gpus.html)
@@ -72,9 +104,14 @@ source /opt/intel/oneapi/setvars.sh
 Clone this repository:
 
 ```shell
+<<<<<<< HEAD
 git clone https://github.com/intel/intel-xpu-backend-for-triton.git
 cd intel-xpu-backend-for-triton
 ```
+=======
+git clone https://github.com/triton-lang/triton.git;
+cd triton;
+>>>>>>> d6739d3c33dee481f2d4dee4f6ecd4123f671597
 
 To avoid potential conflicts with installed packages it is recommended to create and activate a new Python virtual environment:
 
@@ -205,6 +242,7 @@ For detailed instructions on how to debug Triton's frontend, please refer to thi
 
 # Usage Guide
 
+<<<<<<< HEAD
 ## Code Modifications
 Intel® XPU Backend for Triton\* requires a special version of PyTorch that can be built from sources or installed from nightly wheels.
 
@@ -308,6 +346,14 @@ Note that the user needs to explicitly set `TRITON_XPU_PROFILE=1` when the user
 ```Bash
 export TRITON_XPU_PROFILE=1
 ```
+=======
+Version 2.0 is out! New features include:
+
+- Many, many bug fixes
+- Performance improvements
+- Backend rewritten to use MLIR
+- Support for kernels that contain back-to-back matmuls (e.g., flash attention)
+>>>>>>> d6739d3c33dee481f2d4dee4f6ecd4123f671597
 
 # Contributing
 
@@ -317,10 +363,24 @@ Community contributions are more than welcome, whether it be to fix bugs or to a
 
 _MIT License_. As found in [LICENSE](https://github.com/intel/intel-xpu-backend-for-triton/blob/main/LICENSE) file.
 
+<<<<<<< HEAD
 
 ## Security
 
 See Intel's [Security Center](https://www.intel.com/content/www/us/en/security-center/default.html)
 for information on how to report a potential security issue or vulnerability.
 
 See also: [Security Policy](security.md)
+=======
+# Compatibility
+
+Supported Platforms:
+
+- Linux
+
+Supported Hardware:
+
+- NVIDIA GPUs (Compute Capability 8.0+)
+- AMD GPUs (ROCm 5.2+)
+- Under development: CPUs
+>>>>>>> d6739d3c33dee481f2d4dee4f6ecd4123f671597
@@ -145,7 +145,7 @@ def documenter(app, obj, parent):
 autosummary_generate = True
 
 # versioning config
-smv_tag_whitelist = r'^(v3.1.0)$'
+smv_tag_whitelist = r'^(v3.2.0)$'
 smv_branch_whitelist = r'^main$'
 smv_remote_whitelist = None
 smv_released_pattern = r'^tags/.*$'
 
@@ -162,13 +162,6 @@ struct ReduceOpConversion
 
     auto mod = op->getParentOfType<ModuleOp>();
     unsigned iWarpSize = triton::gpu::TritonGPUDialect::getThreadsPerWarp(mod);
-    if (iWarpSize > numLaneToReduce) {
-      Value threadId = getThreadId(rewriter, loc);
-      Value warpSize = i32_val(iWarpSize);
-      Value laneId = urem(threadId, warpSize);
-      Value lanePred = icmp_slt(laneId, i32_val(numLaneToReduce));
-      pred = pred ? and_(pred, lanePred) : lanePred;
-    }
 
     for (unsigned N = numLaneToReduce / 2; N > 0; N >>= 1) {
       SmallVector<Value> shfl(acc.size());
 
@@ -187,7 +187,7 @@ static void AddPartialReduce(SmallVector<SmallVector<Value>> &srcValues,
       }
       Value mask = icmp_sge(warpId, i32_val(i + 1));
       accumulator.acc =
-          accumulate(helper, rewriter, accumulator.acc, partialReduce, mask);
+          accumulate(helper, rewriter, accumulator.acc, partialReduce);
       for (unsigned j = 0; j < helper.getNumOperands(); ++j) {
         accumulator.maskedAcc[j] =
             select(mask, accumulator.acc[j], accumulator.maskedAcc[j]);
 
@@ -235,6 +235,12 @@ class BlockedToMMA : public mlir::OpRewritePattern<DotOp> {
                   mlir::PatternRewriter &rewriter) const override {
     if (computeCapability < 70)
       return failure();
+    if (computeCapability < 80) {
+      dotOp.emitRemark()
+          << "Dot op using MMA for compute capability " << computeCapability
+          << " has been deprecated. It falls back to the FMA path.";
+      return failure();
+    }
     // TODO: Check data-types and SM compatibility
     RankedTensorType oldRetType = dotOp.getType();
     if (!oldRetType.getEncoding() ||
 
@@ -460,15 +460,16 @@ def build_extension(self, ext):
                 "-DCMAKE_CXX_FLAGS=-fsanitize=address",
             ]
 
-        if check_env_flag("TRITON_BUILD_WITH_CCACHE"):
-            cmake_args += [
-                "-DCMAKE_CXX_COMPILER_LAUNCHER=ccache",
-            ]
+        # environment variables we will pass through to cmake
+        passthrough_args = [
+            "TRITON_BUILD_PROTON",
+            "TRITON_BUILD_TUTORIALS",
+            "TRITON_BUILD_WITH_CCACHE",
+        ]
+        cmake_args += [f"-D{option}={os.getenv(option)}" for option in passthrough_args if option in os.environ]
 
         if check_env_flag("TRITON_BUILD_PROTON", "ON"):  # Default ON
             cmake_args += self.get_proton_cmake_args()
-        else:
-            cmake_args += ["-DTRITON_BUILD_PROTON=OFF"]
 
         if is_offline_build():
             # unit test builds fetch googletests from GitHub
@@ -701,7 +702,7 @@ def get_install_requires():
 
 setup(
     name=os.environ.get("TRITON_WHEEL_NAME", "triton"),
-    version="3.1.0" + get_git_commit_hash() + os.environ.get("TRITON_WHEEL_VERSION_SUFFIX", ""),
+    version="3.2.0" + get_git_commit_hash() + os.environ.get("TRITON_WHEEL_VERSION_SUFFIX", ""),
     author="Philippe Tillet",
     author_email="[email protected]",
     description="A language and compiler for custom Deep Learning operations",
 
@@ -353,7 +353,7 @@ def test_fp8_support(dtype):
             supported_dtypes.append(tl.float8e4nv)
     elif is_hip():
         if is_hip_mi300():
-            supported_dtypes += [tl.float8e4b8, tl.float8e5b16]
+            supported_dtypes += [tl.float8e4nv, tl.float8e4b8, tl.float8e5b16]
     elif is_xpu():
         supported_dtypes += [tl.float8e4b15, tl.float8e4nv]
     elif is_interpreter():
 
@@ -302,8 +302,13 @@ def upcast_test(src_dtype, dst_dtype, exponent_bits, mantissa_bits, exponent_bia
     ('float8e5b16', 'float16'),
 ])
 def test_typeconvert_upcast(src_dtype, dst_dtype, device):
+
+    # On HIP, fp8e4nv upcasting is only supported to bf16, and it's only supported on MI300.
+    if src_dtype == 'float8e4nv' and is_hip() and (dst_dtype != 'bfloat16' or not is_hip_mi300()):
+        pytest.skip(f"upcasting {src_dtype} to {dst_dtype} not supported in this architecture")
+
     if ((src_dtype == 'float8e4nv' and is_cuda() and torch.cuda.get_device_capability(0) < (8, 9))
-       or (src_dtype in ('float8e4nv', 'float8e4b15') and is_hip())
+       or (src_dtype in ('float8e4b15') and is_hip())
        or (src_dtype in ('float8e4b8', 'float8e5b16') and (is_cuda() or not is_hip_mi300()))):
         # If the dtype should error out in the given device, we assert that and return
         with pytest.raises(triton.CompilationError, match="not supported in this architecture"):
@@ -358,6 +363,9 @@ def test_typeconvert_downcast(src_dtype, dst_dtype, rounding, max_repr, device):
     if dst_dtype in ('float8e5b16', 'float8e4b8') and rounding == 'rtne' and (is_cuda() or not is_hip_mi300()):
         pytest.xfail(f"{dst_dtype} downcast with RTNE rounding tests only supported on AMDGPU MI300")
 
+    if dst_dtype == 'float8e4nv' and is_hip():
+        pytest.skip(f"{dst_dtype} downcast not supported in HIP")
+
     # dtype : (exponent_bits, mantissa_bits, exponent_bias)
     stuff = {
         'float16': (5, 10, 15),
Original file line number	Diff line number	Diff line change
`@@ -187,7 +187,7 @@ static void AddPartialReduce(SmallVector<SmallVector<Value>> &srcValues,`
`187`	`187`	`}`
`188`	`188`	`Value mask = icmp_sge(warpId, i32_val(i + 1));`
`189`	`189`	`accumulator.acc =`
`190`		`- accumulate(helper, rewriter, accumulator.acc, partialReduce, mask);`
	`190`	`+ accumulate(helper, rewriter, accumulator.acc, partialReduce);`
`191`	`191`	`for (unsigned j = 0; j < helper.getNumOperands(); ++j) {`
`192`	`192`	`accumulator.maskedAcc[j] =`
`193`	`193`	`select(mask, accumulator.acc[j], accumulator.maskedAcc[j]);`