Skip to content

Commit 70da69c

Browse files
Merge commit 'd6739d3c33dee481f2d4dee4f6ecd4123f671597'
2 parents ecc9bd4 + d6739d3 commit 70da69c

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

47 files changed

+4935
-776
lines changed

CMakeLists.txt

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ set(CMAKE_CXX_STANDARD 17)
1212

1313
set(CMAKE_INCLUDE_CURRENT_DIR ON)
1414

15-
project(triton)
15+
project(triton CXX)
1616
include(CTest)
1717

1818
if(NOT WIN32)
@@ -26,8 +26,25 @@ option(TRITON_BUILD_TUTORIALS "Build C++ Triton tutorials" ON)
2626
option(TRITON_BUILD_PYTHON_MODULE "Build Python Triton bindings" OFF)
2727
option(TRITON_BUILD_PROTON "Build the Triton Proton profiler" ON)
2828
option(TRITON_BUILD_UT "Build C++ Triton Unit Tests" ON)
29+
option(TRITON_BUILD_WITH_CCACHE "Build with ccache (if available)" ON)
2930
set(TRITON_CODEGEN_BACKENDS "" CACHE STRING "Enable different codegen backends")
3031

32+
if(TRITON_BUILD_WITH_CCACHE)
33+
find_program(CCACHE_PROGRAM ccache)
34+
if(CCACHE_PROGRAM)
35+
set(CMAKE_C_COMPILER_LAUNCHER "${CCACHE_PROGRAM}"
36+
CACHE STRING "C compiler launcher")
37+
set(CMAKE_CXX_COMPILER_LAUNCHER "${CCACHE_PROGRAM}"
38+
CACHE STRING "CXX compiler launcher")
39+
else()
40+
message(
41+
STATUS
42+
"Could not find ccache. Consider installing ccache to speed up compilation."
43+
)
44+
endif()
45+
endif()
46+
47+
3148
# Ensure Python3 vars are set correctly
3249
# used conditionally in this file and by lit tests
3350

README.md

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66

77
This is the development repository of Intel® XPU Backend for Triton\*, a new [Triton](https://github.com/triton-lang/triton/) backend for Intel GPUs. Intel® XPU Backend for Triton\* is a out of tree backend module for [Triton](https://github.com/triton-lang/triton/blob/main/CONTRIBUTING.md) used to provide best-in-class performance and productivity on any Intel GPUs for [PyTorch](https://github.com/triton-lang/triton/blob/main/CONTRIBUTING.md) and standalone usage.
88

9+
<<<<<<< HEAD
910
# Compatibility
1011

1112
* Operating systems:
@@ -21,11 +22,25 @@ This is the development repository of Intel® XPU Backend for Triton\*, a new [T
2122
* Latest [PyTorch Prerequisites for Intel GPUs](https://www.intel.com/content/www/us/en/developer/articles/tool/pytorch-prerequisites-for-intel-gpus.html)
2223

2324
Note that Intel® XPU Backend for Triton\* is not compatible with Intel® Extension for PyTorch\* and Intel® oneAPI Base Toolkit\*.
25+
=======
26+
| **`Documentation`** | **`Nightly Wheels`** |
27+
|-------------------- | -------------------- |
28+
| [![Documentation](https://github.com/triton-lang/triton/actions/workflows/documentation.yml/badge.svg)](https://triton-lang.org/) | [![Wheels](https://github.com/triton-lang/triton/actions/workflows/wheels.yml/badge.svg?branch=release/2.0.x)](https://github.com/triton-lang/triton/actions/workflows/wheels.yml) |
29+
30+
# Triton
31+
32+
This is the development repository of Triton, a language and compiler for writing highly efficient custom Deep-Learning primitives. The aim of Triton is to provide an open-source environment to write fast code at higher productivity than CUDA, but also with higher flexibility than other existing DSLs.
33+
34+
The foundations of this project are described in the following MAPL2019 publication: [Triton: An Intermediate Language and Compiler for Tiled Neural Network Computations](http://www.eecs.harvard.edu/~htk/publication/2019-mapl-tillet-kung-cox.pdf). Please consider citing this work if you use Triton!
35+
36+
The [official documentation](https://triton-lang.org) contains installation instructions and tutorials. See also these third-party [Triton puzzles](https://github.com/srush/Triton-Puzzles), which can all be run using the Triton interpreter -- no GPU required.
37+
>>>>>>> d6739d3c33dee481f2d4dee4f6ecd4123f671597
2438
2539
# Quick Installation
2640

2741
## Prerequisites
2842

43+
<<<<<<< HEAD
2944
1. Latest [Rolling Release](https://dgpu-docs.intel.com/driver/installation-rolling.html) or [Long Term Support Release](https://dgpu-docs.intel.com/driver/installation.html) of GPU driver
3045
2. Latest release of [PyTorch Prerequisites for Intel GPUs](https://www.intel.com/content/www/us/en/developer/articles/tool/pytorch-prerequisites-for-intel-gpus.html)
3146
3. Latest release of [Profiling Tools Interfaces for Intel GPU (PTI for GPU)](https://www.intel.com/content/www/us/en/developer/articles/tool/pytorch-prerequisites-for-intel-gpus.html)
@@ -40,18 +55,35 @@ Extract the archive and in the extracted directory execute:
4055
```shell
4156
pip install torch-*.whl triton-*.whl
4257
```
58+
=======
59+
```shell
60+
pip install triton
61+
```
62+
63+
Binary wheels are available for CPython 3.8-3.12 and PyPy 3.8-3.9.
64+
>>>>>>> d6739d3c33dee481f2d4dee4f6ecd4123f671597
4365
4466
Before using Intel® XPU Backend for Triton\* you need to initialize the toolchain.
4567
The default location is `/opt/intel/oneapi` (if installed as a `root` user) or `~/intel/oneapi` (if installed as a regular user).
4668

4769
```shell
70+
<<<<<<< HEAD
4871
# replace /opt/intel/oneapi with the actual location of PyTorch Prerequisites for Intel GPUs
4972
source /opt/intel/oneapi/setvars.sh
73+
=======
74+
pip install -U --index-url https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/Triton-Nightly/pypi/simple/ triton-nightly
75+
>>>>>>> d6739d3c33dee481f2d4dee4f6ecd4123f671597
5076
```
5177

5278
# Install from source
5379

80+
<<<<<<< HEAD
5481
## Prerequisites
82+
=======
83+
```shell
84+
git clone https://github.com/triton-lang/triton.git;
85+
cd triton;
86+
>>>>>>> d6739d3c33dee481f2d4dee4f6ecd4123f671597
5587

5688
1. Latest [Rolling Release](https://dgpu-docs.intel.com/driver/installation-rolling.html) or [Long Term Support Release](https://dgpu-docs.intel.com/driver/installation.html) of GPU driver
5789
2. Latest release of [PyTorch Prerequisites for Intel GPUs](https://www.intel.com/content/www/us/en/developer/articles/tool/pytorch-prerequisites-for-intel-gpus.html)
@@ -72,9 +104,14 @@ source /opt/intel/oneapi/setvars.sh
72104
Clone this repository:
73105

74106
```shell
107+
<<<<<<< HEAD
75108
git clone https://github.com/intel/intel-xpu-backend-for-triton.git
76109
cd intel-xpu-backend-for-triton
77110
```
111+
=======
112+
git clone https://github.com/triton-lang/triton.git;
113+
cd triton;
114+
>>>>>>> d6739d3c33dee481f2d4dee4f6ecd4123f671597
78115

79116
To avoid potential conflicts with installed packages it is recommended to create and activate a new Python virtual environment:
80117

@@ -205,6 +242,7 @@ For detailed instructions on how to debug Triton's frontend, please refer to thi
205242
206243
# Usage Guide
207244
245+
<<<<<<< HEAD
208246
## Code Modifications
209247
Intel® XPU Backend for Triton\* requires a special version of PyTorch that can be built from sources or installed from nightly wheels.
210248
@@ -308,6 +346,14 @@ Note that the user needs to explicitly set `TRITON_XPU_PROFILE=1` when the user
308346
```Bash
309347
export TRITON_XPU_PROFILE=1
310348
```
349+
=======
350+
Version 2.0 is out! New features include:
351+
352+
- Many, many bug fixes
353+
- Performance improvements
354+
- Backend rewritten to use MLIR
355+
- Support for kernels that contain back-to-back matmuls (e.g., flash attention)
356+
>>>>>>> d6739d3c33dee481f2d4dee4f6ecd4123f671597
311357
312358
# Contributing
313359
@@ -317,10 +363,24 @@ Community contributions are more than welcome, whether it be to fix bugs or to a
317363

318364
_MIT License_. As found in [LICENSE](https://github.com/intel/intel-xpu-backend-for-triton/blob/main/LICENSE) file.
319365

366+
<<<<<<< HEAD
320367

321368
## Security
322369

323370
See Intel's [Security Center](https://www.intel.com/content/www/us/en/security-center/default.html)
324371
for information on how to report a potential security issue or vulnerability.
325372
326373
See also: [Security Policy](security.md)
374+
=======
375+
# Compatibility
376+
377+
Supported Platforms:
378+
379+
- Linux
380+
381+
Supported Hardware:
382+
383+
- NVIDIA GPUs (Compute Capability 8.0+)
384+
- AMD GPUs (ROCm 5.2+)
385+
- Under development: CPUs
386+
>>>>>>> d6739d3c33dee481f2d4dee4f6ecd4123f671597

docs/conf.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -145,7 +145,7 @@ def documenter(app, obj, parent):
145145
autosummary_generate = True
146146

147147
# versioning config
148-
smv_tag_whitelist = r'^(v3.1.0)$'
148+
smv_tag_whitelist = r'^(v3.2.0)$'
149149
smv_branch_whitelist = r'^main$'
150150
smv_remote_whitelist = None
151151
smv_released_pattern = r'^tags/.*$'

lib/Conversion/TritonGPUToLLVM/ReduceOpToLLVM.cpp

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -162,13 +162,6 @@ struct ReduceOpConversion
162162

163163
auto mod = op->getParentOfType<ModuleOp>();
164164
unsigned iWarpSize = triton::gpu::TritonGPUDialect::getThreadsPerWarp(mod);
165-
if (iWarpSize > numLaneToReduce) {
166-
Value threadId = getThreadId(rewriter, loc);
167-
Value warpSize = i32_val(iWarpSize);
168-
Value laneId = urem(threadId, warpSize);
169-
Value lanePred = icmp_slt(laneId, i32_val(numLaneToReduce));
170-
pred = pred ? and_(pred, lanePred) : lanePred;
171-
}
172165

173166
for (unsigned N = numLaneToReduce / 2; N > 0; N >>= 1) {
174167
SmallVector<Value> shfl(acc.size());

lib/Conversion/TritonGPUToLLVM/ScanOpToLLVM.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -187,7 +187,7 @@ static void AddPartialReduce(SmallVector<SmallVector<Value>> &srcValues,
187187
}
188188
Value mask = icmp_sge(warpId, i32_val(i + 1));
189189
accumulator.acc =
190-
accumulate(helper, rewriter, accumulator.acc, partialReduce, mask);
190+
accumulate(helper, rewriter, accumulator.acc, partialReduce);
191191
for (unsigned j = 0; j < helper.getNumOperands(); ++j) {
192192
accumulator.maskedAcc[j] =
193193
select(mask, accumulator.acc[j], accumulator.maskedAcc[j]);

lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -235,6 +235,12 @@ class BlockedToMMA : public mlir::OpRewritePattern<DotOp> {
235235
mlir::PatternRewriter &rewriter) const override {
236236
if (computeCapability < 70)
237237
return failure();
238+
if (computeCapability < 80) {
239+
dotOp.emitRemark()
240+
<< "Dot op using MMA for compute capability " << computeCapability
241+
<< " has been deprecated. It falls back to the FMA path.";
242+
return failure();
243+
}
238244
// TODO: Check data-types and SM compatibility
239245
RankedTensorType oldRetType = dotOp.getType();
240246
if (!oldRetType.getEncoding() ||

python/setup.py

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -460,15 +460,16 @@ def build_extension(self, ext):
460460
"-DCMAKE_CXX_FLAGS=-fsanitize=address",
461461
]
462462

463-
if check_env_flag("TRITON_BUILD_WITH_CCACHE"):
464-
cmake_args += [
465-
"-DCMAKE_CXX_COMPILER_LAUNCHER=ccache",
466-
]
463+
# environment variables we will pass through to cmake
464+
passthrough_args = [
465+
"TRITON_BUILD_PROTON",
466+
"TRITON_BUILD_TUTORIALS",
467+
"TRITON_BUILD_WITH_CCACHE",
468+
]
469+
cmake_args += [f"-D{option}={os.getenv(option)}" for option in passthrough_args if option in os.environ]
467470

468471
if check_env_flag("TRITON_BUILD_PROTON", "ON"): # Default ON
469472
cmake_args += self.get_proton_cmake_args()
470-
else:
471-
cmake_args += ["-DTRITON_BUILD_PROTON=OFF"]
472473

473474
if is_offline_build():
474475
# unit test builds fetch googletests from GitHub
@@ -701,7 +702,7 @@ def get_install_requires():
701702

702703
setup(
703704
name=os.environ.get("TRITON_WHEEL_NAME", "triton"),
704-
version="3.1.0" + get_git_commit_hash() + os.environ.get("TRITON_WHEEL_VERSION_SUFFIX", ""),
705+
version="3.2.0" + get_git_commit_hash() + os.environ.get("TRITON_WHEEL_VERSION_SUFFIX", ""),
705706
author="Philippe Tillet",
706707
author_email="[email protected]",
707708
description="A language and compiler for custom Deep Learning operations",

python/test/unit/language/test_compile_errors.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -353,7 +353,7 @@ def test_fp8_support(dtype):
353353
supported_dtypes.append(tl.float8e4nv)
354354
elif is_hip():
355355
if is_hip_mi300():
356-
supported_dtypes += [tl.float8e4b8, tl.float8e5b16]
356+
supported_dtypes += [tl.float8e4nv, tl.float8e4b8, tl.float8e5b16]
357357
elif is_xpu():
358358
supported_dtypes += [tl.float8e4b15, tl.float8e4nv]
359359
elif is_interpreter():

python/test/unit/language/test_conversions.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -302,8 +302,13 @@ def upcast_test(src_dtype, dst_dtype, exponent_bits, mantissa_bits, exponent_bia
302302
('float8e5b16', 'float16'),
303303
])
304304
def test_typeconvert_upcast(src_dtype, dst_dtype, device):
305+
306+
# On HIP, fp8e4nv upcasting is only supported to bf16, and it's only supported on MI300.
307+
if src_dtype == 'float8e4nv' and is_hip() and (dst_dtype != 'bfloat16' or not is_hip_mi300()):
308+
pytest.skip(f"upcasting {src_dtype} to {dst_dtype} not supported in this architecture")
309+
305310
if ((src_dtype == 'float8e4nv' and is_cuda() and torch.cuda.get_device_capability(0) < (8, 9))
306-
or (src_dtype in ('float8e4nv', 'float8e4b15') and is_hip())
311+
or (src_dtype in ('float8e4b15') and is_hip())
307312
or (src_dtype in ('float8e4b8', 'float8e5b16') and (is_cuda() or not is_hip_mi300()))):
308313
# If the dtype should error out in the given device, we assert that and return
309314
with pytest.raises(triton.CompilationError, match="not supported in this architecture"):
@@ -358,6 +363,9 @@ def test_typeconvert_downcast(src_dtype, dst_dtype, rounding, max_repr, device):
358363
if dst_dtype in ('float8e5b16', 'float8e4b8') and rounding == 'rtne' and (is_cuda() or not is_hip_mi300()):
359364
pytest.xfail(f"{dst_dtype} downcast with RTNE rounding tests only supported on AMDGPU MI300")
360365

366+
if dst_dtype == 'float8e4nv' and is_hip():
367+
pytest.skip(f"{dst_dtype} downcast not supported in HIP")
368+
361369
# dtype : (exponent_bits, mantissa_bits, exponent_bias)
362370
stuff = {
363371
'float16': (5, 10, 15),

0 commit comments

Comments
 (0)