Skip to content

Commit 0c81300

Browse files
authored
[BENCHMARK][GEMM] Fix CUTLASS benchmark when running on BMG (#4274)
This PR resolves the issue [4254](#4254) by specifying multiple `sycl-target` to create a fatbinary and be supported on PVC and BMG. BMG benchmark CI: https://github.com/intel/intel-xpu-backend-for-triton/actions/runs/15200477831 --------- Signed-off-by: Jefferson Le Quellec <[email protected]>
1 parent b9da70d commit 0c81300

File tree

3 files changed

+8
-18
lines changed

3 files changed

+8
-18
lines changed

.github/workflows/triton-benchmarks.yml

Lines changed: 3 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -150,10 +150,7 @@ jobs:
150150
source ../../scripts/capture-hw-details.sh
151151
python build_report.py $REPORTS/matmul-performance-base.csv $REPORTS/gemm-newshapes-triton-report.csv --benchmark gemm --compiler triton --param_cols "B,M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
152152
python build_report.py $REPORTS/matmul-performance-base.csv $REPORTS/gemm-newshapes-onednn-report.csv --benchmark gemm --compiler onednn --param_cols "B,M,K,N" --tflops_col OneDNN-TFlops --hbm_col "OneDNN-GB/s" --tag $TAG
153-
if [[ "${{ inputs.runner_label || 'max1550' }}" = "max1550" ]]; then
154-
# FIXME: enable cuttlass on bmg
155-
python build_report.py $REPORTS/matmul-performance-base.csv $REPORTS/gemm-newshapes-cutlass-report.csv --benchmark gemm --compiler cutlass --param_cols "B,M,K,N" --tflops_col CUTLASS-TFlops --hbm_col "CUTLASS-GB/s" --tag $TAG
156-
fi
153+
python build_report.py $REPORTS/matmul-performance-base.csv $REPORTS/gemm-newshapes-cutlass-report.csv --benchmark gemm --compiler cutlass --param_cols "B,M,K,N" --tflops_col CUTLASS-TFlops --hbm_col "CUTLASS-GB/s" --tag $TAG
157154
158155
- name: Run Triton GEMM kernel benchmark - with tensor of pointer
159156
if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_tensor_of_ptr_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_tensor_of_ptr_benchmark.py') }}
@@ -164,10 +161,7 @@ jobs:
164161
source ../../scripts/capture-hw-details.sh
165162
python build_report.py $REPORTS/matmul-tensor-of-ptr-performance.csv $REPORTS/gemm-tensor-of-ptr-triton-report.csv --benchmark gemm-tensor-of-ptr --compiler triton --param_cols "B,M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
166163
python build_report.py $REPORTS/matmul-tensor-of-ptr-performance.csv $REPORTS/gemm-tensor-of-ptr-onednn-report.csv --benchmark gemm-tensor-of-ptr --compiler onednn --param_cols "B,M,K,N" --tflops_col OneDNN-TFlops --hbm_col "OneDNN-GB/s" --tag $TAG
167-
if [[ "${{ inputs.runner_label || 'max1550' }}" = "max1550" ]]; then
168-
# FIXME: enable cuttlass on bmg
169-
python build_report.py $REPORTS/matmul-tensor-of-ptr-performance.csv $REPORTS/gemm-tensor-of-ptr-cutlass-report.csv --benchmark gemm-tensor-of-ptr --compiler cutlass --param_cols "B,M,K,N" --tflops_col CUTLASS-TFlops --hbm_col "CUTLASS-GB/s" --tag $TAG
170-
fi
164+
python build_report.py $REPORTS/matmul-tensor-of-ptr-performance.csv $REPORTS/gemm-tensor-of-ptr-cutlass-report.csv --benchmark gemm-tensor-of-ptr --compiler cutlass --param_cols "B,M,K,N" --tflops_col CUTLASS-TFlops --hbm_col "CUTLASS-GB/s" --tag $TAG
171165
172166
- name: Run Triton GEMM kernel benchmark - with tensor descriptor
173167
if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_tensor_desc_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_tensor_desc_benchmark.py') }}
@@ -178,10 +172,7 @@ jobs:
178172
source ../../scripts/capture-hw-details.sh
179173
python build_report.py $REPORTS/matmul-tensor-desc-performance.csv $REPORTS/gemm-tensor-desc-triton-report.csv --benchmark gemm-tensor-desc --compiler triton --param_cols "B,M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
180174
python build_report.py $REPORTS/matmul-tensor-desc-performance.csv $REPORTS/gemm-tensor-desc-onednn-report.csv --benchmark gemm-tensor-desc --compiler onednn --param_cols "B,M,K,N" --tflops_col OneDNN-TFlops --hbm_col "OneDNN-GB/s" --tag $TAG
181-
if [[ "${{ inputs.runner_label || 'max1550' }}" = "max1550" ]]; then
182-
# FIXME: enable cuttlass on bmg
183-
python build_report.py $REPORTS/matmul-tensor-desc-performance.csv $REPORTS/gemm-tensor-desc-cutlass-report.csv --benchmark gemm-tensor-desc --compiler cutlass --param_cols "B,M,K,N" --tflops_col CUTLASS-TFlops --hbm_col "CUTLASS-GB/s" --tag $TAG
184-
fi
175+
python build_report.py $REPORTS/matmul-tensor-desc-performance.csv $REPORTS/gemm-tensor-desc-cutlass-report.csv --benchmark gemm-tensor-desc --compiler cutlass --param_cols "B,M,K,N" --tflops_col CUTLASS-TFlops --hbm_col "CUTLASS-GB/s" --tag $TAG
185176
186177
- name: Run Triton GEMM (A@B^t) kernel benchmark
187178
if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_benchmark.py_abt')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_benchmark.py_abt') }}

benchmarks/cutlass_kernel/CMakeLists.txt

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,14 @@
11
set(CUTLASS_KERNEL_FLAGS ${CUTLASS_KERNEL_FLAGS}
22
-fsycl
3-
-fsycl-targets=intel_gpu_pvc
43
-fsycl-device-code-split=per_kernel
5-
-Xspirv-translator -spirv-ext=+SPV_INTEL_split_barrier
4+
-fsycl-targets=intel_gpu_pvc,intel_gpu_bmg_g21
5+
"SHELL:-Xspirv-translator=intel_gpu_pvc --spirv-ext=+SPV_INTEL_split_barrier"
6+
"SHELL:-Xspirv-translator=intel_gpu_bmg_g21 --spirv-ext=+SPV_INTEL_split_barrier"
67
)
78

89
Python3_add_library(cutlass_kernel MODULE WITH_SOABI python_main.cpp)
910

10-
target_compile_options(cutlass_kernel PRIVATE "-fsycl" "-fsycl-targets=intel_gpu_pvc" "-fpreview-breaking-changes")
11+
target_compile_options(cutlass_kernel PRIVATE "-fsycl" "-fsycl-targets=intel_gpu_pvc,intel_gpu_bmg_g21" "-fpreview-breaking-changes")
1112
target_compile_options(cutlass_kernel PRIVATE "-DCUTLASS_ENABLE_SYCL")
1213
target_compile_options(cutlass_kernel PRIVATE "-DSYCL_INTEL_TARGET")
1314

benchmarks/triton_kernels_benchmark/gemm_benchmark.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -301,9 +301,7 @@ def get_benchmark(
301301
}
302302
# use_cutlass
303303
if not (transpose_a or transpose_b):
304-
if '580' not in torch.xpu.get_device_name():
305-
# FIXME: enable cutlass on bmg
306-
supported_providers['cutlass'] = 'CUTLASS'
304+
supported_providers['cutlass'] = 'CUTLASS'
307305
providers = benchmark_suite.filter_providers(supported_providers, providers_filter)
308306

309307
# Benchmark Performance

0 commit comments

Comments
 (0)