@@ -60,6 +60,8 @@ permissions: read-all
6060
6161env :
6262 PYTHON_VERSION : " 3.10"
63+ # FIXME: in the next versions of pti (most likely 0.12.3) this will not need to be done
64+ PTI_DEVICE_SYNC_DELTA : " 1"
6365 BENCHMARKING_METHOD : ${{ inputs.benchmarking_method || 'UPSTREAM_PYTORCH_PROFILER' }}
6466 VERIFY : ${{ (github.event_name == 'pull_request' || github.event_name == 'schedule' || inputs.verify) && '1' || '0' }}
6567 TAG : ${{ inputs.tag || (github.event_name == 'pull_request' && format('pr-{0}', github.event.number)) || (github.event_name == 'schedule' && 'ci') || 'test' }}
@@ -122,10 +124,16 @@ jobs:
122124 run : |
123125 cd benchmarks
124126 pip install .
127+ pip install intel-pti==0.12.2
128+ PTI_LIBS_DIR=$(python -c "import sysconfig; print(sysconfig.get_paths()['stdlib']+'/..')")
129+ # the output should contain: `libpti.so`, `libpti_metrics.so.0.12.2` and `libpti_view.so.0.12.2`
130+ ls $PTI_LIBS_DIR
131+ echo "PTI_LIBS_DIR=$PTI_LIBS_DIR" >> $GITHUB_ENV
125132
126133 - name : Run Triton Softmax kernel benchmark
127134 if : ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'fused_softmax.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'fused_softmax.py') }}
128135 run : |
136+ export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
129137 cd benchmarks/triton_kernels_benchmark
130138 python fused_softmax.py --reports $REPORTS --n_runs $N_RUNS
131139 source ../../scripts/capture-hw-details.sh
@@ -135,37 +143,50 @@ jobs:
135143 - name : Run Triton GEMM kernel benchmark
136144 if : ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_benchmark.py_newshapes')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_benchmark.py_newshapes') }}
137145 run : |
146+ export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
138147 cd benchmarks/triton_kernels_benchmark
139148 python gemm_benchmark.py --reports $REPORTS --n_runs $N_RUNS
140149 mv $REPORTS/matmul-performance.csv $REPORTS/matmul-performance-base.csv
141150 source ../../scripts/capture-hw-details.sh
142151 python build_report.py $REPORTS/matmul-performance-base.csv $REPORTS/gemm-newshapes-triton-report.csv --benchmark gemm --compiler triton --param_cols "B,M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
143152 python build_report.py $REPORTS/matmul-performance-base.csv $REPORTS/gemm-newshapes-onednn-report.csv --benchmark gemm --compiler onednn --param_cols "B,M,K,N" --tflops_col OneDNN-TFlops --hbm_col "OneDNN-GB/s" --tag $TAG
144- python build_report.py $REPORTS/matmul-performance-base.csv $REPORTS/gemm-newshapes-cutlass-report.csv --benchmark gemm --compiler cutlass --param_cols "B,M,K,N" --tflops_col CUTLASS-TFlops --hbm_col "CUTLASS-GB/s" --tag $TAG
153+ if [[ "${{ inputs.runner_label || 'max1550' }}" = "max1550" ]]; then
154+ # FIXME: enable cuttlass on bmg
155+ python build_report.py $REPORTS/matmul-performance-base.csv $REPORTS/gemm-newshapes-cutlass-report.csv --benchmark gemm --compiler cutlass --param_cols "B,M,K,N" --tflops_col CUTLASS-TFlops --hbm_col "CUTLASS-GB/s" --tag $TAG
156+ fi
145157
146158 - name : Run Triton GEMM kernel benchmark - with tensor of pointer
147159 if : ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_tensor_of_ptr_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_tensor_of_ptr_benchmark.py') }}
148160 run : |
161+ export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
149162 cd benchmarks/triton_kernels_benchmark
150163 python gemm_tensor_of_ptr_benchmark.py --reports $REPORTS --n_runs $N_RUNS
151164 source ../../scripts/capture-hw-details.sh
152165 python build_report.py $REPORTS/matmul-tensor-of-ptr-performance.csv $REPORTS/gemm-tensor-of-ptr-triton-report.csv --benchmark gemm-tensor-of-ptr --compiler triton --param_cols "B,M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
153166 python build_report.py $REPORTS/matmul-tensor-of-ptr-performance.csv $REPORTS/gemm-tensor-of-ptr-onednn-report.csv --benchmark gemm-tensor-of-ptr --compiler onednn --param_cols "B,M,K,N" --tflops_col OneDNN-TFlops --hbm_col "OneDNN-GB/s" --tag $TAG
154- python build_report.py $REPORTS/matmul-tensor-of-ptr-performance.csv $REPORTS/gemm-tensor-of-ptr-cutlass-report.csv --benchmark gemm-tensor-of-ptr --compiler cutlass --param_cols "B,M,K,N" --tflops_col CUTLASS-TFlops --hbm_col "CUTLASS-GB/s" --tag $TAG
167+ if [[ "${{ inputs.runner_label || 'max1550' }}" = "max1550" ]]; then
168+ # FIXME: enable cuttlass on bmg
169+ python build_report.py $REPORTS/matmul-tensor-of-ptr-performance.csv $REPORTS/gemm-tensor-of-ptr-cutlass-report.csv --benchmark gemm-tensor-of-ptr --compiler cutlass --param_cols "B,M,K,N" --tflops_col CUTLASS-TFlops --hbm_col "CUTLASS-GB/s" --tag $TAG
170+ fi
155171
156172 - name : Run Triton GEMM kernel benchmark - with tensor descriptor
157173 if : ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_tensor_desc_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_tensor_desc_benchmark.py') }}
158174 run : |
175+ export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
159176 cd benchmarks/triton_kernels_benchmark
160177 python gemm_tensor_desc_benchmark.py --reports $REPORTS --n_runs $N_RUNS
161178 source ../../scripts/capture-hw-details.sh
162179 python build_report.py $REPORTS/matmul-tensor-desc-performance.csv $REPORTS/gemm-tensor-desc-triton-report.csv --benchmark gemm-tensor-desc --compiler triton --param_cols "B,M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
163180 python build_report.py $REPORTS/matmul-tensor-desc-performance.csv $REPORTS/gemm-tensor-desc-onednn-report.csv --benchmark gemm-tensor-desc --compiler onednn --param_cols "B,M,K,N" --tflops_col OneDNN-TFlops --hbm_col "OneDNN-GB/s" --tag $TAG
164- python build_report.py $REPORTS/matmul-tensor-desc-performance.csv $REPORTS/gemm-tensor-desc-cutlass-report.csv --benchmark gemm-tensor-desc --compiler cutlass --param_cols "B,M,K,N" --tflops_col CUTLASS-TFlops --hbm_col "CUTLASS-GB/s" --tag $TAG
181+ if [[ "${{ inputs.runner_label || 'max1550' }}" = "max1550" ]]; then
182+ # FIXME: enable cuttlass on bmg
183+ python build_report.py $REPORTS/matmul-tensor-desc-performance.csv $REPORTS/gemm-tensor-desc-cutlass-report.csv --benchmark gemm-tensor-desc --compiler cutlass --param_cols "B,M,K,N" --tflops_col CUTLASS-TFlops --hbm_col "CUTLASS-GB/s" --tag $TAG
184+ fi
165185
166186 - name : Run Triton GEMM (A@B^t) kernel benchmark
167187 if : ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_benchmark.py_abt')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_benchmark.py_abt') }}
168188 run : |
189+ export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
169190 cd benchmarks/triton_kernels_benchmark
170191 TRANSPOSE_B=1 python gemm_benchmark.py --reports $REPORTS --n_runs $N_RUNS
171192 mv $REPORTS/matmul-performance.csv $REPORTS/matmul-performance-bt.csv
@@ -177,6 +198,7 @@ jobs:
177198 - name : Run Triton GEMM (A^t@B) kernel benchmark
178199 if : ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_benchmark.py_atb')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_benchmark.py_atb') }}
179200 run : |
201+ export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
180202 cd benchmarks/triton_kernels_benchmark
181203 TRANSPOSE_A=1 python gemm_benchmark.py --reports $REPORTS --n_runs $N_RUNS
182204 mv $REPORTS/matmul-performance.csv $REPORTS/matmul-performance-at.csv
@@ -188,6 +210,7 @@ jobs:
188210 - name : Run Triton GEMM (stream-k) kernel benchmark
189211 if : ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_streamk_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_streamk_benchmark.py') }}
190212 run : |
213+ export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
191214 cd benchmarks/triton_kernels_benchmark
192215 python gemm_streamk_benchmark.py --reports $REPORTS --n_runs $N_RUNS
193216 source ../../scripts/capture-hw-details.sh
@@ -197,6 +220,7 @@ jobs:
197220 - name : Run Triton GEMM (split-k) kernel benchmark
198221 if : ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_splitk_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_splitk_benchmark.py') }}
199222 run : |
223+ export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
200224 cd benchmarks/triton_kernels_benchmark
201225 python gemm_splitk_benchmark.py --reports $REPORTS --n_runs $N_RUNS
202226 source ../../scripts/capture-hw-details.sh
@@ -206,6 +230,7 @@ jobs:
206230 - name : Run Triton GEMM + PreOp (exp) kernel benchmark
207231 if : ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_preop_exp_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_preop_exp_benchmark.py') }}
208232 run : |
233+ export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
209234 cd benchmarks/triton_kernels_benchmark
210235 python gemm_preop_exp_benchmark.py --reports $REPORTS --n_runs $N_RUNS
211236 source ../../scripts/capture-hw-details.sh
@@ -214,6 +239,7 @@ jobs:
214239 - name : Run Triton GEMM + PostOp (Gelu) kernel benchmark
215240 if : ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_postop_gelu_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_postop_gelu_benchmark.py') }}
216241 run : |
242+ export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
217243 cd benchmarks/triton_kernels_benchmark
218244 python gemm_postop_gelu_benchmark.py --reports $REPORTS --n_runs $N_RUNS
219245 source ../../scripts/capture-hw-details.sh
@@ -222,6 +248,7 @@ jobs:
222248 - name : Run Triton GEMM + PostOp (add matrix) kernel benchmark bfloat16
223249 if : ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_postop_addmatrix_benchmark_bfloat16.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_postop_addmatrix_benchmark_bfloat16.py') }}
224250 run : |
251+ export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
225252 cd benchmarks/triton_kernels_benchmark
226253 python gemm_postop_addmatrix_benchmark.py --reports $REPORTS --n_runs $N_RUNS
227254 source ../../scripts/capture-hw-details.sh
@@ -231,6 +258,7 @@ jobs:
231258 - name : Run Triton GEMM + PostOp (add matrix) kernel benchmark int8
232259 if : ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_postop_addmatrix_benchmark_int8.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_postop_addmatrix_benchmark_int8.py') }}
233260 run : |
261+ export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
234262 cd benchmarks/triton_kernels_benchmark
235263 INT8_ONLY=1 python gemm_postop_addmatrix_benchmark.py --reports $REPORTS --n_runs $N_RUNS
236264 source ../../scripts/capture-hw-details.sh
@@ -240,6 +268,7 @@ jobs:
240268 - name : Run Triton FA fwd kernel benchmark
241269 if : ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'flash_attention_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'flash_attention_benchmark.py') }}
242270 run : |
271+ export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
243272 cd benchmarks/triton_kernels_benchmark
244273 python flash_attention_benchmark.py --reports $REPORTS --n_runs $N_RUNS
245274
@@ -250,6 +279,7 @@ jobs:
250279 - name : Run Triton FA bwd kernel benchmark
251280 if : ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'flash_attention_bwd_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'flash_attention_bwd_benchmark.py') }}
252281 run : |
282+ export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
253283 cd benchmarks/triton_kernels_benchmark
254284 FA_KERNEL_MODE="bwd" \
255285 python flash_attention_benchmark.py --reports $REPORTS --n_runs $N_RUNS
@@ -262,6 +292,7 @@ jobs:
262292 - name : Run Triton FA fwd kernel benchmark - with tensor descriptors
263293 if : ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'flash_attention_tensor_desc_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'flash_attention_tensor_desc_benchmark.py') }}
264294 run : |
295+ export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
265296 cd benchmarks/triton_kernels_benchmark
266297 python flash_attention_tensor_desc_benchmark.py --reports $REPORTS --n_runs $N_RUNS
267298 mv $REPORTS/attn-performance.csv $REPORTS/attn-tensor-desc-performance.csv
@@ -273,6 +304,7 @@ jobs:
273304 - name : Run Triton FlexAttention Causal Mask fwd kernel benchmark
274305 if : ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'flex_attention_benchmark_causal_mask.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'flex_attention_benchmark_causal_mask.py') }}
275306 run : |
307+ export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
276308 cd benchmarks/triton_kernels_benchmark
277309 python flex_attention_benchmark_causal_mask.py --reports $REPORTS --n_runs $N_RUNS
278310
@@ -283,6 +315,7 @@ jobs:
283315 - name : Run Triton FlexAttention Custom Masks fwd kernel benchmark
284316 if : ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'flex_attention_benchmark_custom_masks.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'flex_attention_benchmark_custom_masks.py') }}
285317 run : |
318+ export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
286319 cd benchmarks/triton_kernels_benchmark
287320 python flex_attention_benchmark_custom_masks.py --reports $REPORTS --n_runs $N_RUNS
288321
@@ -293,6 +326,7 @@ jobs:
293326 - name : Run Prefix Sums kernel benchmark
294327 if : ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'prefix_sums.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'prefix_sums.py') }}
295328 run : |
329+ export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
296330 cd benchmarks/triton_kernels_benchmark
297331 python prefix_sums.py --reports $REPORTS --n_runs $N_RUNS
298332 source ../../scripts/capture-hw-details.sh
@@ -301,6 +335,7 @@ jobs:
301335 - name : Run micro benchmark
302336 if : ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'micro_benchmarks.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'micro_benchmarks') }}
303337 run : |
338+ export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
304339 cd benchmarks/micro_benchmarks
305340 python run_benchmarks.py --reports $REPORTS
306341
0 commit comments