@@ -124,39 +124,6 @@ jobs:
124124 ls $PTI_LIBS_DIR
125125 echo "PTI_LIBS_DIR=$PTI_LIBS_DIR" >> $GITHUB_ENV
126126
127- - name : Run Triton Softmax kernel benchmark
128- if : ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'fused_softmax.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'fused_softmax.py') }}
129- run : |
130- export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
131- cd benchmarks/triton_kernels_benchmark
132- python fused_softmax.py --reports $REPORTS --n_runs $N_RUNS
133- source ../../scripts/capture-hw-details.sh
134- python build_report.py $REPORTS/softmax-performance.csv $REPORTS/softmax-triton-report.csv --benchmark softmax --compiler triton --param_cols "N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
135- python build_report.py $REPORTS/softmax-performance.csv $REPORTS/softmax-xetla-report.csv --benchmark softmax --compiler xetla --param_cols "N" --tflops_col XeTLA-TFlops --hbm_col "XeTLA-GB/s" --tag $TAG
136- python build_report.py $REPORTS/softmax-performance.csv $REPORTS/softmax-onednn-report.csv --benchmark softmax --compiler onednn --param_cols "N" --tflops_col oneDNN-TFlops --hbm_col "oneDNN-GB/s" --tag $TAG
137-
138- - name : Run Triton Softmax kernel benchmark with Proton
139- if : ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'fused_softmax.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'fused_softmax.py') }}
140- run : |
141- export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
142- cd benchmarks/triton_kernels_benchmark
143- BENCHMARKING_METHOD=PROTON_PROFILER python fused_softmax.py
144- source ../../scripts/capture-hw-details.sh
145-
146- - name : Run Triton GEMM kernel benchmark
147- if : ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_benchmark.py') }}
148- run : |
149- export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
150- cd benchmarks/triton_kernels_benchmark
151- python gemm_benchmark.py --reports $REPORTS --n_runs $N_RUNS
152- mv $REPORTS/matmul-performance.csv $REPORTS/matmul-performance-base.csv
153- source ../../scripts/capture-hw-details.sh
154- python build_report.py $REPORTS/matmul-performance-base.csv $REPORTS/gemm-newshapes-triton-report.csv --benchmark gemm --compiler triton --param_cols "B,M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
155- python build_report.py $REPORTS/matmul-performance-base.csv $REPORTS/gemm-newshapes-onednn-report.csv --benchmark gemm --compiler onednn --param_cols "B,M,K,N" --tflops_col OneDNN-TFlops --hbm_col "OneDNN-GB/s" --tag $TAG
156- if [[ "${{ inputs.runner_label || 'max1550' }}" != "lnl" ]]; then
157- python build_report.py $REPORTS/matmul-performance-base.csv $REPORTS/gemm-newshapes-cutlass-report.csv --benchmark gemm --compiler cutlass --param_cols "B,M,K,N" --tflops_col CUTLASS-TFlops --hbm_col "CUTLASS-GB/s" --tag $TAG
158- fi
159-
160127 - name : Run Triton GEMM kernel benchmark - with tensor of pointer
161128 if : ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_tensor_of_ptr_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_tensor_of_ptr_benchmark.py') }}
162129 run : |
@@ -183,124 +150,6 @@ jobs:
183150 python build_report.py $REPORTS/matmul-tensor-desc-performance.csv $REPORTS/gemm-tensor-desc-cutlass-report.csv --benchmark gemm-tensor-desc --compiler cutlass --param_cols "B,M,K,N" --tflops_col CUTLASS-TFlops --hbm_col "CUTLASS-GB/s" --tag $TAG
184151 fi
185152
186- - name : Run Triton GEMM (A@B^t) kernel benchmark
187- if : ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_benchmark.py_abt')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_benchmark.py_abt') }}
188- run : |
189- export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
190- cd benchmarks/triton_kernels_benchmark
191- TRANSPOSE_B=1 python gemm_benchmark.py --reports $REPORTS --n_runs $N_RUNS
192- mv $REPORTS/matmul-performance.csv $REPORTS/matmul-performance-bt.csv
193- source ../../scripts/capture-hw-details.sh
194-
195- python build_report.py $REPORTS/matmul-performance-bt.csv $REPORTS/gemm-bt-triton-report.csv --benchmark gemm-bt --compiler triton --param_cols "B,M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
196- python build_report.py $REPORTS/matmul-performance-bt.csv $REPORTS/gemm-bt-onednn-report.csv --benchmark gemm-bt --compiler onednn --param_cols "B,M,K,N" --tflops_col OneDNN-TFlops --hbm_col "OneDNN-GB/s" --tag $TAG
197-
198- - name : Run Triton GEMM (A^t@B) kernel benchmark
199- if : ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_benchmark.py_atb')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_benchmark.py_atb') }}
200- run : |
201- export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
202- cd benchmarks/triton_kernels_benchmark
203- TRANSPOSE_A=1 python gemm_benchmark.py --reports $REPORTS --n_runs $N_RUNS
204- mv $REPORTS/matmul-performance.csv $REPORTS/matmul-performance-at.csv
205- source ../../scripts/capture-hw-details.sh
206-
207- python build_report.py $REPORTS/matmul-performance-at.csv $REPORTS/gemm-at-triton-report.csv --benchmark gemm-at --compiler triton --param_cols "B,M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
208- python build_report.py $REPORTS/matmul-performance-at.csv $REPORTS/gemm-at-onednn-report.csv --benchmark gemm-at --compiler onednn --param_cols "B,M,K,N" --tflops_col OneDNN-TFlops --hbm_col "OneDNN-GB/s" --tag $TAG
209-
210- - name : Run Triton GEMM (stream-k) kernel benchmark
211- if : ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_streamk_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_streamk_benchmark.py') }}
212- run : |
213- export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
214- cd benchmarks/triton_kernels_benchmark
215- python gemm_streamk_benchmark.py --reports $REPORTS --n_runs $N_RUNS
216- source ../../scripts/capture-hw-details.sh
217- python build_report.py $REPORTS/matmul-streamk-performance.csv $REPORTS/gemm-streamk-triton-report.csv --benchmark gemm-streamk --compiler triton --param_cols "M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
218- python build_report.py $REPORTS/matmul-streamk-performance.csv $REPORTS/gemm-streamk-xetla-report.csv --benchmark gemm-streamk --compiler xetla --param_cols "M,K,N" --tflops_col XeTLA-TFlops --hbm_col "XeTLA-GB/s" --tag $TAG
219-
220- - name : Run Triton GEMM (split-k) kernel benchmark
221- if : ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_splitk_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_splitk_benchmark.py') }}
222- run : |
223- export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
224- cd benchmarks/triton_kernels_benchmark
225- python gemm_splitk_benchmark.py --reports $REPORTS --n_runs $N_RUNS
226- source ../../scripts/capture-hw-details.sh
227- python build_report.py $REPORTS/matmul-splitk-performance.csv $REPORTS/gemm-splitk-triton-report.csv --benchmark gemm-splitk --compiler triton --param_cols "M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
228- python build_report.py $REPORTS/matmul-splitk-performance.csv $REPORTS/gemm-splitk-xetla-report.csv --benchmark gemm-splitk --compiler xetla --param_cols "M,K,N" --tflops_col XeTLA-TFlops --hbm_col "XeTLA-GB/s" --tag $TAG
229-
230- - name : Run Triton GEMM + PreOp (exp) kernel benchmark
231- if : ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_preop_exp_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_preop_exp_benchmark.py') }}
232- run : |
233- export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
234- cd benchmarks/triton_kernels_benchmark
235- python gemm_preop_exp_benchmark.py --reports $REPORTS --n_runs $N_RUNS
236- source ../../scripts/capture-hw-details.sh
237- python build_report.py $REPORTS/matmul-performance-preop-exp.csv $REPORTS/gemm-preop-exp-triton-report.csv --benchmark gemm-preop-exp --compiler triton --param_cols "B,M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
238-
239- - name : Run Triton GEMM + PostOp (Gelu) kernel benchmark
240- if : ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_postop_gelu_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_postop_gelu_benchmark.py') }}
241- run : |
242- export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
243- cd benchmarks/triton_kernels_benchmark
244- python gemm_postop_gelu_benchmark.py --reports $REPORTS --n_runs $N_RUNS
245- source ../../scripts/capture-hw-details.sh
246- python build_report.py $REPORTS/matmul-performance-postop-gelu.csv $REPORTS/gemm-postop-gelu-triton-report.csv --benchmark gemm-postop-gelu --compiler triton --param_cols "B,M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
247-
248- - name : Run Triton GEMM + PostOp (add matrix) kernel benchmark bfloat16
249- if : ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_postop_addmatrix_benchmark_bfloat16.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_postop_addmatrix_benchmark_bfloat16.py') }}
250- run : |
251- export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
252- cd benchmarks/triton_kernels_benchmark
253- python gemm_postop_addmatrix_benchmark.py --reports $REPORTS --n_runs $N_RUNS
254- source ../../scripts/capture-hw-details.sh
255- python build_report.py $REPORTS/matmul-performance-postop-addmatrix-bfloat16.csv $REPORTS/gemm-postop-addmatrix-bfloat16-triton-report.csv --benchmark gemm-postop-addmatrix --compiler triton --param_cols "B,M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
256- python build_report.py $REPORTS/matmul-performance-postop-addmatrix-bfloat16.csv $REPORTS/gemm-postop-addmatrix-bfloat16-onednn-report.csv --benchmark gemm-postop-addmatrix --compiler onednn --param_cols "B,M,K,N" --tflops_col OneDNN-TFlops --hbm_col "OneDNN-GB/s" --tag $TAG
257-
258- - name : Run Triton GEMM + PostOp (add matrix) kernel benchmark int8
259- if : ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_postop_addmatrix_benchmark_int8.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_postop_addmatrix_benchmark_int8.py') }}
260- run : |
261- export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
262- cd benchmarks/triton_kernels_benchmark
263- INT8_ONLY=1 python gemm_postop_addmatrix_benchmark.py --reports $REPORTS --n_runs $N_RUNS
264- source ../../scripts/capture-hw-details.sh
265- python build_report.py $REPORTS/matmul-performance-postop-addmatrix-int8.csv $REPORTS/gemm-postop-addmatrix-int8-triton-report.csv --benchmark gemm-postop-addmatrix-int8 --compiler triton --param_cols "B,M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
266- python build_report.py $REPORTS/matmul-performance-postop-addmatrix-int8.csv $REPORTS/gemm-postop-addmatrix-int8-onednn-report.csv --benchmark gemm-postop-addmatrix-int8 --compiler onednn --param_cols "B,M,K,N" --tflops_col OneDNN-TFlops --hbm_col "OneDNN-GB/s" --tag $TAG
267-
268- - name : Run Triton FA fwd kernel benchmark
269- if : ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'flash_attention_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'flash_attention_benchmark.py') }}
270- run : |
271- export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
272- cd benchmarks/triton_kernels_benchmark
273- python flash_attention_benchmark.py --reports $REPORTS --n_runs $N_RUNS
274-
275- source ../../scripts/capture-hw-details.sh
276- python build_report.py $REPORTS/attn-performance.csv $REPORTS/attn-triton-report.csv --benchmark flash-attn --compiler triton --param_cols "Z,H,N_CTX,D_HEAD,CAUSAL" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
277- python build_report.py $REPORTS/attn-performance.csv $REPORTS/attn-cutlass-report.csv --benchmark flash-attn --compiler cutlass --param_cols "Z,H,N_CTX,D_HEAD,CAUSAL" --tflops_col CUTLASS-TFlops --hbm_col "CUTLASS-GB/s" --tag $TAG
278-
279- - name : Run Triton FA bwd kernel benchmark
280- if : ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'flash_attention_bwd_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'flash_attention_bwd_benchmark.py') }}
281- run : |
282- export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
283- cd benchmarks/triton_kernels_benchmark
284- FA_KERNEL_MODE="bwd" \
285- python flash_attention_benchmark.py --reports $REPORTS --n_runs $N_RUNS
286- mv $REPORTS/attn-performance.csv $REPORTS/attn-bwd-performance.csv
287-
288- source ../../scripts/capture-hw-details.sh
289- python build_report.py $REPORTS/attn-bwd-performance.csv $REPORTS/attn-bwd-triton-report.csv --benchmark flash-attn-bwd --compiler triton --param_cols "Z,H,N_CTX,D_HEAD,CAUSAL" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
290- python build_report.py $REPORTS/attn-bwd-performance.csv $REPORTS/attn-bwd-xetla-report.csv --benchmark flash-attn-bwd --compiler xetla --param_cols "Z,H,N_CTX,D_HEAD,CAUSAL" --tflops_col XeTLA-TFlops --hbm_col "XeTLA-GB/s" --tag $TAG
291-
292- - name : Run Triton FA fwd kernel benchmark - with tensor descriptors
293- if : ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'flash_attention_tensor_desc_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'flash_attention_tensor_desc_benchmark.py') }}
294- run : |
295- export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
296- cd benchmarks/triton_kernels_benchmark
297- python flash_attention_tensor_desc_benchmark.py --reports $REPORTS --n_runs $N_RUNS
298- mv $REPORTS/attn-performance.csv $REPORTS/attn-tensor-desc-performance.csv
299-
300- source ../../scripts/capture-hw-details.sh
301- python build_report.py $REPORTS/attn-tensor-desc-performance.csv $REPORTS/attn-tensor-desc-triton-report.csv --benchmark flash-attn-tensor-desc --compiler triton --param_cols "Z,H,N_CTX,D_HEAD,CAUSAL" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
302- python build_report.py $REPORTS/attn-tensor-desc-performance.csv $REPORTS/attn-tensor-desc-cutlass-report.csv --benchmark flash-attn-tensor-desc --compiler cutlass --param_cols "Z,H,N_CTX,D_HEAD,CAUSAL" --tflops_col CUTLASS-TFlops --hbm_col "CUTLASS-GB/s" --tag $TAG
303-
304153 - name : Run Triton FlexAttention Causal Mask fwd kernel benchmark
305154 if : ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'flex_attention_benchmark_causal_mask.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'flex_attention_benchmark_causal_mask.py') }}
306155 run : |
@@ -360,22 +209,6 @@ jobs:
360209 python build_report.py $REPORTS/flexAttnMasks-performance.csv $REPORTS/flexAttnMasks-onednn-report.csv --benchmark flex-attn-masks --compiler onednn --param_cols "Z,H,N_CTX,D_HEAD,MASK" --tflops_col OneDNN-TFlops --hbm_col "OneDNN-GB/s" --tag $TAG --mask
361210 fi
362211
363- - name : Run Prefix Sums kernel benchmark
364- if : ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'prefix_sums.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'prefix_sums.py') }}
365- run : |
366- export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
367- cd benchmarks/triton_kernels_benchmark
368- python prefix_sums.py --reports $REPORTS --n_runs $N_RUNS
369- source ../../scripts/capture-hw-details.sh
370- python build_report.py $REPORTS/prefix-sums.csv $REPORTS/prefix_sums-triton-report.csv --benchmark prefix_sums --compiler triton --param_cols "M,N,AXIS" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
371-
372- - name : Run micro benchmark
373- if : ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'micro_benchmarks.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'micro_benchmarks') }}
374- run : |
375- export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
376- cd benchmarks/micro_benchmarks
377- python run_benchmarks.py --reports $REPORTS
378-
379212 - name : Upload benchmark reports
380213 if : ${{ steps.install.outcome == 'success' && !cancelled() }}
381214 uses : actions/upload-artifact@v5
0 commit comments