@@ -111,27 +111,149 @@ jobs:
111111 cd benchmarks
112112 python setup.py install
113113
114- - name : Run Triton FA kernel benchmark - advanced path (w/ TRITON_INTEL_ENABLE_INSTR_SCHED)
114+ - name : Run Triton Softmax kernel benchmark
115115 if : ${{ steps.install.outcome == 'success' && !cancelled() }}
116116 run : |
117117 cd benchmarks/triton_kernels_benchmark
118- rm -rf ~/.triton/cache
118+ python fused_softmax.py --reports $REPORTS
119+ source ../../scripts/capture-hw-details.sh
120+ python ../../scripts/build_report.py $REPORTS/softmax-performance.csv $REPORTS/softmax-triton-report.csv --benchmark softmax --compiler triton --param_cols "N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
121+ python ../../scripts/build_report.py $REPORTS/softmax-performance.csv $REPORTS/softmax-xetla-report.csv --benchmark softmax --compiler xetla --param_cols "N" --tflops_col XeTLA-TFlops --hbm_col "XeTLA-GB/s" --tag $TAG
122+
123+ - name : Run Triton GEMM kernel benchmark
124+ if : ${{ steps.install.outcome == 'success' && !cancelled() }}
125+ run : |
126+ cd benchmarks/triton_kernels_benchmark
127+ python gemm_benchmark.py --reports $REPORTS
128+ mv $REPORTS/matmul-performance.csv $REPORTS/matmul-performance-base.csv
129+
130+ source ../../scripts/capture-hw-details.sh
131+ python ../../scripts/build_report.py $REPORTS/matmul-performance-base.csv $REPORTS/gemm-triton-report.csv --benchmark gemm --compiler triton --param_cols "B,M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
132+ python ../../scripts/build_report.py $REPORTS/matmul-performance-base.csv $REPORTS/gemm-xetla-report.csv --benchmark gemm --compiler xetla --param_cols "B,M,K,N" --tflops_col XeTLA-TFlops --hbm_col "XeTLA-GB/s" --tag $TAG
133+
134+ - name : Run Triton GEMM kernel benchmark - default path
135+ if : ${{ steps.install.outcome == 'success' && !cancelled() }}
136+ run : |
137+ cd benchmarks/triton_kernels_benchmark
138+ # Default path:
139+ TRITON_INTEL_ADVANCED_PATH=0 \
140+ TRITON_INTEL_ENABLE_ADDRESS_PAYLOAD_OPT=1 \
141+ IGC_VISAOptions=" -enableBCR -nolocalra" \
142+ IGC_DisableLoopUnroll=1 \
143+ python gemm_benchmark.py --reports $REPORTS
144+ mv $REPORTS/matmul-performance.csv $REPORTS/matmul-performance-default-path.csv
145+
146+ source ../../scripts/capture-hw-details.sh
147+ TAG="${TAG}-dflt"
148+ python ../../scripts/build_report.py $REPORTS/matmul-performance-default-path.csv $REPORTS/gemm-triton-default-report.csv --benchmark gemm --compiler triton --param_cols "B,M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
149+
150+ - name : Run Triton GEMM kernel benchmark - advanced path
151+ if : ${{ steps.install.outcome == 'success' && !cancelled() }}
152+ run : |
153+ cd benchmarks/triton_kernels_benchmark
154+ # Advanced path:
119155 TRITON_INTEL_ADVANCED_PATH=1 \
120- TRITON_INTEL_ENABLE_INSTR_SCHED=1 \
156+ TRITON_INTEL_ENABLE_ADDRESS_PAYLOAD_OPT=1 \
157+ IGC_VISAOptions=" -enableBCR -nolocalra" \
158+ IGC_DisableLoopUnroll=1 \
159+ python gemm_benchmark.py --reports $REPORTS
160+ mv $REPORTS/matmul-performance.csv $REPORTS/matmul-performance-adv-path.csv
161+
162+ source ../../scripts/capture-hw-details.sh
163+ TAG="${TAG}-adv"
164+ python ../../scripts/build_report.py $REPORTS/matmul-performance-adv-path.csv $REPORTS/gemm-triton-advanced-report.csv --benchmark gemm --compiler triton --param_cols "B,M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
165+
166+ - name : Run Triton GEMM (A@B^t) kernel benchmark
167+ if : ${{ steps.install.outcome == 'success' && !cancelled() }}
168+ run : |
169+ cd benchmarks/triton_kernels_benchmark
170+ TRANSPOSE_B=1 python gemm_benchmark.py --reports $REPORTS
171+ mv $REPORTS/matmul-performance.csv $REPORTS/matmul-performance-bt.csv
172+ source ../../scripts/capture-hw-details.sh
173+
174+ python ../../scripts/build_report.py $REPORTS/matmul-performance-bt.csv $REPORTS/gemm-bt-triton-report.csv --benchmark gemm-bt --compiler triton --param_cols "B,M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
175+ python ../../scripts/build_report.py $REPORTS/matmul-performance-bt.csv $REPORTS/gemm-bt-onednn-report.csv --benchmark gemm-bt --compiler onednn --param_cols "B,M,K,N" --tflops_col onednn-TFlops --hbm_col "onednn-GB/s" --tag $TAG
176+
177+ - name : Run Triton GEMM (A^t@B) kernel benchmark
178+ if : ${{ steps.install.outcome == 'success' && !cancelled() }}
179+ run : |
180+ cd benchmarks/triton_kernels_benchmark
181+ TRANSPOSE_A=1 python gemm_benchmark.py --reports $REPORTS
182+ mv $REPORTS/matmul-performance.csv $REPORTS/matmul-performance-at.csv
183+ source ../../scripts/capture-hw-details.sh
184+
185+ python ../../scripts/build_report.py $REPORTS/matmul-performance-at.csv $REPORTS/gemm-at-triton-report.csv --benchmark gemm-at --compiler triton --param_cols "B,M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
186+ python ../../scripts/build_report.py $REPORTS/matmul-performance-at.csv $REPORTS/gemm-at-onednn-report.csv --benchmark gemm-at --compiler onednn --param_cols "B,M,K,N" --tflops_col onednn-TFlops --hbm_col "onednn-GB/s" --tag $TAG
187+
188+ - name : Run Triton GEMM (stream-k) kernel benchmark
189+ if : ${{ steps.install.outcome == 'success' && !cancelled() }}
190+ run : |
191+ cd benchmarks/triton_kernels_benchmark
192+ python gemm_streamk_benchmark.py --reports $REPORTS
193+ source ../../scripts/capture-hw-details.sh
194+ python ../../scripts/build_report.py $REPORTS/matmul-streamk-performance.csv $REPORTS/gemm-streamk-triton-report.csv --benchmark gemm-streamk --compiler triton --param_cols "M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
195+
196+ - name : Run Triton GEMM (split-k) kernel benchmark
197+ if : ${{ steps.install.outcome == 'success' && !cancelled() }}
198+ run : |
199+ cd benchmarks/triton_kernels_benchmark
200+ python gemm_splitk_benchmark.py --reports $REPORTS
201+ source ../../scripts/capture-hw-details.sh
202+ python ../../scripts/build_report.py $REPORTS/matmul-splitk-performance.csv $REPORTS/gemm-splitk-triton-report.csv --benchmark gemm-splitk --compiler triton --param_cols "M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
203+
204+ - name : Run Triton GEMM + PreOp (exp) kernel benchmark
205+ if : ${{ steps.install.outcome == 'success' && !cancelled() }}
206+ run : |
207+ cd benchmarks/triton_kernels_benchmark
208+ python gemm_preop_exp_benchmark.py --reports $REPORTS
209+ source ../../scripts/capture-hw-details.sh
210+ python ../../scripts/build_report.py $REPORTS/matmul-performance-preop-exp.csv $REPORTS/gemm-preop-exp-triton-report.csv --benchmark gemm-preop-exp --compiler triton --param_cols "B,M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
211+
212+ - name : Run Triton GEMM + PostOp (Gelu) kernel benchmark
213+ if : ${{ steps.install.outcome == 'success' && !cancelled() }}
214+ run : |
215+ cd benchmarks/triton_kernels_benchmark
216+ python gemm_postop_gelu_benchmark.py --reports $REPORTS
217+ source ../../scripts/capture-hw-details.sh
218+ python ../../scripts/build_report.py $REPORTS/matmul-performance-postop-gelu.csv $REPORTS/gemm-postop-gelu-triton-report.csv --benchmark gemm-postop-gelu --compiler triton --param_cols "B,M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
219+
220+ - name : Run Triton GEMM + PostOp (add matrix) kernel benchmark
221+ if : ${{ steps.install.outcome == 'success' && !cancelled() }}
222+ run : |
223+ cd benchmarks/triton_kernels_benchmark
224+ python gemm_postop_addmatrix_benchmark.py --reports $REPORTS
225+ source ../../scripts/capture-hw-details.sh
226+ python ../../scripts/build_report.py $REPORTS/matmul-performance-postop-addmatrix.csv $REPORTS/gemm-postop-addmatrix-triton-report.csv --benchmark gemm-postop-addmatrix --compiler triton --param_cols "B,M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
227+
228+ - name : Run Triton FA kernel benchmark
229+ if : ${{ steps.install.outcome == 'success' && !cancelled() }}
230+ run : |
231+ cd benchmarks/triton_kernels_benchmark
232+ python flash_attention_fwd_benchmark.py --reports $REPORTS
233+
234+ source ../../scripts/capture-hw-details.sh
235+ python ../../scripts/build_report.py $REPORTS/attn-performance.csv $REPORTS/attn-triton-report.csv --benchmark attn --compiler triton --param_cols "Z,H,N_CTX,D_HEAD,CAUSAL" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
236+ python ../../scripts/build_report.py $REPORTS/attn-performance.csv $REPORTS/attn-xetla-report.csv --benchmark attn --compiler xetla --param_cols "Z,H,N_CTX,D_HEAD,CAUSAL" --tflops_col XeTLA-TFlops --hbm_col "XeTLA-GB/s" --tag $TAG
237+
238+ - name : Run Triton FA kernel benchmark - default path
239+ if : ${{ steps.install.outcome == 'success' && !cancelled() }}
240+ run : |
241+ cd benchmarks/triton_kernels_benchmark
242+ TRITON_INTEL_ADVANCED_PATH=0 \
121243 TRITON_INTEL_ENABLE_ADDRESS_PAYLOAD_OPT=1 \
122244 IGC_VISAOptions=" -enableBCR" \
123245 python flash_attention_fwd_benchmark.py --reports $REPORTS
124246
125- TAG="${TAG}-adv "
247+ TAG="${TAG}-dflt "
126248 source ../../scripts/capture-hw-details.sh
127- python ../../scripts/build_report.py $REPORTS/attn-performance.csv $REPORTS/attn-triton-advanced -report.csv --benchmark attn --compiler triton --param_cols "Z,H,N_CTX,D_HEAD,CAUSAL" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
249+ python ../../scripts/build_report.py $REPORTS/attn-performance.csv $REPORTS/attn-triton-default -report.csv --benchmark attn --compiler triton --param_cols "Z,H,N_CTX,D_HEAD,CAUSAL" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
128250
129- - name : Run Triton FA kernel benchmark - advanced path (w/o TRITON_INTEL_ENABLE_INSTR_SCHED)
251+ - name : Run Triton FA kernel benchmark - advanced path
130252 if : ${{ steps.install.outcome == 'success' && !cancelled() }}
131253 run : |
132254 cd benchmarks/triton_kernels_benchmark
133- rm -rf ~/.triton/cache
134255 TRITON_INTEL_ADVANCED_PATH=1 \
256+ TRITON_INTEL_ENABLE_INSTR_SCHED=1 \
135257 TRITON_INTEL_ENABLE_ADDRESS_PAYLOAD_OPT=1 \
136258 IGC_VISAOptions=" -enableBCR" \
137259 python flash_attention_fwd_benchmark.py --reports $REPORTS
@@ -140,6 +262,20 @@ jobs:
140262 source ../../scripts/capture-hw-details.sh
141263 python ../../scripts/build_report.py $REPORTS/attn-performance.csv $REPORTS/attn-triton-advanced-report.csv --benchmark attn --compiler triton --param_cols "Z,H,N_CTX,D_HEAD,CAUSAL" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
142264
265+ - name : Run Prefix Sums kernel benchmark
266+ if : ${{ steps.install.outcome == 'success' && !cancelled() }}
267+ run : |
268+ cd benchmarks/triton_kernels_benchmark
269+ python prefix_sums.py --reports $REPORTS
270+ source ../../scripts/capture-hw-details.sh
271+ python ../../scripts/build_report.py $REPORTS/prefix-sums.csv $REPORTS/prefix_sums-triton-report.csv --benchmark prefix_sums --compiler triton --param_cols "N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
272+
273+ - name : Run micro benchmark
274+ if : ${{ steps.install.outcome == 'success' && !cancelled() }}
275+ run : |
276+ cd benchmarks/micro_benchmarks
277+ python run_benchmarks.py --reports $REPORTS
278+
143279 - name : Save pip cache
144280 if : ${{ steps.pip-cache.outputs.status == 'miss' }}
145281 uses : ./.github/actions/save
0 commit comments