@@ -111,120 +111,6 @@ jobs:
111111 cd benchmarks
112112 python setup.py install
113113
114- - name : Run Triton Softmax kernel benchmark
115- if : ${{ steps.install.outcome == 'success' && !cancelled() }}
116- run : |
117- cd benchmarks/triton_kernels_benchmark
118- python fused_softmax.py --reports $REPORTS
119- source ../../scripts/capture-hw-details.sh
120- python ../../scripts/build_report.py $REPORTS/softmax-performance.csv $REPORTS/softmax-triton-report.csv --benchmark softmax --compiler triton --param_cols "N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
121- python ../../scripts/build_report.py $REPORTS/softmax-performance.csv $REPORTS/softmax-xetla-report.csv --benchmark softmax --compiler xetla --param_cols "N" --tflops_col XeTLA-TFlops --hbm_col "XeTLA-GB/s" --tag $TAG
122-
123- - name : Run Triton GEMM kernel benchmark
124- if : ${{ steps.install.outcome == 'success' && !cancelled() }}
125- run : |
126- cd benchmarks/triton_kernels_benchmark
127- python gemm_benchmark.py --reports $REPORTS
128- mv $REPORTS/matmul-performance.csv $REPORTS/matmul-performance-base.csv
129-
130- source ../../scripts/capture-hw-details.sh
131- python ../../scripts/build_report.py $REPORTS/matmul-performance-base.csv $REPORTS/gemm-triton-report.csv --benchmark gemm --compiler triton --param_cols "B,M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
132- python ../../scripts/build_report.py $REPORTS/matmul-performance-base.csv $REPORTS/gemm-xetla-report.csv --benchmark gemm --compiler xetla --param_cols "B,M,K,N" --tflops_col XeTLA-TFlops --hbm_col "XeTLA-GB/s" --tag $TAG
133-
134- - name : Run Triton GEMM kernel benchmark - default path
135- if : ${{ steps.install.outcome == 'success' && !cancelled() }}
136- run : |
137- cd benchmarks/triton_kernels_benchmark
138- # Default path:
139- TRITON_INTEL_ADVANCED_PATH=0 \
140- TRITON_INTEL_ENABLE_ADDRESS_PAYLOAD_OPT=1 \
141- IGC_VISAOptions=" -enableBCR -nolocalra" \
142- IGC_DisableLoopUnroll=1 \
143- python gemm_benchmark.py --reports $REPORTS
144- mv $REPORTS/matmul-performance.csv $REPORTS/matmul-performance-default-path.csv
145-
146- source ../../scripts/capture-hw-details.sh
147- TAG="${TAG}-dflt"
148- python ../../scripts/build_report.py $REPORTS/matmul-performance-default-path.csv $REPORTS/gemm-triton-default-report.csv --benchmark gemm --compiler triton --param_cols "B,M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
149-
150- - name : Run Triton GEMM kernel benchmark - advanced path
151- if : ${{ steps.install.outcome == 'success' && !cancelled() }}
152- run : |
153- cd benchmarks/triton_kernels_benchmark
154- # Advanced path:
155- TRITON_INTEL_ADVANCED_PATH=1 \
156- TRITON_INTEL_ENABLE_ADDRESS_PAYLOAD_OPT=1 \
157- IGC_VISAOptions=" -enableBCR -nolocalra" \
158- IGC_DisableLoopUnroll=1 \
159- python gemm_benchmark.py --reports $REPORTS
160- mv $REPORTS/matmul-performance.csv $REPORTS/matmul-performance-adv-path.csv
161-
162- source ../../scripts/capture-hw-details.sh
163- TAG="${TAG}-adv"
164- python ../../scripts/build_report.py $REPORTS/matmul-performance-adv-path.csv $REPORTS/gemm-triton-advanced-report.csv --benchmark gemm --compiler triton --param_cols "B,M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
165-
166- - name : Run Triton GEMM (A@B^t) kernel benchmark
167- if : ${{ steps.install.outcome == 'success' && !cancelled() }}
168- run : |
169- cd benchmarks/triton_kernels_benchmark
170- TRANSPOSE_B=1 python gemm_benchmark.py --reports $REPORTS
171- mv $REPORTS/matmul-performance.csv $REPORTS/matmul-performance-bt.csv
172- source ../../scripts/capture-hw-details.sh
173-
174- python ../../scripts/build_report.py $REPORTS/matmul-performance-bt.csv $REPORTS/gemm-bt-triton-report.csv --benchmark gemm-bt --compiler triton --param_cols "B,M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
175- python ../../scripts/build_report.py $REPORTS/matmul-performance-bt.csv $REPORTS/gemm-bt-onednn-report.csv --benchmark gemm-bt --compiler onednn --param_cols "B,M,K,N" --tflops_col onednn-TFlops --hbm_col "onednn-GB/s" --tag $TAG
176-
177- - name : Run Triton GEMM (A^t@B) kernel benchmark
178- if : ${{ steps.install.outcome == 'success' && !cancelled() }}
179- run : |
180- cd benchmarks/triton_kernels_benchmark
181- TRANSPOSE_A=1 python gemm_benchmark.py --reports $REPORTS
182- mv $REPORTS/matmul-performance.csv $REPORTS/matmul-performance-at.csv
183- source ../../scripts/capture-hw-details.sh
184-
185- python ../../scripts/build_report.py $REPORTS/matmul-performance-at.csv $REPORTS/gemm-at-triton-report.csv --benchmark gemm-at --compiler triton --param_cols "B,M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
186- python ../../scripts/build_report.py $REPORTS/matmul-performance-at.csv $REPORTS/gemm-at-onednn-report.csv --benchmark gemm-at --compiler onednn --param_cols "B,M,K,N" --tflops_col onednn-TFlops --hbm_col "onednn-GB/s" --tag $TAG
187-
188- - name : Run Triton GEMM (stream-k) kernel benchmark
189- if : ${{ steps.install.outcome == 'success' && !cancelled() }}
190- run : |
191- cd benchmarks/triton_kernels_benchmark
192- python gemm_streamk_benchmark.py --reports $REPORTS
193- source ../../scripts/capture-hw-details.sh
194- python ../../scripts/build_report.py $REPORTS/matmul-streamk-performance.csv $REPORTS/gemm-streamk-triton-report.csv --benchmark gemm-streamk --compiler triton --param_cols "M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
195-
196- - name : Run Triton GEMM (split-k) kernel benchmark
197- if : ${{ steps.install.outcome == 'success' && !cancelled() }}
198- run : |
199- cd benchmarks/triton_kernels_benchmark
200- python gemm_splitk_benchmark.py --reports $REPORTS
201- source ../../scripts/capture-hw-details.sh
202- python ../../scripts/build_report.py $REPORTS/matmul-splitk-performance.csv $REPORTS/gemm-splitk-triton-report.csv --benchmark gemm-splitk --compiler triton --param_cols "M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
203-
204- - name : Run Triton GEMM + PreOp (exp) kernel benchmark
205- if : ${{ steps.install.outcome == 'success' && !cancelled() }}
206- run : |
207- cd benchmarks/triton_kernels_benchmark
208- python gemm_preop_exp_benchmark.py --reports $REPORTS
209- source ../../scripts/capture-hw-details.sh
210- python ../../scripts/build_report.py $REPORTS/matmul-performance-preop-exp.csv $REPORTS/gemm-preop-exp-triton-report.csv --benchmark gemm-preop-exp --compiler triton --param_cols "B,M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
211-
212- - name : Run Triton GEMM + PostOp (Gelu) kernel benchmark
213- if : ${{ steps.install.outcome == 'success' && !cancelled() }}
214- run : |
215- cd benchmarks/triton_kernels_benchmark
216- python gemm_postop_gelu_benchmark.py --reports $REPORTS
217- source ../../scripts/capture-hw-details.sh
218- python ../../scripts/build_report.py $REPORTS/matmul-performance-postop-gelu.csv $REPORTS/gemm-postop-gelu-triton-report.csv --benchmark gemm-postop-gelu --compiler triton --param_cols "B,M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
219-
220- - name : Run Triton GEMM + PostOp (add matrix) kernel benchmark
221- if : ${{ steps.install.outcome == 'success' && !cancelled() }}
222- run : |
223- cd benchmarks/triton_kernels_benchmark
224- python gemm_postop_addmatrix_benchmark.py --reports $REPORTS
225- source ../../scripts/capture-hw-details.sh
226- python ../../scripts/build_report.py $REPORTS/matmul-performance-postop-addmatrix.csv $REPORTS/gemm-postop-addmatrix-triton-report.csv --benchmark gemm-postop-addmatrix --compiler triton --param_cols "B,M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
227-
228114 - name : Run Triton FA kernel benchmark
229115 if : ${{ steps.install.outcome == 'success' && !cancelled() }}
230116 run : |
@@ -235,45 +121,28 @@ jobs:
235121 python ../../scripts/build_report.py $REPORTS/attn-performance.csv $REPORTS/attn-triton-report.csv --benchmark attn --compiler triton --param_cols "Z,H,N_CTX,D_HEAD,CAUSAL" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
236122 python ../../scripts/build_report.py $REPORTS/attn-performance.csv $REPORTS/attn-xetla-report.csv --benchmark attn --compiler xetla --param_cols "Z,H,N_CTX,D_HEAD,CAUSAL" --tflops_col XeTLA-TFlops --hbm_col "XeTLA-GB/s" --tag $TAG
237123
238- - name : Run Triton FA kernel benchmark - default path
124+ - name : Run Triton FA kernel benchmark - with opt
239125 if : ${{ steps.install.outcome == 'success' && !cancelled() }}
240126 run : |
241127 cd benchmarks/triton_kernels_benchmark
242- TRITON_INTEL_ADVANCED_PATH=0 \
243- TRITON_INTEL_ENABLE_ADDRESS_PAYLOAD_OPT=1 \
244- IGC_VISAOptions=" -enableBCR" \
128+ TRITON_INTEL_OPTIMIZE_REDUCTION_LOCALITY=1 \
245129 python flash_attention_fwd_benchmark.py --reports $REPORTS
246130
247- TAG="${TAG}-dflt"
248131 source ../../scripts/capture-hw-details.sh
249- python ../../scripts/build_report.py $REPORTS/attn-performance.csv $REPORTS/attn-triton-default-report.csv --benchmark attn --compiler triton --param_cols "Z,H,N_CTX,D_HEAD,CAUSAL" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
132+ python ../../scripts/build_report.py $REPORTS/attn-performance.csv $REPORTS/attn-triton-report.csv --benchmark attn --compiler triton --param_cols "Z,H,N_CTX,D_HEAD,CAUSAL" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
133+ python ../../scripts/build_report.py $REPORTS/attn-performance.csv $REPORTS/attn-xetla-report.csv --benchmark attn --compiler xetla --param_cols "Z,H,N_CTX,D_HEAD,CAUSAL" --tflops_col XeTLA-TFlops --hbm_col "XeTLA-GB/s" --tag $TAG
250134
251- - name : Run Triton FA kernel benchmark - advanced path
135+ - name : Run Triton FA kernel benchmark - with opt and BCR
252136 if : ${{ steps.install.outcome == 'success' && !cancelled() }}
253137 run : |
254138 cd benchmarks/triton_kernels_benchmark
255- TRITON_INTEL_ADVANCED_PATH=1 \
256- TRITON_INTEL_ENABLE_ADDRESS_PAYLOAD_OPT=1 \
139+ TRITON_INTEL_OPTIMIZE_REDUCTION_LOCALITY=1 \
257140 IGC_VISAOptions=" -enableBCR" \
258141 python flash_attention_fwd_benchmark.py --reports $REPORTS
259142
260- TAG="${TAG}-adv"
261- source ../../scripts/capture-hw-details.sh
262- python ../../scripts/build_report.py $REPORTS/attn-performance.csv $REPORTS/attn-triton-advanced-report.csv --benchmark attn --compiler triton --param_cols "Z,H,N_CTX,D_HEAD,CAUSAL" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
263-
264- - name : Run Prefix Sums kernel benchmark
265- if : ${{ steps.install.outcome == 'success' && !cancelled() }}
266- run : |
267- cd benchmarks/triton_kernels_benchmark
268- python prefix_sums.py --reports $REPORTS
269143 source ../../scripts/capture-hw-details.sh
270- python ../../scripts/build_report.py $REPORTS/prefix-sums.csv $REPORTS/prefix_sums-triton-report.csv --benchmark prefix_sums --compiler triton --param_cols "N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
271-
272- - name : Run micro benchmark
273- if : ${{ steps.install.outcome == 'success' && !cancelled() }}
274- run : |
275- cd benchmarks/micro_benchmarks
276- python run_benchmarks.py --reports $REPORTS
144+ python ../../scripts/build_report.py $REPORTS/attn-performance.csv $REPORTS/attn-triton-report.csv --benchmark attn --compiler triton --param_cols "Z,H,N_CTX,D_HEAD,CAUSAL" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
145+ python ../../scripts/build_report.py $REPORTS/attn-performance.csv $REPORTS/attn-xetla-report.csv --benchmark attn --compiler xetla --param_cols "Z,H,N_CTX,D_HEAD,CAUSAL" --tflops_col XeTLA-TFlops --hbm_col "XeTLA-GB/s" --tag $TAG
277146
278147 - name : Save pip cache
279148 if : ${{ steps.pip-cache.outputs.status == 'miss' }}
0 commit comments