Skip to content

Commit f9edefa

Browse files
committed
Test
1 parent 0fe7349 commit f9edefa

File tree

2 files changed

+10
-140
lines changed

2 files changed

+10
-140
lines changed

.github/workflows/triton-benchmarks.yml

Lines changed: 8 additions & 139 deletions
Original file line numberDiff line numberDiff line change
@@ -111,120 +111,6 @@ jobs:
111111
cd benchmarks
112112
python setup.py install
113113
114-
- name: Run Triton Softmax kernel benchmark
115-
if: ${{ steps.install.outcome == 'success' && !cancelled() }}
116-
run: |
117-
cd benchmarks/triton_kernels_benchmark
118-
python fused_softmax.py --reports $REPORTS
119-
source ../../scripts/capture-hw-details.sh
120-
python ../../scripts/build_report.py $REPORTS/softmax-performance.csv $REPORTS/softmax-triton-report.csv --benchmark softmax --compiler triton --param_cols "N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
121-
python ../../scripts/build_report.py $REPORTS/softmax-performance.csv $REPORTS/softmax-xetla-report.csv --benchmark softmax --compiler xetla --param_cols "N" --tflops_col XeTLA-TFlops --hbm_col "XeTLA-GB/s" --tag $TAG
122-
123-
- name: Run Triton GEMM kernel benchmark
124-
if: ${{ steps.install.outcome == 'success' && !cancelled() }}
125-
run: |
126-
cd benchmarks/triton_kernels_benchmark
127-
python gemm_benchmark.py --reports $REPORTS
128-
mv $REPORTS/matmul-performance.csv $REPORTS/matmul-performance-base.csv
129-
130-
source ../../scripts/capture-hw-details.sh
131-
python ../../scripts/build_report.py $REPORTS/matmul-performance-base.csv $REPORTS/gemm-triton-report.csv --benchmark gemm --compiler triton --param_cols "B,M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
132-
python ../../scripts/build_report.py $REPORTS/matmul-performance-base.csv $REPORTS/gemm-xetla-report.csv --benchmark gemm --compiler xetla --param_cols "B,M,K,N" --tflops_col XeTLA-TFlops --hbm_col "XeTLA-GB/s" --tag $TAG
133-
134-
- name: Run Triton GEMM kernel benchmark - default path
135-
if: ${{ steps.install.outcome == 'success' && !cancelled() }}
136-
run: |
137-
cd benchmarks/triton_kernels_benchmark
138-
# Default path:
139-
TRITON_INTEL_ADVANCED_PATH=0 \
140-
TRITON_INTEL_ENABLE_ADDRESS_PAYLOAD_OPT=1 \
141-
IGC_VISAOptions=" -enableBCR -nolocalra" \
142-
IGC_DisableLoopUnroll=1 \
143-
python gemm_benchmark.py --reports $REPORTS
144-
mv $REPORTS/matmul-performance.csv $REPORTS/matmul-performance-default-path.csv
145-
146-
source ../../scripts/capture-hw-details.sh
147-
TAG="${TAG}-dflt"
148-
python ../../scripts/build_report.py $REPORTS/matmul-performance-default-path.csv $REPORTS/gemm-triton-default-report.csv --benchmark gemm --compiler triton --param_cols "B,M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
149-
150-
- name: Run Triton GEMM kernel benchmark - advanced path
151-
if: ${{ steps.install.outcome == 'success' && !cancelled() }}
152-
run: |
153-
cd benchmarks/triton_kernels_benchmark
154-
# Advanced path:
155-
TRITON_INTEL_ADVANCED_PATH=1 \
156-
TRITON_INTEL_ENABLE_ADDRESS_PAYLOAD_OPT=1 \
157-
IGC_VISAOptions=" -enableBCR -nolocalra" \
158-
IGC_DisableLoopUnroll=1 \
159-
python gemm_benchmark.py --reports $REPORTS
160-
mv $REPORTS/matmul-performance.csv $REPORTS/matmul-performance-adv-path.csv
161-
162-
source ../../scripts/capture-hw-details.sh
163-
TAG="${TAG}-adv"
164-
python ../../scripts/build_report.py $REPORTS/matmul-performance-adv-path.csv $REPORTS/gemm-triton-advanced-report.csv --benchmark gemm --compiler triton --param_cols "B,M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
165-
166-
- name: Run Triton GEMM (A@B^t) kernel benchmark
167-
if: ${{ steps.install.outcome == 'success' && !cancelled() }}
168-
run: |
169-
cd benchmarks/triton_kernels_benchmark
170-
TRANSPOSE_B=1 python gemm_benchmark.py --reports $REPORTS
171-
mv $REPORTS/matmul-performance.csv $REPORTS/matmul-performance-bt.csv
172-
source ../../scripts/capture-hw-details.sh
173-
174-
python ../../scripts/build_report.py $REPORTS/matmul-performance-bt.csv $REPORTS/gemm-bt-triton-report.csv --benchmark gemm-bt --compiler triton --param_cols "B,M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
175-
python ../../scripts/build_report.py $REPORTS/matmul-performance-bt.csv $REPORTS/gemm-bt-onednn-report.csv --benchmark gemm-bt --compiler onednn --param_cols "B,M,K,N" --tflops_col onednn-TFlops --hbm_col "onednn-GB/s" --tag $TAG
176-
177-
- name: Run Triton GEMM (A^t@B) kernel benchmark
178-
if: ${{ steps.install.outcome == 'success' && !cancelled() }}
179-
run: |
180-
cd benchmarks/triton_kernels_benchmark
181-
TRANSPOSE_A=1 python gemm_benchmark.py --reports $REPORTS
182-
mv $REPORTS/matmul-performance.csv $REPORTS/matmul-performance-at.csv
183-
source ../../scripts/capture-hw-details.sh
184-
185-
python ../../scripts/build_report.py $REPORTS/matmul-performance-at.csv $REPORTS/gemm-at-triton-report.csv --benchmark gemm-at --compiler triton --param_cols "B,M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
186-
python ../../scripts/build_report.py $REPORTS/matmul-performance-at.csv $REPORTS/gemm-at-onednn-report.csv --benchmark gemm-at --compiler onednn --param_cols "B,M,K,N" --tflops_col onednn-TFlops --hbm_col "onednn-GB/s" --tag $TAG
187-
188-
- name: Run Triton GEMM (stream-k) kernel benchmark
189-
if: ${{ steps.install.outcome == 'success' && !cancelled() }}
190-
run: |
191-
cd benchmarks/triton_kernels_benchmark
192-
python gemm_streamk_benchmark.py --reports $REPORTS
193-
source ../../scripts/capture-hw-details.sh
194-
python ../../scripts/build_report.py $REPORTS/matmul-streamk-performance.csv $REPORTS/gemm-streamk-triton-report.csv --benchmark gemm-streamk --compiler triton --param_cols "M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
195-
196-
- name: Run Triton GEMM (split-k) kernel benchmark
197-
if: ${{ steps.install.outcome == 'success' && !cancelled() }}
198-
run: |
199-
cd benchmarks/triton_kernels_benchmark
200-
python gemm_splitk_benchmark.py --reports $REPORTS
201-
source ../../scripts/capture-hw-details.sh
202-
python ../../scripts/build_report.py $REPORTS/matmul-splitk-performance.csv $REPORTS/gemm-splitk-triton-report.csv --benchmark gemm-splitk --compiler triton --param_cols "M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
203-
204-
- name: Run Triton GEMM + PreOp (exp) kernel benchmark
205-
if: ${{ steps.install.outcome == 'success' && !cancelled() }}
206-
run: |
207-
cd benchmarks/triton_kernels_benchmark
208-
python gemm_preop_exp_benchmark.py --reports $REPORTS
209-
source ../../scripts/capture-hw-details.sh
210-
python ../../scripts/build_report.py $REPORTS/matmul-performance-preop-exp.csv $REPORTS/gemm-preop-exp-triton-report.csv --benchmark gemm-preop-exp --compiler triton --param_cols "B,M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
211-
212-
- name: Run Triton GEMM + PostOp (Gelu) kernel benchmark
213-
if: ${{ steps.install.outcome == 'success' && !cancelled() }}
214-
run: |
215-
cd benchmarks/triton_kernels_benchmark
216-
python gemm_postop_gelu_benchmark.py --reports $REPORTS
217-
source ../../scripts/capture-hw-details.sh
218-
python ../../scripts/build_report.py $REPORTS/matmul-performance-postop-gelu.csv $REPORTS/gemm-postop-gelu-triton-report.csv --benchmark gemm-postop-gelu --compiler triton --param_cols "B,M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
219-
220-
- name: Run Triton GEMM + PostOp (add matrix) kernel benchmark
221-
if: ${{ steps.install.outcome == 'success' && !cancelled() }}
222-
run: |
223-
cd benchmarks/triton_kernels_benchmark
224-
python gemm_postop_addmatrix_benchmark.py --reports $REPORTS
225-
source ../../scripts/capture-hw-details.sh
226-
python ../../scripts/build_report.py $REPORTS/matmul-performance-postop-addmatrix.csv $REPORTS/gemm-postop-addmatrix-triton-report.csv --benchmark gemm-postop-addmatrix --compiler triton --param_cols "B,M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
227-
228114
- name: Run Triton FA kernel benchmark
229115
if: ${{ steps.install.outcome == 'success' && !cancelled() }}
230116
run: |
@@ -235,45 +121,28 @@ jobs:
235121
python ../../scripts/build_report.py $REPORTS/attn-performance.csv $REPORTS/attn-triton-report.csv --benchmark attn --compiler triton --param_cols "Z,H,N_CTX,D_HEAD,CAUSAL" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
236122
python ../../scripts/build_report.py $REPORTS/attn-performance.csv $REPORTS/attn-xetla-report.csv --benchmark attn --compiler xetla --param_cols "Z,H,N_CTX,D_HEAD,CAUSAL" --tflops_col XeTLA-TFlops --hbm_col "XeTLA-GB/s" --tag $TAG
237123
238-
- name: Run Triton FA kernel benchmark - default path
124+
- name: Run Triton FA kernel benchmark - with opt
239125
if: ${{ steps.install.outcome == 'success' && !cancelled() }}
240126
run: |
241127
cd benchmarks/triton_kernels_benchmark
242-
TRITON_INTEL_ADVANCED_PATH=0 \
243-
TRITON_INTEL_ENABLE_ADDRESS_PAYLOAD_OPT=1 \
244-
IGC_VISAOptions=" -enableBCR" \
128+
TRITON_INTEL_OPTIMIZE_REDUCTION_LOCALITY=1 \
245129
python flash_attention_fwd_benchmark.py --reports $REPORTS
246130
247-
TAG="${TAG}-dflt"
248131
source ../../scripts/capture-hw-details.sh
249-
python ../../scripts/build_report.py $REPORTS/attn-performance.csv $REPORTS/attn-triton-default-report.csv --benchmark attn --compiler triton --param_cols "Z,H,N_CTX,D_HEAD,CAUSAL" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
132+
python ../../scripts/build_report.py $REPORTS/attn-performance.csv $REPORTS/attn-triton-report.csv --benchmark attn --compiler triton --param_cols "Z,H,N_CTX,D_HEAD,CAUSAL" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
133+
python ../../scripts/build_report.py $REPORTS/attn-performance.csv $REPORTS/attn-xetla-report.csv --benchmark attn --compiler xetla --param_cols "Z,H,N_CTX,D_HEAD,CAUSAL" --tflops_col XeTLA-TFlops --hbm_col "XeTLA-GB/s" --tag $TAG
250134
251-
- name: Run Triton FA kernel benchmark - advanced path
135+
- name: Run Triton FA kernel benchmark - with opt and BCR
252136
if: ${{ steps.install.outcome == 'success' && !cancelled() }}
253137
run: |
254138
cd benchmarks/triton_kernels_benchmark
255-
TRITON_INTEL_ADVANCED_PATH=1 \
256-
TRITON_INTEL_ENABLE_ADDRESS_PAYLOAD_OPT=1 \
139+
TRITON_INTEL_OPTIMIZE_REDUCTION_LOCALITY=1 \
257140
IGC_VISAOptions=" -enableBCR" \
258141
python flash_attention_fwd_benchmark.py --reports $REPORTS
259142
260-
TAG="${TAG}-adv"
261-
source ../../scripts/capture-hw-details.sh
262-
python ../../scripts/build_report.py $REPORTS/attn-performance.csv $REPORTS/attn-triton-advanced-report.csv --benchmark attn --compiler triton --param_cols "Z,H,N_CTX,D_HEAD,CAUSAL" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
263-
264-
- name: Run Prefix Sums kernel benchmark
265-
if: ${{ steps.install.outcome == 'success' && !cancelled() }}
266-
run: |
267-
cd benchmarks/triton_kernels_benchmark
268-
python prefix_sums.py --reports $REPORTS
269143
source ../../scripts/capture-hw-details.sh
270-
python ../../scripts/build_report.py $REPORTS/prefix-sums.csv $REPORTS/prefix_sums-triton-report.csv --benchmark prefix_sums --compiler triton --param_cols "N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
271-
272-
- name: Run micro benchmark
273-
if: ${{ steps.install.outcome == 'success' && !cancelled() }}
274-
run: |
275-
cd benchmarks/micro_benchmarks
276-
python run_benchmarks.py --reports $REPORTS
144+
python ../../scripts/build_report.py $REPORTS/attn-performance.csv $REPORTS/attn-triton-report.csv --benchmark attn --compiler triton --param_cols "Z,H,N_CTX,D_HEAD,CAUSAL" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
145+
python ../../scripts/build_report.py $REPORTS/attn-performance.csv $REPORTS/attn-xetla-report.csv --benchmark attn --compiler xetla --param_cols "Z,H,N_CTX,D_HEAD,CAUSAL" --tflops_col XeTLA-TFlops --hbm_col "XeTLA-GB/s" --tag $TAG
277146
278147
- name: Save pip cache
279148
if: ${{ steps.pip-cache.outputs.status == 'miss' }}

third_party/intel/backend/compiler.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -251,7 +251,8 @@ def make_ttgir(mod, metadata, opt, properties):
251251
passes.common.add_cse(pm)
252252
passes.ttgpuir.add_prefetch(pm)
253253
passes.ttgpuir.add_optimize_dot_operands(pm, True)
254-
intel.passes.ttgpuir.add_optimize_reduction_locality(pm)
254+
if os.getenv("TRITON_INTEL_OPTIMIZE_REDUCTION_LOCALITY", "0") == 1:
255+
intel.passes.ttgpuir.add_optimize_reduction_locality(pm)
255256
intel.passes.ttgpuir.add_remove_layout_conversions(pm)
256257
intel.passes.ttgpuir.add_reduce_data_duplication(pm)
257258
passes.ttgpuir.add_reorder_instructions(pm)

0 commit comments

Comments
 (0)