From 268ef042c747026247d572e4a08267025b02c609 Mon Sep 17 00:00:00 2001 From: Whitney Tsang Date: Sun, 27 Oct 2024 13:16:42 +0000 Subject: [PATCH 1/3] Test removal of TRITON_INTEL_ENABLE_INSTR_SCHED Signed-off-by: Whitney Tsang --- .github/workflows/triton-benchmarks.yml | 150 ++---------------------- 1 file changed, 7 insertions(+), 143 deletions(-) diff --git a/.github/workflows/triton-benchmarks.yml b/.github/workflows/triton-benchmarks.yml index d68ed5df74..0210e95355 100644 --- a/.github/workflows/triton-benchmarks.yml +++ b/.github/workflows/triton-benchmarks.yml @@ -111,149 +111,27 @@ jobs: cd benchmarks python setup.py install - - name: Run Triton Softmax kernel benchmark + - name: Run Triton FA kernel benchmark - advanced path (w/ TRITON_INTEL_ENABLE_INSTR_SCHED) if: ${{ steps.install.outcome == 'success' && !cancelled() }} run: | cd benchmarks/triton_kernels_benchmark - python fused_softmax.py --reports $REPORTS - source ../../scripts/capture-hw-details.sh - python ../../scripts/build_report.py $REPORTS/softmax-performance.csv $REPORTS/softmax-triton-report.csv --benchmark softmax --compiler triton --param_cols "N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG - python ../../scripts/build_report.py $REPORTS/softmax-performance.csv $REPORTS/softmax-xetla-report.csv --benchmark softmax --compiler xetla --param_cols "N" --tflops_col XeTLA-TFlops --hbm_col "XeTLA-GB/s" --tag $TAG - - - name: Run Triton GEMM kernel benchmark - if: ${{ steps.install.outcome == 'success' && !cancelled() }} - run: | - cd benchmarks/triton_kernels_benchmark - python gemm_benchmark.py --reports $REPORTS - mv $REPORTS/matmul-performance.csv $REPORTS/matmul-performance-base.csv - - source ../../scripts/capture-hw-details.sh - python ../../scripts/build_report.py $REPORTS/matmul-performance-base.csv $REPORTS/gemm-triton-report.csv --benchmark gemm --compiler triton --param_cols "B,M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG - python ../../scripts/build_report.py $REPORTS/matmul-performance-base.csv $REPORTS/gemm-xetla-report.csv --benchmark gemm --compiler xetla --param_cols "B,M,K,N" --tflops_col XeTLA-TFlops --hbm_col "XeTLA-GB/s" --tag $TAG - - - name: Run Triton GEMM kernel benchmark - default path - if: ${{ steps.install.outcome == 'success' && !cancelled() }} - run: | - cd benchmarks/triton_kernels_benchmark - # Default path: - TRITON_INTEL_ADVANCED_PATH=0 \ - TRITON_INTEL_ENABLE_ADDRESS_PAYLOAD_OPT=1 \ - IGC_VISAOptions=" -enableBCR -nolocalra" \ - IGC_DisableLoopUnroll=1 \ - python gemm_benchmark.py --reports $REPORTS - mv $REPORTS/matmul-performance.csv $REPORTS/matmul-performance-default-path.csv - - source ../../scripts/capture-hw-details.sh - TAG="${TAG}-dflt" - python ../../scripts/build_report.py $REPORTS/matmul-performance-default-path.csv $REPORTS/gemm-triton-default-report.csv --benchmark gemm --compiler triton --param_cols "B,M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG - - - name: Run Triton GEMM kernel benchmark - advanced path - if: ${{ steps.install.outcome == 'success' && !cancelled() }} - run: | - cd benchmarks/triton_kernels_benchmark - # Advanced path: + rm -rf ~/.triton/cache TRITON_INTEL_ADVANCED_PATH=1 \ - TRITON_INTEL_ENABLE_ADDRESS_PAYLOAD_OPT=1 \ - IGC_VISAOptions=" -enableBCR -nolocalra" \ - IGC_DisableLoopUnroll=1 \ - python gemm_benchmark.py --reports $REPORTS - mv $REPORTS/matmul-performance.csv $REPORTS/matmul-performance-adv-path.csv - - source ../../scripts/capture-hw-details.sh - TAG="${TAG}-adv" - python ../../scripts/build_report.py $REPORTS/matmul-performance-adv-path.csv $REPORTS/gemm-triton-advanced-report.csv --benchmark gemm --compiler triton --param_cols "B,M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG - - - name: Run Triton GEMM (A@B^t) kernel benchmark - if: ${{ steps.install.outcome == 'success' && !cancelled() }} - run: | - cd benchmarks/triton_kernels_benchmark - TRANSPOSE_B=1 python gemm_benchmark.py --reports $REPORTS - mv $REPORTS/matmul-performance.csv $REPORTS/matmul-performance-bt.csv - source ../../scripts/capture-hw-details.sh - - python ../../scripts/build_report.py $REPORTS/matmul-performance-bt.csv $REPORTS/gemm-bt-triton-report.csv --benchmark gemm-bt --compiler triton --param_cols "B,M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG - python ../../scripts/build_report.py $REPORTS/matmul-performance-bt.csv $REPORTS/gemm-bt-onednn-report.csv --benchmark gemm-bt --compiler onednn --param_cols "B,M,K,N" --tflops_col onednn-TFlops --hbm_col "onednn-GB/s" --tag $TAG - - - name: Run Triton GEMM (A^t@B) kernel benchmark - if: ${{ steps.install.outcome == 'success' && !cancelled() }} - run: | - cd benchmarks/triton_kernels_benchmark - TRANSPOSE_A=1 python gemm_benchmark.py --reports $REPORTS - mv $REPORTS/matmul-performance.csv $REPORTS/matmul-performance-at.csv - source ../../scripts/capture-hw-details.sh - - python ../../scripts/build_report.py $REPORTS/matmul-performance-at.csv $REPORTS/gemm-at-triton-report.csv --benchmark gemm-at --compiler triton --param_cols "B,M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG - python ../../scripts/build_report.py $REPORTS/matmul-performance-at.csv $REPORTS/gemm-at-onednn-report.csv --benchmark gemm-at --compiler onednn --param_cols "B,M,K,N" --tflops_col onednn-TFlops --hbm_col "onednn-GB/s" --tag $TAG - - - name: Run Triton GEMM (stream-k) kernel benchmark - if: ${{ steps.install.outcome == 'success' && !cancelled() }} - run: | - cd benchmarks/triton_kernels_benchmark - python gemm_streamk_benchmark.py --reports $REPORTS - source ../../scripts/capture-hw-details.sh - python ../../scripts/build_report.py $REPORTS/matmul-streamk-performance.csv $REPORTS/gemm-streamk-triton-report.csv --benchmark gemm-streamk --compiler triton --param_cols "M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG - - - name: Run Triton GEMM (split-k) kernel benchmark - if: ${{ steps.install.outcome == 'success' && !cancelled() }} - run: | - cd benchmarks/triton_kernels_benchmark - python gemm_splitk_benchmark.py --reports $REPORTS - source ../../scripts/capture-hw-details.sh - python ../../scripts/build_report.py $REPORTS/matmul-splitk-performance.csv $REPORTS/gemm-splitk-triton-report.csv --benchmark gemm-splitk --compiler triton --param_cols "M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG - - - name: Run Triton GEMM + PreOp (exp) kernel benchmark - if: ${{ steps.install.outcome == 'success' && !cancelled() }} - run: | - cd benchmarks/triton_kernels_benchmark - python gemm_preop_exp_benchmark.py --reports $REPORTS - source ../../scripts/capture-hw-details.sh - python ../../scripts/build_report.py $REPORTS/matmul-performance-preop-exp.csv $REPORTS/gemm-preop-exp-triton-report.csv --benchmark gemm-preop-exp --compiler triton --param_cols "B,M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG - - - name: Run Triton GEMM + PostOp (Gelu) kernel benchmark - if: ${{ steps.install.outcome == 'success' && !cancelled() }} - run: | - cd benchmarks/triton_kernels_benchmark - python gemm_postop_gelu_benchmark.py --reports $REPORTS - source ../../scripts/capture-hw-details.sh - python ../../scripts/build_report.py $REPORTS/matmul-performance-postop-gelu.csv $REPORTS/gemm-postop-gelu-triton-report.csv --benchmark gemm-postop-gelu --compiler triton --param_cols "B,M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG - - - name: Run Triton GEMM + PostOp (add matrix) kernel benchmark - if: ${{ steps.install.outcome == 'success' && !cancelled() }} - run: | - cd benchmarks/triton_kernels_benchmark - python gemm_postop_addmatrix_benchmark.py --reports $REPORTS - source ../../scripts/capture-hw-details.sh - python ../../scripts/build_report.py $REPORTS/matmul-performance-postop-addmatrix.csv $REPORTS/gemm-postop-addmatrix-triton-report.csv --benchmark gemm-postop-addmatrix --compiler triton --param_cols "B,M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG - - - name: Run Triton FA kernel benchmark - if: ${{ steps.install.outcome == 'success' && !cancelled() }} - run: | - cd benchmarks/triton_kernels_benchmark - python flash_attention_fwd_benchmark.py --reports $REPORTS - - source ../../scripts/capture-hw-details.sh - python ../../scripts/build_report.py $REPORTS/attn-performance.csv $REPORTS/attn-triton-report.csv --benchmark attn --compiler triton --param_cols "Z,H,N_CTX,D_HEAD,CAUSAL" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG - python ../../scripts/build_report.py $REPORTS/attn-performance.csv $REPORTS/attn-xetla-report.csv --benchmark attn --compiler xetla --param_cols "Z,H,N_CTX,D_HEAD,CAUSAL" --tflops_col XeTLA-TFlops --hbm_col "XeTLA-GB/s" --tag $TAG - - - name: Run Triton FA kernel benchmark - default path - if: ${{ steps.install.outcome == 'success' && !cancelled() }} - run: | - cd benchmarks/triton_kernels_benchmark - TRITON_INTEL_ADVANCED_PATH=0 \ + TRITON_INTEL_ENABLE_INSTR_SCHED=1 \ TRITON_INTEL_ENABLE_ADDRESS_PAYLOAD_OPT=1 \ IGC_VISAOptions=" -enableBCR" \ python flash_attention_fwd_benchmark.py --reports $REPORTS - TAG="${TAG}-dflt" + TAG="${TAG}-adv" source ../../scripts/capture-hw-details.sh - python ../../scripts/build_report.py $REPORTS/attn-performance.csv $REPORTS/attn-triton-default-report.csv --benchmark attn --compiler triton --param_cols "Z,H,N_CTX,D_HEAD,CAUSAL" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG + python ../../scripts/build_report.py $REPORTS/attn-performance.csv $REPORTS/attn-triton-advanced-report.csv --benchmark attn --compiler triton --param_cols "Z,H,N_CTX,D_HEAD,CAUSAL" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG - - name: Run Triton FA kernel benchmark - advanced path + - name: Run Triton FA kernel benchmark - advanced path (w/o TRITON_INTEL_ENABLE_INSTR_SCHED) if: ${{ steps.install.outcome == 'success' && !cancelled() }} run: | cd benchmarks/triton_kernels_benchmark + rm -rf ~/.triton/cache TRITON_INTEL_ADVANCED_PATH=1 \ - TRITON_INTEL_ENABLE_INSTR_SCHED=1 \ TRITON_INTEL_ENABLE_ADDRESS_PAYLOAD_OPT=1 \ IGC_VISAOptions=" -enableBCR" \ python flash_attention_fwd_benchmark.py --reports $REPORTS @@ -262,20 +140,6 @@ jobs: source ../../scripts/capture-hw-details.sh python ../../scripts/build_report.py $REPORTS/attn-performance.csv $REPORTS/attn-triton-advanced-report.csv --benchmark attn --compiler triton --param_cols "Z,H,N_CTX,D_HEAD,CAUSAL" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG - - name: Run Prefix Sums kernel benchmark - if: ${{ steps.install.outcome == 'success' && !cancelled() }} - run: | - cd benchmarks/triton_kernels_benchmark - python prefix_sums.py --reports $REPORTS - source ../../scripts/capture-hw-details.sh - python ../../scripts/build_report.py $REPORTS/prefix-sums.csv $REPORTS/prefix_sums-triton-report.csv --benchmark prefix_sums --compiler triton --param_cols "N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG - - - name: Run micro benchmark - if: ${{ steps.install.outcome == 'success' && !cancelled() }} - run: | - cd benchmarks/micro_benchmarks - python run_benchmarks.py --reports $REPORTS - - name: Save pip cache if: ${{ steps.pip-cache.outputs.status == 'miss' }} uses: ./.github/actions/save From 55095c8cac51dce23762fa84431b9ecf0c9238bf Mon Sep 17 00:00:00 2001 From: Whitney Tsang Date: Sun, 27 Oct 2024 14:14:28 +0000 Subject: [PATCH 2/3] Revert "Test removal of TRITON_INTEL_ENABLE_INSTR_SCHED" This reverts commit 35c3e09dbfc263d436b6230ad29595a089bcadff. --- .github/workflows/triton-benchmarks.yml | 150 ++++++++++++++++++++++-- 1 file changed, 143 insertions(+), 7 deletions(-) diff --git a/.github/workflows/triton-benchmarks.yml b/.github/workflows/triton-benchmarks.yml index 0210e95355..d68ed5df74 100644 --- a/.github/workflows/triton-benchmarks.yml +++ b/.github/workflows/triton-benchmarks.yml @@ -111,27 +111,149 @@ jobs: cd benchmarks python setup.py install - - name: Run Triton FA kernel benchmark - advanced path (w/ TRITON_INTEL_ENABLE_INSTR_SCHED) + - name: Run Triton Softmax kernel benchmark if: ${{ steps.install.outcome == 'success' && !cancelled() }} run: | cd benchmarks/triton_kernels_benchmark - rm -rf ~/.triton/cache + python fused_softmax.py --reports $REPORTS + source ../../scripts/capture-hw-details.sh + python ../../scripts/build_report.py $REPORTS/softmax-performance.csv $REPORTS/softmax-triton-report.csv --benchmark softmax --compiler triton --param_cols "N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG + python ../../scripts/build_report.py $REPORTS/softmax-performance.csv $REPORTS/softmax-xetla-report.csv --benchmark softmax --compiler xetla --param_cols "N" --tflops_col XeTLA-TFlops --hbm_col "XeTLA-GB/s" --tag $TAG + + - name: Run Triton GEMM kernel benchmark + if: ${{ steps.install.outcome == 'success' && !cancelled() }} + run: | + cd benchmarks/triton_kernels_benchmark + python gemm_benchmark.py --reports $REPORTS + mv $REPORTS/matmul-performance.csv $REPORTS/matmul-performance-base.csv + + source ../../scripts/capture-hw-details.sh + python ../../scripts/build_report.py $REPORTS/matmul-performance-base.csv $REPORTS/gemm-triton-report.csv --benchmark gemm --compiler triton --param_cols "B,M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG + python ../../scripts/build_report.py $REPORTS/matmul-performance-base.csv $REPORTS/gemm-xetla-report.csv --benchmark gemm --compiler xetla --param_cols "B,M,K,N" --tflops_col XeTLA-TFlops --hbm_col "XeTLA-GB/s" --tag $TAG + + - name: Run Triton GEMM kernel benchmark - default path + if: ${{ steps.install.outcome == 'success' && !cancelled() }} + run: | + cd benchmarks/triton_kernels_benchmark + # Default path: + TRITON_INTEL_ADVANCED_PATH=0 \ + TRITON_INTEL_ENABLE_ADDRESS_PAYLOAD_OPT=1 \ + IGC_VISAOptions=" -enableBCR -nolocalra" \ + IGC_DisableLoopUnroll=1 \ + python gemm_benchmark.py --reports $REPORTS + mv $REPORTS/matmul-performance.csv $REPORTS/matmul-performance-default-path.csv + + source ../../scripts/capture-hw-details.sh + TAG="${TAG}-dflt" + python ../../scripts/build_report.py $REPORTS/matmul-performance-default-path.csv $REPORTS/gemm-triton-default-report.csv --benchmark gemm --compiler triton --param_cols "B,M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG + + - name: Run Triton GEMM kernel benchmark - advanced path + if: ${{ steps.install.outcome == 'success' && !cancelled() }} + run: | + cd benchmarks/triton_kernels_benchmark + # Advanced path: TRITON_INTEL_ADVANCED_PATH=1 \ - TRITON_INTEL_ENABLE_INSTR_SCHED=1 \ + TRITON_INTEL_ENABLE_ADDRESS_PAYLOAD_OPT=1 \ + IGC_VISAOptions=" -enableBCR -nolocalra" \ + IGC_DisableLoopUnroll=1 \ + python gemm_benchmark.py --reports $REPORTS + mv $REPORTS/matmul-performance.csv $REPORTS/matmul-performance-adv-path.csv + + source ../../scripts/capture-hw-details.sh + TAG="${TAG}-adv" + python ../../scripts/build_report.py $REPORTS/matmul-performance-adv-path.csv $REPORTS/gemm-triton-advanced-report.csv --benchmark gemm --compiler triton --param_cols "B,M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG + + - name: Run Triton GEMM (A@B^t) kernel benchmark + if: ${{ steps.install.outcome == 'success' && !cancelled() }} + run: | + cd benchmarks/triton_kernels_benchmark + TRANSPOSE_B=1 python gemm_benchmark.py --reports $REPORTS + mv $REPORTS/matmul-performance.csv $REPORTS/matmul-performance-bt.csv + source ../../scripts/capture-hw-details.sh + + python ../../scripts/build_report.py $REPORTS/matmul-performance-bt.csv $REPORTS/gemm-bt-triton-report.csv --benchmark gemm-bt --compiler triton --param_cols "B,M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG + python ../../scripts/build_report.py $REPORTS/matmul-performance-bt.csv $REPORTS/gemm-bt-onednn-report.csv --benchmark gemm-bt --compiler onednn --param_cols "B,M,K,N" --tflops_col onednn-TFlops --hbm_col "onednn-GB/s" --tag $TAG + + - name: Run Triton GEMM (A^t@B) kernel benchmark + if: ${{ steps.install.outcome == 'success' && !cancelled() }} + run: | + cd benchmarks/triton_kernels_benchmark + TRANSPOSE_A=1 python gemm_benchmark.py --reports $REPORTS + mv $REPORTS/matmul-performance.csv $REPORTS/matmul-performance-at.csv + source ../../scripts/capture-hw-details.sh + + python ../../scripts/build_report.py $REPORTS/matmul-performance-at.csv $REPORTS/gemm-at-triton-report.csv --benchmark gemm-at --compiler triton --param_cols "B,M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG + python ../../scripts/build_report.py $REPORTS/matmul-performance-at.csv $REPORTS/gemm-at-onednn-report.csv --benchmark gemm-at --compiler onednn --param_cols "B,M,K,N" --tflops_col onednn-TFlops --hbm_col "onednn-GB/s" --tag $TAG + + - name: Run Triton GEMM (stream-k) kernel benchmark + if: ${{ steps.install.outcome == 'success' && !cancelled() }} + run: | + cd benchmarks/triton_kernels_benchmark + python gemm_streamk_benchmark.py --reports $REPORTS + source ../../scripts/capture-hw-details.sh + python ../../scripts/build_report.py $REPORTS/matmul-streamk-performance.csv $REPORTS/gemm-streamk-triton-report.csv --benchmark gemm-streamk --compiler triton --param_cols "M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG + + - name: Run Triton GEMM (split-k) kernel benchmark + if: ${{ steps.install.outcome == 'success' && !cancelled() }} + run: | + cd benchmarks/triton_kernels_benchmark + python gemm_splitk_benchmark.py --reports $REPORTS + source ../../scripts/capture-hw-details.sh + python ../../scripts/build_report.py $REPORTS/matmul-splitk-performance.csv $REPORTS/gemm-splitk-triton-report.csv --benchmark gemm-splitk --compiler triton --param_cols "M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG + + - name: Run Triton GEMM + PreOp (exp) kernel benchmark + if: ${{ steps.install.outcome == 'success' && !cancelled() }} + run: | + cd benchmarks/triton_kernels_benchmark + python gemm_preop_exp_benchmark.py --reports $REPORTS + source ../../scripts/capture-hw-details.sh + python ../../scripts/build_report.py $REPORTS/matmul-performance-preop-exp.csv $REPORTS/gemm-preop-exp-triton-report.csv --benchmark gemm-preop-exp --compiler triton --param_cols "B,M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG + + - name: Run Triton GEMM + PostOp (Gelu) kernel benchmark + if: ${{ steps.install.outcome == 'success' && !cancelled() }} + run: | + cd benchmarks/triton_kernels_benchmark + python gemm_postop_gelu_benchmark.py --reports $REPORTS + source ../../scripts/capture-hw-details.sh + python ../../scripts/build_report.py $REPORTS/matmul-performance-postop-gelu.csv $REPORTS/gemm-postop-gelu-triton-report.csv --benchmark gemm-postop-gelu --compiler triton --param_cols "B,M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG + + - name: Run Triton GEMM + PostOp (add matrix) kernel benchmark + if: ${{ steps.install.outcome == 'success' && !cancelled() }} + run: | + cd benchmarks/triton_kernels_benchmark + python gemm_postop_addmatrix_benchmark.py --reports $REPORTS + source ../../scripts/capture-hw-details.sh + python ../../scripts/build_report.py $REPORTS/matmul-performance-postop-addmatrix.csv $REPORTS/gemm-postop-addmatrix-triton-report.csv --benchmark gemm-postop-addmatrix --compiler triton --param_cols "B,M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG + + - name: Run Triton FA kernel benchmark + if: ${{ steps.install.outcome == 'success' && !cancelled() }} + run: | + cd benchmarks/triton_kernels_benchmark + python flash_attention_fwd_benchmark.py --reports $REPORTS + + source ../../scripts/capture-hw-details.sh + python ../../scripts/build_report.py $REPORTS/attn-performance.csv $REPORTS/attn-triton-report.csv --benchmark attn --compiler triton --param_cols "Z,H,N_CTX,D_HEAD,CAUSAL" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG + python ../../scripts/build_report.py $REPORTS/attn-performance.csv $REPORTS/attn-xetla-report.csv --benchmark attn --compiler xetla --param_cols "Z,H,N_CTX,D_HEAD,CAUSAL" --tflops_col XeTLA-TFlops --hbm_col "XeTLA-GB/s" --tag $TAG + + - name: Run Triton FA kernel benchmark - default path + if: ${{ steps.install.outcome == 'success' && !cancelled() }} + run: | + cd benchmarks/triton_kernels_benchmark + TRITON_INTEL_ADVANCED_PATH=0 \ TRITON_INTEL_ENABLE_ADDRESS_PAYLOAD_OPT=1 \ IGC_VISAOptions=" -enableBCR" \ python flash_attention_fwd_benchmark.py --reports $REPORTS - TAG="${TAG}-adv" + TAG="${TAG}-dflt" source ../../scripts/capture-hw-details.sh - python ../../scripts/build_report.py $REPORTS/attn-performance.csv $REPORTS/attn-triton-advanced-report.csv --benchmark attn --compiler triton --param_cols "Z,H,N_CTX,D_HEAD,CAUSAL" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG + python ../../scripts/build_report.py $REPORTS/attn-performance.csv $REPORTS/attn-triton-default-report.csv --benchmark attn --compiler triton --param_cols "Z,H,N_CTX,D_HEAD,CAUSAL" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG - - name: Run Triton FA kernel benchmark - advanced path (w/o TRITON_INTEL_ENABLE_INSTR_SCHED) + - name: Run Triton FA kernel benchmark - advanced path if: ${{ steps.install.outcome == 'success' && !cancelled() }} run: | cd benchmarks/triton_kernels_benchmark - rm -rf ~/.triton/cache TRITON_INTEL_ADVANCED_PATH=1 \ + TRITON_INTEL_ENABLE_INSTR_SCHED=1 \ TRITON_INTEL_ENABLE_ADDRESS_PAYLOAD_OPT=1 \ IGC_VISAOptions=" -enableBCR" \ python flash_attention_fwd_benchmark.py --reports $REPORTS @@ -140,6 +262,20 @@ jobs: source ../../scripts/capture-hw-details.sh python ../../scripts/build_report.py $REPORTS/attn-performance.csv $REPORTS/attn-triton-advanced-report.csv --benchmark attn --compiler triton --param_cols "Z,H,N_CTX,D_HEAD,CAUSAL" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG + - name: Run Prefix Sums kernel benchmark + if: ${{ steps.install.outcome == 'success' && !cancelled() }} + run: | + cd benchmarks/triton_kernels_benchmark + python prefix_sums.py --reports $REPORTS + source ../../scripts/capture-hw-details.sh + python ../../scripts/build_report.py $REPORTS/prefix-sums.csv $REPORTS/prefix_sums-triton-report.csv --benchmark prefix_sums --compiler triton --param_cols "N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG + + - name: Run micro benchmark + if: ${{ steps.install.outcome == 'success' && !cancelled() }} + run: | + cd benchmarks/micro_benchmarks + python run_benchmarks.py --reports $REPORTS + - name: Save pip cache if: ${{ steps.pip-cache.outputs.status == 'miss' }} uses: ./.github/actions/save From cb128498e370a2adbc4005e60467344604e2a8fc Mon Sep 17 00:00:00 2001 From: Whitney Tsang Date: Sun, 27 Oct 2024 14:16:21 +0000 Subject: [PATCH 3/3] [FA] Remove TRITON_INTEL_ENABLE_INSTR_SCHED Signed-off-by: Whitney Tsang --- .github/workflows/triton-benchmarks.yml | 1 - scripts/test-triton.sh | 1 - 2 files changed, 2 deletions(-) diff --git a/.github/workflows/triton-benchmarks.yml b/.github/workflows/triton-benchmarks.yml index d68ed5df74..94e419646b 100644 --- a/.github/workflows/triton-benchmarks.yml +++ b/.github/workflows/triton-benchmarks.yml @@ -253,7 +253,6 @@ jobs: run: | cd benchmarks/triton_kernels_benchmark TRITON_INTEL_ADVANCED_PATH=1 \ - TRITON_INTEL_ENABLE_INSTR_SCHED=1 \ TRITON_INTEL_ENABLE_ADDRESS_PAYLOAD_OPT=1 \ IGC_VISAOptions=" -enableBCR" \ python flash_attention_fwd_benchmark.py --reports $REPORTS diff --git a/scripts/test-triton.sh b/scripts/test-triton.sh index 3a62421b7f..1f8681e701 100755 --- a/scripts/test-triton.sh +++ b/scripts/test-triton.sh @@ -285,7 +285,6 @@ run_benchmark_attention() { echo "Advanced path:" TRITON_INTEL_ADVANCED_PATH=1 \ TRITON_INTEL_ENABLE_ADDRESS_PAYLOAD_OPT=1 \ - TRITON_INTEL_ENABLE_INSTR_SCHED=1 \ IGC_VISAOptions=" -enableBCR" \ python $TRITON_PROJ/benchmarks/triton_kernels_benchmark/flash_attention_fwd_benchmark.py }