Update triton-benchmarks.yml

whitneywhtsang · whitneywhtsang · commit 7422d023c1db · 2024-11-28T02:33:51.000Z
Signed-off-by: Whitney Tsang &lt;whitney.tsang@intel.com&gt;
diff --git a/.github/workflows/triton-benchmarks.yml b/.github/workflows/triton-benchmarks.yml
@@ -250,6 +250,19 @@ jobs:
         if: ${{ steps.install.outcome == 'success' && !cancelled() && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'flash_attention_fwd_benchmark.py') }}
         run: |
           cd benchmarks/triton_kernels_benchmark
+          TRITON_INTEL_DISABLE_LARGE_BLOCK_SIZE_IO_FOR_TRANS_DOT_B=1 \
+          python flash_attention_fwd_benchmark.py --reports $REPORTS
+
+          source ../../scripts/capture-hw-details.sh
+          python ../../scripts/build_report.py $REPORTS/attn-performance.csv $REPORTS/attn-triton-report.csv --benchmark attn --compiler triton --param_cols "Z,H,N_CTX,D_HEAD,CAUSAL" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
+          python ../../scripts/build_report.py $REPORTS/attn-performance.csv $REPORTS/attn-xetla-report.csv --benchmark attn --compiler xetla --param_cols "Z,H,N_CTX,D_HEAD,CAUSAL" --tflops_col XeTLA-TFlops --hbm_col "XeTLA-GB/s" --tag $TAG
+
+      - name: Run Triton FA kernel benchmark (+ reduction)
+        if: ${{ steps.install.outcome == 'success' && !cancelled() && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'flash_attention_fwd_benchmark.py') }}
+        run: |
+          cd benchmarks/triton_kernels_benchmark
+          TRITON_INTEL_DISABLE_LARGE_BLOCK_SIZE_IO_FOR_TRANS_DOT_B=1 \
+          TRITON_INTEL_OPTIMIZE_REDUCTION_LOCALITY=1 \
           python flash_attention_fwd_benchmark.py --reports $REPORTS
 
           source ../../scripts/capture-hw-details.sh
@@ -261,6 +274,7 @@ jobs:
         run: |
           cd benchmarks/triton_kernels_benchmark
           TRITON_INTEL_ADVANCED_PATH=0 \
+          TRITON_INTEL_DISABLE_LARGE_BLOCK_SIZE_IO_FOR_TRANS_DOT_B=1 \
           TRITON_INTEL_ENABLE_ADDRESS_PAYLOAD_OPT=1 \
           IGC_VISAOptions=" -enableBCR" \
           python flash_attention_fwd_benchmark.py --reports $REPORTS
diff --git a/third_party/intel/lib/TritonIntelGPUToLLVM/DotOpToLLVM/DPAS.cpp b/third_party/intel/lib/TritonIntelGPUToLLVM/DotOpToLLVM/DPAS.cpp
@@ -186,7 +186,8 @@ class DotOpDPASConversionHelper {
       auto RC = IntegerAttr::get(rewriter.getIntegerType(32),
                                  dpasEncoding.getRepeatCount());
       fc.at({b, m, n}) = rewriter.create<TritonGEN::MatrixDPASOp>(
-          loc, dTy, valc, valA, valB, pA, pB, RC);
+          loc, dTy, bitcast(valc, cTy), bitcast(valA, aTy), bitcast(valB, bTy),
+          pA, pB, RC);
     };
 
     ArrayRef<unsigned> repCluster = dpasEncoding.getRepCluster();
@@ -345,8 +346,7 @@ class DotOpDPASConversionHelper {
                                         i32_val(k));
               }
               vals[{b, i * repClusterOuter + repOuter,
-                    j * repClusterInner + repInner}] =
-                  bitcast(matVal, dotOperandType);
+                    j * repClusterInner + repInner}] = matVal;
             }
           }
         }

Original file line number	Diff line number	Diff line change
`@@ -186,7 +186,8 @@ class DotOpDPASConversionHelper {`
`186`	`186`	`auto RC = IntegerAttr::get(rewriter.getIntegerType(32),`
`187`	`187`	`dpasEncoding.getRepeatCount());`
`188`	`188`	`fc.at({b, m, n}) = rewriter.create<TritonGEN::MatrixDPASOp>(`
`189`		`- loc, dTy, valc, valA, valB, pA, pB, RC);`
	`189`	`+ loc, dTy, bitcast(valc, cTy), bitcast(valA, aTy), bitcast(valB, bTy),`
	`190`	`+ pA, pB, RC);`
`190`	`191`	`};`
`191`	`192`
`192`	`193`	`ArrayRef<unsigned> repCluster = dpasEncoding.getRepCluster();`
`@@ -345,8 +346,7 @@ class DotOpDPASConversionHelper {`
`345`	`346`	`i32_val(k));`
`346`	`347`	`}`
`347`	`348`	`vals[{b, i * repClusterOuter + repOuter,`
`348`		`- j * repClusterInner + repInner}] =`
`349`		`- bitcast(matVal, dotOperandType);`
	`349`	`+ j * repClusterInner + repInner}] = matVal;`
`350`	`350`	`}`
`351`	`351`	`}`
`352`	`352`	`}`