[GEMM] Stop running XeTLA (#4248)

whitneywhtsang · web-flow · commit f916e47bcbcf · 2025-05-21T09:16:20.000-04:00
As `XeTLA` continuous development has been stopped, the team decided to
use other implementations as reference, e.g., `oneDNN` and `CUTLASS`.

Signed-off-by: Whitney Tsang &lt;whitney.tsang@intel.com&gt;
diff --git a/.github/workflows/triton-benchmarks.yml b/.github/workflows/triton-benchmarks.yml
@@ -133,17 +133,6 @@ jobs:
           python build_report.py $REPORTS/softmax-performance.csv $REPORTS/softmax-xetla-report.csv --benchmark softmax --compiler xetla --param_cols "N" --tflops_col XeTLA-TFlops --hbm_col "XeTLA-GB/s" --tag $TAG
 
       - name: Run Triton GEMM kernel benchmark
-        if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_benchmark.py') }}
-        run: |
-          cd benchmarks/triton_kernels_benchmark
-          NEW_SHAPES=0 python gemm_benchmark.py --reports $REPORTS --n_runs $N_RUNS
-          source ../../scripts/capture-hw-details.sh
-          python build_report.py $REPORTS/matmul-performance.csv $REPORTS/gemm-triton-report.csv --benchmark gemm-legacy --compiler triton --param_cols "B,M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
-          python build_report.py $REPORTS/matmul-performance.csv $REPORTS/gemm-xetla-report.csv --benchmark gemm-legacy --compiler xetla --param_cols "B,M,K,N" --tflops_col XeTLA-TFlops --hbm_col "XeTLA-GB/s" --tag $TAG
-          python build_report.py $REPORTS/matmul-performance.csv $REPORTS/gemm-onednn-report.csv --benchmark gemm-legacy --compiler onednn --param_cols "B,M,K,N" --tflops_col OneDNN-TFlops --hbm_col "OneDNN-GB/s" --tag $TAG
-          python build_report.py $REPORTS/matmul-performance.csv $REPORTS/gemm-cutlass-report.csv --benchmark gemm-legacy --compiler cutlass --param_cols "B,M,K,N" --tflops_col CUTLASS-TFlops --hbm_col "CUTLASS-GB/s" --tag $TAG
-
-      - name: Run Triton GEMM kernel benchmark - new shapes
         if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_benchmark.py_newshapes')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_benchmark.py_newshapes') }}
         run: |
           cd benchmarks/triton_kernels_benchmark
diff --git a/benchmarks/triton_kernels_benchmark/gemm_benchmark.py b/benchmarks/triton_kernels_benchmark/gemm_benchmark.py
@@ -232,15 +232,13 @@ def get_shapes(B, M, N, K, transpose_a, transpose_b):
     return a_shape, b_shape
 
 
-NEW_X_VALS = [  #
+X_VALS = [  #
+    [1, 1024 * i, 1024 * i, 1024 * i] for i in [1, 2, 4, 8]
+] + [  #
     [1, m, n, 4096] for m in [1, 8] for n in [1024, 4096, 6144, 14336, 28672, 128256]
 ] + [  #
     [1, m, 4096, 14336] for m in [1, 8]
 ] + [  #
-    [1, 8192, 4096, 4096]  #
-]
-
-X_VALS = [[1, 1024 * i, 1024 * i, 1024 * i] for i in [1, 2, 4, 8]] + [
     [1, 1, 13824, 5120],
     [1, 4, 12288, 4096],
     [1, 512, 8192, 8192],
@@ -261,6 +259,7 @@ def get_shapes(B, M, N, K, transpose_a, transpose_b):
     [32, 4096, 128, 4096],
     [4096, 8, 128, 16384],
     [4096, 8, 16384, 128],
+    [1, 8192, 4096, 4096],
 ]
 
 DEVICE_NAME = torch.xpu.get_device_name()
@@ -281,16 +280,13 @@ def is_enough_memory(x_val):
     return enough_memory
 
 
-if os.getenv('NEW_SHAPES', '1') == '1':
-    X_VALS += NEW_X_VALS
 X_VALS = [x_val for x_val in X_VALS if is_enough_memory(x_val)]
 
 
 def get_benchmark(
     providers_filter: Optional[list[str]] = None,
     transpose_a=False,
     transpose_b=False,
-    new_shapes=False,
     matmul_kernel=matmul_kernel_with_block_pointers,
     matmul_kernel_batched=matmul_kernel_with_block_pointers_batched,
     plot_name='matmul-performance',
@@ -303,10 +299,8 @@ def get_benchmark(
         'triton': 'Triton',
         'onednn': 'OneDNN',
     }
-    # use_xetla and use_cutlass
+    # use_cutlass
     if not (transpose_a or transpose_b):
-        if not new_shapes:
-            supported_providers['xetla'] = 'XeTLA'
         supported_providers['cutlass'] = 'CUTLASS'
     providers = benchmark_suite.filter_providers(supported_providers, providers_filter)
 
@@ -457,6 +451,5 @@ def cutlass_invoker():
     _benchmark = get_benchmark(
         transpose_a=(os.getenv('TRANSPOSE_A', '0') == '1'),
         transpose_b=(os.getenv('TRANSPOSE_B', '0') == '1'),
-        new_shapes=(os.getenv('NEW_SHAPES', '1') == '1'),
     )
     _benchmark.run(show_plots=False, print_data=True)
diff --git a/benchmarks/triton_kernels_benchmark/gemm_tensor_desc_benchmark.py b/benchmarks/triton_kernels_benchmark/gemm_tensor_desc_benchmark.py
@@ -117,7 +117,6 @@ def get_benchmark(
     providers_filter: Optional[List[str]] = None,
     transpose_a=False,
     transpose_b=False,
-    new_shapes=True,
 ):
     return gemm_benchmark.get_benchmark(
         providers_filter=providers_filter,
@@ -126,14 +125,12 @@ def get_benchmark(
         plot_name='matmul-tensor-desc-performance',
         transpose_a=transpose_a,
         transpose_b=transpose_b,
-        new_shapes=new_shapes,
     )
 
 
 if __name__ == '__main__':
     _benchmark = get_benchmark(
         transpose_a=(os.getenv('TRANSPOSE_A', '0') == '1'),
         transpose_b=(os.getenv('TRANSPOSE_B', '0') == '1'),
-        new_shapes=(os.getenv('NEW_SHAPES', '1') == '1'),
     )
     _benchmark.run(show_plots=False, print_data=True)
diff --git a/benchmarks/triton_kernels_benchmark/gemm_tensor_of_ptr_benchmark.py b/benchmarks/triton_kernels_benchmark/gemm_tensor_of_ptr_benchmark.py
@@ -124,7 +124,6 @@ def get_benchmark(
     providers_filter: Optional[List[str]] = None,
     transpose_a=False,
     transpose_b=False,
-    new_shapes=True,
 ):
     return gemm_benchmark.get_benchmark(
         providers_filter=providers_filter,
@@ -133,14 +132,12 @@ def get_benchmark(
         plot_name='matmul-tensor-of-ptr-performance',
         transpose_a=transpose_a,
         transpose_b=transpose_b,
-        new_shapes=new_shapes,
     )
 
 
 if __name__ == '__main__':
     _benchmark = get_benchmark(
         transpose_a=(os.getenv('TRANSPOSE_A', '0') == '1'),
         transpose_b=(os.getenv('TRANSPOSE_B', '0') == '1'),
-        new_shapes=(os.getenv('NEW_SHAPES', '1') == '1'),
     )
     _benchmark.run(show_plots=False, print_data=True)