Skip to content
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 12 additions & 1 deletion .github/workflows/build-test-reusable.yml
Original file line number Diff line number Diff line change
Expand Up @@ -285,9 +285,20 @@ jobs:
run: |
echo "TRITON_TEST_CMD=${{ needs.build.outputs.test-triton-command }}" | tee -a $GITHUB_ENV

- name: Run Proton tests
- name: Build PTI && Run Proton tests
if: matrix.suite == 'rest' && inputs.driver_version == 'rolling' && inputs.device == 'max1100'
run: |
git clone https://github.com/intel/pti-gpu.git
cd pti-gpu
git checkout 15a201d25e5659692613b98ee33513263b689101
cd sdk
cmake --preset linux-icpx-release
BUILD_TESTING=1 PTI_BUILD_SAMPLES=1 cmake --build --preset linux-icpx-release

PTI_LIBS_DIR="$(pwd)/build-linux-icpx-release/lib/"

export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This version of PTI requires ze loader 1.24.2 dependency, while agama 1146 includes 1.21.* (however 1188 includes 1.24.* so most likely there is no problem with this), which is likely why this variable needs to be used for the tests to work.

export TRITON_XPUPTI_LIB_PATH=$PTI_LIBS_DIR
cd third_party/proton/test
# FIXME: enable 'test_record.py' back
pytest test_api.py test_lib.py test_profile.py test_viewer.py -s -v
Expand Down
35 changes: 35 additions & 0 deletions .github/workflows/triton-benchmarks.yml
Original file line number Diff line number Diff line change
Expand Up @@ -116,9 +116,24 @@ jobs:
cd benchmarks
pip install .

- name: Build PTI from source
id: build-pti
run: |
git clone https://github.com/intel/pti-gpu.git
cd pti-gpu
git checkout 15a201d25e5659692613b98ee33513263b689101
cd sdk
cmake --preset linux-icpx-release
BUILD_TESTING=1 PTI_BUILD_SAMPLES=1 cmake --build --preset linux-icpx-release

PTI_LIBS_DIR="$(pwd)/build-linux-icpx-release/lib/"
ls $PTI_LIBS_DIR
echo "PTI_LIBS_DIR=$PTI_LIBS_DIR" >> $GITHUB_ENV

- name: Run Triton Softmax kernel benchmark
if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'fused_softmax.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'fused_softmax.py') }}
run: |
export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
cd benchmarks/triton_kernels_benchmark
python fused_softmax.py --reports $REPORTS --n_runs $N_RUNS
source ../../scripts/capture-hw-details.sh
Expand All @@ -129,6 +144,7 @@ jobs:
- name: Run Triton GEMM kernel benchmark
if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_benchmark.py') }}
run: |
export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
cd benchmarks/triton_kernels_benchmark
python gemm_benchmark.py --reports $REPORTS --n_runs $N_RUNS
mv $REPORTS/matmul-performance.csv $REPORTS/matmul-performance-base.csv
Expand All @@ -142,6 +158,7 @@ jobs:
- name: Run Triton GEMM kernel benchmark - with tensor of pointer
if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_tensor_of_ptr_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_tensor_of_ptr_benchmark.py') }}
run: |
export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
cd benchmarks/triton_kernels_benchmark
python gemm_tensor_of_ptr_benchmark.py --reports $REPORTS --n_runs $N_RUNS
source ../../scripts/capture-hw-details.sh
Expand All @@ -154,6 +171,7 @@ jobs:
- name: Run Triton GEMM kernel benchmark - with tensor descriptor
if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_tensor_desc_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_tensor_desc_benchmark.py') }}
run: |
export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
cd benchmarks/triton_kernels_benchmark
python gemm_tensor_desc_benchmark.py --reports $REPORTS --n_runs $N_RUNS
source ../../scripts/capture-hw-details.sh
Expand All @@ -166,6 +184,7 @@ jobs:
- name: Run Triton GEMM (A@B^t) kernel benchmark
if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_benchmark.py_abt')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_benchmark.py_abt') }}
run: |
export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
cd benchmarks/triton_kernels_benchmark
TRANSPOSE_B=1 python gemm_benchmark.py --reports $REPORTS --n_runs $N_RUNS
mv $REPORTS/matmul-performance.csv $REPORTS/matmul-performance-bt.csv
Expand All @@ -177,6 +196,7 @@ jobs:
- name: Run Triton GEMM (A^t@B) kernel benchmark
if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_benchmark.py_atb')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_benchmark.py_atb') }}
run: |
export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
cd benchmarks/triton_kernels_benchmark
TRANSPOSE_A=1 python gemm_benchmark.py --reports $REPORTS --n_runs $N_RUNS
mv $REPORTS/matmul-performance.csv $REPORTS/matmul-performance-at.csv
Expand All @@ -188,6 +208,7 @@ jobs:
- name: Run Triton GEMM (stream-k) kernel benchmark
if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_streamk_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_streamk_benchmark.py') }}
run: |
export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
cd benchmarks/triton_kernels_benchmark
python gemm_streamk_benchmark.py --reports $REPORTS --n_runs $N_RUNS
source ../../scripts/capture-hw-details.sh
Expand All @@ -197,6 +218,7 @@ jobs:
- name: Run Triton GEMM (split-k) kernel benchmark
if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_splitk_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_splitk_benchmark.py') }}
run: |
export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
cd benchmarks/triton_kernels_benchmark
python gemm_splitk_benchmark.py --reports $REPORTS --n_runs $N_RUNS
source ../../scripts/capture-hw-details.sh
Expand All @@ -206,6 +228,7 @@ jobs:
- name: Run Triton GEMM + PreOp (exp) kernel benchmark
if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_preop_exp_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_preop_exp_benchmark.py') }}
run: |
export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
cd benchmarks/triton_kernels_benchmark
python gemm_preop_exp_benchmark.py --reports $REPORTS --n_runs $N_RUNS
source ../../scripts/capture-hw-details.sh
Expand All @@ -214,6 +237,7 @@ jobs:
- name: Run Triton GEMM + PostOp (Gelu) kernel benchmark
if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_postop_gelu_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_postop_gelu_benchmark.py') }}
run: |
export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
cd benchmarks/triton_kernels_benchmark
python gemm_postop_gelu_benchmark.py --reports $REPORTS --n_runs $N_RUNS
source ../../scripts/capture-hw-details.sh
Expand All @@ -222,6 +246,7 @@ jobs:
- name: Run Triton GEMM + PostOp (add matrix) kernel benchmark bfloat16
if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_postop_addmatrix_benchmark_bfloat16.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_postop_addmatrix_benchmark_bfloat16.py') }}
run: |
export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
cd benchmarks/triton_kernels_benchmark
python gemm_postop_addmatrix_benchmark.py --reports $REPORTS --n_runs $N_RUNS
source ../../scripts/capture-hw-details.sh
Expand All @@ -231,6 +256,7 @@ jobs:
- name: Run Triton GEMM + PostOp (add matrix) kernel benchmark int8
if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_postop_addmatrix_benchmark_int8.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_postop_addmatrix_benchmark_int8.py') }}
run: |
export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
cd benchmarks/triton_kernels_benchmark
INT8_ONLY=1 python gemm_postop_addmatrix_benchmark.py --reports $REPORTS --n_runs $N_RUNS
source ../../scripts/capture-hw-details.sh
Expand All @@ -240,6 +266,7 @@ jobs:
- name: Run Triton FA fwd kernel benchmark
if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'flash_attention_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'flash_attention_benchmark.py') }}
run: |
export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
cd benchmarks/triton_kernels_benchmark
python flash_attention_benchmark.py --reports $REPORTS --n_runs $N_RUNS

Expand All @@ -250,6 +277,7 @@ jobs:
- name: Run Triton FA bwd kernel benchmark
if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'flash_attention_bwd_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'flash_attention_bwd_benchmark.py') }}
run: |
export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
cd benchmarks/triton_kernels_benchmark
FA_KERNEL_MODE="bwd" \
python flash_attention_benchmark.py --reports $REPORTS --n_runs $N_RUNS
Expand All @@ -262,6 +290,7 @@ jobs:
- name: Run Triton FA fwd kernel benchmark - with tensor descriptors
if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'flash_attention_tensor_desc_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'flash_attention_tensor_desc_benchmark.py') }}
run: |
export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
cd benchmarks/triton_kernels_benchmark
python flash_attention_tensor_desc_benchmark.py --reports $REPORTS --n_runs $N_RUNS
mv $REPORTS/attn-performance.csv $REPORTS/attn-tensor-desc-performance.csv
Expand All @@ -273,6 +302,7 @@ jobs:
- name: Run Triton FlexAttention Causal Mask fwd kernel benchmark
if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'flex_attention_benchmark_causal_mask.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'flex_attention_benchmark_causal_mask.py') }}
run: |
export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
cd benchmarks/triton_kernels_benchmark
python flex_attention_benchmark_causal_mask.py --reports $REPORTS --n_runs $N_RUNS

Expand All @@ -283,6 +313,7 @@ jobs:
- name: Run Triton FlexAttention (batch_size=4) Causal Mask fwd kernel benchmark
if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'flex_attention_benchmark_batch4-causal_mask.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'flex_attention_benchmark_batch4-causal_mask.py') }}
run: |
export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
cd benchmarks/triton_kernels_benchmark
BATCH_SIZE=4 python flex_attention_benchmark_causal_mask.py --reports $REPORTS --n_runs $N_RUNS

Expand All @@ -293,6 +324,7 @@ jobs:
- name: Run Triton FlexAttention (batch_size=16) Causal Mask fwd kernel benchmark
if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'flex_attention_benchmark_batch16-causal_mask.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'flex_attention_benchmark_batch16-causal_mask.py') }}
run: |
export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
cd benchmarks/triton_kernels_benchmark
BATCH_SIZE=16 python flex_attention_benchmark_causal_mask.py --reports $REPORTS --n_runs $N_RUNS

Expand All @@ -303,6 +335,7 @@ jobs:
- name: Run Triton FlexAttention Custom Masks fwd kernel benchmark
if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'flex_attention_benchmark_custom_masks.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'flex_attention_benchmark_custom_masks.py') }}
run: |
export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
cd benchmarks/triton_kernels_benchmark
python flex_attention_benchmark_custom_masks.py --reports $REPORTS --n_runs $N_RUNS

Expand All @@ -316,6 +349,7 @@ jobs:
- name: Run Prefix Sums kernel benchmark
if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'prefix_sums.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'prefix_sums.py') }}
run: |
export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
cd benchmarks/triton_kernels_benchmark
python prefix_sums.py --reports $REPORTS --n_runs $N_RUNS
source ../../scripts/capture-hw-details.sh
Expand All @@ -324,6 +358,7 @@ jobs:
- name: Run micro benchmark
if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'micro_benchmarks.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'micro_benchmarks') }}
run: |
export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
cd benchmarks/micro_benchmarks
python run_benchmarks.py --reports $REPORTS

Expand Down
6 changes: 3 additions & 3 deletions python/tutorials/09-persistent-matmul.py
Original file line number Diff line number Diff line change
Expand Up @@ -673,7 +673,7 @@ def bench_fn(label, reps, warmup_reps, fn, *args):
for _ in range(warmup_reps):
fn(*args)
#FIXME: Enable for XPU once proton support works.
if is_cuda():
if True or is_cuda():
with proton_context():
for _ in range(reps):
fn(*args)
Expand Down Expand Up @@ -783,11 +783,11 @@ def show_profile(precision, profile_name):

validate(32, 32, 32, dtype)
validate(8192, 8192, args.K_range[0], dtype)
if is_cuda():
if True or is_cuda():
proton.start("matmul", hook="triton")
proton.deactivate()
for K in range(args.K_range[0], args.K_range[1] + 1, args.K_step):
bench(K, dtype)
if is_cuda():
if True or is_cuda():
proton.finalize()
show_profile(args.prec, "matmul")
23 changes: 22 additions & 1 deletion third_party/intel/backend/proton/include/pti/pti.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,9 @@ typedef enum {
//!< PTI_VIEW_EXTERNAL_CORRELATION
PTI_ERROR_BAD_TIMESTAMP = 6, //!< error in timestamp conversion, might be related with the user
//!< provided TimestampCallback
PTI_ERROR_BAD_API_ID = 7, //!< invalid api_id when enable/disable runtime/driver specific api_id
PTI_ERROR_BAD_API_ID = 7, //!< invalid api_id when enable/disable runtime/driver specific api_id
PTI_ERROR_NO_GPU_VIEWS_ENABLED = 8, //!< at least one GPU view must be enabled for kernel tracing

PTI_ERROR_DRIVER = 50, //!< unknown driver error
PTI_ERROR_TRACING_NOT_INITIALIZED = 51, //!< installed driver requires tracing enabling with
//!< setting environment variable ZE_ENABLE_TRACING_LAYER
Expand All @@ -57,6 +59,25 @@ typedef enum {
*/
PTI_EXPORT const char* ptiResultTypeToString(pti_result result_value);


/**
* @brief Abstraction for backend-specific objects.
*
* Level Zero is currently the only supported backend. However, these types will attempt to serve other backends.
* In case the other backend supported - the same types will serve it.
*/

typedef void* pti_device_handle_t; //!< Device handle

typedef void* pti_backend_ctx_t; //!< Backend context handle

typedef void* pti_backend_queue_t; //!< Backend queue handle

typedef void* pti_backend_evt_t; //!< Backend event handle

typedef void* pti_backend_command_list_t; //!< Backend command list handle


#if defined(__cplusplus)
}
#endif
Expand Down
Loading
Loading