Skip to content

Commit a9e92c0

Browse files
authored
Integrate PTI callback interface and build it from sources (#5289)
Closes #3852 The first step to obtaining profiling data in more complex cases where the previous workaround did not work. The assembly of PTI wheels will be organized separately, since PTI is not yet ready for this (there is no setup.py/pyproject.toml). FYI @jfedorov The next immediate step is to improve the pass rate and clean up old code. --------- Signed-off-by: Anatoly Myachev <[email protected]>
1 parent 64f7385 commit a9e92c0

File tree

14 files changed

+476
-44
lines changed

14 files changed

+476
-44
lines changed

.github/pins/pti.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
15a201d25e5659692613b98ee33513263b689101

.github/workflows/build-test-reusable.yml

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -285,9 +285,22 @@ jobs:
285285
run: |
286286
echo "TRITON_TEST_CMD=${{ needs.build.outputs.test-triton-command }}" | tee -a $GITHUB_ENV
287287
288-
- name: Run Proton tests
288+
- name: Build PTI && Run Proton tests
289289
if: matrix.suite == 'rest' && inputs.driver_version == 'rolling' && inputs.device == 'max1100'
290290
run: |
291+
PTI_COMMIT_ID="$(<.github/pins/pti.txt)"
292+
git clone https://github.com/intel/pti-gpu.git
293+
cd pti-gpu
294+
git checkout $PTI_COMMIT_ID
295+
cd sdk
296+
cmake --preset linux-icpx-release
297+
BUILD_TESTING=1 PTI_BUILD_SAMPLES=1 cmake --build --preset linux-icpx-release
298+
299+
PTI_LIBS_DIR="$(pwd)/build-linux-icpx-release/lib/"
300+
cd ../..
301+
302+
export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
303+
export TRITON_XPUPTI_LIB_PATH=$PTI_LIBS_DIR
291304
cd third_party/proton/test
292305
# FIXME: enable 'test_record.py' back
293306
pytest test_api.py test_lib.py test_profile.py test_viewer.py -s -v

.github/workflows/triton-benchmarks.yml

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -116,9 +116,24 @@ jobs:
116116
cd benchmarks
117117
pip install .
118118
119+
- name: Build PTI from source
120+
run: |
121+
PTI_COMMIT_ID="$(<.github/pins/pti.txt)"
122+
git clone https://github.com/intel/pti-gpu.git
123+
cd pti-gpu
124+
git checkout $PTI_COMMIT_ID
125+
cd sdk
126+
cmake --preset linux-icpx-release
127+
BUILD_TESTING=1 PTI_BUILD_SAMPLES=1 cmake --build --preset linux-icpx-release
128+
129+
PTI_LIBS_DIR="$(pwd)/build-linux-icpx-release/lib/"
130+
ls $PTI_LIBS_DIR
131+
echo "PTI_LIBS_DIR=$PTI_LIBS_DIR" >> $GITHUB_ENV
132+
119133
- name: Run Triton Softmax kernel benchmark
120134
if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'fused_softmax.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'fused_softmax.py') }}
121135
run: |
136+
export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
122137
cd benchmarks/triton_kernels_benchmark
123138
python fused_softmax.py --reports $REPORTS --n_runs $N_RUNS
124139
source ../../scripts/capture-hw-details.sh
@@ -129,6 +144,7 @@ jobs:
129144
- name: Run Triton GEMM kernel benchmark
130145
if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_benchmark.py') }}
131146
run: |
147+
export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
132148
cd benchmarks/triton_kernels_benchmark
133149
python gemm_benchmark.py --reports $REPORTS --n_runs $N_RUNS
134150
mv $REPORTS/matmul-performance.csv $REPORTS/matmul-performance-base.csv
@@ -142,6 +158,7 @@ jobs:
142158
- name: Run Triton GEMM kernel benchmark - with tensor of pointer
143159
if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_tensor_of_ptr_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_tensor_of_ptr_benchmark.py') }}
144160
run: |
161+
export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
145162
cd benchmarks/triton_kernels_benchmark
146163
python gemm_tensor_of_ptr_benchmark.py --reports $REPORTS --n_runs $N_RUNS
147164
source ../../scripts/capture-hw-details.sh
@@ -154,6 +171,7 @@ jobs:
154171
- name: Run Triton GEMM kernel benchmark - with tensor descriptor
155172
if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_tensor_desc_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_tensor_desc_benchmark.py') }}
156173
run: |
174+
export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
157175
cd benchmarks/triton_kernels_benchmark
158176
python gemm_tensor_desc_benchmark.py --reports $REPORTS --n_runs $N_RUNS
159177
source ../../scripts/capture-hw-details.sh
@@ -166,6 +184,7 @@ jobs:
166184
- name: Run Triton GEMM (A@B^t) kernel benchmark
167185
if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_benchmark.py_abt')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_benchmark.py_abt') }}
168186
run: |
187+
export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
169188
cd benchmarks/triton_kernels_benchmark
170189
TRANSPOSE_B=1 python gemm_benchmark.py --reports $REPORTS --n_runs $N_RUNS
171190
mv $REPORTS/matmul-performance.csv $REPORTS/matmul-performance-bt.csv
@@ -177,6 +196,7 @@ jobs:
177196
- name: Run Triton GEMM (A^t@B) kernel benchmark
178197
if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_benchmark.py_atb')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_benchmark.py_atb') }}
179198
run: |
199+
export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
180200
cd benchmarks/triton_kernels_benchmark
181201
TRANSPOSE_A=1 python gemm_benchmark.py --reports $REPORTS --n_runs $N_RUNS
182202
mv $REPORTS/matmul-performance.csv $REPORTS/matmul-performance-at.csv
@@ -188,6 +208,7 @@ jobs:
188208
- name: Run Triton GEMM (stream-k) kernel benchmark
189209
if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_streamk_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_streamk_benchmark.py') }}
190210
run: |
211+
export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
191212
cd benchmarks/triton_kernels_benchmark
192213
python gemm_streamk_benchmark.py --reports $REPORTS --n_runs $N_RUNS
193214
source ../../scripts/capture-hw-details.sh
@@ -197,6 +218,7 @@ jobs:
197218
- name: Run Triton GEMM (split-k) kernel benchmark
198219
if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_splitk_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_splitk_benchmark.py') }}
199220
run: |
221+
export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
200222
cd benchmarks/triton_kernels_benchmark
201223
python gemm_splitk_benchmark.py --reports $REPORTS --n_runs $N_RUNS
202224
source ../../scripts/capture-hw-details.sh
@@ -206,6 +228,7 @@ jobs:
206228
- name: Run Triton GEMM + PreOp (exp) kernel benchmark
207229
if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_preop_exp_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_preop_exp_benchmark.py') }}
208230
run: |
231+
export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
209232
cd benchmarks/triton_kernels_benchmark
210233
python gemm_preop_exp_benchmark.py --reports $REPORTS --n_runs $N_RUNS
211234
source ../../scripts/capture-hw-details.sh
@@ -214,6 +237,7 @@ jobs:
214237
- name: Run Triton GEMM + PostOp (Gelu) kernel benchmark
215238
if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_postop_gelu_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_postop_gelu_benchmark.py') }}
216239
run: |
240+
export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
217241
cd benchmarks/triton_kernels_benchmark
218242
python gemm_postop_gelu_benchmark.py --reports $REPORTS --n_runs $N_RUNS
219243
source ../../scripts/capture-hw-details.sh
@@ -222,6 +246,7 @@ jobs:
222246
- name: Run Triton GEMM + PostOp (add matrix) kernel benchmark bfloat16
223247
if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_postop_addmatrix_benchmark_bfloat16.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_postop_addmatrix_benchmark_bfloat16.py') }}
224248
run: |
249+
export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
225250
cd benchmarks/triton_kernels_benchmark
226251
python gemm_postop_addmatrix_benchmark.py --reports $REPORTS --n_runs $N_RUNS
227252
source ../../scripts/capture-hw-details.sh
@@ -231,6 +256,7 @@ jobs:
231256
- name: Run Triton GEMM + PostOp (add matrix) kernel benchmark int8
232257
if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_postop_addmatrix_benchmark_int8.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_postop_addmatrix_benchmark_int8.py') }}
233258
run: |
259+
export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
234260
cd benchmarks/triton_kernels_benchmark
235261
INT8_ONLY=1 python gemm_postop_addmatrix_benchmark.py --reports $REPORTS --n_runs $N_RUNS
236262
source ../../scripts/capture-hw-details.sh
@@ -240,6 +266,7 @@ jobs:
240266
- name: Run Triton FA fwd kernel benchmark
241267
if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'flash_attention_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'flash_attention_benchmark.py') }}
242268
run: |
269+
export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
243270
cd benchmarks/triton_kernels_benchmark
244271
python flash_attention_benchmark.py --reports $REPORTS --n_runs $N_RUNS
245272
@@ -250,6 +277,7 @@ jobs:
250277
- name: Run Triton FA bwd kernel benchmark
251278
if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'flash_attention_bwd_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'flash_attention_bwd_benchmark.py') }}
252279
run: |
280+
export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
253281
cd benchmarks/triton_kernels_benchmark
254282
FA_KERNEL_MODE="bwd" \
255283
python flash_attention_benchmark.py --reports $REPORTS --n_runs $N_RUNS
@@ -262,6 +290,7 @@ jobs:
262290
- name: Run Triton FA fwd kernel benchmark - with tensor descriptors
263291
if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'flash_attention_tensor_desc_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'flash_attention_tensor_desc_benchmark.py') }}
264292
run: |
293+
export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
265294
cd benchmarks/triton_kernels_benchmark
266295
python flash_attention_tensor_desc_benchmark.py --reports $REPORTS --n_runs $N_RUNS
267296
mv $REPORTS/attn-performance.csv $REPORTS/attn-tensor-desc-performance.csv
@@ -273,6 +302,7 @@ jobs:
273302
- name: Run Triton FlexAttention Causal Mask fwd kernel benchmark
274303
if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'flex_attention_benchmark_causal_mask.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'flex_attention_benchmark_causal_mask.py') }}
275304
run: |
305+
export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
276306
cd benchmarks/triton_kernels_benchmark
277307
python flex_attention_benchmark_causal_mask.py --reports $REPORTS --n_runs $N_RUNS
278308
@@ -283,6 +313,7 @@ jobs:
283313
- name: Run Triton FlexAttention (batch_size=4) Causal Mask fwd kernel benchmark
284314
if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'flex_attention_benchmark_batch4-causal_mask.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'flex_attention_benchmark_batch4-causal_mask.py') }}
285315
run: |
316+
export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
286317
cd benchmarks/triton_kernels_benchmark
287318
BATCH_SIZE=4 python flex_attention_benchmark_causal_mask.py --reports $REPORTS --n_runs $N_RUNS
288319
@@ -293,6 +324,7 @@ jobs:
293324
- name: Run Triton FlexAttention (batch_size=16) Causal Mask fwd kernel benchmark
294325
if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'flex_attention_benchmark_batch16-causal_mask.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'flex_attention_benchmark_batch16-causal_mask.py') }}
295326
run: |
327+
export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
296328
cd benchmarks/triton_kernels_benchmark
297329
BATCH_SIZE=16 python flex_attention_benchmark_causal_mask.py --reports $REPORTS --n_runs $N_RUNS
298330
@@ -303,6 +335,7 @@ jobs:
303335
- name: Run Triton FlexAttention Custom Masks fwd kernel benchmark
304336
if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'flex_attention_benchmark_custom_masks.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'flex_attention_benchmark_custom_masks.py') }}
305337
run: |
338+
export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
306339
cd benchmarks/triton_kernels_benchmark
307340
python flex_attention_benchmark_custom_masks.py --reports $REPORTS --n_runs $N_RUNS
308341
@@ -316,6 +349,7 @@ jobs:
316349
- name: Run Prefix Sums kernel benchmark
317350
if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'prefix_sums.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'prefix_sums.py') }}
318351
run: |
352+
export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
319353
cd benchmarks/triton_kernels_benchmark
320354
python prefix_sums.py --reports $REPORTS --n_runs $N_RUNS
321355
source ../../scripts/capture-hw-details.sh
@@ -324,6 +358,7 @@ jobs:
324358
- name: Run micro benchmark
325359
if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'micro_benchmarks.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'micro_benchmarks') }}
326360
run: |
361+
export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
327362
cd benchmarks/micro_benchmarks
328363
python run_benchmarks.py --reports $REPORTS
329364

third_party/intel/backend/proton/include/pti/pti.h

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,9 @@ typedef enum {
3131
//!< PTI_VIEW_EXTERNAL_CORRELATION
3232
PTI_ERROR_BAD_TIMESTAMP = 6, //!< error in timestamp conversion, might be related with the user
3333
//!< provided TimestampCallback
34-
PTI_ERROR_BAD_API_ID = 7, //!< invalid api_id when enable/disable runtime/driver specific api_id
34+
PTI_ERROR_BAD_API_ID = 7, //!< invalid api_id when enable/disable runtime/driver specific api_id
35+
PTI_ERROR_NO_GPU_VIEWS_ENABLED = 8, //!< at least one GPU view must be enabled for kernel tracing
36+
3537
PTI_ERROR_DRIVER = 50, //!< unknown driver error
3638
PTI_ERROR_TRACING_NOT_INITIALIZED = 51, //!< installed driver requires tracing enabling with
3739
//!< setting environment variable ZE_ENABLE_TRACING_LAYER
@@ -57,6 +59,25 @@ typedef enum {
5759
*/
5860
PTI_EXPORT const char* ptiResultTypeToString(pti_result result_value);
5961

62+
63+
/**
64+
* @brief Abstraction for backend-specific objects.
65+
*
66+
* Level Zero is currently the only supported backend. However, these types will attempt to serve other backends.
67+
* In case the other backend supported - the same types will serve it.
68+
*/
69+
70+
typedef void* pti_device_handle_t; //!< Device handle
71+
72+
typedef void* pti_backend_ctx_t; //!< Backend context handle
73+
74+
typedef void* pti_backend_queue_t; //!< Backend queue handle
75+
76+
typedef void* pti_backend_evt_t; //!< Backend event handle
77+
78+
typedef void* pti_backend_command_list_t; //!< Backend command list handle
79+
80+
6081
#if defined(__cplusplus)
6182
}
6283
#endif

0 commit comments

Comments
 (0)