From 2ea9563fc389102cf00ceb8e75de04910b689f29 Mon Sep 17 00:00:00 2001
From: Anatoly Myachev <anatoly.myachev@intel.com>
Date: Fri, 10 Oct 2025 14:13:21 +0000
Subject: [PATCH 1/9] [DEBUG] Integrate PTI callback interface and build it
 from sources

Signed-off-by: Anatoly Myachev <anatoly.myachev@intel.com>
---
 .github/workflows/triton-benchmarks.yml       |  15 ++
 python/tutorials/09-persistent-matmul.py      |   6 +-
 .../intel/backend/proton/include/pti/pti.h    |  23 +-
 .../backend/proton/include/pti/pti_callback.h | 234 ++++++++++++++++++
 .../pti/pti_driver_levelzero_api_ids.h        |  15 +-
 .../backend/proton/include/pti/pti_metrics.h  |   2 -
 .../include/pti/pti_runtime_sycl_api_ids.h    |  49 +++-
 .../backend/proton/include/pti/pti_version.h  |   8 +-
 .../backend/proton/include/pti/pti_view.h     |   8 +-
 .../csrc/include/Driver/GPU/XpuptiApi.h       |  17 ++
 .../proton/csrc/lib/Driver/GPU/XpuptiApi.cpp  |  13 +
 .../lib/Profiler/Xpupti/XpuptiProfiler.cpp    | 111 ++++++++-
 third_party/proton/test/test_profile.py       |  22 +-
 13 files changed, 477 insertions(+), 46 deletions(-)
 create mode 100644 third_party/intel/backend/proton/include/pti/pti_callback.h

diff --git a/.github/workflows/triton-benchmarks.yml b/.github/workflows/triton-benchmarks.yml
index 449a13bd46..904d8aef18 100644
--- a/.github/workflows/triton-benchmarks.yml
+++ b/.github/workflows/triton-benchmarks.yml
@@ -116,9 +116,24 @@ jobs:
           cd benchmarks
           pip install .
 
+      - name: Build PTI from source
+        id: build-pti
+        run: |
+          git clone https://github.com/intel/pti-gpu.git
+          cd pti-gpu
+          git checkout 15a201d25e5659692613b98ee33513263b689101
+          cd sdk
+          cmake --preset linux-icpx-release
+          BUILD_TESTING=1 PTI_BUILD_SAMPLES=1 cmake --build --preset linux-icpx-release
+
+          PTI_LIBS_DIR="$(pwd)/build-linux-icpx-release/lib/"
+          ls $PTI_LIBS_DIR
+          echo "PTI_LIBS_DIR=$PTI_LIBS_DIR" >> $GITHUB_ENV
+
       - name: Run Triton Softmax kernel benchmark
         if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'fused_softmax.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'fused_softmax.py') }}
         run: |
+          export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
           cd benchmarks/triton_kernels_benchmark
           python fused_softmax.py --reports $REPORTS --n_runs $N_RUNS
           source ../../scripts/capture-hw-details.sh
diff --git a/python/tutorials/09-persistent-matmul.py b/python/tutorials/09-persistent-matmul.py
index f354769615..93cf85241f 100644
--- a/python/tutorials/09-persistent-matmul.py
+++ b/python/tutorials/09-persistent-matmul.py
@@ -673,7 +673,7 @@ def bench_fn(label, reps, warmup_reps, fn, *args):
     for _ in range(warmup_reps):
         fn(*args)
     #FIXME: Enable for XPU once proton support works.
-    if is_cuda():
+    if True or is_cuda():
         with proton_context():
             for _ in range(reps):
                 fn(*args)
@@ -783,11 +783,11 @@ def show_profile(precision, profile_name):
 
         validate(32, 32, 32, dtype)
         validate(8192, 8192, args.K_range[0], dtype)
-        if is_cuda():
+        if True or is_cuda():
             proton.start("matmul", hook="triton")
             proton.deactivate()
         for K in range(args.K_range[0], args.K_range[1] + 1, args.K_step):
             bench(K, dtype)
-        if is_cuda():
+        if True or is_cuda():
             proton.finalize()
             show_profile(args.prec, "matmul")
diff --git a/third_party/intel/backend/proton/include/pti/pti.h b/third_party/intel/backend/proton/include/pti/pti.h
index 3bd6a3d363..512a839154 100644
--- a/third_party/intel/backend/proton/include/pti/pti.h
+++ b/third_party/intel/backend/proton/include/pti/pti.h
@@ -31,7 +31,9 @@ typedef enum {
                                           //!< PTI_VIEW_EXTERNAL_CORRELATION
   PTI_ERROR_BAD_TIMESTAMP = 6,            //!< error in timestamp conversion, might be related with the user
                                           //!< provided TimestampCallback
-  PTI_ERROR_BAD_API_ID = 7,               //!< invalid api_id when enable/disable runtime/driver specific api_id 
+  PTI_ERROR_BAD_API_ID = 7,               //!< invalid api_id when enable/disable runtime/driver specific api_id
+  PTI_ERROR_NO_GPU_VIEWS_ENABLED = 8,     //!< at least one GPU view must be enabled for kernel tracing
+
   PTI_ERROR_DRIVER = 50,                  //!< unknown driver error
   PTI_ERROR_TRACING_NOT_INITIALIZED = 51,  //!< installed driver requires tracing enabling with
                                            //!< setting environment variable ZE_ENABLE_TRACING_LAYER
@@ -57,6 +59,25 @@ typedef enum {
  */
 PTI_EXPORT const char* ptiResultTypeToString(pti_result result_value);
 
+
+/**
+ * @brief Abstraction for backend-specific objects.
+ *
+ * Level Zero is currently the only supported backend. However, these types will attempt to serve other backends.
+ * In case the other backend supported - the same types will serve it.
+ */
+
+typedef void* pti_device_handle_t;  //!< Device handle
+
+typedef void* pti_backend_ctx_t;    //!< Backend context handle
+
+typedef void* pti_backend_queue_t;  //!< Backend queue handle
+
+typedef void* pti_backend_evt_t;    //!< Backend event handle
+
+typedef void* pti_backend_command_list_t; //!< Backend command list handle
+
+
 #if defined(__cplusplus)
 }
 #endif
diff --git a/third_party/intel/backend/proton/include/pti/pti_callback.h b/third_party/intel/backend/proton/include/pti/pti_callback.h
new file mode 100644
index 0000000000..0659cec7fa
--- /dev/null
+++ b/third_party/intel/backend/proton/include/pti/pti_callback.h
@@ -0,0 +1,234 @@
+//==============================================================
+// Copyright (C) Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+// =============================================================
+
+#ifndef PTI_CALLBACK_H_
+#define PTI_CALLBACK_H_
+
+#include <stdint.h>
+
+#include "pti/pti.h"
+#include "pti/pti_view.h"
+
+/**
+ * This file contains APIs that are so far experimental in PTI.
+ * APIs and data structures in this file are work-in-progress and subject to change!
+ * All content in this file concerns the Callback API.
+ *
+ * The Callback API is useful for many purposes,
+ * including the implementation of `MetricsScope` functionality that needs to subscribe to
+ * domains such as kernel append to a command list, and potentially other domains.
+ * The `MetricsScope` API is under development and is the first (internal) user of the Callback API.
+ */
+
+
+/* clang-format off */
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+typedef struct _pti_callback_subscriber* pti_callback_subscriber_handle;
+
+typedef enum _pti_callback_domain {
+  PTI_CB_DOMAIN_INVALID                            = 0,
+  PTI_CB_DOMAIN_DRIVER_CONTEXT_CREATED             = 1, //!< Not implemented yet
+                                                     //!< attempt to enable it will return PTI_ERROR_NOT_IMPLEMENTED
+
+  PTI_CB_DOMAIN_DRIVER_MODULE_LOADED               = 2, //!< Not implemented yet
+                                                     //!< attempt to enable it will return PTI_ERROR_NOT_IMPLEMENTED
+
+  PTI_CB_DOMAIN_DRIVER_MODULE_UNLOADED             = 3, //!< Not implemented yet
+                                                     //!< attempt to enable it will return PTI_ERROR_NOT_IMPLEMENTED
+
+  PTI_CB_DOMAIN_DRIVER_GPU_OPERATION_APPENDED      = 4, //!< Synchronous callback
+                                                     //!< This also serves as PTI_CB_DOMAIN_DRIVER_GPU_OPERATION_DISPATCHED
+                                                     //!< when appended to Immediate Command List,
+                                                     //!< which means no separate callback PTI_CB_DOMAIN_DRIVER_GPU_OPERATION_DISPATCHED
+
+  PTI_CB_DOMAIN_DRIVER_GPU_OPERATION_DISPATCHED    = 5, //!< Not implemented yet
+                                                     //!< attempt to enable it will return PTI_ERROR_NOT_IMPLEMENTED
+
+  PTI_CB_DOMAIN_DRIVER_GPU_OPERATION_COMPLETED     = 6, //!< Asynchronous callback, always has only EXIT phase of some API,
+                                                      //!< where completed operations are collected and reported
+
+  PTI_CB_DOMAIN_DRIVER_HOST_SYNCHRONIZATION        = 7, //!< Not implemented yet
+                                                     //!< attempt to enable it will return PTI_ERROR_NOT_IMPLEMENTED
+
+  PTI_CB_DOMAIN_DRIVER_API                         = 1023, //!< Not implemented yet,
+                                                        //!< attempt to enable it will return PTI_ERROR_NOT_IMPLEMENTED
+                                                        //!< Callback created for all Driver APIs
+  // below domains to inform user about PTI internal events
+  PTI_CB_DOMAIN_INTERNAL_THREADS                = 1024, //!< Not implemented yet
+  PTI_CB_DOMAIN_INTERNAL_EVENT                  = 1025, //!< Not implemented yet
+
+  PTI_CB_DOMAIN_MAX                             = 0x7fffffff
+} pti_callback_domain;
+
+typedef enum _pti_callback_phase {
+  PTI_CB_PHASE_INVALID                 = 0,
+  PTI_CB_PHASE_API_ENTER               = 1,
+  PTI_CB_PHASE_API_EXIT                = 2,
+  PTI_CB_PHASE_INTERNAL_THREAD_START   = 3,
+  PTI_CB_PHASE_INTERNAL_THREAD_END     = 4,
+  PTI_CB_PHASE_INTERNAL_EVENT          = 5,
+
+  PTI_CB_PHASE_MAX                     = 0x7fffffff
+} pti_callback_phase;
+
+typedef enum _pti_backend_command_list_type {
+  PTI_BACKEND_COMMAND_LIST_TYPE_UNKNOWN   = (1<<0),
+  PTI_BACKEND_COMMAND_LIST_TYPE_IMMEDIATE = (1<<1),
+  PTI_BACKEND_COMMAND_LIST_TYPE_MUTABLE   = (1<<2),
+
+  PTI_BACKEND_COMMAND_LIST_TYPE_MAX       = 0x7fffffff
+} pti_backend_command_list_type;
+
+/**
+ * A user can subscribe to notifications about non-standard situations from PTI
+ * when it collects or processes the data
+ */
+typedef enum _pti_internal_event_type {
+  PTI_INTERNAL_EVENT_TYPE_INFO       = 0,
+  PTI_INTERNAL_EVENT_TYPE_WARNING    = 1, // one or a few records data inconsistencies, or other
+                                          // collection is safe to continue
+  PTI_INTERNAL_EVENT_TYPE_CRITICAL   = 2, // critical error after which further collected data are invalid
+
+  PTI_INTERNAL_EVENT_TYPE_MAX        = 0x7fffffff
+} pti_internal_event_type;
+
+typedef enum _pti_gpu_operation_kind {
+  PTI_GPU_OPERATION_KIND_INVALID         = 0,
+  PTI_GPU_OPERATION_KIND_KERNEL          = 1,
+  PTI_GPU_OPERATION_KIND_MEMORY          = 2,
+  PTI_GPU_OPERATION_KIND_OTHER           = 3,
+
+  PTI_GPU_OPERATION_KIND_MAX             = 0x7fffffff
+} pti_gpu_operation_kind;
+
+typedef struct _pti_gpu_op_details {
+  pti_gpu_operation_kind             _operation_kind; //<! Kind of the operation: kernel, mem op
+  uint64_t                           _operation_id;   //<! GPU kernel or memory operation instance ID,
+                                                      //<! uniquely throughout the process
+  uint64_t                           _kernel_handle;  //!< a handle uniquely identifying kernel object as
+                                                      //!< contained in the module at the specific offset
+                                                      //!< it will be zero in case of not implemented yet or
+                                                      //!< for memory operations
+  const char*                        _name;           //!< symbolic name of a kernel or memcpy operation
+} pti_gpu_op_details;
+
+typedef struct _pti_callback_gpu_op_data {
+  pti_callback_domain             _domain;              //!< domain of the callback
+  pti_backend_command_list_type   _cmd_list_properties; //!< immediate, mutable,..
+  pti_backend_command_list_t      _cmd_list_handle;     //!< Device back-end command list handle,
+                                                        //!< could be nullptr if unknown or
+                                                        //!< when several operations with different command lists
+                                                        //!< reported together
+  pti_backend_queue_t             _queue_handle;        //!< Device back-end queue handle,
+                                                        //!< could be nullptr if unknown
+                                                        //!< when several operations with different command lists
+                                                        //!< reported together
+  pti_device_handle_t             _device_handle;       //!< Device handle
+  pti_callback_phase              _phase;               //!< PTI_CB_PHASE_API_ENTER/EXIT
+  uint32_t                        _return_code;         //!< will be valid only for L0 API EXIT, for others will be zero
+  uint32_t                  _correlation_id;    //!< ID that corresponds to the same call reported by View API records
+  uint32_t                  _operation_count;   //!< number of operations appended or dispatched to the GPU
+  pti_gpu_op_details*       _operation_details; //!< pointer to details of operation(s) appended, dispatched or completed
+} pti_callback_gpu_op_data;
+
+typedef struct _pti_internal_callback_data {
+  pti_callback_domain  _domain;       //!< domain of the callback
+  pti_callback_phase   _phase;        //!< THREAD START/END or INTERNAL EVENT
+  uint32_t             _detail;       //!< depending on the domain should be casted/interpreted
+                                      //!< as a purpose of an internal PTI thread or
+                                      //!< pti_internal_event_type
+  const char*          _message;      //!< explains details
+} pti_internal_callback_data;
+
+typedef void (*pti_callback_function)(
+  pti_callback_domain  domain,
+  pti_api_group_id     driver_api_group_id, //!< driver API group ID, keep it to distinguish between L0 and OpenCL
+                                            //!< although the current implementation is only for L0
+  uint32_t             driver_api_id,
+  pti_backend_ctx_t    backend_context,     //!< Driver (L0) level context handle
+  void*                cb_data, //!< depending on the domain, it should be type-casted to the pointer
+                                //!< to either pti_callback_gpu_op_data, pti_internal_callback_data,
+                                //!< or to other types to be defined
+  void*                global_user_data,    //!< Any global data defined by user returned
+                                            //!< to every callback from a same subscriber
+  void**         instance_user_data);       //!< Data that could be passed between ENTER and EXIT
+                                            //!< phases of one API call
+
+/**
+ * Callback API functions
+ * None of the PTI API functions should be called from within a Callback function.
+ * Exceptions are helper functions that return character representations of enums.
+ */
+
+/**
+ * @brief Initialize Callback subscriber
+ *
+ * @param subscriber - subscriber handle
+ * @param callback   - pointer to the callback function
+ * @param user_data  - user data to be passed to the callback function
+ * @return pti_result
+ */
+pti_result PTI_EXPORT
+ptiCallbackSubscribe(pti_callback_subscriber_handle* subscriber,
+                     pti_callback_function    callback,
+                     void* user_data);
+
+/**
+ * @brief Unsubscribe Callback subscriber. This unsubscribes from all domains, disables the callback,
+ *        cleans up all resources related to the subscriber handle, and invalidates the handle.
+ */
+pti_result PTI_EXPORT
+ptiCallbackUnsubscribe(pti_callback_subscriber_handle subscriber);
+
+/**
+ * @brief Enables callbacks on specific domain
+ *
+ * @param subscriber - subscriber handle
+ * @param domain     - domain to enable
+ * @param enter_cb   - indicate if callback called on enter/start: 0-no, 1-yes; used only for domains with 2 phases
+ * @param exit_cb    - indicates if callback is called on exit/end: 0-no, 1-yes; used only for domains with 2 phases
+ * @return pti_result
+ */
+pti_result PTI_EXPORT
+ptiCallbackEnableDomain(pti_callback_subscriber_handle subscriber,
+                        pti_callback_domain  domain,
+                        uint32_t enter_cb,
+                        uint32_t exit_cb);
+
+/**
+ * @brief Disables callbacks for specific domain
+ */
+pti_result PTI_EXPORT
+ptiCallbackDisableDomain(pti_callback_subscriber_handle subscriber,
+                         pti_callback_domain  domain);
+
+/**
+ * @brief Disables the callback of the subscriber for all domains
+ */
+pti_result PTI_EXPORT
+ptiCallbackDisableAllDomains(pti_callback_subscriber_handle subscriber);
+
+/**
+ * @brief Helper function to return stringified enum members for pti_callback_domain
+ *
+ * @return const char*
+ */
+PTI_EXPORT const char* ptiCallbackDomainTypeToString(pti_callback_domain domain);
+
+/**
+ * @brief Helper function to return stringified enum members for pti_callback_phase
+ *
+ * @return const char*
+ */
+PTI_EXPORT const char* ptiCallbackPhaseTypeToString(pti_callback_phase phase);
+
+#if defined(__cplusplus)
+}
+#endif
+#endif  // PTI_CALLBACK_H_
diff --git a/third_party/intel/backend/proton/include/pti/pti_driver_levelzero_api_ids.h b/third_party/intel/backend/proton/include/pti/pti_driver_levelzero_api_ids.h
index 500a58e87d..07a6d8b2a8 100644
--- a/third_party/intel/backend/proton/include/pti/pti_driver_levelzero_api_ids.h
+++ b/third_party/intel/backend/proton/include/pti/pti_driver_levelzero_api_ids.h
@@ -10,10 +10,10 @@
 // ========= This file is autogenerated - do not modify ========
 // ========= Api file version used for generation: =============
 //    ApiFile: ze_api.h
-// ApiVersion: * @version v1.12-r1.12.15
+// ApiVersion: * @version v1.13-r1.13.1
 
 //             https://github.com/oneapi-src/level-zero.git
-//             commit: d7a44e0303722e754e711227e0334aae3fa52f9d - v1.20.2
+//             commit: ff8c99d4abda00fba6d92548a9cb2f721764d9d0 - v1.24.2
 
 
 typedef enum _pti_api_id_driver_levelzero {
@@ -212,6 +212,17 @@ typedef enum _pti_api_id_driver_levelzero {
     zeFabricVertexGetDeviceExp_id=192,
     zelTracerSetEnabled_id=193,
     zelTracerCreate_id=194,
+    zeRTASBuilderCreateExt_id=195,
+    zeRTASBuilderGetBuildPropertiesExt_id=196,
+    zeRTASBuilderBuildExt_id=197,
+    zeRTASBuilderCommandListAppendCopyExt_id=198,
+    zeRTASBuilderDestroyExt_id=199,
+    zeRTASParallelOperationCreateExt_id=200,
+    zeRTASParallelOperationGetPropertiesExt_id=201,
+    zeRTASParallelOperationJoinExt_id=202,
+    zeRTASParallelOperationDestroyExt_id=203,
+    zeDriverRTASFormatCompatibilityCheckExt_id=204,
+    zeDeviceGetVectorWidthPropertiesExt_id=205,
  } pti_api_id_driver_levelzero;
 
 #endif
diff --git a/third_party/intel/backend/proton/include/pti/pti_metrics.h b/third_party/intel/backend/proton/include/pti/pti_metrics.h
index d4dbf41de9..df9efdc6c6 100644
--- a/third_party/intel/backend/proton/include/pti/pti_metrics.h
+++ b/third_party/intel/backend/proton/include/pti/pti_metrics.h
@@ -20,8 +20,6 @@ extern "C" {
                                  DEVICE
 *****************************************************************************/
 
-typedef void* pti_device_handle_t;  //!< Abstraction of device within PTI
-
 typedef struct _pti_pci_properties_t {
   uint8_t _domain;
   uint8_t _bus;
diff --git a/third_party/intel/backend/proton/include/pti/pti_runtime_sycl_api_ids.h b/third_party/intel/backend/proton/include/pti/pti_runtime_sycl_api_ids.h
index a07ed127b3..579f5328f6 100644
--- a/third_party/intel/backend/proton/include/pti/pti_runtime_sycl_api_ids.h
+++ b/third_party/intel/backend/proton/include/pti/pti_runtime_sycl_api_ids.h
@@ -10,7 +10,7 @@
 // ========= This file is autogenerated - do not modify ========
 // ========= Api file version used for generation: =============
 //    ApiFile: ur_api.h
-// ApiVersion:* @version v0.10-r0
+// ApiVersion:* @version v0.12-r0
 //             https://github.com/oneapi-src/unified-runtime.git
 //             
 
@@ -213,7 +213,52 @@ typedef enum _pti_api_id_runtime_sycl {
     urEnqueueNativeCommandExp_id=228,
     urLoaderConfigSetMockingEnabled_id=229,
     urBindlessImagesReleaseExternalMemoryExp_id=230,
-    urBindlessImagesMapExternalLinearMemoryExp_id=231,
+    urCommandBufferAppendUSMMemcpyExp_v2_id=231,
+    urCommandBufferAppendUSMFillExp_v2_id=232,
+    urCommandBufferAppendMemBufferCopyExp_v2_id=233,
+    urCommandBufferAppendMemBufferWriteExp_v2_id=234,
+    urCommandBufferAppendMemBufferReadExp_v2_id=235,
+    urCommandBufferAppendMemBufferCopyRectExp_v2_id=236,
+    urCommandBufferAppendMemBufferWriteRectExp_v2_id=237,
+    urCommandBufferAppendMemBufferReadRectExp_v2_id=238,
+    urCommandBufferAppendMemBufferFillExp_v2_id=239,
+    urCommandBufferAppendUSMPrefetchExp_v2_id=240,
+    urCommandBufferAppendUSMAdviseExp_v2_id=241,
+    urEnqueueCommandBufferExp_id=242,
+    urCommandBufferUpdateSignalEventExp_id=243,
+    urCommandBufferUpdateWaitEventsExp_id=244,
+    urBindlessImagesMapExternalLinearMemoryExp_id=245,
+    urEnqueueEventsWaitWithBarrierExt_id=246,
+    urTensorMapEncodeIm2ColExp_id=247,
+    urTensorMapEncodeTiledExp_id=248,
+    urPhysicalMemGetInfo_id=249,
+    urEnqueueUSMDeviceAllocExp_id=250,
+    urEnqueueUSMSharedAllocExp_id=251,
+    urEnqueueUSMHostAllocExp_id=252,
+    urEnqueueUSMFreeExp_id=253,
+    urUSMPoolCreateExp_id=254,
+    urUSMPoolDestroyExp_id=255,
+    urUSMPoolSetThresholdExp_id=256,
+    urUSMPoolGetDefaultDevicePoolExp_id=257,
+    urUSMPoolSetDevicePoolExp_id=259,
+    urUSMPoolGetDevicePoolExp_id=260,
+    urUSMPoolTrimToExp_id=261,
+    urUSMPoolGetInfoExp_id=262,
+    urCommandBufferAppendNativeCommandExp_id=263,
+    urCommandBufferGetNativeHandleExp_id=264,
+    urUSMPoolSetInfoExp_id=265,
+    urAdapterSetLoggerCallback_id=266,
+    urAdapterSetLoggerCallbackLevel_id=267,
+    urBindlessImagesGetImageUnsampledHandleSupportExp_id=268,
+    urBindlessImagesGetImageSampledHandleSupportExp_id=269,
+    urBindlessImagesGetImageMemoryHandleTypeSupportExp_id=270,
+    urBindlessImagesFreeMappedLinearMemoryExp_id=271,
+    urKernelSuggestMaxCooperativeGroupCount_id=272,
+    urUSMContextMemcpyExp_id=273,
+    urMemoryExportAllocExportableMemoryExp_id=285,
+    urMemoryExportFreeExportableMemoryExp_id=286,
+    urMemoryExportExportMemoryHandleExp_id=287,
+    urBindlessImagesSupportsImportingHandleTypeExp_id=288,
  } pti_api_id_runtime_sycl;
 
 #endif
diff --git a/third_party/intel/backend/proton/include/pti/pti_version.h b/third_party/intel/backend/proton/include/pti/pti_version.h
index 114e5ffb6c..583d2fe6c4 100644
--- a/third_party/intel/backend/proton/include/pti/pti_version.h
+++ b/third_party/intel/backend/proton/include/pti/pti_version.h
@@ -16,13 +16,13 @@ extern "C" {
 #endif
 
 #if !defined(PTI_VERSION)
-#define PTI_VERSION 0.13.1
+#define PTI_VERSION 0.14.0
 #endif
 
-#define PTI_VERSION_STRING "0.13.1"
+#define PTI_VERSION_STRING "0.14.0"
 #define PTI_VERSION_MAJOR 0
-#define PTI_VERSION_MINOR 13
-#define PTI_VERSION_PATCH 1
+#define PTI_VERSION_MINOR 14
+#define PTI_VERSION_PATCH 0
 
 typedef struct pti_version {
   uint32_t _major;
diff --git a/third_party/intel/backend/proton/include/pti/pti_view.h b/third_party/intel/backend/proton/include/pti/pti_view.h
index df4ef6d65d..9021125b94 100644
--- a/third_party/intel/backend/proton/include/pti/pti_view.h
+++ b/third_party/intel/backend/proton/include/pti/pti_view.h
@@ -23,7 +23,7 @@ extern "C" {
  * @brief const defines.
  */
 #define PTI_MAX_PCI_ADDRESS_SIZE 16                         //!< Size of pci address array.
-#define PTI_INVALID_QUEUE_ID 0xFFFFFFFFFFFFFFFF-1           //!< For oneAPI versions earlier than 2024.1.1 -- UINT64_MAX-1
+#define PTI_INVALID_QUEUE_ID 0xFFFFFFFFFFFFFFFF-1           //!< Indicates a missing sycl queue id. UINT64_MAX-1
 
 /**
  * @brief Kinds of software and hardware operations to be tracked and viewed,
@@ -149,12 +149,6 @@ typedef enum _pti_api_group_id {
                                                            //!< -- you will get all classes *now* and in the *future*!
  } pti_api_class;
 
-typedef void* pti_backend_queue_t; //!< Backend queue handle
-
-typedef void* pti_backend_ctx_t; //!< Backend context handle
-
-typedef void* pti_backend_evt_t; //!< Backend event handle
-
 /**
  * @brief Base View record type
  */
diff --git a/third_party/proton/csrc/include/Driver/GPU/XpuptiApi.h b/third_party/proton/csrc/include/Driver/GPU/XpuptiApi.h
index f28cb9fed7..f9b8fbe4c4 100644
--- a/third_party/proton/csrc/include/Driver/GPU/XpuptiApi.h
+++ b/third_party/proton/csrc/include/Driver/GPU/XpuptiApi.h
@@ -1,6 +1,7 @@
 #ifndef PROTON_DRIVER_GPU_XPUPTI_H_
 #define PROTON_DRIVER_GPU_XPUPTI_H_
 
+#include <pti/pti_callback.h>
 #include <pti/pti_view.h>
 
 namespace proton {
@@ -15,6 +16,22 @@ template <bool CheckSuccess> pti_result viewDisable(pti_view_kind kind);
 
 template <bool CheckSuccess> pti_result viewFlushAll();
 
+template <bool CheckSuccess>
+pti_result subscribe(pti_callback_subscriber_handle *subscriber,
+                     pti_callback_function callback, void *user_data);
+
+template <bool CheckSuccess>
+pti_result unsubscribe(pti_callback_subscriber_handle subscriber);
+
+template <bool CheckSuccess>
+pti_result enableDomain(pti_callback_subscriber_handle subscriber,
+                        pti_callback_domain domain, uint32_t enter_cb,
+                        uint32_t exit_cb);
+
+template <bool CheckSuccess>
+pti_result disableDomain(pti_callback_subscriber_handle subscriber,
+                         pti_callback_domain domain);
+
 template <bool CheckSuccess>
 pti_result viewGetNextRecord(uint8_t *buffer, size_t valid_bytes,
                              pti_view_record_base **record);
diff --git a/third_party/proton/csrc/lib/Driver/GPU/XpuptiApi.cpp b/third_party/proton/csrc/lib/Driver/GPU/XpuptiApi.cpp
index 726199781d..bd180533aa 100644
--- a/third_party/proton/csrc/lib/Driver/GPU/XpuptiApi.cpp
+++ b/third_party/proton/csrc/lib/Driver/GPU/XpuptiApi.cpp
@@ -24,6 +24,19 @@ DEFINE_DISPATCH(ExternLibXpupti, viewDisable, ptiViewDisable, pti_view_kind)
 
 DEFINE_DISPATCH(ExternLibXpupti, viewFlushAll, ptiFlushAllViews)
 
+DEFINE_DISPATCH(ExternLibXpupti, subscribe, ptiCallbackSubscribe,
+                pti_callback_subscriber_handle *, pti_callback_function, void *)
+
+DEFINE_DISPATCH(ExternLibXpupti, unsubscribe, ptiCallbackUnsubscribe,
+                pti_callback_subscriber_handle);
+
+DEFINE_DISPATCH(ExternLibXpupti, enableDomain, ptiCallbackEnableDomain,
+                pti_callback_subscriber_handle, pti_callback_domain, uint32_t,
+                uint32_t);
+
+DEFINE_DISPATCH(ExternLibXpupti, disableDomain, ptiCallbackDisableDomain,
+                pti_callback_subscriber_handle, pti_callback_domain);
+
 DEFINE_DISPATCH(ExternLibXpupti, viewGetNextRecord, ptiViewGetNextRecord,
                 uint8_t *, size_t, pti_view_record_base **)
 
diff --git a/third_party/proton/csrc/lib/Profiler/Xpupti/XpuptiProfiler.cpp b/third_party/proton/csrc/lib/Profiler/Xpupti/XpuptiProfiler.cpp
index 48ba83f564..749794d3d6 100644
--- a/third_party/proton/csrc/lib/Profiler/Xpupti/XpuptiProfiler.cpp
+++ b/third_party/proton/csrc/lib/Profiler/Xpupti/XpuptiProfiler.cpp
@@ -264,14 +264,19 @@ struct XpuptiProfiler::XpuptiProfilerPimpl
 
   static void allocBuffer(uint8_t **buffer, size_t *bufferSize);
   static void completeBuffer(uint8_t *buffer, size_t size, size_t validSize);
-  /*
-  static void callbackFn(void *userData, CUpti_CallbackDomain domain,
-                         CUpti_CallbackId cbId, const void *cbData);
-  */
+  // static void callbackFn(void *userData, CUpti_CallbackDomain domain,
+  //                        CUpti_CallbackId cbId, const void *cbData);
+  static void callbackFn(pti_callback_domain domain,
+                         pti_api_group_id driver_api_group_id,
+                         uint32_t driver_api_id,
+                         pti_backend_ctx_t backend_context, void *cb_data,
+                         void *global_user_data, void **instance_user_data);
 
   static constexpr size_t AlignSize = 8;
   static constexpr size_t BufferSize = 64 * 1024 * 1024;
 
+  pti_callback_subscriber_handle subscriber;
+
   /*
   static constexpr size_t AttributeSize = sizeof(size_t);
 
@@ -327,6 +332,79 @@ void XpuptiProfiler::XpuptiProfilerPimpl::completeBuffer(uint8_t *buffer,
   profiler.correlation.complete(maxCorrelationId);
 }
 
+void XpuptiProfiler::XpuptiProfilerPimpl::callbackFn(
+    pti_callback_domain domain, pti_api_group_id driver_api_group_id,
+    uint32_t driver_api_id, pti_backend_ctx_t backend_context, void *cb_data,
+    void *global_user_data, void **instance_user_data) {
+  std::cout << "callback\n" << std::flush;
+  pti_callback_gpu_op_data *callback_data =
+      static_cast<pti_callback_gpu_op_data *>(cb_data);
+  if (callback_data == nullptr) {
+    std::cerr << "CallbackGPUOperationAppend: callback_data is null"
+              << std::endl;
+    return;
+  }
+  if (callback_data->_phase == PTI_CB_PHASE_API_ENTER) {
+    threadState.enterOp();
+    threadState.profiler.correlation.correlate(callback_data->_correlation_id,
+                                               1);
+  } else if (callback_data->_phase == PTI_CB_PHASE_API_EXIT) {
+    threadState.exitOp();
+    threadState.profiler.correlation.submit(callback_data->_correlation_id);
+  } else {
+    throw std::runtime_error("[PROTON] callbackFn failed");
+  }
+
+  /*                        switch (domain) {
+    case PTI_DOMAIN_DRIVER_GPU_OPERATION_APPEND:
+      CallbackGPUOperationAppend(domain, driver_api_group_id, driver_api_id,
+                                backend_context, cb_data, user_data);
+      break;
+    case PTI_DOMAIN_GPU_OPERATION_COMPLETED:
+      CallbackGPUOperationCompletion(domain, driver_api_group_id, driver_api_id,
+                                   backend_context, cb_data, user_data);
+      break;
+    default: {
+      std::cout << "In " << __func__ << ", domain: " << domain
+                << ", driver_group_id: " << driver_api_group_id
+                << ", driver_api_id: " << driver_api_id << std::endl;
+
+      const char* name_ptr = nullptr;
+
+      if (PTI_SUCCESS == ptiViewGetApiIdName(driver_api_group_id, driver_api_id,
+  &name_ptr) ) { std::cout << ", API name: " << name_ptr << std::endl; } else {
+        std::cout << ", Unknown API name" << std::endl;
+      }
+      break;
+    }
+  }
+  std::cout << std::endl;
+  */
+}
+
+void CallbackCommon(pti_callback_domain domain,
+                    pti_api_group_id driver_group_id, uint32_t driver_api_id,
+                    [[maybe_unused]] pti_backend_ctx_t backend_context,
+                    [[maybe_unused]] void *cb_data,
+                    [[maybe_unused]] void *user_data) {
+
+  switch (domain) {
+  case PTI_CB_DOMAIN_DRIVER_GPU_OPERATION_APPENDED:
+    std::cout << "PTI_CB_DOMAIN_DRIVER_GPU_OPERATION_APPENDED\n" << std::flush;
+    break;
+  case PTI_CB_DOMAIN_DRIVER_GPU_OPERATION_COMPLETED:
+    std::cout << "PTI_CB_DOMAIN_DRIVER_GPU_OPERATION_COMPLETED\n" << std::flush;
+    break;
+  default: {
+    std::cout << "In " << __func__ << ", domain: " << domain
+              << ", driver_group_id: " << driver_group_id
+              << ", driver_api_id: " << driver_api_id << std::endl;
+    break;
+  }
+  }
+  std::cout << std::endl;
+}
+
 zel_tracer_handle_t tracer = nullptr;
 
 typedef void (*EnumDeviceUUIDsFunc)(std::vector<std::array<uint8_t, 16>>);
@@ -380,7 +458,7 @@ int callWaitOnSyclQueue(const std::string &utils_cache_path, void *syclQueue) {
 }
 
 void XpuptiProfiler::XpuptiProfilerPimpl::doStart() {
-  // xpupti::subscribe<true>(&subscriber, callbackFn, nullptr);
+  std::cout << "doStart\n" << std::flush;
   // should be call to shared lib
   XpuptiProfiler &profiler = threadState.profiler;
   if (profiler.utils_cache_path != "") {
@@ -389,13 +467,13 @@ void XpuptiProfiler::XpuptiProfilerPimpl::doStart() {
   // auto res = ptiViewPushExternalCorrelationId(
   //     pti_view_external_kind::PTI_VIEW_EXTERNAL_KIND_CUSTOM_1, 42);
   //  std::cout << "res: " << res << "\n" << std::flush;
-
+  /*
   ze_result_t status = ZE_RESULT_SUCCESS;
   // status = zeInit(ZE_INIT_FLAG_GPU_ONLY);
   // assert(status == ZE_RESULT_SUCCESS);
 
   zel_tracer_desc_t tracer_desc = {ZEL_STRUCTURE_TYPE_TRACER_DESC, nullptr,
-                                   nullptr /* global user data */};
+                                   nullptr};
 
   status = zelTracerCreate(&tracer_desc, &tracer);
   std::cout << "zelTracerCreate: " << status << "\n" << std::flush;
@@ -417,9 +495,14 @@ void XpuptiProfiler::XpuptiProfilerPimpl::doStart() {
 
   status = zelTracerSetEnabled(tracer, true);
   assert(status == ZE_RESULT_SUCCESS);
+  */
 
   xpupti::viewSetCallbacks<true>(allocBuffer, completeBuffer);
   xpupti::viewEnable<true>(PTI_VIEW_DEVICE_GPU_KERNEL);
+  xpupti::viewEnable<true>(PTI_VIEW_DEVICE_GPU_MEM_FILL);
+  xpupti::viewEnable<true>(PTI_VIEW_DEVICE_GPU_MEM_COPY);
+  std::cout << "doStart2\n" << std::flush;
+  xpupti::subscribe<true>(&subscriber, callbackFn, &subscriber);
   // xpupti::viewEnable<true>(PTI_VIEW_DEVICE_GPU_MEM_COPY);
   // xpupti::viewEnable<true>(PTI_VIEW_DEVICE_GPU_MEM_FILL);
   // xpupti::viewEnable<true>(PTI_VIEW_SYCL_RUNTIME_CALLS);
@@ -428,6 +511,10 @@ void XpuptiProfiler::XpuptiProfilerPimpl::doStart() {
   // xpupti::viewEnable<true>(PTI_VIEW_LEVEL_ZERO_CALLS);
   // setGraphCallbacks(subscriber, /*enable=*/true);
   // setRuntimeCallbacks(subscriber, /*enable=*/true);
+  std::cout << "doStart::enableDomain\n" << std::flush;
+  xpupti::enableDomain<true>(subscriber,
+                             PTI_CB_DOMAIN_DRIVER_GPU_OPERATION_APPENDED, 1, 1);
+  std::cout << "doStart::enableDomain after\n" << std::flush;
   // setDriverCallbacks(subscriber, /*enable=*/true);
 }
 
@@ -444,13 +531,17 @@ void XpuptiProfiler::XpuptiProfilerPimpl::doFlush() {
 }
 
 void XpuptiProfiler::XpuptiProfilerPimpl::doStop() {
+  /*
   ze_result_t status = ZE_RESULT_SUCCESS;
   status = zelTracerSetEnabled(tracer, false);
   assert(status == ZE_RESULT_SUCCESS);
   status = zelTracerDestroy(tracer);
   assert(status == ZE_RESULT_SUCCESS);
+  */
 
   xpupti::viewDisable<true>(PTI_VIEW_DEVICE_GPU_KERNEL);
+  xpupti::viewDisable<true>(PTI_VIEW_DEVICE_GPU_MEM_FILL);
+  xpupti::viewDisable<true>(PTI_VIEW_DEVICE_GPU_MEM_COPY);
   // xpupti::viewDisable<true>(PTI_VIEW_DEVICE_GPU_MEM_COPY);
   // xpupti::viewDisable<true>(PTI_VIEW_DEVICE_GPU_MEM_FILL);
   // xpupti::viewDisable<true>(PTI_VIEW_SYCL_RUNTIME_CALLS);
@@ -460,7 +551,11 @@ void XpuptiProfiler::XpuptiProfilerPimpl::doStop() {
   // setGraphCallbacks(subscriber, /*enable=*/false);
   // setRuntimeCallbacks(subscriber, /*enable=*/false);
   // setDriverCallbacks(subscriber, /*enable=*/false);
-  // cupti::unsubscribe<true>(subscriber);
+  std::cout << "doStop::disableDomain\n" << std::flush;
+  xpupti::disableDomain<true>(subscriber,
+                              PTI_CB_DOMAIN_DRIVER_GPU_OPERATION_APPENDED);
+  std::cout << "doStop::disableDomain after\n" << std::flush;
+  xpupti::unsubscribe<true>(subscriber);
   // cupti::finalize<true>();
 }
 
diff --git a/third_party/proton/test/test_profile.py b/third_party/proton/test/test_profile.py
index f85a4f77d2..15a97dc578 100644
--- a/third_party/proton/test/test_profile.py
+++ b/third_party/proton/test/test_profile.py
@@ -258,8 +258,6 @@ def foo(x, size: tl.constexpr, y):
 
 
 def test_hook_with_third_party(tmp_path: pathlib.Path):
-    if is_xpu():
-        pytest.skip("FIXME: enable")
     third_party_hook_invoked = False
 
     def third_party_hook(metadata) -> None:
@@ -280,7 +278,7 @@ def foo(x, size: tl.constexpr, y):
         offs = tl.arange(0, size)
         tl.store(y + offs, tl.load(x + offs))
 
-    x = torch.tensor([2], device="cuda", dtype=torch.float32)
+    x = torch.tensor([2], device="xpu", dtype=torch.float32)
     y = torch.zeros_like(x)
     temp_file = tmp_path / "test_hook_with_third_party.hatchet"
     proton.start(str(temp_file.with_suffix("")), hook="triton")
@@ -295,8 +293,6 @@ def foo(x, size: tl.constexpr, y):
 
 
 def test_hook_multiple_threads(tmp_path: pathlib.Path):
-    if is_xpu():
-        pytest.skip("FIXME: enable")
 
     def metadata_fn_foo(grid: tuple, metadata: NamedTuple, args: dict):
         return {"name": "foo_test"}
@@ -314,9 +310,9 @@ def bar(x, size: tl.constexpr, y):
         offs = tl.arange(0, size)
         tl.store(y + offs, tl.load(x + offs))
 
-    x_foo = torch.tensor([2], device="cuda", dtype=torch.float32)
+    x_foo = torch.tensor([2], device="xpu", dtype=torch.float32)
     y_foo = torch.zeros_like(x_foo)
-    x_bar = torch.tensor([2], device="cuda", dtype=torch.float32)
+    x_bar = torch.tensor([2], device="xpu", dtype=torch.float32)
     y_bar = torch.zeros_like(x_bar)
 
     temp_file = tmp_path / "test_hook.hatchet"
@@ -410,10 +406,6 @@ def test_deactivate(tmp_path: pathlib.Path):
 
 
 def test_multiple_sessions(tmp_path: pathlib.Path):
-    if is_xpu():
-        # FIXME: the same correlation id, that's why it's filtered,
-        # should `_kernel_id` be used instead
-        pytest.xfail('assert int(data[0]["children"][0]["metrics"]["count"]) == 2')
     temp_file0 = tmp_path / "test_multiple_sessions0.hatchet"
     temp_file1 = tmp_path / "test_multiple_sessions1.hatchet"
     session_id0 = proton.start(str(temp_file0.with_suffix("")))
@@ -439,8 +431,6 @@ def test_multiple_sessions(tmp_path: pathlib.Path):
 
 
 def test_trace(tmp_path: pathlib.Path):
-    if is_xpu():
-        pytest.skip("FIXME: enable")
     temp_file = tmp_path / "test_trace.chrome_trace"
     proton.start(str(temp_file.with_suffix("")), data="trace")
 
@@ -450,7 +440,7 @@ def foo(x, y, size: tl.constexpr):
         tl.store(y + offs, tl.load(x + offs))
 
     with proton.scope("init"):
-        x = torch.ones((1024, ), device="cuda", dtype=torch.float32)
+        x = torch.ones((1024, ), device="xpu", dtype=torch.float32)
         y = torch.zeros_like(x)
 
     with proton.scope("test"):
@@ -467,8 +457,6 @@ def foo(x, y, size: tl.constexpr):
 
 
 def test_scope_multiple_threads(tmp_path: pathlib.Path):
-    if is_xpu():
-        pytest.skip("FIXME: enable")
     temp_file = tmp_path / "test_scope_threads.hatchet"
     proton.start(str(temp_file.with_suffix("")))
 
@@ -479,7 +467,7 @@ def worker(prefix: str):
         for i in range(N):
             name = f"{prefix}_{i}"
             proton.enter_scope(name)
-            torch.ones((1, ), device="cuda")
+            torch.ones((1, ), device="xpu")
             proton.exit_scope()
 
     threads = [threading.Thread(target=worker, args=(tname, )) for tname in thread_names]

From 1642cd3b22078190fc995fbd7b1e279e6d2a1f23 Mon Sep 17 00:00:00 2001
From: Anatoly Myachev <anatoly.myachev@intel.com>
Date: Fri, 10 Oct 2025 15:07:48 +0000
Subject: [PATCH 2/9] fix UTs

Signed-off-by: Anatoly Myachev <anatoly.myachev@intel.com>
---
 .github/workflows/build-test-reusable.yml | 12 ++++++++++++
 .github/workflows/triton-benchmarks.yml   | 20 ++++++++++++++++++++
 2 files changed, 32 insertions(+)

diff --git a/.github/workflows/build-test-reusable.yml b/.github/workflows/build-test-reusable.yml
index 722963a9a1..62003d9f86 100644
--- a/.github/workflows/build-test-reusable.yml
+++ b/.github/workflows/build-test-reusable.yml
@@ -123,6 +123,17 @@ jobs:
         run: |
           echo TRITON_BUILD_PROTON_XPU=1 | tee -a $GITHUB_ENV
 
+          git clone https://github.com/intel/pti-gpu.git
+          cd pti-gpu
+          git checkout 15a201d25e5659692613b98ee33513263b689101
+          cd sdk
+          cmake --preset linux-icpx-release
+          BUILD_TESTING=1 PTI_BUILD_SAMPLES=1 cmake --build --preset linux-icpx-release
+
+          PTI_LIBS_DIR="$(pwd)/build-linux-icpx-release/lib/"
+          ls $PTI_LIBS_DIR
+          echo "PTI_LIBS_DIR=$PTI_LIBS_DIR" >> $GITHUB_ENV
+
       - name: Build Triton
         uses: ./.github/actions/setup-triton
         with:
@@ -288,6 +299,7 @@ jobs:
       - name: Run Proton tests
         if: matrix.suite == 'rest' && inputs.driver_version == 'rolling' && inputs.device == 'max1100'
         run: |
+          export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
           cd third_party/proton/test
           # FIXME: enable 'test_record.py' back
           pytest test_api.py test_lib.py test_profile.py test_viewer.py -s -v
diff --git a/.github/workflows/triton-benchmarks.yml b/.github/workflows/triton-benchmarks.yml
index 904d8aef18..a101677bf8 100644
--- a/.github/workflows/triton-benchmarks.yml
+++ b/.github/workflows/triton-benchmarks.yml
@@ -144,6 +144,7 @@ jobs:
       - name: Run Triton GEMM kernel benchmark
         if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_benchmark.py') }}
         run: |
+          export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
           cd benchmarks/triton_kernels_benchmark
           python gemm_benchmark.py --reports $REPORTS --n_runs $N_RUNS
           mv $REPORTS/matmul-performance.csv $REPORTS/matmul-performance-base.csv
@@ -157,6 +158,7 @@ jobs:
       - name: Run Triton GEMM kernel benchmark - with tensor of pointer
         if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_tensor_of_ptr_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_tensor_of_ptr_benchmark.py') }}
         run: |
+          export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
           cd benchmarks/triton_kernels_benchmark
           python gemm_tensor_of_ptr_benchmark.py --reports $REPORTS --n_runs $N_RUNS
           source ../../scripts/capture-hw-details.sh
@@ -169,6 +171,7 @@ jobs:
       - name: Run Triton GEMM kernel benchmark - with tensor descriptor
         if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_tensor_desc_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_tensor_desc_benchmark.py') }}
         run: |
+          export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
           cd benchmarks/triton_kernels_benchmark
           python gemm_tensor_desc_benchmark.py --reports $REPORTS --n_runs $N_RUNS
           source ../../scripts/capture-hw-details.sh
@@ -181,6 +184,7 @@ jobs:
       - name: Run Triton GEMM (A@B^t) kernel benchmark
         if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_benchmark.py_abt')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_benchmark.py_abt') }}
         run: |
+          export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
           cd benchmarks/triton_kernels_benchmark
           TRANSPOSE_B=1 python gemm_benchmark.py --reports $REPORTS --n_runs $N_RUNS
           mv $REPORTS/matmul-performance.csv $REPORTS/matmul-performance-bt.csv
@@ -192,6 +196,7 @@ jobs:
       - name: Run Triton GEMM (A^t@B) kernel benchmark
         if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_benchmark.py_atb')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_benchmark.py_atb') }}
         run: |
+          export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
           cd benchmarks/triton_kernels_benchmark
           TRANSPOSE_A=1 python gemm_benchmark.py --reports $REPORTS --n_runs $N_RUNS
           mv $REPORTS/matmul-performance.csv $REPORTS/matmul-performance-at.csv
@@ -203,6 +208,7 @@ jobs:
       - name: Run Triton GEMM (stream-k) kernel benchmark
         if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_streamk_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_streamk_benchmark.py') }}
         run: |
+          export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
           cd benchmarks/triton_kernels_benchmark
           python gemm_streamk_benchmark.py --reports $REPORTS --n_runs $N_RUNS
           source ../../scripts/capture-hw-details.sh
@@ -212,6 +218,7 @@ jobs:
       - name: Run Triton GEMM (split-k) kernel benchmark
         if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_splitk_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_splitk_benchmark.py') }}
         run: |
+          export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
           cd benchmarks/triton_kernels_benchmark
           python gemm_splitk_benchmark.py --reports $REPORTS --n_runs $N_RUNS
           source ../../scripts/capture-hw-details.sh
@@ -221,6 +228,7 @@ jobs:
       - name: Run Triton GEMM + PreOp (exp) kernel benchmark
         if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_preop_exp_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_preop_exp_benchmark.py') }}
         run: |
+          export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
           cd benchmarks/triton_kernels_benchmark
           python gemm_preop_exp_benchmark.py --reports $REPORTS --n_runs $N_RUNS
           source ../../scripts/capture-hw-details.sh
@@ -229,6 +237,7 @@ jobs:
       - name: Run Triton GEMM + PostOp (Gelu) kernel benchmark
         if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_postop_gelu_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_postop_gelu_benchmark.py') }}
         run: |
+          export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
           cd benchmarks/triton_kernels_benchmark
           python gemm_postop_gelu_benchmark.py --reports $REPORTS --n_runs $N_RUNS
           source ../../scripts/capture-hw-details.sh
@@ -237,6 +246,7 @@ jobs:
       - name: Run Triton GEMM + PostOp (add matrix) kernel benchmark bfloat16
         if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_postop_addmatrix_benchmark_bfloat16.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_postop_addmatrix_benchmark_bfloat16.py') }}
         run: |
+          export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
           cd benchmarks/triton_kernels_benchmark
           python gemm_postop_addmatrix_benchmark.py --reports $REPORTS --n_runs $N_RUNS
           source ../../scripts/capture-hw-details.sh
@@ -246,6 +256,7 @@ jobs:
       - name: Run Triton GEMM + PostOp (add matrix) kernel benchmark int8
         if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_postop_addmatrix_benchmark_int8.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_postop_addmatrix_benchmark_int8.py') }}
         run: |
+          export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
           cd benchmarks/triton_kernels_benchmark
           INT8_ONLY=1 python gemm_postop_addmatrix_benchmark.py --reports $REPORTS --n_runs $N_RUNS
           source ../../scripts/capture-hw-details.sh
@@ -255,6 +266,7 @@ jobs:
       - name: Run Triton FA fwd kernel benchmark
         if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'flash_attention_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'flash_attention_benchmark.py') }}
         run: |
+          export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
           cd benchmarks/triton_kernels_benchmark
           python flash_attention_benchmark.py --reports $REPORTS --n_runs $N_RUNS
 
@@ -265,6 +277,7 @@ jobs:
       - name: Run Triton FA bwd kernel benchmark
         if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'flash_attention_bwd_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'flash_attention_bwd_benchmark.py') }}
         run: |
+          export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
           cd benchmarks/triton_kernels_benchmark
           FA_KERNEL_MODE="bwd" \
             python flash_attention_benchmark.py --reports $REPORTS --n_runs $N_RUNS
@@ -277,6 +290,7 @@ jobs:
       - name: Run Triton FA fwd kernel benchmark - with tensor descriptors
         if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'flash_attention_tensor_desc_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'flash_attention_tensor_desc_benchmark.py') }}
         run: |
+          export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
           cd benchmarks/triton_kernels_benchmark
           python flash_attention_tensor_desc_benchmark.py --reports $REPORTS --n_runs $N_RUNS
           mv $REPORTS/attn-performance.csv $REPORTS/attn-tensor-desc-performance.csv
@@ -288,6 +302,7 @@ jobs:
       - name: Run Triton FlexAttention Causal Mask fwd kernel benchmark
         if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'flex_attention_benchmark_causal_mask.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'flex_attention_benchmark_causal_mask.py') }}
         run: |
+          export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
           cd benchmarks/triton_kernels_benchmark
           python flex_attention_benchmark_causal_mask.py --reports $REPORTS --n_runs $N_RUNS
 
@@ -298,6 +313,7 @@ jobs:
       - name: Run Triton FlexAttention (batch_size=4) Causal Mask fwd kernel benchmark
         if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'flex_attention_benchmark_batch4-causal_mask.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'flex_attention_benchmark_batch4-causal_mask.py') }}
         run: |
+          export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
           cd benchmarks/triton_kernels_benchmark
           BATCH_SIZE=4 python flex_attention_benchmark_causal_mask.py --reports $REPORTS --n_runs $N_RUNS
 
@@ -308,6 +324,7 @@ jobs:
       - name: Run Triton FlexAttention (batch_size=16) Causal Mask fwd kernel benchmark
         if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'flex_attention_benchmark_batch16-causal_mask.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'flex_attention_benchmark_batch16-causal_mask.py') }}
         run: |
+          export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
           cd benchmarks/triton_kernels_benchmark
           BATCH_SIZE=16 python flex_attention_benchmark_causal_mask.py --reports $REPORTS --n_runs $N_RUNS
 
@@ -318,6 +335,7 @@ jobs:
       - name: Run Triton FlexAttention Custom Masks fwd kernel benchmark
         if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'flex_attention_benchmark_custom_masks.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'flex_attention_benchmark_custom_masks.py') }}
         run: |
+          export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
           cd benchmarks/triton_kernels_benchmark
           python flex_attention_benchmark_custom_masks.py --reports $REPORTS --n_runs $N_RUNS
 
@@ -331,6 +349,7 @@ jobs:
       - name: Run Prefix Sums kernel benchmark
         if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'prefix_sums.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'prefix_sums.py') }}
         run: |
+          export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
           cd benchmarks/triton_kernels_benchmark
           python prefix_sums.py --reports $REPORTS --n_runs $N_RUNS
           source ../../scripts/capture-hw-details.sh
@@ -339,6 +358,7 @@ jobs:
       - name: Run micro benchmark
         if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'micro_benchmarks.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'micro_benchmarks') }}
         run: |
+          export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
           cd benchmarks/micro_benchmarks
           python run_benchmarks.py --reports $REPORTS
 

From 144ab7725d933f85653dafb0a9639a79242dcc7d Mon Sep 17 00:00:00 2001
From: Anatoly Myachev <anatoly.myachev@intel.com>
Date: Mon, 13 Oct 2025 08:39:24 +0000
Subject: [PATCH 3/9] add TRITON_XPUPTI_LIB_PATH

Signed-off-by: Anatoly Myachev <anatoly.myachev@intel.com>
---
 .github/workflows/build-test-reusable.yml            | 1 +
 third_party/proton/csrc/lib/Driver/GPU/XpuptiApi.cpp | 1 +
 2 files changed, 2 insertions(+)

diff --git a/.github/workflows/build-test-reusable.yml b/.github/workflows/build-test-reusable.yml
index 62003d9f86..336d4cf42c 100644
--- a/.github/workflows/build-test-reusable.yml
+++ b/.github/workflows/build-test-reusable.yml
@@ -300,6 +300,7 @@ jobs:
         if: matrix.suite == 'rest' && inputs.driver_version == 'rolling' && inputs.device == 'max1100'
         run: |
           export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
+          export TRITON_XPUPTI_LIB_PATH=$PTI_LIBS_DIR
           cd third_party/proton/test
           # FIXME: enable 'test_record.py' back
           pytest test_api.py test_lib.py test_profile.py test_viewer.py -s -v
diff --git a/third_party/proton/csrc/lib/Driver/GPU/XpuptiApi.cpp b/third_party/proton/csrc/lib/Driver/GPU/XpuptiApi.cpp
index bd180533aa..c618661372 100644
--- a/third_party/proton/csrc/lib/Driver/GPU/XpuptiApi.cpp
+++ b/third_party/proton/csrc/lib/Driver/GPU/XpuptiApi.cpp
@@ -10,6 +10,7 @@ struct ExternLibXpupti : public ExternLibBase {
   using RetType = pti_result;
   static constexpr const char *name = "libpti_view.so";
   static constexpr const char *defaultDir = "";
+  static constexpr const char *pathEnv = "TRITON_XPUPTI_LIB_PATH";
   static constexpr RetType success = PTI_SUCCESS;
   static void *lib;
 };

From 17de806175d352f156f891ac7241917e87e4db46 Mon Sep 17 00:00:00 2001
From: Anatoly Myachev <anatoly.myachev@intel.com>
Date: Mon, 13 Oct 2025 09:18:18 +0000
Subject: [PATCH 4/9] another commit

Signed-off-by: Anatoly Myachev <anatoly.myachev@intel.com>
---
 .github/workflows/build-test-reusable.yml | 2 +-
 .github/workflows/triton-benchmarks.yml   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/build-test-reusable.yml b/.github/workflows/build-test-reusable.yml
index 336d4cf42c..4c5893220a 100644
--- a/.github/workflows/build-test-reusable.yml
+++ b/.github/workflows/build-test-reusable.yml
@@ -125,7 +125,7 @@ jobs:
 
           git clone https://github.com/intel/pti-gpu.git
           cd pti-gpu
-          git checkout 15a201d25e5659692613b98ee33513263b689101
+          git checkout 998e192cb39994a3e4790a86250ea0c480ae416c
           cd sdk
           cmake --preset linux-icpx-release
           BUILD_TESTING=1 PTI_BUILD_SAMPLES=1 cmake --build --preset linux-icpx-release
diff --git a/.github/workflows/triton-benchmarks.yml b/.github/workflows/triton-benchmarks.yml
index a101677bf8..be054d2f9c 100644
--- a/.github/workflows/triton-benchmarks.yml
+++ b/.github/workflows/triton-benchmarks.yml
@@ -121,7 +121,7 @@ jobs:
         run: |
           git clone https://github.com/intel/pti-gpu.git
           cd pti-gpu
-          git checkout 15a201d25e5659692613b98ee33513263b689101
+          git checkout 998e192cb39994a3e4790a86250ea0c480ae416c
           cd sdk
           cmake --preset linux-icpx-release
           BUILD_TESTING=1 PTI_BUILD_SAMPLES=1 cmake --build --preset linux-icpx-release

From cee025e63e4189182c5dc679e03e6996d060553e Mon Sep 17 00:00:00 2001
From: Anatoly Myachev <anatoly.myachev@intel.com>
Date: Mon, 13 Oct 2025 09:55:58 +0000
Subject: [PATCH 5/9] Revert "another commit"

This reverts commit 17de806175d352f156f891ac7241917e87e4db46.
---
 .github/workflows/build-test-reusable.yml | 2 +-
 .github/workflows/triton-benchmarks.yml   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/build-test-reusable.yml b/.github/workflows/build-test-reusable.yml
index 4c5893220a..336d4cf42c 100644
--- a/.github/workflows/build-test-reusable.yml
+++ b/.github/workflows/build-test-reusable.yml
@@ -125,7 +125,7 @@ jobs:
 
           git clone https://github.com/intel/pti-gpu.git
           cd pti-gpu
-          git checkout 998e192cb39994a3e4790a86250ea0c480ae416c
+          git checkout 15a201d25e5659692613b98ee33513263b689101
           cd sdk
           cmake --preset linux-icpx-release
           BUILD_TESTING=1 PTI_BUILD_SAMPLES=1 cmake --build --preset linux-icpx-release
diff --git a/.github/workflows/triton-benchmarks.yml b/.github/workflows/triton-benchmarks.yml
index be054d2f9c..a101677bf8 100644
--- a/.github/workflows/triton-benchmarks.yml
+++ b/.github/workflows/triton-benchmarks.yml
@@ -121,7 +121,7 @@ jobs:
         run: |
           git clone https://github.com/intel/pti-gpu.git
           cd pti-gpu
-          git checkout 998e192cb39994a3e4790a86250ea0c480ae416c
+          git checkout 15a201d25e5659692613b98ee33513263b689101
           cd sdk
           cmake --preset linux-icpx-release
           BUILD_TESTING=1 PTI_BUILD_SAMPLES=1 cmake --build --preset linux-icpx-release

From 44e3dd32bec2f3b91e1a85ce5f14bb7985c80d25 Mon Sep 17 00:00:00 2001
From: Anatoly Myachev <anatoly.myachev@intel.com>
Date: Mon, 13 Oct 2025 10:12:44 +0000
Subject: [PATCH 6/9] passing PTI libs dir via 'GITHUB_OUTPUT'

Signed-off-by: Anatoly Myachev <anatoly.myachev@intel.com>
---
 .github/workflows/build-test-reusable.yml | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/build-test-reusable.yml b/.github/workflows/build-test-reusable.yml
index 336d4cf42c..823a97a2bc 100644
--- a/.github/workflows/build-test-reusable.yml
+++ b/.github/workflows/build-test-reusable.yml
@@ -83,6 +83,7 @@ jobs:
         shell: bash -noprofile --norc -eo pipefail -c "source /opt/intel/oneapi/setvars.sh > /dev/null; source {0}"
     outputs:
       test-triton-command: ${{ steps.test-triton.outputs.command }}
+      pti_libs_dir: ${{ steps.build-proton.outputs.pti_libs_dir }}
     steps:
       - name: Print inputs
         run: |
@@ -120,6 +121,7 @@ jobs:
 
       - name: Build Proton with XPU support
         if: inputs.driver_version == 'rolling' && inputs.device == 'max1100'
+        id: build-proton
         run: |
           echo TRITON_BUILD_PROTON_XPU=1 | tee -a $GITHUB_ENV
 
@@ -132,7 +134,7 @@ jobs:
 
           PTI_LIBS_DIR="$(pwd)/build-linux-icpx-release/lib/"
           ls $PTI_LIBS_DIR
-          echo "PTI_LIBS_DIR=$PTI_LIBS_DIR" >> $GITHUB_ENV
+          echo "pti_libs_dir=$PTI_LIBS_DIR" >> $GITHUB_OUTPUT
 
       - name: Build Triton
         uses: ./.github/actions/setup-triton
@@ -299,6 +301,7 @@ jobs:
       - name: Run Proton tests
         if: matrix.suite == 'rest' && inputs.driver_version == 'rolling' && inputs.device == 'max1100'
         run: |
+          export "PTI_LIBS_DIR=${{ needs.build.outputs.pti_libs_dir }}"
           export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
           export TRITON_XPUPTI_LIB_PATH=$PTI_LIBS_DIR
           cd third_party/proton/test

From a34823bae1b536b90962d7952a692b3593cddfb6 Mon Sep 17 00:00:00 2001
From: Anatoly Myachev <anatoly.myachev@intel.com>
Date: Mon, 13 Oct 2025 12:29:31 +0000
Subject: [PATCH 7/9] just move PTI build into 'integration-tests' job

Signed-off-by: Anatoly Myachev <anatoly.myachev@intel.com>
---
 .github/workflows/build-test-reusable.yml | 25 +++++++++--------------
 1 file changed, 10 insertions(+), 15 deletions(-)

diff --git a/.github/workflows/build-test-reusable.yml b/.github/workflows/build-test-reusable.yml
index 823a97a2bc..35bfb47acb 100644
--- a/.github/workflows/build-test-reusable.yml
+++ b/.github/workflows/build-test-reusable.yml
@@ -83,7 +83,6 @@ jobs:
         shell: bash -noprofile --norc -eo pipefail -c "source /opt/intel/oneapi/setvars.sh > /dev/null; source {0}"
     outputs:
       test-triton-command: ${{ steps.test-triton.outputs.command }}
-      pti_libs_dir: ${{ steps.build-proton.outputs.pti_libs_dir }}
     steps:
       - name: Print inputs
         run: |
@@ -121,21 +120,9 @@ jobs:
 
       - name: Build Proton with XPU support
         if: inputs.driver_version == 'rolling' && inputs.device == 'max1100'
-        id: build-proton
         run: |
           echo TRITON_BUILD_PROTON_XPU=1 | tee -a $GITHUB_ENV
 
-          git clone https://github.com/intel/pti-gpu.git
-          cd pti-gpu
-          git checkout 15a201d25e5659692613b98ee33513263b689101
-          cd sdk
-          cmake --preset linux-icpx-release
-          BUILD_TESTING=1 PTI_BUILD_SAMPLES=1 cmake --build --preset linux-icpx-release
-
-          PTI_LIBS_DIR="$(pwd)/build-linux-icpx-release/lib/"
-          ls $PTI_LIBS_DIR
-          echo "pti_libs_dir=$PTI_LIBS_DIR" >> $GITHUB_OUTPUT
-
       - name: Build Triton
         uses: ./.github/actions/setup-triton
         with:
@@ -298,10 +285,18 @@ jobs:
         run: |
           echo "TRITON_TEST_CMD=${{ needs.build.outputs.test-triton-command }}" | tee -a $GITHUB_ENV
 
-      - name: Run Proton tests
+      - name: Build PTI && Run Proton tests
         if: matrix.suite == 'rest' && inputs.driver_version == 'rolling' && inputs.device == 'max1100'
         run: |
-          export "PTI_LIBS_DIR=${{ needs.build.outputs.pti_libs_dir }}"
+          git clone https://github.com/intel/pti-gpu.git
+          cd pti-gpu
+          git checkout 15a201d25e5659692613b98ee33513263b689101
+          cd sdk
+          cmake --preset linux-icpx-release
+          BUILD_TESTING=1 PTI_BUILD_SAMPLES=1 cmake --build --preset linux-icpx-release
+
+          PTI_LIBS_DIR="$(pwd)/build-linux-icpx-release/lib/"
+
           export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
           export TRITON_XPUPTI_LIB_PATH=$PTI_LIBS_DIR
           cd third_party/proton/test

From 9064ee986b063513454a9d554a65ec79ab7e8dfd Mon Sep 17 00:00:00 2001
From: Anatoly Myachev <anatoly.myachev@intel.com>
Date: Mon, 13 Oct 2025 15:12:13 +0200
Subject: [PATCH 8/9] Apply suggestion from @anmyachev

---
 .github/workflows/build-test-reusable.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/build-test-reusable.yml b/.github/workflows/build-test-reusable.yml
index 35bfb47acb..b30835c27f 100644
--- a/.github/workflows/build-test-reusable.yml
+++ b/.github/workflows/build-test-reusable.yml
@@ -296,6 +296,7 @@ jobs:
           BUILD_TESTING=1 PTI_BUILD_SAMPLES=1 cmake --build --preset linux-icpx-release
 
           PTI_LIBS_DIR="$(pwd)/build-linux-icpx-release/lib/"
+          cd ../..
 
           export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
           export TRITON_XPUPTI_LIB_PATH=$PTI_LIBS_DIR

From fd37e3630b0ec574b1dfb388e5453d4754ab421c Mon Sep 17 00:00:00 2001
From: Anatoly Myachev <anatoly.myachev@intel.com>
Date: Mon, 13 Oct 2025 14:41:50 +0000
Subject: [PATCH 9/9] cleanup && add pti.txt

Signed-off-by: Anatoly Myachev <anatoly.myachev@intel.com>
---
 .github/pins/pti.txt                          |  1 +
 .github/workflows/build-test-reusable.yml     |  3 +-
 .github/workflows/triton-benchmarks.yml       |  4 +--
 python/tutorials/09-persistent-matmul.py      |  6 ++--
 .../lib/Profiler/Xpupti/XpuptiProfiler.cpp    | 34 -------------------
 5 files changed, 8 insertions(+), 40 deletions(-)
 create mode 100644 .github/pins/pti.txt

diff --git a/.github/pins/pti.txt b/.github/pins/pti.txt
new file mode 100644
index 0000000000..ca98925e9d
--- /dev/null
+++ b/.github/pins/pti.txt
@@ -0,0 +1 @@
+15a201d25e5659692613b98ee33513263b689101
diff --git a/.github/workflows/build-test-reusable.yml b/.github/workflows/build-test-reusable.yml
index b30835c27f..7fe0ddaa6e 100644
--- a/.github/workflows/build-test-reusable.yml
+++ b/.github/workflows/build-test-reusable.yml
@@ -288,9 +288,10 @@ jobs:
       - name: Build PTI && Run Proton tests
         if: matrix.suite == 'rest' && inputs.driver_version == 'rolling' && inputs.device == 'max1100'
         run: |
+          PTI_COMMIT_ID="$(<.github/pins/pti.txt)"
           git clone https://github.com/intel/pti-gpu.git
           cd pti-gpu
-          git checkout 15a201d25e5659692613b98ee33513263b689101
+          git checkout $PTI_COMMIT_ID
           cd sdk
           cmake --preset linux-icpx-release
           BUILD_TESTING=1 PTI_BUILD_SAMPLES=1 cmake --build --preset linux-icpx-release
diff --git a/.github/workflows/triton-benchmarks.yml b/.github/workflows/triton-benchmarks.yml
index a101677bf8..f29d6ad4e9 100644
--- a/.github/workflows/triton-benchmarks.yml
+++ b/.github/workflows/triton-benchmarks.yml
@@ -117,11 +117,11 @@ jobs:
           pip install .
 
       - name: Build PTI from source
-        id: build-pti
         run: |
+          PTI_COMMIT_ID="$(<.github/pins/pti.txt)"
           git clone https://github.com/intel/pti-gpu.git
           cd pti-gpu
-          git checkout 15a201d25e5659692613b98ee33513263b689101
+          git checkout $PTI_COMMIT_ID
           cd sdk
           cmake --preset linux-icpx-release
           BUILD_TESTING=1 PTI_BUILD_SAMPLES=1 cmake --build --preset linux-icpx-release
diff --git a/python/tutorials/09-persistent-matmul.py b/python/tutorials/09-persistent-matmul.py
index 93cf85241f..f354769615 100644
--- a/python/tutorials/09-persistent-matmul.py
+++ b/python/tutorials/09-persistent-matmul.py
@@ -673,7 +673,7 @@ def bench_fn(label, reps, warmup_reps, fn, *args):
     for _ in range(warmup_reps):
         fn(*args)
     #FIXME: Enable for XPU once proton support works.
-    if True or is_cuda():
+    if is_cuda():
         with proton_context():
             for _ in range(reps):
                 fn(*args)
@@ -783,11 +783,11 @@ def show_profile(precision, profile_name):
 
         validate(32, 32, 32, dtype)
         validate(8192, 8192, args.K_range[0], dtype)
-        if True or is_cuda():
+        if is_cuda():
             proton.start("matmul", hook="triton")
             proton.deactivate()
         for K in range(args.K_range[0], args.K_range[1] + 1, args.K_step):
             bench(K, dtype)
-        if True or is_cuda():
+        if is_cuda():
             proton.finalize()
             show_profile(args.prec, "matmul")
diff --git a/third_party/proton/csrc/lib/Profiler/Xpupti/XpuptiProfiler.cpp b/third_party/proton/csrc/lib/Profiler/Xpupti/XpuptiProfiler.cpp
index 749794d3d6..e8651323dd 100644
--- a/third_party/proton/csrc/lib/Profiler/Xpupti/XpuptiProfiler.cpp
+++ b/third_party/proton/csrc/lib/Profiler/Xpupti/XpuptiProfiler.cpp
@@ -264,8 +264,6 @@ struct XpuptiProfiler::XpuptiProfilerPimpl
 
   static void allocBuffer(uint8_t **buffer, size_t *bufferSize);
   static void completeBuffer(uint8_t *buffer, size_t size, size_t validSize);
-  // static void callbackFn(void *userData, CUpti_CallbackDomain domain,
-  //                        CUpti_CallbackId cbId, const void *cbData);
   static void callbackFn(pti_callback_domain domain,
                          pti_api_group_id driver_api_group_id,
                          uint32_t driver_api_id,
@@ -354,32 +352,6 @@ void XpuptiProfiler::XpuptiProfilerPimpl::callbackFn(
   } else {
     throw std::runtime_error("[PROTON] callbackFn failed");
   }
-
-  /*                        switch (domain) {
-    case PTI_DOMAIN_DRIVER_GPU_OPERATION_APPEND:
-      CallbackGPUOperationAppend(domain, driver_api_group_id, driver_api_id,
-                                backend_context, cb_data, user_data);
-      break;
-    case PTI_DOMAIN_GPU_OPERATION_COMPLETED:
-      CallbackGPUOperationCompletion(domain, driver_api_group_id, driver_api_id,
-                                   backend_context, cb_data, user_data);
-      break;
-    default: {
-      std::cout << "In " << __func__ << ", domain: " << domain
-                << ", driver_group_id: " << driver_api_group_id
-                << ", driver_api_id: " << driver_api_id << std::endl;
-
-      const char* name_ptr = nullptr;
-
-      if (PTI_SUCCESS == ptiViewGetApiIdName(driver_api_group_id, driver_api_id,
-  &name_ptr) ) { std::cout << ", API name: " << name_ptr << std::endl; } else {
-        std::cout << ", Unknown API name" << std::endl;
-      }
-      break;
-    }
-  }
-  std::cout << std::endl;
-  */
 }
 
 void CallbackCommon(pti_callback_domain domain,
@@ -458,7 +430,6 @@ int callWaitOnSyclQueue(const std::string &utils_cache_path, void *syclQueue) {
 }
 
 void XpuptiProfiler::XpuptiProfilerPimpl::doStart() {
-  std::cout << "doStart\n" << std::flush;
   // should be call to shared lib
   XpuptiProfiler &profiler = threadState.profiler;
   if (profiler.utils_cache_path != "") {
@@ -501,7 +472,6 @@ void XpuptiProfiler::XpuptiProfilerPimpl::doStart() {
   xpupti::viewEnable<true>(PTI_VIEW_DEVICE_GPU_KERNEL);
   xpupti::viewEnable<true>(PTI_VIEW_DEVICE_GPU_MEM_FILL);
   xpupti::viewEnable<true>(PTI_VIEW_DEVICE_GPU_MEM_COPY);
-  std::cout << "doStart2\n" << std::flush;
   xpupti::subscribe<true>(&subscriber, callbackFn, &subscriber);
   // xpupti::viewEnable<true>(PTI_VIEW_DEVICE_GPU_MEM_COPY);
   // xpupti::viewEnable<true>(PTI_VIEW_DEVICE_GPU_MEM_FILL);
@@ -511,10 +481,8 @@ void XpuptiProfiler::XpuptiProfilerPimpl::doStart() {
   // xpupti::viewEnable<true>(PTI_VIEW_LEVEL_ZERO_CALLS);
   // setGraphCallbacks(subscriber, /*enable=*/true);
   // setRuntimeCallbacks(subscriber, /*enable=*/true);
-  std::cout << "doStart::enableDomain\n" << std::flush;
   xpupti::enableDomain<true>(subscriber,
                              PTI_CB_DOMAIN_DRIVER_GPU_OPERATION_APPENDED, 1, 1);
-  std::cout << "doStart::enableDomain after\n" << std::flush;
   // setDriverCallbacks(subscriber, /*enable=*/true);
 }
 
@@ -551,10 +519,8 @@ void XpuptiProfiler::XpuptiProfilerPimpl::doStop() {
   // setGraphCallbacks(subscriber, /*enable=*/false);
   // setRuntimeCallbacks(subscriber, /*enable=*/false);
   // setDriverCallbacks(subscriber, /*enable=*/false);
-  std::cout << "doStop::disableDomain\n" << std::flush;
   xpupti::disableDomain<true>(subscriber,
                               PTI_CB_DOMAIN_DRIVER_GPU_OPERATION_APPENDED);
-  std::cout << "doStop::disableDomain after\n" << std::flush;
   xpupti::unsubscribe<true>(subscriber);
   // cupti::finalize<true>();
 }