diff --git a/.github/pins/pti.txt b/.github/pins/pti.txt
new file mode 100644
index 0000000000..ca98925e9d
--- /dev/null
+++ b/.github/pins/pti.txt
@@ -0,0 +1 @@
+15a201d25e5659692613b98ee33513263b689101
diff --git a/.github/workflows/build-test-reusable.yml b/.github/workflows/build-test-reusable.yml
index 722963a9a1..7fe0ddaa6e 100644
--- a/.github/workflows/build-test-reusable.yml
+++ b/.github/workflows/build-test-reusable.yml
@@ -285,9 +285,22 @@ jobs:
         run: |
           echo "TRITON_TEST_CMD=${{ needs.build.outputs.test-triton-command }}" | tee -a $GITHUB_ENV
 
-      - name: Run Proton tests
+      - name: Build PTI && Run Proton tests
         if: matrix.suite == 'rest' && inputs.driver_version == 'rolling' && inputs.device == 'max1100'
         run: |
+          PTI_COMMIT_ID="$(<.github/pins/pti.txt)"
+          git clone https://github.com/intel/pti-gpu.git
+          cd pti-gpu
+          git checkout $PTI_COMMIT_ID
+          cd sdk
+          cmake --preset linux-icpx-release
+          BUILD_TESTING=1 PTI_BUILD_SAMPLES=1 cmake --build --preset linux-icpx-release
+
+          PTI_LIBS_DIR="$(pwd)/build-linux-icpx-release/lib/"
+          cd ../..
+
+          export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
+          export TRITON_XPUPTI_LIB_PATH=$PTI_LIBS_DIR
           cd third_party/proton/test
           # FIXME: enable 'test_record.py' back
           pytest test_api.py test_lib.py test_profile.py test_viewer.py -s -v
diff --git a/.github/workflows/triton-benchmarks.yml b/.github/workflows/triton-benchmarks.yml
index 449a13bd46..f29d6ad4e9 100644
--- a/.github/workflows/triton-benchmarks.yml
+++ b/.github/workflows/triton-benchmarks.yml
@@ -116,9 +116,24 @@ jobs:
           cd benchmarks
           pip install .
 
+      - name: Build PTI from source
+        run: |
+          PTI_COMMIT_ID="$(<.github/pins/pti.txt)"
+          git clone https://github.com/intel/pti-gpu.git
+          cd pti-gpu
+          git checkout $PTI_COMMIT_ID
+          cd sdk
+          cmake --preset linux-icpx-release
+          BUILD_TESTING=1 PTI_BUILD_SAMPLES=1 cmake --build --preset linux-icpx-release
+
+          PTI_LIBS_DIR="$(pwd)/build-linux-icpx-release/lib/"
+          ls $PTI_LIBS_DIR
+          echo "PTI_LIBS_DIR=$PTI_LIBS_DIR" >> $GITHUB_ENV
+
       - name: Run Triton Softmax kernel benchmark
         if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'fused_softmax.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'fused_softmax.py') }}
         run: |
+          export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
           cd benchmarks/triton_kernels_benchmark
           python fused_softmax.py --reports $REPORTS --n_runs $N_RUNS
           source ../../scripts/capture-hw-details.sh
@@ -129,6 +144,7 @@ jobs:
       - name: Run Triton GEMM kernel benchmark
         if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_benchmark.py') }}
         run: |
+          export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
           cd benchmarks/triton_kernels_benchmark
           python gemm_benchmark.py --reports $REPORTS --n_runs $N_RUNS
           mv $REPORTS/matmul-performance.csv $REPORTS/matmul-performance-base.csv
@@ -142,6 +158,7 @@ jobs:
       - name: Run Triton GEMM kernel benchmark - with tensor of pointer
         if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_tensor_of_ptr_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_tensor_of_ptr_benchmark.py') }}
         run: |
+          export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
           cd benchmarks/triton_kernels_benchmark
           python gemm_tensor_of_ptr_benchmark.py --reports $REPORTS --n_runs $N_RUNS
           source ../../scripts/capture-hw-details.sh
@@ -154,6 +171,7 @@ jobs:
       - name: Run Triton GEMM kernel benchmark - with tensor descriptor
         if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_tensor_desc_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_tensor_desc_benchmark.py') }}
         run: |
+          export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
           cd benchmarks/triton_kernels_benchmark
           python gemm_tensor_desc_benchmark.py --reports $REPORTS --n_runs $N_RUNS
           source ../../scripts/capture-hw-details.sh
@@ -166,6 +184,7 @@ jobs:
       - name: Run Triton GEMM (A@B^t) kernel benchmark
         if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_benchmark.py_abt')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_benchmark.py_abt') }}
         run: |
+          export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
           cd benchmarks/triton_kernels_benchmark
           TRANSPOSE_B=1 python gemm_benchmark.py --reports $REPORTS --n_runs $N_RUNS
           mv $REPORTS/matmul-performance.csv $REPORTS/matmul-performance-bt.csv
@@ -177,6 +196,7 @@ jobs:
       - name: Run Triton GEMM (A^t@B) kernel benchmark
         if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_benchmark.py_atb')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_benchmark.py_atb') }}
         run: |
+          export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
           cd benchmarks/triton_kernels_benchmark
           TRANSPOSE_A=1 python gemm_benchmark.py --reports $REPORTS --n_runs $N_RUNS
           mv $REPORTS/matmul-performance.csv $REPORTS/matmul-performance-at.csv
@@ -188,6 +208,7 @@ jobs:
       - name: Run Triton GEMM (stream-k) kernel benchmark
         if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_streamk_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_streamk_benchmark.py') }}
         run: |
+          export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
           cd benchmarks/triton_kernels_benchmark
           python gemm_streamk_benchmark.py --reports $REPORTS --n_runs $N_RUNS
           source ../../scripts/capture-hw-details.sh
@@ -197,6 +218,7 @@ jobs:
       - name: Run Triton GEMM (split-k) kernel benchmark
         if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_splitk_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_splitk_benchmark.py') }}
         run: |
+          export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
           cd benchmarks/triton_kernels_benchmark
           python gemm_splitk_benchmark.py --reports $REPORTS --n_runs $N_RUNS
           source ../../scripts/capture-hw-details.sh
@@ -206,6 +228,7 @@ jobs:
       - name: Run Triton GEMM + PreOp (exp) kernel benchmark
         if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_preop_exp_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_preop_exp_benchmark.py') }}
         run: |
+          export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
           cd benchmarks/triton_kernels_benchmark
           python gemm_preop_exp_benchmark.py --reports $REPORTS --n_runs $N_RUNS
           source ../../scripts/capture-hw-details.sh
@@ -214,6 +237,7 @@ jobs:
       - name: Run Triton GEMM + PostOp (Gelu) kernel benchmark
         if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_postop_gelu_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_postop_gelu_benchmark.py') }}
         run: |
+          export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
           cd benchmarks/triton_kernels_benchmark
           python gemm_postop_gelu_benchmark.py --reports $REPORTS --n_runs $N_RUNS
           source ../../scripts/capture-hw-details.sh
@@ -222,6 +246,7 @@ jobs:
       - name: Run Triton GEMM + PostOp (add matrix) kernel benchmark bfloat16
         if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_postop_addmatrix_benchmark_bfloat16.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_postop_addmatrix_benchmark_bfloat16.py') }}
         run: |
+          export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
           cd benchmarks/triton_kernels_benchmark
           python gemm_postop_addmatrix_benchmark.py --reports $REPORTS --n_runs $N_RUNS
           source ../../scripts/capture-hw-details.sh
@@ -231,6 +256,7 @@ jobs:
       - name: Run Triton GEMM + PostOp (add matrix) kernel benchmark int8
         if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_postop_addmatrix_benchmark_int8.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_postop_addmatrix_benchmark_int8.py') }}
         run: |
+          export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
           cd benchmarks/triton_kernels_benchmark
           INT8_ONLY=1 python gemm_postop_addmatrix_benchmark.py --reports $REPORTS --n_runs $N_RUNS
           source ../../scripts/capture-hw-details.sh
@@ -240,6 +266,7 @@ jobs:
       - name: Run Triton FA fwd kernel benchmark
         if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'flash_attention_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'flash_attention_benchmark.py') }}
         run: |
+          export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
           cd benchmarks/triton_kernels_benchmark
           python flash_attention_benchmark.py --reports $REPORTS --n_runs $N_RUNS
 
@@ -250,6 +277,7 @@ jobs:
       - name: Run Triton FA bwd kernel benchmark
         if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'flash_attention_bwd_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'flash_attention_bwd_benchmark.py') }}
         run: |
+          export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
           cd benchmarks/triton_kernels_benchmark
           FA_KERNEL_MODE="bwd" \
             python flash_attention_benchmark.py --reports $REPORTS --n_runs $N_RUNS
@@ -262,6 +290,7 @@ jobs:
       - name: Run Triton FA fwd kernel benchmark - with tensor descriptors
         if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'flash_attention_tensor_desc_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'flash_attention_tensor_desc_benchmark.py') }}
         run: |
+          export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
           cd benchmarks/triton_kernels_benchmark
           python flash_attention_tensor_desc_benchmark.py --reports $REPORTS --n_runs $N_RUNS
           mv $REPORTS/attn-performance.csv $REPORTS/attn-tensor-desc-performance.csv
@@ -273,6 +302,7 @@ jobs:
       - name: Run Triton FlexAttention Causal Mask fwd kernel benchmark
         if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'flex_attention_benchmark_causal_mask.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'flex_attention_benchmark_causal_mask.py') }}
         run: |
+          export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
           cd benchmarks/triton_kernels_benchmark
           python flex_attention_benchmark_causal_mask.py --reports $REPORTS --n_runs $N_RUNS
 
@@ -283,6 +313,7 @@ jobs:
       - name: Run Triton FlexAttention (batch_size=4) Causal Mask fwd kernel benchmark
         if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'flex_attention_benchmark_batch4-causal_mask.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'flex_attention_benchmark_batch4-causal_mask.py') }}
         run: |
+          export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
           cd benchmarks/triton_kernels_benchmark
           BATCH_SIZE=4 python flex_attention_benchmark_causal_mask.py --reports $REPORTS --n_runs $N_RUNS
 
@@ -293,6 +324,7 @@ jobs:
       - name: Run Triton FlexAttention (batch_size=16) Causal Mask fwd kernel benchmark
         if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'flex_attention_benchmark_batch16-causal_mask.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'flex_attention_benchmark_batch16-causal_mask.py') }}
         run: |
+          export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
           cd benchmarks/triton_kernels_benchmark
           BATCH_SIZE=16 python flex_attention_benchmark_causal_mask.py --reports $REPORTS --n_runs $N_RUNS
 
@@ -303,6 +335,7 @@ jobs:
       - name: Run Triton FlexAttention Custom Masks fwd kernel benchmark
         if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'flex_attention_benchmark_custom_masks.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'flex_attention_benchmark_custom_masks.py') }}
         run: |
+          export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
           cd benchmarks/triton_kernels_benchmark
           python flex_attention_benchmark_custom_masks.py --reports $REPORTS --n_runs $N_RUNS
 
@@ -316,6 +349,7 @@ jobs:
       - name: Run Prefix Sums kernel benchmark
         if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'prefix_sums.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'prefix_sums.py') }}
         run: |
+          export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
           cd benchmarks/triton_kernels_benchmark
           python prefix_sums.py --reports $REPORTS --n_runs $N_RUNS
           source ../../scripts/capture-hw-details.sh
@@ -324,6 +358,7 @@ jobs:
       - name: Run micro benchmark
         if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'micro_benchmarks.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'micro_benchmarks') }}
         run: |
+          export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
           cd benchmarks/micro_benchmarks
           python run_benchmarks.py --reports $REPORTS
 
diff --git a/third_party/intel/backend/proton/include/pti/pti.h b/third_party/intel/backend/proton/include/pti/pti.h
index 3bd6a3d363..512a839154 100644
--- a/third_party/intel/backend/proton/include/pti/pti.h
+++ b/third_party/intel/backend/proton/include/pti/pti.h
@@ -31,7 +31,9 @@ typedef enum {
                                           //!< PTI_VIEW_EXTERNAL_CORRELATION
   PTI_ERROR_BAD_TIMESTAMP = 6,            //!< error in timestamp conversion, might be related with the user
                                           //!< provided TimestampCallback
-  PTI_ERROR_BAD_API_ID = 7,               //!< invalid api_id when enable/disable runtime/driver specific api_id 
+  PTI_ERROR_BAD_API_ID = 7,               //!< invalid api_id when enable/disable runtime/driver specific api_id
+  PTI_ERROR_NO_GPU_VIEWS_ENABLED = 8,     //!< at least one GPU view must be enabled for kernel tracing
+
   PTI_ERROR_DRIVER = 50,                  //!< unknown driver error
   PTI_ERROR_TRACING_NOT_INITIALIZED = 51,  //!< installed driver requires tracing enabling with
                                            //!< setting environment variable ZE_ENABLE_TRACING_LAYER
@@ -57,6 +59,25 @@ typedef enum {
  */
 PTI_EXPORT const char* ptiResultTypeToString(pti_result result_value);
 
+
+/**
+ * @brief Abstraction for backend-specific objects.
+ *
+ * Level Zero is currently the only supported backend. However, these types will attempt to serve other backends.
+ * In case the other backend supported - the same types will serve it.
+ */
+
+typedef void* pti_device_handle_t;  //!< Device handle
+
+typedef void* pti_backend_ctx_t;    //!< Backend context handle
+
+typedef void* pti_backend_queue_t;  //!< Backend queue handle
+
+typedef void* pti_backend_evt_t;    //!< Backend event handle
+
+typedef void* pti_backend_command_list_t; //!< Backend command list handle
+
+
 #if defined(__cplusplus)
 }
 #endif
diff --git a/third_party/intel/backend/proton/include/pti/pti_callback.h b/third_party/intel/backend/proton/include/pti/pti_callback.h
new file mode 100644
index 0000000000..0659cec7fa
--- /dev/null
+++ b/third_party/intel/backend/proton/include/pti/pti_callback.h
@@ -0,0 +1,234 @@
+//==============================================================
+// Copyright (C) Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+// =============================================================
+
+#ifndef PTI_CALLBACK_H_
+#define PTI_CALLBACK_H_
+
+#include <stdint.h>
+
+#include "pti/pti.h"
+#include "pti/pti_view.h"
+
+/**
+ * This file contains APIs that are so far experimental in PTI.
+ * APIs and data structures in this file are work-in-progress and subject to change!
+ * All content in this file concerns the Callback API.
+ *
+ * The Callback API is useful for many purposes,
+ * including the implementation of `MetricsScope` functionality that needs to subscribe to
+ * domains such as kernel append to a command list, and potentially other domains.
+ * The `MetricsScope` API is under development and is the first (internal) user of the Callback API.
+ */
+
+
+/* clang-format off */
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+typedef struct _pti_callback_subscriber* pti_callback_subscriber_handle;
+
+typedef enum _pti_callback_domain {
+  PTI_CB_DOMAIN_INVALID                            = 0,
+  PTI_CB_DOMAIN_DRIVER_CONTEXT_CREATED             = 1, //!< Not implemented yet
+                                                     //!< attempt to enable it will return PTI_ERROR_NOT_IMPLEMENTED
+
+  PTI_CB_DOMAIN_DRIVER_MODULE_LOADED               = 2, //!< Not implemented yet
+                                                     //!< attempt to enable it will return PTI_ERROR_NOT_IMPLEMENTED
+
+  PTI_CB_DOMAIN_DRIVER_MODULE_UNLOADED             = 3, //!< Not implemented yet
+                                                     //!< attempt to enable it will return PTI_ERROR_NOT_IMPLEMENTED
+
+  PTI_CB_DOMAIN_DRIVER_GPU_OPERATION_APPENDED      = 4, //!< Synchronous callback
+                                                     //!< This also serves as PTI_CB_DOMAIN_DRIVER_GPU_OPERATION_DISPATCHED
+                                                     //!< when appended to Immediate Command List,
+                                                     //!< which means no separate callback PTI_CB_DOMAIN_DRIVER_GPU_OPERATION_DISPATCHED
+
+  PTI_CB_DOMAIN_DRIVER_GPU_OPERATION_DISPATCHED    = 5, //!< Not implemented yet
+                                                     //!< attempt to enable it will return PTI_ERROR_NOT_IMPLEMENTED
+
+  PTI_CB_DOMAIN_DRIVER_GPU_OPERATION_COMPLETED     = 6, //!< Asynchronous callback, always has only EXIT phase of some API,
+                                                      //!< where completed operations are collected and reported
+
+  PTI_CB_DOMAIN_DRIVER_HOST_SYNCHRONIZATION        = 7, //!< Not implemented yet
+                                                     //!< attempt to enable it will return PTI_ERROR_NOT_IMPLEMENTED
+
+  PTI_CB_DOMAIN_DRIVER_API                         = 1023, //!< Not implemented yet,
+                                                        //!< attempt to enable it will return PTI_ERROR_NOT_IMPLEMENTED
+                                                        //!< Callback created for all Driver APIs
+  // below domains to inform user about PTI internal events
+  PTI_CB_DOMAIN_INTERNAL_THREADS                = 1024, //!< Not implemented yet
+  PTI_CB_DOMAIN_INTERNAL_EVENT                  = 1025, //!< Not implemented yet
+
+  PTI_CB_DOMAIN_MAX                             = 0x7fffffff
+} pti_callback_domain;
+
+typedef enum _pti_callback_phase {
+  PTI_CB_PHASE_INVALID                 = 0,
+  PTI_CB_PHASE_API_ENTER               = 1,
+  PTI_CB_PHASE_API_EXIT                = 2,
+  PTI_CB_PHASE_INTERNAL_THREAD_START   = 3,
+  PTI_CB_PHASE_INTERNAL_THREAD_END     = 4,
+  PTI_CB_PHASE_INTERNAL_EVENT          = 5,
+
+  PTI_CB_PHASE_MAX                     = 0x7fffffff
+} pti_callback_phase;
+
+typedef enum _pti_backend_command_list_type {
+  PTI_BACKEND_COMMAND_LIST_TYPE_UNKNOWN   = (1<<0),
+  PTI_BACKEND_COMMAND_LIST_TYPE_IMMEDIATE = (1<<1),
+  PTI_BACKEND_COMMAND_LIST_TYPE_MUTABLE   = (1<<2),
+
+  PTI_BACKEND_COMMAND_LIST_TYPE_MAX       = 0x7fffffff
+} pti_backend_command_list_type;
+
+/**
+ * A user can subscribe to notifications about non-standard situations from PTI
+ * when it collects or processes the data
+ */
+typedef enum _pti_internal_event_type {
+  PTI_INTERNAL_EVENT_TYPE_INFO       = 0,
+  PTI_INTERNAL_EVENT_TYPE_WARNING    = 1, // one or a few records data inconsistencies, or other
+                                          // collection is safe to continue
+  PTI_INTERNAL_EVENT_TYPE_CRITICAL   = 2, // critical error after which further collected data are invalid
+
+  PTI_INTERNAL_EVENT_TYPE_MAX        = 0x7fffffff
+} pti_internal_event_type;
+
+typedef enum _pti_gpu_operation_kind {
+  PTI_GPU_OPERATION_KIND_INVALID         = 0,
+  PTI_GPU_OPERATION_KIND_KERNEL          = 1,
+  PTI_GPU_OPERATION_KIND_MEMORY          = 2,
+  PTI_GPU_OPERATION_KIND_OTHER           = 3,
+
+  PTI_GPU_OPERATION_KIND_MAX             = 0x7fffffff
+} pti_gpu_operation_kind;
+
+typedef struct _pti_gpu_op_details {
+  pti_gpu_operation_kind             _operation_kind; //<! Kind of the operation: kernel, mem op
+  uint64_t                           _operation_id;   //<! GPU kernel or memory operation instance ID,
+                                                      //<! uniquely throughout the process
+  uint64_t                           _kernel_handle;  //!< a handle uniquely identifying kernel object as
+                                                      //!< contained in the module at the specific offset
+                                                      //!< it will be zero in case of not implemented yet or
+                                                      //!< for memory operations
+  const char*                        _name;           //!< symbolic name of a kernel or memcpy operation
+} pti_gpu_op_details;
+
+typedef struct _pti_callback_gpu_op_data {
+  pti_callback_domain             _domain;              //!< domain of the callback
+  pti_backend_command_list_type   _cmd_list_properties; //!< immediate, mutable,..
+  pti_backend_command_list_t      _cmd_list_handle;     //!< Device back-end command list handle,
+                                                        //!< could be nullptr if unknown or
+                                                        //!< when several operations with different command lists
+                                                        //!< reported together
+  pti_backend_queue_t             _queue_handle;        //!< Device back-end queue handle,
+                                                        //!< could be nullptr if unknown
+                                                        //!< when several operations with different command lists
+                                                        //!< reported together
+  pti_device_handle_t             _device_handle;       //!< Device handle
+  pti_callback_phase              _phase;               //!< PTI_CB_PHASE_API_ENTER/EXIT
+  uint32_t                        _return_code;         //!< will be valid only for L0 API EXIT, for others will be zero
+  uint32_t                  _correlation_id;    //!< ID that corresponds to the same call reported by View API records
+  uint32_t                  _operation_count;   //!< number of operations appended or dispatched to the GPU
+  pti_gpu_op_details*       _operation_details; //!< pointer to details of operation(s) appended, dispatched or completed
+} pti_callback_gpu_op_data;
+
+typedef struct _pti_internal_callback_data {
+  pti_callback_domain  _domain;       //!< domain of the callback
+  pti_callback_phase   _phase;        //!< THREAD START/END or INTERNAL EVENT
+  uint32_t             _detail;       //!< depending on the domain should be casted/interpreted
+                                      //!< as a purpose of an internal PTI thread or
+                                      //!< pti_internal_event_type
+  const char*          _message;      //!< explains details
+} pti_internal_callback_data;
+
+typedef void (*pti_callback_function)(
+  pti_callback_domain  domain,
+  pti_api_group_id     driver_api_group_id, //!< driver API group ID, keep it to distinguish between L0 and OpenCL
+                                            //!< although the current implementation is only for L0
+  uint32_t             driver_api_id,
+  pti_backend_ctx_t    backend_context,     //!< Driver (L0) level context handle
+  void*                cb_data, //!< depending on the domain, it should be type-casted to the pointer
+                                //!< to either pti_callback_gpu_op_data, pti_internal_callback_data,
+                                //!< or to other types to be defined
+  void*                global_user_data,    //!< Any global data defined by user returned
+                                            //!< to every callback from a same subscriber
+  void**         instance_user_data);       //!< Data that could be passed between ENTER and EXIT
+                                            //!< phases of one API call
+
+/**
+ * Callback API functions
+ * None of the PTI API functions should be called from within a Callback function.
+ * Exceptions are helper functions that return character representations of enums.
+ */
+
+/**
+ * @brief Initialize Callback subscriber
+ *
+ * @param subscriber - subscriber handle
+ * @param callback   - pointer to the callback function
+ * @param user_data  - user data to be passed to the callback function
+ * @return pti_result
+ */
+pti_result PTI_EXPORT
+ptiCallbackSubscribe(pti_callback_subscriber_handle* subscriber,
+                     pti_callback_function    callback,
+                     void* user_data);
+
+/**
+ * @brief Unsubscribe Callback subscriber. This unsubscribes from all domains, disables the callback,
+ *        cleans up all resources related to the subscriber handle, and invalidates the handle.
+ */
+pti_result PTI_EXPORT
+ptiCallbackUnsubscribe(pti_callback_subscriber_handle subscriber);
+
+/**
+ * @brief Enables callbacks on specific domain
+ *
+ * @param subscriber - subscriber handle
+ * @param domain     - domain to enable
+ * @param enter_cb   - indicate if callback called on enter/start: 0-no, 1-yes; used only for domains with 2 phases
+ * @param exit_cb    - indicates if callback is called on exit/end: 0-no, 1-yes; used only for domains with 2 phases
+ * @return pti_result
+ */
+pti_result PTI_EXPORT
+ptiCallbackEnableDomain(pti_callback_subscriber_handle subscriber,
+                        pti_callback_domain  domain,
+                        uint32_t enter_cb,
+                        uint32_t exit_cb);
+
+/**
+ * @brief Disables callbacks for specific domain
+ */
+pti_result PTI_EXPORT
+ptiCallbackDisableDomain(pti_callback_subscriber_handle subscriber,
+                         pti_callback_domain  domain);
+
+/**
+ * @brief Disables the callback of the subscriber for all domains
+ */
+pti_result PTI_EXPORT
+ptiCallbackDisableAllDomains(pti_callback_subscriber_handle subscriber);
+
+/**
+ * @brief Helper function to return stringified enum members for pti_callback_domain
+ *
+ * @return const char*
+ */
+PTI_EXPORT const char* ptiCallbackDomainTypeToString(pti_callback_domain domain);
+
+/**
+ * @brief Helper function to return stringified enum members for pti_callback_phase
+ *
+ * @return const char*
+ */
+PTI_EXPORT const char* ptiCallbackPhaseTypeToString(pti_callback_phase phase);
+
+#if defined(__cplusplus)
+}
+#endif
+#endif  // PTI_CALLBACK_H_
diff --git a/third_party/intel/backend/proton/include/pti/pti_driver_levelzero_api_ids.h b/third_party/intel/backend/proton/include/pti/pti_driver_levelzero_api_ids.h
index 500a58e87d..07a6d8b2a8 100644
--- a/third_party/intel/backend/proton/include/pti/pti_driver_levelzero_api_ids.h
+++ b/third_party/intel/backend/proton/include/pti/pti_driver_levelzero_api_ids.h
@@ -10,10 +10,10 @@
 // ========= This file is autogenerated - do not modify ========
 // ========= Api file version used for generation: =============
 //    ApiFile: ze_api.h
-// ApiVersion: * @version v1.12-r1.12.15
+// ApiVersion: * @version v1.13-r1.13.1
 
 //             https://github.com/oneapi-src/level-zero.git
-//             commit: d7a44e0303722e754e711227e0334aae3fa52f9d - v1.20.2
+//             commit: ff8c99d4abda00fba6d92548a9cb2f721764d9d0 - v1.24.2
 
 
 typedef enum _pti_api_id_driver_levelzero {
@@ -212,6 +212,17 @@ typedef enum _pti_api_id_driver_levelzero {
     zeFabricVertexGetDeviceExp_id=192,
     zelTracerSetEnabled_id=193,
     zelTracerCreate_id=194,
+    zeRTASBuilderCreateExt_id=195,
+    zeRTASBuilderGetBuildPropertiesExt_id=196,
+    zeRTASBuilderBuildExt_id=197,
+    zeRTASBuilderCommandListAppendCopyExt_id=198,
+    zeRTASBuilderDestroyExt_id=199,
+    zeRTASParallelOperationCreateExt_id=200,
+    zeRTASParallelOperationGetPropertiesExt_id=201,
+    zeRTASParallelOperationJoinExt_id=202,
+    zeRTASParallelOperationDestroyExt_id=203,
+    zeDriverRTASFormatCompatibilityCheckExt_id=204,
+    zeDeviceGetVectorWidthPropertiesExt_id=205,
  } pti_api_id_driver_levelzero;
 
 #endif
diff --git a/third_party/intel/backend/proton/include/pti/pti_metrics.h b/third_party/intel/backend/proton/include/pti/pti_metrics.h
index d4dbf41de9..df9efdc6c6 100644
--- a/third_party/intel/backend/proton/include/pti/pti_metrics.h
+++ b/third_party/intel/backend/proton/include/pti/pti_metrics.h
@@ -20,8 +20,6 @@ extern "C" {
                                  DEVICE
 *****************************************************************************/
 
-typedef void* pti_device_handle_t;  //!< Abstraction of device within PTI
-
 typedef struct _pti_pci_properties_t {
   uint8_t _domain;
   uint8_t _bus;
diff --git a/third_party/intel/backend/proton/include/pti/pti_runtime_sycl_api_ids.h b/third_party/intel/backend/proton/include/pti/pti_runtime_sycl_api_ids.h
index a07ed127b3..579f5328f6 100644
--- a/third_party/intel/backend/proton/include/pti/pti_runtime_sycl_api_ids.h
+++ b/third_party/intel/backend/proton/include/pti/pti_runtime_sycl_api_ids.h
@@ -10,7 +10,7 @@
 // ========= This file is autogenerated - do not modify ========
 // ========= Api file version used for generation: =============
 //    ApiFile: ur_api.h
-// ApiVersion:* @version v0.10-r0
+// ApiVersion:* @version v0.12-r0
 //             https://github.com/oneapi-src/unified-runtime.git
 //             
 
@@ -213,7 +213,52 @@ typedef enum _pti_api_id_runtime_sycl {
     urEnqueueNativeCommandExp_id=228,
     urLoaderConfigSetMockingEnabled_id=229,
     urBindlessImagesReleaseExternalMemoryExp_id=230,
-    urBindlessImagesMapExternalLinearMemoryExp_id=231,
+    urCommandBufferAppendUSMMemcpyExp_v2_id=231,
+    urCommandBufferAppendUSMFillExp_v2_id=232,
+    urCommandBufferAppendMemBufferCopyExp_v2_id=233,
+    urCommandBufferAppendMemBufferWriteExp_v2_id=234,
+    urCommandBufferAppendMemBufferReadExp_v2_id=235,
+    urCommandBufferAppendMemBufferCopyRectExp_v2_id=236,
+    urCommandBufferAppendMemBufferWriteRectExp_v2_id=237,
+    urCommandBufferAppendMemBufferReadRectExp_v2_id=238,
+    urCommandBufferAppendMemBufferFillExp_v2_id=239,
+    urCommandBufferAppendUSMPrefetchExp_v2_id=240,
+    urCommandBufferAppendUSMAdviseExp_v2_id=241,
+    urEnqueueCommandBufferExp_id=242,
+    urCommandBufferUpdateSignalEventExp_id=243,
+    urCommandBufferUpdateWaitEventsExp_id=244,
+    urBindlessImagesMapExternalLinearMemoryExp_id=245,
+    urEnqueueEventsWaitWithBarrierExt_id=246,
+    urTensorMapEncodeIm2ColExp_id=247,
+    urTensorMapEncodeTiledExp_id=248,
+    urPhysicalMemGetInfo_id=249,
+    urEnqueueUSMDeviceAllocExp_id=250,
+    urEnqueueUSMSharedAllocExp_id=251,
+    urEnqueueUSMHostAllocExp_id=252,
+    urEnqueueUSMFreeExp_id=253,
+    urUSMPoolCreateExp_id=254,
+    urUSMPoolDestroyExp_id=255,
+    urUSMPoolSetThresholdExp_id=256,
+    urUSMPoolGetDefaultDevicePoolExp_id=257,
+    urUSMPoolSetDevicePoolExp_id=259,
+    urUSMPoolGetDevicePoolExp_id=260,
+    urUSMPoolTrimToExp_id=261,
+    urUSMPoolGetInfoExp_id=262,
+    urCommandBufferAppendNativeCommandExp_id=263,
+    urCommandBufferGetNativeHandleExp_id=264,
+    urUSMPoolSetInfoExp_id=265,
+    urAdapterSetLoggerCallback_id=266,
+    urAdapterSetLoggerCallbackLevel_id=267,
+    urBindlessImagesGetImageUnsampledHandleSupportExp_id=268,
+    urBindlessImagesGetImageSampledHandleSupportExp_id=269,
+    urBindlessImagesGetImageMemoryHandleTypeSupportExp_id=270,
+    urBindlessImagesFreeMappedLinearMemoryExp_id=271,
+    urKernelSuggestMaxCooperativeGroupCount_id=272,
+    urUSMContextMemcpyExp_id=273,
+    urMemoryExportAllocExportableMemoryExp_id=285,
+    urMemoryExportFreeExportableMemoryExp_id=286,
+    urMemoryExportExportMemoryHandleExp_id=287,
+    urBindlessImagesSupportsImportingHandleTypeExp_id=288,
  } pti_api_id_runtime_sycl;
 
 #endif
diff --git a/third_party/intel/backend/proton/include/pti/pti_version.h b/third_party/intel/backend/proton/include/pti/pti_version.h
index 114e5ffb6c..583d2fe6c4 100644
--- a/third_party/intel/backend/proton/include/pti/pti_version.h
+++ b/third_party/intel/backend/proton/include/pti/pti_version.h
@@ -16,13 +16,13 @@ extern "C" {
 #endif
 
 #if !defined(PTI_VERSION)
-#define PTI_VERSION 0.13.1
+#define PTI_VERSION 0.14.0
 #endif
 
-#define PTI_VERSION_STRING "0.13.1"
+#define PTI_VERSION_STRING "0.14.0"
 #define PTI_VERSION_MAJOR 0
-#define PTI_VERSION_MINOR 13
-#define PTI_VERSION_PATCH 1
+#define PTI_VERSION_MINOR 14
+#define PTI_VERSION_PATCH 0
 
 typedef struct pti_version {
   uint32_t _major;
diff --git a/third_party/intel/backend/proton/include/pti/pti_view.h b/third_party/intel/backend/proton/include/pti/pti_view.h
index df4ef6d65d..9021125b94 100644
--- a/third_party/intel/backend/proton/include/pti/pti_view.h
+++ b/third_party/intel/backend/proton/include/pti/pti_view.h
@@ -23,7 +23,7 @@ extern "C" {
  * @brief const defines.
  */
 #define PTI_MAX_PCI_ADDRESS_SIZE 16                         //!< Size of pci address array.
-#define PTI_INVALID_QUEUE_ID 0xFFFFFFFFFFFFFFFF-1           //!< For oneAPI versions earlier than 2024.1.1 -- UINT64_MAX-1
+#define PTI_INVALID_QUEUE_ID 0xFFFFFFFFFFFFFFFF-1           //!< Indicates a missing sycl queue id. UINT64_MAX-1
 
 /**
  * @brief Kinds of software and hardware operations to be tracked and viewed,
@@ -149,12 +149,6 @@ typedef enum _pti_api_group_id {
                                                            //!< -- you will get all classes *now* and in the *future*!
  } pti_api_class;
 
-typedef void* pti_backend_queue_t; //!< Backend queue handle
-
-typedef void* pti_backend_ctx_t; //!< Backend context handle
-
-typedef void* pti_backend_evt_t; //!< Backend event handle
-
 /**
  * @brief Base View record type
  */
diff --git a/third_party/proton/csrc/include/Driver/GPU/XpuptiApi.h b/third_party/proton/csrc/include/Driver/GPU/XpuptiApi.h
index f28cb9fed7..f9b8fbe4c4 100644
--- a/third_party/proton/csrc/include/Driver/GPU/XpuptiApi.h
+++ b/third_party/proton/csrc/include/Driver/GPU/XpuptiApi.h
@@ -1,6 +1,7 @@
 #ifndef PROTON_DRIVER_GPU_XPUPTI_H_
 #define PROTON_DRIVER_GPU_XPUPTI_H_
 
+#include <pti/pti_callback.h>
 #include <pti/pti_view.h>
 
 namespace proton {
@@ -15,6 +16,22 @@ template <bool CheckSuccess> pti_result viewDisable(pti_view_kind kind);
 
 template <bool CheckSuccess> pti_result viewFlushAll();
 
+template <bool CheckSuccess>
+pti_result subscribe(pti_callback_subscriber_handle *subscriber,
+                     pti_callback_function callback, void *user_data);
+
+template <bool CheckSuccess>
+pti_result unsubscribe(pti_callback_subscriber_handle subscriber);
+
+template <bool CheckSuccess>
+pti_result enableDomain(pti_callback_subscriber_handle subscriber,
+                        pti_callback_domain domain, uint32_t enter_cb,
+                        uint32_t exit_cb);
+
+template <bool CheckSuccess>
+pti_result disableDomain(pti_callback_subscriber_handle subscriber,
+                         pti_callback_domain domain);
+
 template <bool CheckSuccess>
 pti_result viewGetNextRecord(uint8_t *buffer, size_t valid_bytes,
                              pti_view_record_base **record);
diff --git a/third_party/proton/csrc/lib/Driver/GPU/XpuptiApi.cpp b/third_party/proton/csrc/lib/Driver/GPU/XpuptiApi.cpp
index 726199781d..c618661372 100644
--- a/third_party/proton/csrc/lib/Driver/GPU/XpuptiApi.cpp
+++ b/third_party/proton/csrc/lib/Driver/GPU/XpuptiApi.cpp
@@ -10,6 +10,7 @@ struct ExternLibXpupti : public ExternLibBase {
   using RetType = pti_result;
   static constexpr const char *name = "libpti_view.so";
   static constexpr const char *defaultDir = "";
+  static constexpr const char *pathEnv = "TRITON_XPUPTI_LIB_PATH";
   static constexpr RetType success = PTI_SUCCESS;
   static void *lib;
 };
@@ -24,6 +25,19 @@ DEFINE_DISPATCH(ExternLibXpupti, viewDisable, ptiViewDisable, pti_view_kind)
 
 DEFINE_DISPATCH(ExternLibXpupti, viewFlushAll, ptiFlushAllViews)
 
+DEFINE_DISPATCH(ExternLibXpupti, subscribe, ptiCallbackSubscribe,
+                pti_callback_subscriber_handle *, pti_callback_function, void *)
+
+DEFINE_DISPATCH(ExternLibXpupti, unsubscribe, ptiCallbackUnsubscribe,
+                pti_callback_subscriber_handle);
+
+DEFINE_DISPATCH(ExternLibXpupti, enableDomain, ptiCallbackEnableDomain,
+                pti_callback_subscriber_handle, pti_callback_domain, uint32_t,
+                uint32_t);
+
+DEFINE_DISPATCH(ExternLibXpupti, disableDomain, ptiCallbackDisableDomain,
+                pti_callback_subscriber_handle, pti_callback_domain);
+
 DEFINE_DISPATCH(ExternLibXpupti, viewGetNextRecord, ptiViewGetNextRecord,
                 uint8_t *, size_t, pti_view_record_base **)
 
diff --git a/third_party/proton/csrc/lib/Profiler/Xpupti/XpuptiProfiler.cpp b/third_party/proton/csrc/lib/Profiler/Xpupti/XpuptiProfiler.cpp
index 48ba83f564..e8651323dd 100644
--- a/third_party/proton/csrc/lib/Profiler/Xpupti/XpuptiProfiler.cpp
+++ b/third_party/proton/csrc/lib/Profiler/Xpupti/XpuptiProfiler.cpp
@@ -264,14 +264,17 @@ struct XpuptiProfiler::XpuptiProfilerPimpl
 
   static void allocBuffer(uint8_t **buffer, size_t *bufferSize);
   static void completeBuffer(uint8_t *buffer, size_t size, size_t validSize);
-  /*
-  static void callbackFn(void *userData, CUpti_CallbackDomain domain,
-                         CUpti_CallbackId cbId, const void *cbData);
-  */
+  static void callbackFn(pti_callback_domain domain,
+                         pti_api_group_id driver_api_group_id,
+                         uint32_t driver_api_id,
+                         pti_backend_ctx_t backend_context, void *cb_data,
+                         void *global_user_data, void **instance_user_data);
 
   static constexpr size_t AlignSize = 8;
   static constexpr size_t BufferSize = 64 * 1024 * 1024;
 
+  pti_callback_subscriber_handle subscriber;
+
   /*
   static constexpr size_t AttributeSize = sizeof(size_t);
 
@@ -327,6 +330,53 @@ void XpuptiProfiler::XpuptiProfilerPimpl::completeBuffer(uint8_t *buffer,
   profiler.correlation.complete(maxCorrelationId);
 }
 
+void XpuptiProfiler::XpuptiProfilerPimpl::callbackFn(
+    pti_callback_domain domain, pti_api_group_id driver_api_group_id,
+    uint32_t driver_api_id, pti_backend_ctx_t backend_context, void *cb_data,
+    void *global_user_data, void **instance_user_data) {
+  std::cout << "callback\n" << std::flush;
+  pti_callback_gpu_op_data *callback_data =
+      static_cast<pti_callback_gpu_op_data *>(cb_data);
+  if (callback_data == nullptr) {
+    std::cerr << "CallbackGPUOperationAppend: callback_data is null"
+              << std::endl;
+    return;
+  }
+  if (callback_data->_phase == PTI_CB_PHASE_API_ENTER) {
+    threadState.enterOp();
+    threadState.profiler.correlation.correlate(callback_data->_correlation_id,
+                                               1);
+  } else if (callback_data->_phase == PTI_CB_PHASE_API_EXIT) {
+    threadState.exitOp();
+    threadState.profiler.correlation.submit(callback_data->_correlation_id);
+  } else {
+    throw std::runtime_error("[PROTON] callbackFn failed");
+  }
+}
+
+void CallbackCommon(pti_callback_domain domain,
+                    pti_api_group_id driver_group_id, uint32_t driver_api_id,
+                    [[maybe_unused]] pti_backend_ctx_t backend_context,
+                    [[maybe_unused]] void *cb_data,
+                    [[maybe_unused]] void *user_data) {
+
+  switch (domain) {
+  case PTI_CB_DOMAIN_DRIVER_GPU_OPERATION_APPENDED:
+    std::cout << "PTI_CB_DOMAIN_DRIVER_GPU_OPERATION_APPENDED\n" << std::flush;
+    break;
+  case PTI_CB_DOMAIN_DRIVER_GPU_OPERATION_COMPLETED:
+    std::cout << "PTI_CB_DOMAIN_DRIVER_GPU_OPERATION_COMPLETED\n" << std::flush;
+    break;
+  default: {
+    std::cout << "In " << __func__ << ", domain: " << domain
+              << ", driver_group_id: " << driver_group_id
+              << ", driver_api_id: " << driver_api_id << std::endl;
+    break;
+  }
+  }
+  std::cout << std::endl;
+}
+
 zel_tracer_handle_t tracer = nullptr;
 
 typedef void (*EnumDeviceUUIDsFunc)(std::vector<std::array<uint8_t, 16>>);
@@ -380,7 +430,6 @@ int callWaitOnSyclQueue(const std::string &utils_cache_path, void *syclQueue) {
 }
 
 void XpuptiProfiler::XpuptiProfilerPimpl::doStart() {
-  // xpupti::subscribe<true>(&subscriber, callbackFn, nullptr);
   // should be call to shared lib
   XpuptiProfiler &profiler = threadState.profiler;
   if (profiler.utils_cache_path != "") {
@@ -389,13 +438,13 @@ void XpuptiProfiler::XpuptiProfilerPimpl::doStart() {
   // auto res = ptiViewPushExternalCorrelationId(
   //     pti_view_external_kind::PTI_VIEW_EXTERNAL_KIND_CUSTOM_1, 42);
   //  std::cout << "res: " << res << "\n" << std::flush;
-
+  /*
   ze_result_t status = ZE_RESULT_SUCCESS;
   // status = zeInit(ZE_INIT_FLAG_GPU_ONLY);
   // assert(status == ZE_RESULT_SUCCESS);
 
   zel_tracer_desc_t tracer_desc = {ZEL_STRUCTURE_TYPE_TRACER_DESC, nullptr,
-                                   nullptr /* global user data */};
+                                   nullptr};
 
   status = zelTracerCreate(&tracer_desc, &tracer);
   std::cout << "zelTracerCreate: " << status << "\n" << std::flush;
@@ -417,9 +466,13 @@ void XpuptiProfiler::XpuptiProfilerPimpl::doStart() {
 
   status = zelTracerSetEnabled(tracer, true);
   assert(status == ZE_RESULT_SUCCESS);
+  */
 
   xpupti::viewSetCallbacks<true>(allocBuffer, completeBuffer);
   xpupti::viewEnable<true>(PTI_VIEW_DEVICE_GPU_KERNEL);
+  xpupti::viewEnable<true>(PTI_VIEW_DEVICE_GPU_MEM_FILL);
+  xpupti::viewEnable<true>(PTI_VIEW_DEVICE_GPU_MEM_COPY);
+  xpupti::subscribe<true>(&subscriber, callbackFn, &subscriber);
   // xpupti::viewEnable<true>(PTI_VIEW_DEVICE_GPU_MEM_COPY);
   // xpupti::viewEnable<true>(PTI_VIEW_DEVICE_GPU_MEM_FILL);
   // xpupti::viewEnable<true>(PTI_VIEW_SYCL_RUNTIME_CALLS);
@@ -428,6 +481,8 @@ void XpuptiProfiler::XpuptiProfilerPimpl::doStart() {
   // xpupti::viewEnable<true>(PTI_VIEW_LEVEL_ZERO_CALLS);
   // setGraphCallbacks(subscriber, /*enable=*/true);
   // setRuntimeCallbacks(subscriber, /*enable=*/true);
+  xpupti::enableDomain<true>(subscriber,
+                             PTI_CB_DOMAIN_DRIVER_GPU_OPERATION_APPENDED, 1, 1);
   // setDriverCallbacks(subscriber, /*enable=*/true);
 }
 
@@ -444,13 +499,17 @@ void XpuptiProfiler::XpuptiProfilerPimpl::doFlush() {
 }
 
 void XpuptiProfiler::XpuptiProfilerPimpl::doStop() {
+  /*
   ze_result_t status = ZE_RESULT_SUCCESS;
   status = zelTracerSetEnabled(tracer, false);
   assert(status == ZE_RESULT_SUCCESS);
   status = zelTracerDestroy(tracer);
   assert(status == ZE_RESULT_SUCCESS);
+  */
 
   xpupti::viewDisable<true>(PTI_VIEW_DEVICE_GPU_KERNEL);
+  xpupti::viewDisable<true>(PTI_VIEW_DEVICE_GPU_MEM_FILL);
+  xpupti::viewDisable<true>(PTI_VIEW_DEVICE_GPU_MEM_COPY);
   // xpupti::viewDisable<true>(PTI_VIEW_DEVICE_GPU_MEM_COPY);
   // xpupti::viewDisable<true>(PTI_VIEW_DEVICE_GPU_MEM_FILL);
   // xpupti::viewDisable<true>(PTI_VIEW_SYCL_RUNTIME_CALLS);
@@ -460,7 +519,9 @@ void XpuptiProfiler::XpuptiProfilerPimpl::doStop() {
   // setGraphCallbacks(subscriber, /*enable=*/false);
   // setRuntimeCallbacks(subscriber, /*enable=*/false);
   // setDriverCallbacks(subscriber, /*enable=*/false);
-  // cupti::unsubscribe<true>(subscriber);
+  xpupti::disableDomain<true>(subscriber,
+                              PTI_CB_DOMAIN_DRIVER_GPU_OPERATION_APPENDED);
+  xpupti::unsubscribe<true>(subscriber);
   // cupti::finalize<true>();
 }
 
diff --git a/third_party/proton/test/test_profile.py b/third_party/proton/test/test_profile.py
index f85a4f77d2..15a97dc578 100644
--- a/third_party/proton/test/test_profile.py
+++ b/third_party/proton/test/test_profile.py
@@ -258,8 +258,6 @@ def foo(x, size: tl.constexpr, y):
 
 
 def test_hook_with_third_party(tmp_path: pathlib.Path):
-    if is_xpu():
-        pytest.skip("FIXME: enable")
     third_party_hook_invoked = False
 
     def third_party_hook(metadata) -> None:
@@ -280,7 +278,7 @@ def foo(x, size: tl.constexpr, y):
         offs = tl.arange(0, size)
         tl.store(y + offs, tl.load(x + offs))
 
-    x = torch.tensor([2], device="cuda", dtype=torch.float32)
+    x = torch.tensor([2], device="xpu", dtype=torch.float32)
     y = torch.zeros_like(x)
     temp_file = tmp_path / "test_hook_with_third_party.hatchet"
     proton.start(str(temp_file.with_suffix("")), hook="triton")
@@ -295,8 +293,6 @@ def foo(x, size: tl.constexpr, y):
 
 
 def test_hook_multiple_threads(tmp_path: pathlib.Path):
-    if is_xpu():
-        pytest.skip("FIXME: enable")
 
     def metadata_fn_foo(grid: tuple, metadata: NamedTuple, args: dict):
         return {"name": "foo_test"}
@@ -314,9 +310,9 @@ def bar(x, size: tl.constexpr, y):
         offs = tl.arange(0, size)
         tl.store(y + offs, tl.load(x + offs))
 
-    x_foo = torch.tensor([2], device="cuda", dtype=torch.float32)
+    x_foo = torch.tensor([2], device="xpu", dtype=torch.float32)
     y_foo = torch.zeros_like(x_foo)
-    x_bar = torch.tensor([2], device="cuda", dtype=torch.float32)
+    x_bar = torch.tensor([2], device="xpu", dtype=torch.float32)
     y_bar = torch.zeros_like(x_bar)
 
     temp_file = tmp_path / "test_hook.hatchet"
@@ -410,10 +406,6 @@ def test_deactivate(tmp_path: pathlib.Path):
 
 
 def test_multiple_sessions(tmp_path: pathlib.Path):
-    if is_xpu():
-        # FIXME: the same correlation id, that's why it's filtered,
-        # should `_kernel_id` be used instead
-        pytest.xfail('assert int(data[0]["children"][0]["metrics"]["count"]) == 2')
     temp_file0 = tmp_path / "test_multiple_sessions0.hatchet"
     temp_file1 = tmp_path / "test_multiple_sessions1.hatchet"
     session_id0 = proton.start(str(temp_file0.with_suffix("")))
@@ -439,8 +431,6 @@ def test_multiple_sessions(tmp_path: pathlib.Path):
 
 
 def test_trace(tmp_path: pathlib.Path):
-    if is_xpu():
-        pytest.skip("FIXME: enable")
     temp_file = tmp_path / "test_trace.chrome_trace"
     proton.start(str(temp_file.with_suffix("")), data="trace")
 
@@ -450,7 +440,7 @@ def foo(x, y, size: tl.constexpr):
         tl.store(y + offs, tl.load(x + offs))
 
     with proton.scope("init"):
-        x = torch.ones((1024, ), device="cuda", dtype=torch.float32)
+        x = torch.ones((1024, ), device="xpu", dtype=torch.float32)
         y = torch.zeros_like(x)
 
     with proton.scope("test"):
@@ -467,8 +457,6 @@ def foo(x, y, size: tl.constexpr):
 
 
 def test_scope_multiple_threads(tmp_path: pathlib.Path):
-    if is_xpu():
-        pytest.skip("FIXME: enable")
     temp_file = tmp_path / "test_scope_threads.hatchet"
     proton.start(str(temp_file.with_suffix("")))
 
@@ -479,7 +467,7 @@ def worker(prefix: str):
         for i in range(N):
             name = f"{prefix}_{i}"
             proton.enter_scope(name)
-            torch.ones((1, ), device="cuda")
+            torch.ones((1, ), device="xpu")
             proton.exit_scope()
 
     threads = [threading.Thread(target=worker, args=(tname, )) for tname in thread_names]