From 2ea9563fc389102cf00ceb8e75de04910b689f29 Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Fri, 10 Oct 2025 14:13:21 +0000 Subject: [PATCH 1/9] [DEBUG] Integrate PTI callback interface and build it from sources Signed-off-by: Anatoly Myachev --- .github/workflows/triton-benchmarks.yml | 15 ++ python/tutorials/09-persistent-matmul.py | 6 +- .../intel/backend/proton/include/pti/pti.h | 23 +- .../backend/proton/include/pti/pti_callback.h | 234 ++++++++++++++++++ .../pti/pti_driver_levelzero_api_ids.h | 15 +- .../backend/proton/include/pti/pti_metrics.h | 2 - .../include/pti/pti_runtime_sycl_api_ids.h | 49 +++- .../backend/proton/include/pti/pti_version.h | 8 +- .../backend/proton/include/pti/pti_view.h | 8 +- .../csrc/include/Driver/GPU/XpuptiApi.h | 17 ++ .../proton/csrc/lib/Driver/GPU/XpuptiApi.cpp | 13 + .../lib/Profiler/Xpupti/XpuptiProfiler.cpp | 111 ++++++++- third_party/proton/test/test_profile.py | 22 +- 13 files changed, 477 insertions(+), 46 deletions(-) create mode 100644 third_party/intel/backend/proton/include/pti/pti_callback.h diff --git a/.github/workflows/triton-benchmarks.yml b/.github/workflows/triton-benchmarks.yml index 449a13bd46..904d8aef18 100644 --- a/.github/workflows/triton-benchmarks.yml +++ b/.github/workflows/triton-benchmarks.yml @@ -116,9 +116,24 @@ jobs: cd benchmarks pip install . + - name: Build PTI from source + id: build-pti + run: | + git clone https://github.com/intel/pti-gpu.git + cd pti-gpu + git checkout 15a201d25e5659692613b98ee33513263b689101 + cd sdk + cmake --preset linux-icpx-release + BUILD_TESTING=1 PTI_BUILD_SAMPLES=1 cmake --build --preset linux-icpx-release + + PTI_LIBS_DIR="$(pwd)/build-linux-icpx-release/lib/" + ls $PTI_LIBS_DIR + echo "PTI_LIBS_DIR=$PTI_LIBS_DIR" >> $GITHUB_ENV + - name: Run Triton Softmax kernel benchmark if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'fused_softmax.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'fused_softmax.py') }} run: | + export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH cd benchmarks/triton_kernels_benchmark python fused_softmax.py --reports $REPORTS --n_runs $N_RUNS source ../../scripts/capture-hw-details.sh diff --git a/python/tutorials/09-persistent-matmul.py b/python/tutorials/09-persistent-matmul.py index f354769615..93cf85241f 100644 --- a/python/tutorials/09-persistent-matmul.py +++ b/python/tutorials/09-persistent-matmul.py @@ -673,7 +673,7 @@ def bench_fn(label, reps, warmup_reps, fn, *args): for _ in range(warmup_reps): fn(*args) #FIXME: Enable for XPU once proton support works. - if is_cuda(): + if True or is_cuda(): with proton_context(): for _ in range(reps): fn(*args) @@ -783,11 +783,11 @@ def show_profile(precision, profile_name): validate(32, 32, 32, dtype) validate(8192, 8192, args.K_range[0], dtype) - if is_cuda(): + if True or is_cuda(): proton.start("matmul", hook="triton") proton.deactivate() for K in range(args.K_range[0], args.K_range[1] + 1, args.K_step): bench(K, dtype) - if is_cuda(): + if True or is_cuda(): proton.finalize() show_profile(args.prec, "matmul") diff --git a/third_party/intel/backend/proton/include/pti/pti.h b/third_party/intel/backend/proton/include/pti/pti.h index 3bd6a3d363..512a839154 100644 --- a/third_party/intel/backend/proton/include/pti/pti.h +++ b/third_party/intel/backend/proton/include/pti/pti.h @@ -31,7 +31,9 @@ typedef enum { //!< PTI_VIEW_EXTERNAL_CORRELATION PTI_ERROR_BAD_TIMESTAMP = 6, //!< error in timestamp conversion, might be related with the user //!< provided TimestampCallback - PTI_ERROR_BAD_API_ID = 7, //!< invalid api_id when enable/disable runtime/driver specific api_id + PTI_ERROR_BAD_API_ID = 7, //!< invalid api_id when enable/disable runtime/driver specific api_id + PTI_ERROR_NO_GPU_VIEWS_ENABLED = 8, //!< at least one GPU view must be enabled for kernel tracing + PTI_ERROR_DRIVER = 50, //!< unknown driver error PTI_ERROR_TRACING_NOT_INITIALIZED = 51, //!< installed driver requires tracing enabling with //!< setting environment variable ZE_ENABLE_TRACING_LAYER @@ -57,6 +59,25 @@ typedef enum { */ PTI_EXPORT const char* ptiResultTypeToString(pti_result result_value); + +/** + * @brief Abstraction for backend-specific objects. + * + * Level Zero is currently the only supported backend. However, these types will attempt to serve other backends. + * In case the other backend supported - the same types will serve it. + */ + +typedef void* pti_device_handle_t; //!< Device handle + +typedef void* pti_backend_ctx_t; //!< Backend context handle + +typedef void* pti_backend_queue_t; //!< Backend queue handle + +typedef void* pti_backend_evt_t; //!< Backend event handle + +typedef void* pti_backend_command_list_t; //!< Backend command list handle + + #if defined(__cplusplus) } #endif diff --git a/third_party/intel/backend/proton/include/pti/pti_callback.h b/third_party/intel/backend/proton/include/pti/pti_callback.h new file mode 100644 index 0000000000..0659cec7fa --- /dev/null +++ b/third_party/intel/backend/proton/include/pti/pti_callback.h @@ -0,0 +1,234 @@ +//============================================================== +// Copyright (C) Intel Corporation +// +// SPDX-License-Identifier: MIT +// ============================================================= + +#ifndef PTI_CALLBACK_H_ +#define PTI_CALLBACK_H_ + +#include + +#include "pti/pti.h" +#include "pti/pti_view.h" + +/** + * This file contains APIs that are so far experimental in PTI. + * APIs and data structures in this file are work-in-progress and subject to change! + * All content in this file concerns the Callback API. + * + * The Callback API is useful for many purposes, + * including the implementation of `MetricsScope` functionality that needs to subscribe to + * domains such as kernel append to a command list, and potentially other domains. + * The `MetricsScope` API is under development and is the first (internal) user of the Callback API. + */ + + +/* clang-format off */ +#if defined(__cplusplus) +extern "C" { +#endif + +typedef struct _pti_callback_subscriber* pti_callback_subscriber_handle; + +typedef enum _pti_callback_domain { + PTI_CB_DOMAIN_INVALID = 0, + PTI_CB_DOMAIN_DRIVER_CONTEXT_CREATED = 1, //!< Not implemented yet + //!< attempt to enable it will return PTI_ERROR_NOT_IMPLEMENTED + + PTI_CB_DOMAIN_DRIVER_MODULE_LOADED = 2, //!< Not implemented yet + //!< attempt to enable it will return PTI_ERROR_NOT_IMPLEMENTED + + PTI_CB_DOMAIN_DRIVER_MODULE_UNLOADED = 3, //!< Not implemented yet + //!< attempt to enable it will return PTI_ERROR_NOT_IMPLEMENTED + + PTI_CB_DOMAIN_DRIVER_GPU_OPERATION_APPENDED = 4, //!< Synchronous callback + //!< This also serves as PTI_CB_DOMAIN_DRIVER_GPU_OPERATION_DISPATCHED + //!< when appended to Immediate Command List, + //!< which means no separate callback PTI_CB_DOMAIN_DRIVER_GPU_OPERATION_DISPATCHED + + PTI_CB_DOMAIN_DRIVER_GPU_OPERATION_DISPATCHED = 5, //!< Not implemented yet + //!< attempt to enable it will return PTI_ERROR_NOT_IMPLEMENTED + + PTI_CB_DOMAIN_DRIVER_GPU_OPERATION_COMPLETED = 6, //!< Asynchronous callback, always has only EXIT phase of some API, + //!< where completed operations are collected and reported + + PTI_CB_DOMAIN_DRIVER_HOST_SYNCHRONIZATION = 7, //!< Not implemented yet + //!< attempt to enable it will return PTI_ERROR_NOT_IMPLEMENTED + + PTI_CB_DOMAIN_DRIVER_API = 1023, //!< Not implemented yet, + //!< attempt to enable it will return PTI_ERROR_NOT_IMPLEMENTED + //!< Callback created for all Driver APIs + // below domains to inform user about PTI internal events + PTI_CB_DOMAIN_INTERNAL_THREADS = 1024, //!< Not implemented yet + PTI_CB_DOMAIN_INTERNAL_EVENT = 1025, //!< Not implemented yet + + PTI_CB_DOMAIN_MAX = 0x7fffffff +} pti_callback_domain; + +typedef enum _pti_callback_phase { + PTI_CB_PHASE_INVALID = 0, + PTI_CB_PHASE_API_ENTER = 1, + PTI_CB_PHASE_API_EXIT = 2, + PTI_CB_PHASE_INTERNAL_THREAD_START = 3, + PTI_CB_PHASE_INTERNAL_THREAD_END = 4, + PTI_CB_PHASE_INTERNAL_EVENT = 5, + + PTI_CB_PHASE_MAX = 0x7fffffff +} pti_callback_phase; + +typedef enum _pti_backend_command_list_type { + PTI_BACKEND_COMMAND_LIST_TYPE_UNKNOWN = (1<<0), + PTI_BACKEND_COMMAND_LIST_TYPE_IMMEDIATE = (1<<1), + PTI_BACKEND_COMMAND_LIST_TYPE_MUTABLE = (1<<2), + + PTI_BACKEND_COMMAND_LIST_TYPE_MAX = 0x7fffffff +} pti_backend_command_list_type; + +/** + * A user can subscribe to notifications about non-standard situations from PTI + * when it collects or processes the data + */ +typedef enum _pti_internal_event_type { + PTI_INTERNAL_EVENT_TYPE_INFO = 0, + PTI_INTERNAL_EVENT_TYPE_WARNING = 1, // one or a few records data inconsistencies, or other + // collection is safe to continue + PTI_INTERNAL_EVENT_TYPE_CRITICAL = 2, // critical error after which further collected data are invalid + + PTI_INTERNAL_EVENT_TYPE_MAX = 0x7fffffff +} pti_internal_event_type; + +typedef enum _pti_gpu_operation_kind { + PTI_GPU_OPERATION_KIND_INVALID = 0, + PTI_GPU_OPERATION_KIND_KERNEL = 1, + PTI_GPU_OPERATION_KIND_MEMORY = 2, + PTI_GPU_OPERATION_KIND_OTHER = 3, + + PTI_GPU_OPERATION_KIND_MAX = 0x7fffffff +} pti_gpu_operation_kind; + +typedef struct _pti_gpu_op_details { + pti_gpu_operation_kind _operation_kind; // #include namespace proton { @@ -15,6 +16,22 @@ template pti_result viewDisable(pti_view_kind kind); template pti_result viewFlushAll(); +template +pti_result subscribe(pti_callback_subscriber_handle *subscriber, + pti_callback_function callback, void *user_data); + +template +pti_result unsubscribe(pti_callback_subscriber_handle subscriber); + +template +pti_result enableDomain(pti_callback_subscriber_handle subscriber, + pti_callback_domain domain, uint32_t enter_cb, + uint32_t exit_cb); + +template +pti_result disableDomain(pti_callback_subscriber_handle subscriber, + pti_callback_domain domain); + template pti_result viewGetNextRecord(uint8_t *buffer, size_t valid_bytes, pti_view_record_base **record); diff --git a/third_party/proton/csrc/lib/Driver/GPU/XpuptiApi.cpp b/third_party/proton/csrc/lib/Driver/GPU/XpuptiApi.cpp index 726199781d..bd180533aa 100644 --- a/third_party/proton/csrc/lib/Driver/GPU/XpuptiApi.cpp +++ b/third_party/proton/csrc/lib/Driver/GPU/XpuptiApi.cpp @@ -24,6 +24,19 @@ DEFINE_DISPATCH(ExternLibXpupti, viewDisable, ptiViewDisable, pti_view_kind) DEFINE_DISPATCH(ExternLibXpupti, viewFlushAll, ptiFlushAllViews) +DEFINE_DISPATCH(ExternLibXpupti, subscribe, ptiCallbackSubscribe, + pti_callback_subscriber_handle *, pti_callback_function, void *) + +DEFINE_DISPATCH(ExternLibXpupti, unsubscribe, ptiCallbackUnsubscribe, + pti_callback_subscriber_handle); + +DEFINE_DISPATCH(ExternLibXpupti, enableDomain, ptiCallbackEnableDomain, + pti_callback_subscriber_handle, pti_callback_domain, uint32_t, + uint32_t); + +DEFINE_DISPATCH(ExternLibXpupti, disableDomain, ptiCallbackDisableDomain, + pti_callback_subscriber_handle, pti_callback_domain); + DEFINE_DISPATCH(ExternLibXpupti, viewGetNextRecord, ptiViewGetNextRecord, uint8_t *, size_t, pti_view_record_base **) diff --git a/third_party/proton/csrc/lib/Profiler/Xpupti/XpuptiProfiler.cpp b/third_party/proton/csrc/lib/Profiler/Xpupti/XpuptiProfiler.cpp index 48ba83f564..749794d3d6 100644 --- a/third_party/proton/csrc/lib/Profiler/Xpupti/XpuptiProfiler.cpp +++ b/third_party/proton/csrc/lib/Profiler/Xpupti/XpuptiProfiler.cpp @@ -264,14 +264,19 @@ struct XpuptiProfiler::XpuptiProfilerPimpl static void allocBuffer(uint8_t **buffer, size_t *bufferSize); static void completeBuffer(uint8_t *buffer, size_t size, size_t validSize); - /* - static void callbackFn(void *userData, CUpti_CallbackDomain domain, - CUpti_CallbackId cbId, const void *cbData); - */ + // static void callbackFn(void *userData, CUpti_CallbackDomain domain, + // CUpti_CallbackId cbId, const void *cbData); + static void callbackFn(pti_callback_domain domain, + pti_api_group_id driver_api_group_id, + uint32_t driver_api_id, + pti_backend_ctx_t backend_context, void *cb_data, + void *global_user_data, void **instance_user_data); static constexpr size_t AlignSize = 8; static constexpr size_t BufferSize = 64 * 1024 * 1024; + pti_callback_subscriber_handle subscriber; + /* static constexpr size_t AttributeSize = sizeof(size_t); @@ -327,6 +332,79 @@ void XpuptiProfiler::XpuptiProfilerPimpl::completeBuffer(uint8_t *buffer, profiler.correlation.complete(maxCorrelationId); } +void XpuptiProfiler::XpuptiProfilerPimpl::callbackFn( + pti_callback_domain domain, pti_api_group_id driver_api_group_id, + uint32_t driver_api_id, pti_backend_ctx_t backend_context, void *cb_data, + void *global_user_data, void **instance_user_data) { + std::cout << "callback\n" << std::flush; + pti_callback_gpu_op_data *callback_data = + static_cast(cb_data); + if (callback_data == nullptr) { + std::cerr << "CallbackGPUOperationAppend: callback_data is null" + << std::endl; + return; + } + if (callback_data->_phase == PTI_CB_PHASE_API_ENTER) { + threadState.enterOp(); + threadState.profiler.correlation.correlate(callback_data->_correlation_id, + 1); + } else if (callback_data->_phase == PTI_CB_PHASE_API_EXIT) { + threadState.exitOp(); + threadState.profiler.correlation.submit(callback_data->_correlation_id); + } else { + throw std::runtime_error("[PROTON] callbackFn failed"); + } + + /* switch (domain) { + case PTI_DOMAIN_DRIVER_GPU_OPERATION_APPEND: + CallbackGPUOperationAppend(domain, driver_api_group_id, driver_api_id, + backend_context, cb_data, user_data); + break; + case PTI_DOMAIN_GPU_OPERATION_COMPLETED: + CallbackGPUOperationCompletion(domain, driver_api_group_id, driver_api_id, + backend_context, cb_data, user_data); + break; + default: { + std::cout << "In " << __func__ << ", domain: " << domain + << ", driver_group_id: " << driver_api_group_id + << ", driver_api_id: " << driver_api_id << std::endl; + + const char* name_ptr = nullptr; + + if (PTI_SUCCESS == ptiViewGetApiIdName(driver_api_group_id, driver_api_id, + &name_ptr) ) { std::cout << ", API name: " << name_ptr << std::endl; } else { + std::cout << ", Unknown API name" << std::endl; + } + break; + } + } + std::cout << std::endl; + */ +} + +void CallbackCommon(pti_callback_domain domain, + pti_api_group_id driver_group_id, uint32_t driver_api_id, + [[maybe_unused]] pti_backend_ctx_t backend_context, + [[maybe_unused]] void *cb_data, + [[maybe_unused]] void *user_data) { + + switch (domain) { + case PTI_CB_DOMAIN_DRIVER_GPU_OPERATION_APPENDED: + std::cout << "PTI_CB_DOMAIN_DRIVER_GPU_OPERATION_APPENDED\n" << std::flush; + break; + case PTI_CB_DOMAIN_DRIVER_GPU_OPERATION_COMPLETED: + std::cout << "PTI_CB_DOMAIN_DRIVER_GPU_OPERATION_COMPLETED\n" << std::flush; + break; + default: { + std::cout << "In " << __func__ << ", domain: " << domain + << ", driver_group_id: " << driver_group_id + << ", driver_api_id: " << driver_api_id << std::endl; + break; + } + } + std::cout << std::endl; +} + zel_tracer_handle_t tracer = nullptr; typedef void (*EnumDeviceUUIDsFunc)(std::vector>); @@ -380,7 +458,7 @@ int callWaitOnSyclQueue(const std::string &utils_cache_path, void *syclQueue) { } void XpuptiProfiler::XpuptiProfilerPimpl::doStart() { - // xpupti::subscribe(&subscriber, callbackFn, nullptr); + std::cout << "doStart\n" << std::flush; // should be call to shared lib XpuptiProfiler &profiler = threadState.profiler; if (profiler.utils_cache_path != "") { @@ -389,13 +467,13 @@ void XpuptiProfiler::XpuptiProfilerPimpl::doStart() { // auto res = ptiViewPushExternalCorrelationId( // pti_view_external_kind::PTI_VIEW_EXTERNAL_KIND_CUSTOM_1, 42); // std::cout << "res: " << res << "\n" << std::flush; - + /* ze_result_t status = ZE_RESULT_SUCCESS; // status = zeInit(ZE_INIT_FLAG_GPU_ONLY); // assert(status == ZE_RESULT_SUCCESS); zel_tracer_desc_t tracer_desc = {ZEL_STRUCTURE_TYPE_TRACER_DESC, nullptr, - nullptr /* global user data */}; + nullptr}; status = zelTracerCreate(&tracer_desc, &tracer); std::cout << "zelTracerCreate: " << status << "\n" << std::flush; @@ -417,9 +495,14 @@ void XpuptiProfiler::XpuptiProfilerPimpl::doStart() { status = zelTracerSetEnabled(tracer, true); assert(status == ZE_RESULT_SUCCESS); + */ xpupti::viewSetCallbacks(allocBuffer, completeBuffer); xpupti::viewEnable(PTI_VIEW_DEVICE_GPU_KERNEL); + xpupti::viewEnable(PTI_VIEW_DEVICE_GPU_MEM_FILL); + xpupti::viewEnable(PTI_VIEW_DEVICE_GPU_MEM_COPY); + std::cout << "doStart2\n" << std::flush; + xpupti::subscribe(&subscriber, callbackFn, &subscriber); // xpupti::viewEnable(PTI_VIEW_DEVICE_GPU_MEM_COPY); // xpupti::viewEnable(PTI_VIEW_DEVICE_GPU_MEM_FILL); // xpupti::viewEnable(PTI_VIEW_SYCL_RUNTIME_CALLS); @@ -428,6 +511,10 @@ void XpuptiProfiler::XpuptiProfilerPimpl::doStart() { // xpupti::viewEnable(PTI_VIEW_LEVEL_ZERO_CALLS); // setGraphCallbacks(subscriber, /*enable=*/true); // setRuntimeCallbacks(subscriber, /*enable=*/true); + std::cout << "doStart::enableDomain\n" << std::flush; + xpupti::enableDomain(subscriber, + PTI_CB_DOMAIN_DRIVER_GPU_OPERATION_APPENDED, 1, 1); + std::cout << "doStart::enableDomain after\n" << std::flush; // setDriverCallbacks(subscriber, /*enable=*/true); } @@ -444,13 +531,17 @@ void XpuptiProfiler::XpuptiProfilerPimpl::doFlush() { } void XpuptiProfiler::XpuptiProfilerPimpl::doStop() { + /* ze_result_t status = ZE_RESULT_SUCCESS; status = zelTracerSetEnabled(tracer, false); assert(status == ZE_RESULT_SUCCESS); status = zelTracerDestroy(tracer); assert(status == ZE_RESULT_SUCCESS); + */ xpupti::viewDisable(PTI_VIEW_DEVICE_GPU_KERNEL); + xpupti::viewDisable(PTI_VIEW_DEVICE_GPU_MEM_FILL); + xpupti::viewDisable(PTI_VIEW_DEVICE_GPU_MEM_COPY); // xpupti::viewDisable(PTI_VIEW_DEVICE_GPU_MEM_COPY); // xpupti::viewDisable(PTI_VIEW_DEVICE_GPU_MEM_FILL); // xpupti::viewDisable(PTI_VIEW_SYCL_RUNTIME_CALLS); @@ -460,7 +551,11 @@ void XpuptiProfiler::XpuptiProfilerPimpl::doStop() { // setGraphCallbacks(subscriber, /*enable=*/false); // setRuntimeCallbacks(subscriber, /*enable=*/false); // setDriverCallbacks(subscriber, /*enable=*/false); - // cupti::unsubscribe(subscriber); + std::cout << "doStop::disableDomain\n" << std::flush; + xpupti::disableDomain(subscriber, + PTI_CB_DOMAIN_DRIVER_GPU_OPERATION_APPENDED); + std::cout << "doStop::disableDomain after\n" << std::flush; + xpupti::unsubscribe(subscriber); // cupti::finalize(); } diff --git a/third_party/proton/test/test_profile.py b/third_party/proton/test/test_profile.py index f85a4f77d2..15a97dc578 100644 --- a/third_party/proton/test/test_profile.py +++ b/third_party/proton/test/test_profile.py @@ -258,8 +258,6 @@ def foo(x, size: tl.constexpr, y): def test_hook_with_third_party(tmp_path: pathlib.Path): - if is_xpu(): - pytest.skip("FIXME: enable") third_party_hook_invoked = False def third_party_hook(metadata) -> None: @@ -280,7 +278,7 @@ def foo(x, size: tl.constexpr, y): offs = tl.arange(0, size) tl.store(y + offs, tl.load(x + offs)) - x = torch.tensor([2], device="cuda", dtype=torch.float32) + x = torch.tensor([2], device="xpu", dtype=torch.float32) y = torch.zeros_like(x) temp_file = tmp_path / "test_hook_with_third_party.hatchet" proton.start(str(temp_file.with_suffix("")), hook="triton") @@ -295,8 +293,6 @@ def foo(x, size: tl.constexpr, y): def test_hook_multiple_threads(tmp_path: pathlib.Path): - if is_xpu(): - pytest.skip("FIXME: enable") def metadata_fn_foo(grid: tuple, metadata: NamedTuple, args: dict): return {"name": "foo_test"} @@ -314,9 +310,9 @@ def bar(x, size: tl.constexpr, y): offs = tl.arange(0, size) tl.store(y + offs, tl.load(x + offs)) - x_foo = torch.tensor([2], device="cuda", dtype=torch.float32) + x_foo = torch.tensor([2], device="xpu", dtype=torch.float32) y_foo = torch.zeros_like(x_foo) - x_bar = torch.tensor([2], device="cuda", dtype=torch.float32) + x_bar = torch.tensor([2], device="xpu", dtype=torch.float32) y_bar = torch.zeros_like(x_bar) temp_file = tmp_path / "test_hook.hatchet" @@ -410,10 +406,6 @@ def test_deactivate(tmp_path: pathlib.Path): def test_multiple_sessions(tmp_path: pathlib.Path): - if is_xpu(): - # FIXME: the same correlation id, that's why it's filtered, - # should `_kernel_id` be used instead - pytest.xfail('assert int(data[0]["children"][0]["metrics"]["count"]) == 2') temp_file0 = tmp_path / "test_multiple_sessions0.hatchet" temp_file1 = tmp_path / "test_multiple_sessions1.hatchet" session_id0 = proton.start(str(temp_file0.with_suffix(""))) @@ -439,8 +431,6 @@ def test_multiple_sessions(tmp_path: pathlib.Path): def test_trace(tmp_path: pathlib.Path): - if is_xpu(): - pytest.skip("FIXME: enable") temp_file = tmp_path / "test_trace.chrome_trace" proton.start(str(temp_file.with_suffix("")), data="trace") @@ -450,7 +440,7 @@ def foo(x, y, size: tl.constexpr): tl.store(y + offs, tl.load(x + offs)) with proton.scope("init"): - x = torch.ones((1024, ), device="cuda", dtype=torch.float32) + x = torch.ones((1024, ), device="xpu", dtype=torch.float32) y = torch.zeros_like(x) with proton.scope("test"): @@ -467,8 +457,6 @@ def foo(x, y, size: tl.constexpr): def test_scope_multiple_threads(tmp_path: pathlib.Path): - if is_xpu(): - pytest.skip("FIXME: enable") temp_file = tmp_path / "test_scope_threads.hatchet" proton.start(str(temp_file.with_suffix(""))) @@ -479,7 +467,7 @@ def worker(prefix: str): for i in range(N): name = f"{prefix}_{i}" proton.enter_scope(name) - torch.ones((1, ), device="cuda") + torch.ones((1, ), device="xpu") proton.exit_scope() threads = [threading.Thread(target=worker, args=(tname, )) for tname in thread_names] From 1642cd3b22078190fc995fbd7b1e279e6d2a1f23 Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Fri, 10 Oct 2025 15:07:48 +0000 Subject: [PATCH 2/9] fix UTs Signed-off-by: Anatoly Myachev --- .github/workflows/build-test-reusable.yml | 12 ++++++++++++ .github/workflows/triton-benchmarks.yml | 20 ++++++++++++++++++++ 2 files changed, 32 insertions(+) diff --git a/.github/workflows/build-test-reusable.yml b/.github/workflows/build-test-reusable.yml index 722963a9a1..62003d9f86 100644 --- a/.github/workflows/build-test-reusable.yml +++ b/.github/workflows/build-test-reusable.yml @@ -123,6 +123,17 @@ jobs: run: | echo TRITON_BUILD_PROTON_XPU=1 | tee -a $GITHUB_ENV + git clone https://github.com/intel/pti-gpu.git + cd pti-gpu + git checkout 15a201d25e5659692613b98ee33513263b689101 + cd sdk + cmake --preset linux-icpx-release + BUILD_TESTING=1 PTI_BUILD_SAMPLES=1 cmake --build --preset linux-icpx-release + + PTI_LIBS_DIR="$(pwd)/build-linux-icpx-release/lib/" + ls $PTI_LIBS_DIR + echo "PTI_LIBS_DIR=$PTI_LIBS_DIR" >> $GITHUB_ENV + - name: Build Triton uses: ./.github/actions/setup-triton with: @@ -288,6 +299,7 @@ jobs: - name: Run Proton tests if: matrix.suite == 'rest' && inputs.driver_version == 'rolling' && inputs.device == 'max1100' run: | + export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH cd third_party/proton/test # FIXME: enable 'test_record.py' back pytest test_api.py test_lib.py test_profile.py test_viewer.py -s -v diff --git a/.github/workflows/triton-benchmarks.yml b/.github/workflows/triton-benchmarks.yml index 904d8aef18..a101677bf8 100644 --- a/.github/workflows/triton-benchmarks.yml +++ b/.github/workflows/triton-benchmarks.yml @@ -144,6 +144,7 @@ jobs: - name: Run Triton GEMM kernel benchmark if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_benchmark.py') }} run: | + export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH cd benchmarks/triton_kernels_benchmark python gemm_benchmark.py --reports $REPORTS --n_runs $N_RUNS mv $REPORTS/matmul-performance.csv $REPORTS/matmul-performance-base.csv @@ -157,6 +158,7 @@ jobs: - name: Run Triton GEMM kernel benchmark - with tensor of pointer if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_tensor_of_ptr_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_tensor_of_ptr_benchmark.py') }} run: | + export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH cd benchmarks/triton_kernels_benchmark python gemm_tensor_of_ptr_benchmark.py --reports $REPORTS --n_runs $N_RUNS source ../../scripts/capture-hw-details.sh @@ -169,6 +171,7 @@ jobs: - name: Run Triton GEMM kernel benchmark - with tensor descriptor if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_tensor_desc_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_tensor_desc_benchmark.py') }} run: | + export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH cd benchmarks/triton_kernels_benchmark python gemm_tensor_desc_benchmark.py --reports $REPORTS --n_runs $N_RUNS source ../../scripts/capture-hw-details.sh @@ -181,6 +184,7 @@ jobs: - name: Run Triton GEMM (A@B^t) kernel benchmark if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_benchmark.py_abt')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_benchmark.py_abt') }} run: | + export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH cd benchmarks/triton_kernels_benchmark TRANSPOSE_B=1 python gemm_benchmark.py --reports $REPORTS --n_runs $N_RUNS mv $REPORTS/matmul-performance.csv $REPORTS/matmul-performance-bt.csv @@ -192,6 +196,7 @@ jobs: - name: Run Triton GEMM (A^t@B) kernel benchmark if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_benchmark.py_atb')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_benchmark.py_atb') }} run: | + export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH cd benchmarks/triton_kernels_benchmark TRANSPOSE_A=1 python gemm_benchmark.py --reports $REPORTS --n_runs $N_RUNS mv $REPORTS/matmul-performance.csv $REPORTS/matmul-performance-at.csv @@ -203,6 +208,7 @@ jobs: - name: Run Triton GEMM (stream-k) kernel benchmark if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_streamk_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_streamk_benchmark.py') }} run: | + export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH cd benchmarks/triton_kernels_benchmark python gemm_streamk_benchmark.py --reports $REPORTS --n_runs $N_RUNS source ../../scripts/capture-hw-details.sh @@ -212,6 +218,7 @@ jobs: - name: Run Triton GEMM (split-k) kernel benchmark if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_splitk_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_splitk_benchmark.py') }} run: | + export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH cd benchmarks/triton_kernels_benchmark python gemm_splitk_benchmark.py --reports $REPORTS --n_runs $N_RUNS source ../../scripts/capture-hw-details.sh @@ -221,6 +228,7 @@ jobs: - name: Run Triton GEMM + PreOp (exp) kernel benchmark if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_preop_exp_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_preop_exp_benchmark.py') }} run: | + export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH cd benchmarks/triton_kernels_benchmark python gemm_preop_exp_benchmark.py --reports $REPORTS --n_runs $N_RUNS source ../../scripts/capture-hw-details.sh @@ -229,6 +237,7 @@ jobs: - name: Run Triton GEMM + PostOp (Gelu) kernel benchmark if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_postop_gelu_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_postop_gelu_benchmark.py') }} run: | + export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH cd benchmarks/triton_kernels_benchmark python gemm_postop_gelu_benchmark.py --reports $REPORTS --n_runs $N_RUNS source ../../scripts/capture-hw-details.sh @@ -237,6 +246,7 @@ jobs: - name: Run Triton GEMM + PostOp (add matrix) kernel benchmark bfloat16 if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_postop_addmatrix_benchmark_bfloat16.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_postop_addmatrix_benchmark_bfloat16.py') }} run: | + export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH cd benchmarks/triton_kernels_benchmark python gemm_postop_addmatrix_benchmark.py --reports $REPORTS --n_runs $N_RUNS source ../../scripts/capture-hw-details.sh @@ -246,6 +256,7 @@ jobs: - name: Run Triton GEMM + PostOp (add matrix) kernel benchmark int8 if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_postop_addmatrix_benchmark_int8.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_postop_addmatrix_benchmark_int8.py') }} run: | + export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH cd benchmarks/triton_kernels_benchmark INT8_ONLY=1 python gemm_postop_addmatrix_benchmark.py --reports $REPORTS --n_runs $N_RUNS source ../../scripts/capture-hw-details.sh @@ -255,6 +266,7 @@ jobs: - name: Run Triton FA fwd kernel benchmark if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'flash_attention_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'flash_attention_benchmark.py') }} run: | + export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH cd benchmarks/triton_kernels_benchmark python flash_attention_benchmark.py --reports $REPORTS --n_runs $N_RUNS @@ -265,6 +277,7 @@ jobs: - name: Run Triton FA bwd kernel benchmark if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'flash_attention_bwd_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'flash_attention_bwd_benchmark.py') }} run: | + export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH cd benchmarks/triton_kernels_benchmark FA_KERNEL_MODE="bwd" \ python flash_attention_benchmark.py --reports $REPORTS --n_runs $N_RUNS @@ -277,6 +290,7 @@ jobs: - name: Run Triton FA fwd kernel benchmark - with tensor descriptors if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'flash_attention_tensor_desc_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'flash_attention_tensor_desc_benchmark.py') }} run: | + export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH cd benchmarks/triton_kernels_benchmark python flash_attention_tensor_desc_benchmark.py --reports $REPORTS --n_runs $N_RUNS mv $REPORTS/attn-performance.csv $REPORTS/attn-tensor-desc-performance.csv @@ -288,6 +302,7 @@ jobs: - name: Run Triton FlexAttention Causal Mask fwd kernel benchmark if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'flex_attention_benchmark_causal_mask.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'flex_attention_benchmark_causal_mask.py') }} run: | + export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH cd benchmarks/triton_kernels_benchmark python flex_attention_benchmark_causal_mask.py --reports $REPORTS --n_runs $N_RUNS @@ -298,6 +313,7 @@ jobs: - name: Run Triton FlexAttention (batch_size=4) Causal Mask fwd kernel benchmark if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'flex_attention_benchmark_batch4-causal_mask.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'flex_attention_benchmark_batch4-causal_mask.py') }} run: | + export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH cd benchmarks/triton_kernels_benchmark BATCH_SIZE=4 python flex_attention_benchmark_causal_mask.py --reports $REPORTS --n_runs $N_RUNS @@ -308,6 +324,7 @@ jobs: - name: Run Triton FlexAttention (batch_size=16) Causal Mask fwd kernel benchmark if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'flex_attention_benchmark_batch16-causal_mask.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'flex_attention_benchmark_batch16-causal_mask.py') }} run: | + export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH cd benchmarks/triton_kernels_benchmark BATCH_SIZE=16 python flex_attention_benchmark_causal_mask.py --reports $REPORTS --n_runs $N_RUNS @@ -318,6 +335,7 @@ jobs: - name: Run Triton FlexAttention Custom Masks fwd kernel benchmark if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'flex_attention_benchmark_custom_masks.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'flex_attention_benchmark_custom_masks.py') }} run: | + export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH cd benchmarks/triton_kernels_benchmark python flex_attention_benchmark_custom_masks.py --reports $REPORTS --n_runs $N_RUNS @@ -331,6 +349,7 @@ jobs: - name: Run Prefix Sums kernel benchmark if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'prefix_sums.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'prefix_sums.py') }} run: | + export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH cd benchmarks/triton_kernels_benchmark python prefix_sums.py --reports $REPORTS --n_runs $N_RUNS source ../../scripts/capture-hw-details.sh @@ -339,6 +358,7 @@ jobs: - name: Run micro benchmark if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'micro_benchmarks.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'micro_benchmarks') }} run: | + export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH cd benchmarks/micro_benchmarks python run_benchmarks.py --reports $REPORTS From 144ab7725d933f85653dafb0a9639a79242dcc7d Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Mon, 13 Oct 2025 08:39:24 +0000 Subject: [PATCH 3/9] add TRITON_XPUPTI_LIB_PATH Signed-off-by: Anatoly Myachev --- .github/workflows/build-test-reusable.yml | 1 + third_party/proton/csrc/lib/Driver/GPU/XpuptiApi.cpp | 1 + 2 files changed, 2 insertions(+) diff --git a/.github/workflows/build-test-reusable.yml b/.github/workflows/build-test-reusable.yml index 62003d9f86..336d4cf42c 100644 --- a/.github/workflows/build-test-reusable.yml +++ b/.github/workflows/build-test-reusable.yml @@ -300,6 +300,7 @@ jobs: if: matrix.suite == 'rest' && inputs.driver_version == 'rolling' && inputs.device == 'max1100' run: | export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH + export TRITON_XPUPTI_LIB_PATH=$PTI_LIBS_DIR cd third_party/proton/test # FIXME: enable 'test_record.py' back pytest test_api.py test_lib.py test_profile.py test_viewer.py -s -v diff --git a/third_party/proton/csrc/lib/Driver/GPU/XpuptiApi.cpp b/third_party/proton/csrc/lib/Driver/GPU/XpuptiApi.cpp index bd180533aa..c618661372 100644 --- a/third_party/proton/csrc/lib/Driver/GPU/XpuptiApi.cpp +++ b/third_party/proton/csrc/lib/Driver/GPU/XpuptiApi.cpp @@ -10,6 +10,7 @@ struct ExternLibXpupti : public ExternLibBase { using RetType = pti_result; static constexpr const char *name = "libpti_view.so"; static constexpr const char *defaultDir = ""; + static constexpr const char *pathEnv = "TRITON_XPUPTI_LIB_PATH"; static constexpr RetType success = PTI_SUCCESS; static void *lib; }; From 17de806175d352f156f891ac7241917e87e4db46 Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Mon, 13 Oct 2025 09:18:18 +0000 Subject: [PATCH 4/9] another commit Signed-off-by: Anatoly Myachev --- .github/workflows/build-test-reusable.yml | 2 +- .github/workflows/triton-benchmarks.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/build-test-reusable.yml b/.github/workflows/build-test-reusable.yml index 336d4cf42c..4c5893220a 100644 --- a/.github/workflows/build-test-reusable.yml +++ b/.github/workflows/build-test-reusable.yml @@ -125,7 +125,7 @@ jobs: git clone https://github.com/intel/pti-gpu.git cd pti-gpu - git checkout 15a201d25e5659692613b98ee33513263b689101 + git checkout 998e192cb39994a3e4790a86250ea0c480ae416c cd sdk cmake --preset linux-icpx-release BUILD_TESTING=1 PTI_BUILD_SAMPLES=1 cmake --build --preset linux-icpx-release diff --git a/.github/workflows/triton-benchmarks.yml b/.github/workflows/triton-benchmarks.yml index a101677bf8..be054d2f9c 100644 --- a/.github/workflows/triton-benchmarks.yml +++ b/.github/workflows/triton-benchmarks.yml @@ -121,7 +121,7 @@ jobs: run: | git clone https://github.com/intel/pti-gpu.git cd pti-gpu - git checkout 15a201d25e5659692613b98ee33513263b689101 + git checkout 998e192cb39994a3e4790a86250ea0c480ae416c cd sdk cmake --preset linux-icpx-release BUILD_TESTING=1 PTI_BUILD_SAMPLES=1 cmake --build --preset linux-icpx-release From cee025e63e4189182c5dc679e03e6996d060553e Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Mon, 13 Oct 2025 09:55:58 +0000 Subject: [PATCH 5/9] Revert "another commit" This reverts commit 17de806175d352f156f891ac7241917e87e4db46. --- .github/workflows/build-test-reusable.yml | 2 +- .github/workflows/triton-benchmarks.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/build-test-reusable.yml b/.github/workflows/build-test-reusable.yml index 4c5893220a..336d4cf42c 100644 --- a/.github/workflows/build-test-reusable.yml +++ b/.github/workflows/build-test-reusable.yml @@ -125,7 +125,7 @@ jobs: git clone https://github.com/intel/pti-gpu.git cd pti-gpu - git checkout 998e192cb39994a3e4790a86250ea0c480ae416c + git checkout 15a201d25e5659692613b98ee33513263b689101 cd sdk cmake --preset linux-icpx-release BUILD_TESTING=1 PTI_BUILD_SAMPLES=1 cmake --build --preset linux-icpx-release diff --git a/.github/workflows/triton-benchmarks.yml b/.github/workflows/triton-benchmarks.yml index be054d2f9c..a101677bf8 100644 --- a/.github/workflows/triton-benchmarks.yml +++ b/.github/workflows/triton-benchmarks.yml @@ -121,7 +121,7 @@ jobs: run: | git clone https://github.com/intel/pti-gpu.git cd pti-gpu - git checkout 998e192cb39994a3e4790a86250ea0c480ae416c + git checkout 15a201d25e5659692613b98ee33513263b689101 cd sdk cmake --preset linux-icpx-release BUILD_TESTING=1 PTI_BUILD_SAMPLES=1 cmake --build --preset linux-icpx-release From 44e3dd32bec2f3b91e1a85ce5f14bb7985c80d25 Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Mon, 13 Oct 2025 10:12:44 +0000 Subject: [PATCH 6/9] passing PTI libs dir via 'GITHUB_OUTPUT' Signed-off-by: Anatoly Myachev --- .github/workflows/build-test-reusable.yml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/.github/workflows/build-test-reusable.yml b/.github/workflows/build-test-reusable.yml index 336d4cf42c..823a97a2bc 100644 --- a/.github/workflows/build-test-reusable.yml +++ b/.github/workflows/build-test-reusable.yml @@ -83,6 +83,7 @@ jobs: shell: bash -noprofile --norc -eo pipefail -c "source /opt/intel/oneapi/setvars.sh > /dev/null; source {0}" outputs: test-triton-command: ${{ steps.test-triton.outputs.command }} + pti_libs_dir: ${{ steps.build-proton.outputs.pti_libs_dir }} steps: - name: Print inputs run: | @@ -120,6 +121,7 @@ jobs: - name: Build Proton with XPU support if: inputs.driver_version == 'rolling' && inputs.device == 'max1100' + id: build-proton run: | echo TRITON_BUILD_PROTON_XPU=1 | tee -a $GITHUB_ENV @@ -132,7 +134,7 @@ jobs: PTI_LIBS_DIR="$(pwd)/build-linux-icpx-release/lib/" ls $PTI_LIBS_DIR - echo "PTI_LIBS_DIR=$PTI_LIBS_DIR" >> $GITHUB_ENV + echo "pti_libs_dir=$PTI_LIBS_DIR" >> $GITHUB_OUTPUT - name: Build Triton uses: ./.github/actions/setup-triton @@ -299,6 +301,7 @@ jobs: - name: Run Proton tests if: matrix.suite == 'rest' && inputs.driver_version == 'rolling' && inputs.device == 'max1100' run: | + export "PTI_LIBS_DIR=${{ needs.build.outputs.pti_libs_dir }}" export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH export TRITON_XPUPTI_LIB_PATH=$PTI_LIBS_DIR cd third_party/proton/test From a34823bae1b536b90962d7952a692b3593cddfb6 Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Mon, 13 Oct 2025 12:29:31 +0000 Subject: [PATCH 7/9] just move PTI build into 'integration-tests' job Signed-off-by: Anatoly Myachev --- .github/workflows/build-test-reusable.yml | 25 +++++++++-------------- 1 file changed, 10 insertions(+), 15 deletions(-) diff --git a/.github/workflows/build-test-reusable.yml b/.github/workflows/build-test-reusable.yml index 823a97a2bc..35bfb47acb 100644 --- a/.github/workflows/build-test-reusable.yml +++ b/.github/workflows/build-test-reusable.yml @@ -83,7 +83,6 @@ jobs: shell: bash -noprofile --norc -eo pipefail -c "source /opt/intel/oneapi/setvars.sh > /dev/null; source {0}" outputs: test-triton-command: ${{ steps.test-triton.outputs.command }} - pti_libs_dir: ${{ steps.build-proton.outputs.pti_libs_dir }} steps: - name: Print inputs run: | @@ -121,21 +120,9 @@ jobs: - name: Build Proton with XPU support if: inputs.driver_version == 'rolling' && inputs.device == 'max1100' - id: build-proton run: | echo TRITON_BUILD_PROTON_XPU=1 | tee -a $GITHUB_ENV - git clone https://github.com/intel/pti-gpu.git - cd pti-gpu - git checkout 15a201d25e5659692613b98ee33513263b689101 - cd sdk - cmake --preset linux-icpx-release - BUILD_TESTING=1 PTI_BUILD_SAMPLES=1 cmake --build --preset linux-icpx-release - - PTI_LIBS_DIR="$(pwd)/build-linux-icpx-release/lib/" - ls $PTI_LIBS_DIR - echo "pti_libs_dir=$PTI_LIBS_DIR" >> $GITHUB_OUTPUT - - name: Build Triton uses: ./.github/actions/setup-triton with: @@ -298,10 +285,18 @@ jobs: run: | echo "TRITON_TEST_CMD=${{ needs.build.outputs.test-triton-command }}" | tee -a $GITHUB_ENV - - name: Run Proton tests + - name: Build PTI && Run Proton tests if: matrix.suite == 'rest' && inputs.driver_version == 'rolling' && inputs.device == 'max1100' run: | - export "PTI_LIBS_DIR=${{ needs.build.outputs.pti_libs_dir }}" + git clone https://github.com/intel/pti-gpu.git + cd pti-gpu + git checkout 15a201d25e5659692613b98ee33513263b689101 + cd sdk + cmake --preset linux-icpx-release + BUILD_TESTING=1 PTI_BUILD_SAMPLES=1 cmake --build --preset linux-icpx-release + + PTI_LIBS_DIR="$(pwd)/build-linux-icpx-release/lib/" + export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH export TRITON_XPUPTI_LIB_PATH=$PTI_LIBS_DIR cd third_party/proton/test From 9064ee986b063513454a9d554a65ec79ab7e8dfd Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Mon, 13 Oct 2025 15:12:13 +0200 Subject: [PATCH 8/9] Apply suggestion from @anmyachev --- .github/workflows/build-test-reusable.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/build-test-reusable.yml b/.github/workflows/build-test-reusable.yml index 35bfb47acb..b30835c27f 100644 --- a/.github/workflows/build-test-reusable.yml +++ b/.github/workflows/build-test-reusable.yml @@ -296,6 +296,7 @@ jobs: BUILD_TESTING=1 PTI_BUILD_SAMPLES=1 cmake --build --preset linux-icpx-release PTI_LIBS_DIR="$(pwd)/build-linux-icpx-release/lib/" + cd ../.. export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH export TRITON_XPUPTI_LIB_PATH=$PTI_LIBS_DIR From fd37e3630b0ec574b1dfb388e5453d4754ab421c Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Mon, 13 Oct 2025 14:41:50 +0000 Subject: [PATCH 9/9] cleanup && add pti.txt Signed-off-by: Anatoly Myachev --- .github/pins/pti.txt | 1 + .github/workflows/build-test-reusable.yml | 3 +- .github/workflows/triton-benchmarks.yml | 4 +-- python/tutorials/09-persistent-matmul.py | 6 ++-- .../lib/Profiler/Xpupti/XpuptiProfiler.cpp | 34 ------------------- 5 files changed, 8 insertions(+), 40 deletions(-) create mode 100644 .github/pins/pti.txt diff --git a/.github/pins/pti.txt b/.github/pins/pti.txt new file mode 100644 index 0000000000..ca98925e9d --- /dev/null +++ b/.github/pins/pti.txt @@ -0,0 +1 @@ +15a201d25e5659692613b98ee33513263b689101 diff --git a/.github/workflows/build-test-reusable.yml b/.github/workflows/build-test-reusable.yml index b30835c27f..7fe0ddaa6e 100644 --- a/.github/workflows/build-test-reusable.yml +++ b/.github/workflows/build-test-reusable.yml @@ -288,9 +288,10 @@ jobs: - name: Build PTI && Run Proton tests if: matrix.suite == 'rest' && inputs.driver_version == 'rolling' && inputs.device == 'max1100' run: | + PTI_COMMIT_ID="$(<.github/pins/pti.txt)" git clone https://github.com/intel/pti-gpu.git cd pti-gpu - git checkout 15a201d25e5659692613b98ee33513263b689101 + git checkout $PTI_COMMIT_ID cd sdk cmake --preset linux-icpx-release BUILD_TESTING=1 PTI_BUILD_SAMPLES=1 cmake --build --preset linux-icpx-release diff --git a/.github/workflows/triton-benchmarks.yml b/.github/workflows/triton-benchmarks.yml index a101677bf8..f29d6ad4e9 100644 --- a/.github/workflows/triton-benchmarks.yml +++ b/.github/workflows/triton-benchmarks.yml @@ -117,11 +117,11 @@ jobs: pip install . - name: Build PTI from source - id: build-pti run: | + PTI_COMMIT_ID="$(<.github/pins/pti.txt)" git clone https://github.com/intel/pti-gpu.git cd pti-gpu - git checkout 15a201d25e5659692613b98ee33513263b689101 + git checkout $PTI_COMMIT_ID cd sdk cmake --preset linux-icpx-release BUILD_TESTING=1 PTI_BUILD_SAMPLES=1 cmake --build --preset linux-icpx-release diff --git a/python/tutorials/09-persistent-matmul.py b/python/tutorials/09-persistent-matmul.py index 93cf85241f..f354769615 100644 --- a/python/tutorials/09-persistent-matmul.py +++ b/python/tutorials/09-persistent-matmul.py @@ -673,7 +673,7 @@ def bench_fn(label, reps, warmup_reps, fn, *args): for _ in range(warmup_reps): fn(*args) #FIXME: Enable for XPU once proton support works. - if True or is_cuda(): + if is_cuda(): with proton_context(): for _ in range(reps): fn(*args) @@ -783,11 +783,11 @@ def show_profile(precision, profile_name): validate(32, 32, 32, dtype) validate(8192, 8192, args.K_range[0], dtype) - if True or is_cuda(): + if is_cuda(): proton.start("matmul", hook="triton") proton.deactivate() for K in range(args.K_range[0], args.K_range[1] + 1, args.K_step): bench(K, dtype) - if True or is_cuda(): + if is_cuda(): proton.finalize() show_profile(args.prec, "matmul") diff --git a/third_party/proton/csrc/lib/Profiler/Xpupti/XpuptiProfiler.cpp b/third_party/proton/csrc/lib/Profiler/Xpupti/XpuptiProfiler.cpp index 749794d3d6..e8651323dd 100644 --- a/third_party/proton/csrc/lib/Profiler/Xpupti/XpuptiProfiler.cpp +++ b/third_party/proton/csrc/lib/Profiler/Xpupti/XpuptiProfiler.cpp @@ -264,8 +264,6 @@ struct XpuptiProfiler::XpuptiProfilerPimpl static void allocBuffer(uint8_t **buffer, size_t *bufferSize); static void completeBuffer(uint8_t *buffer, size_t size, size_t validSize); - // static void callbackFn(void *userData, CUpti_CallbackDomain domain, - // CUpti_CallbackId cbId, const void *cbData); static void callbackFn(pti_callback_domain domain, pti_api_group_id driver_api_group_id, uint32_t driver_api_id, @@ -354,32 +352,6 @@ void XpuptiProfiler::XpuptiProfilerPimpl::callbackFn( } else { throw std::runtime_error("[PROTON] callbackFn failed"); } - - /* switch (domain) { - case PTI_DOMAIN_DRIVER_GPU_OPERATION_APPEND: - CallbackGPUOperationAppend(domain, driver_api_group_id, driver_api_id, - backend_context, cb_data, user_data); - break; - case PTI_DOMAIN_GPU_OPERATION_COMPLETED: - CallbackGPUOperationCompletion(domain, driver_api_group_id, driver_api_id, - backend_context, cb_data, user_data); - break; - default: { - std::cout << "In " << __func__ << ", domain: " << domain - << ", driver_group_id: " << driver_api_group_id - << ", driver_api_id: " << driver_api_id << std::endl; - - const char* name_ptr = nullptr; - - if (PTI_SUCCESS == ptiViewGetApiIdName(driver_api_group_id, driver_api_id, - &name_ptr) ) { std::cout << ", API name: " << name_ptr << std::endl; } else { - std::cout << ", Unknown API name" << std::endl; - } - break; - } - } - std::cout << std::endl; - */ } void CallbackCommon(pti_callback_domain domain, @@ -458,7 +430,6 @@ int callWaitOnSyclQueue(const std::string &utils_cache_path, void *syclQueue) { } void XpuptiProfiler::XpuptiProfilerPimpl::doStart() { - std::cout << "doStart\n" << std::flush; // should be call to shared lib XpuptiProfiler &profiler = threadState.profiler; if (profiler.utils_cache_path != "") { @@ -501,7 +472,6 @@ void XpuptiProfiler::XpuptiProfilerPimpl::doStart() { xpupti::viewEnable(PTI_VIEW_DEVICE_GPU_KERNEL); xpupti::viewEnable(PTI_VIEW_DEVICE_GPU_MEM_FILL); xpupti::viewEnable(PTI_VIEW_DEVICE_GPU_MEM_COPY); - std::cout << "doStart2\n" << std::flush; xpupti::subscribe(&subscriber, callbackFn, &subscriber); // xpupti::viewEnable(PTI_VIEW_DEVICE_GPU_MEM_COPY); // xpupti::viewEnable(PTI_VIEW_DEVICE_GPU_MEM_FILL); @@ -511,10 +481,8 @@ void XpuptiProfiler::XpuptiProfilerPimpl::doStart() { // xpupti::viewEnable(PTI_VIEW_LEVEL_ZERO_CALLS); // setGraphCallbacks(subscriber, /*enable=*/true); // setRuntimeCallbacks(subscriber, /*enable=*/true); - std::cout << "doStart::enableDomain\n" << std::flush; xpupti::enableDomain(subscriber, PTI_CB_DOMAIN_DRIVER_GPU_OPERATION_APPENDED, 1, 1); - std::cout << "doStart::enableDomain after\n" << std::flush; // setDriverCallbacks(subscriber, /*enable=*/true); } @@ -551,10 +519,8 @@ void XpuptiProfiler::XpuptiProfilerPimpl::doStop() { // setGraphCallbacks(subscriber, /*enable=*/false); // setRuntimeCallbacks(subscriber, /*enable=*/false); // setDriverCallbacks(subscriber, /*enable=*/false); - std::cout << "doStop::disableDomain\n" << std::flush; xpupti::disableDomain(subscriber, PTI_CB_DOMAIN_DRIVER_GPU_OPERATION_APPENDED); - std::cout << "doStop::disableDomain after\n" << std::flush; xpupti::unsubscribe(subscriber); // cupti::finalize(); }