88
99#include < level_zero/ze_api.h>
1010
11+ #include < atomic>
1112#include < cstdlib>
1213#include < iostream>
1314#include < sycl/sycl.hpp>
1617#include " samples_utils.h"
1718
1819/* *
19- * This file implements a sample collector tool that:
20- * - Has two subscribers to PTI Callback Domains related to appending and dispatching GPU
21- * operations.
22- *
23- * - The subscribers trace the workload and report its progress from the callbacks.
24- *
25- * - Of the two subscribers, the second one subscribes only to operation completion.
26- * This demonstrates that multiple subscribers can co-exist.
27- *
28- * - The first subscriber implements measurement of the first GPU kernel duration in GPU cycles.
29- * It does this by appending commands to write global timestamps before and after the kernel,
30- * and reading these timestamps upon kernel completion.
31- *
32- * For simplicity, there is no code for multi-thread synchronization, as well as other tool
33- * aspects are ommitted. The sample workload uses a single thread.
34- * If it is modified to run multiple threads, the synchronization is the first thing
35- * that should be added to the tool.
20+ * This file demonstrate usage of PTI Callback Subscriber and
21+ * External Correlation called from within Append callbacks
22+ * The sample workload uses a single thread.
3623 */
3724
38- // / Tool resources
39-
4025namespace {
4126
42- pti_callback_subscriber_handle subscriber1 = nullptr ;
43- pti_callback_subscriber_handle subscriber2 = nullptr ;
44-
45- ze_event_handle_t global_time_stamp_start_event = nullptr ;
46- ze_event_handle_t global_time_stamp_end_event = nullptr ;
47- void *buff_start = nullptr ;
48- void *buff_end = nullptr ;
27+ pti_callback_subscriber_handle subscriber = nullptr ;
4928
50- uint64_t g_profiled_kernel_id = 0 ;
29+ std::atomic< uint64_t > external_correlation_id = 0 ;
5130
5231} // namespace
5332
54- // / Forward declaration of tool functions required for PTI initialization
55-
33+ //
34+ // Forward declaration of tool functions required for PTI initialization
35+ //
5636void CallbackCommon (pti_callback_domain domain, pti_api_group_id driver_group_id,
5737 uint32_t driver_api_id, pti_backend_ctx_t backend_context, void *cb_data,
5838 void *user_data, void **instance_user_data);
@@ -61,151 +41,46 @@ void ProvideBuffer(unsigned char **buf, std::size_t *buf_size);
6141
6242void ParseBuffer (unsigned char *buf, std::size_t buf_size, std::size_t valid_buf_size);
6343
64- // / Start and Stop profiling
65-
44+ //
45+ // Start and Stop profiling
46+ //
6647void StartProfiling () {
67- // At the moment when subscribe for ptiCallback-s - enable at least one ptiView
48+ // At the moment when subscribe for ptiCallback-s -
49+ // need to enable at least one ptiView for GPU operations
6850 PTI_CHECK_SUCCESS (ptiViewSetCallbacks (ProvideBuffer, ParseBuffer));
6951 PTI_CHECK_SUCCESS (ptiViewEnable (PTI_VIEW_DEVICE_GPU_KERNEL));
7052
71- // Initializing two pti Subscribers and setting for both of them the same callback function
72- // Note, that as user data we pass to each subscriber its own address
73- PTI_CHECK_SUCCESS (ptiCallbackSubscribe (&subscriber1, CallbackCommon, &subscriber1));
74- std::cout << " Initialized Subscriber: " << subscriber1 << std::endl;
75- PTI_CHECK_SUCCESS (ptiCallbackSubscribe (&subscriber2, CallbackCommon, &subscriber2));
76- std::cout << " Initialized Subscriber: " << subscriber2 << std::endl;
53+ // Demonstrating here how to use External Correlation in Subscriber Callbacks
54+ PTI_CHECK_SUCCESS (ptiViewEnable (PTI_VIEW_DRIVER_API));
55+ PTI_CHECK_SUCCESS (ptiViewEnableDriverApiClass (1 , pti_api_class::PTI_API_CLASS_GPU_OPERATION_CORE,
56+ pti_api_group_id::PTI_API_GROUP_LEVELZERO));
57+ PTI_CHECK_SUCCESS (ptiViewEnable (PTI_VIEW_EXTERNAL_CORRELATION));
7758
78- // Enabling for each subscriber the domains of interest
79- // Subscriber1 will get notifications about GPU Operation Appended and Completed
80- PTI_CHECK_SUCCESS (
81- ptiCallbackEnableDomain (subscriber1, PTI_CB_DOMAIN_DRIVER_GPU_OPERATION_APPENDED, 1 , 1 ));
59+ // Initializing Subscriber and setting the callback function
60+ // As user data we pass to subscriber its own address
61+ PTI_CHECK_SUCCESS (ptiCallbackSubscribe (&subscriber, CallbackCommon, &subscriber));
62+ std::cout << " Initialized Subscriber: " << subscriber << std::endl;
63+
64+ // Enabling for each subscriber domains of interest
8265 PTI_CHECK_SUCCESS (
83- ptiCallbackEnableDomain (subscriber1, PTI_CB_DOMAIN_DRIVER_GPU_OPERATION_COMPLETED, 1 , 1 ));
84- // Subscriber2 will get notifications only GPU Operation Completed only
66+ ptiCallbackEnableDomain (subscriber, PTI_CB_DOMAIN_DRIVER_GPU_OPERATION_APPENDED, 1 , 1 ));
8567 PTI_CHECK_SUCCESS (
86- ptiCallbackEnableDomain (subscriber2 , PTI_CB_DOMAIN_DRIVER_GPU_OPERATION_COMPLETED, 1 , 1 ));
68+ ptiCallbackEnableDomain (subscriber , PTI_CB_DOMAIN_DRIVER_GPU_OPERATION_COMPLETED, 1 , 1 ));
8769}
8870
8971void StopProfiling () {
90- PTI_CHECK_SUCCESS (ptiCallbackUnsubscribe (subscriber1));
91- PTI_CHECK_SUCCESS (ptiCallbackUnsubscribe (subscriber2));
72+ PTI_CHECK_SUCCESS (ptiCallbackUnsubscribe (subscriber));
9273
74+ PTI_CHECK_SUCCESS (ptiViewDisable (PTI_VIEW_DRIVER_API));
9375 PTI_CHECK_SUCCESS (ptiViewDisable (PTI_VIEW_DEVICE_GPU_KERNEL));
76+ PTI_CHECK_SUCCESS (ptiViewDisable (PTI_VIEW_EXTERNAL_CORRELATION));
9477
9578 PTI_CHECK_SUCCESS (ptiFlushAllViews ());
9679}
9780
98- // / Tool working functions
99-
100- bool IsToolResourcesInitialized () {
101- return global_time_stamp_start_event != nullptr && global_time_stamp_end_event != nullptr &&
102- buff_start != nullptr && buff_end != nullptr ;
103- }
104-
105- /* *
106- * @brief Initialization is lazy and happens only once
107- */
108- bool InitToolResources (ze_context_handle_t context, ze_device_handle_t device) {
109- static bool ready = false ;
110- ze_event_pool_handle_t event_pool = nullptr ;
111-
112- if (!ready) {
113- // Pool with 2 events - to be appended them (only) at the first kernel append callback pair
114- ze_event_pool_desc_t event_pool_desc = {.stype = ZE_STRUCTURE_TYPE_EVENT_POOL_DESC,
115- .pNext = nullptr ,
116- .flags = ZE_EVENT_POOL_FLAG_IPC |
117- ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP |
118- ZE_EVENT_POOL_FLAG_HOST_VISIBLE,
119- .count = 2 };
120-
121- auto status = zeEventPoolCreate (context, &event_pool_desc, 1 , &device, &event_pool);
122- if (status != ZE_RESULT_SUCCESS) {
123- std::cerr << " zeEventPoolCreate failed with error code: " << status << ' \n ' ;
124- return false ;
125- }
126-
127- // Memory buffers where to write timestamps
128- ze_device_mem_alloc_desc_t device_desc = {ZE_STRUCTURE_TYPE_DEVICE_MEM_ALLOC_DESC, nullptr , 0 ,
129- 0 };
130- ze_host_mem_alloc_desc_t host_desc = {ZE_STRUCTURE_TYPE_HOST_MEM_ALLOC_DESC, nullptr , 0 };
131-
132- status = zeMemAllocShared (context, &device_desc, &host_desc, 8 , 64 , device, &buff_start);
133- if (status != ZE_RESULT_SUCCESS) {
134- std::cerr << " zeMemAllocShared failed with error code: " << status << ' \n ' ;
135- return false ;
136- }
137- status = zeMemAllocShared (context, &device_desc, &host_desc, 8 , 64 , device, &buff_end);
138- if (status != ZE_RESULT_SUCCESS) {
139- std::cerr << " zeMemAllocShared failed with error code: " << status << ' \n ' ;
140- return false ;
141- }
142-
143- // Create two events from the pool. They would signal that timestamps written
144- ze_event_desc_t event_desc = {.stype = ZE_STRUCTURE_TYPE_EVENT_DESC,
145- .pNext = nullptr ,
146- .index = 0 ,
147- .signal = ZE_EVENT_SCOPE_FLAG_HOST, // Event is signaled on host
148- .wait = ZE_EVENT_SCOPE_FLAG_HOST}; // Event is waited on host
149- status = zeEventCreate (event_pool, &event_desc, &global_time_stamp_start_event);
150- if (status != ZE_RESULT_SUCCESS) {
151- std::cerr << " zeEventCreate failed with error code: " << status << ' \n ' ;
152- return false ;
153- }
154- event_desc.index = 1 ;
155- status = zeEventCreate (event_pool, &event_desc, &global_time_stamp_end_event);
156- if (status != ZE_RESULT_SUCCESS) {
157- std::cerr << " zeEventCreate failed with error code: " << status << ' \n ' ;
158- return false ;
159- }
160- ready = true ;
161- return true ;
162- }
163- return false ;
164- }
165-
166- void GPUKernelAppendOnEnter (ze_context_handle_t context, ze_device_handle_t device,
167- ze_command_list_handle_t command_list, bool &write_appended) {
168- write_appended = false ;
169- std::cout << " Initialize tool resources to Write Global timestamps around the kernel"
170- << std::endl;
171- // Lazy initialization, returns true only once
172- auto res = InitToolResources (context, device);
173- if (res) {
174- std::cout << " -----> Appending Write Global timestamp before the kernel" << std::endl;
175- res = zeCommandListAppendWriteGlobalTimestamp (command_list, static_cast <uint64_t *>(buff_start),
176- global_time_stamp_start_event, 0 , nullptr );
177-
178- if (res == ZE_RESULT_SUCCESS) {
179- std::cout << " Appended Write Global timestamp" << std::endl;
180- write_appended = true ;
181- } else {
182- std::cout << " zeCommandListAppendWriteGlobalTimestamp failed with error code: " << res
183- << std::endl;
184- }
185- } else {
186- std::cout << " Data not (re-)initialized" << std::endl;
187- }
188- }
189-
190- void GPUKernelAppendOnExit (ze_command_list_handle_t command_list, bool &write_appended) {
191- write_appended = false ;
192- if (!IsToolResourcesInitialized ()) {
193- std::cout << " Tool resources not initialized. Cannot append write timestamp after the kernel"
194- << std::endl;
195- return ;
196- }
197- std::cout << " <----- Appending Write Global timestamp after the kernel" << std::endl;
198- auto res = zeCommandListAppendWriteGlobalTimestamp (
199- command_list, static_cast <uint64_t *>(buff_end), global_time_stamp_end_event, 0 , nullptr );
200- if (res == ZE_RESULT_SUCCESS) {
201- std::cout << " Appended Write Global timestamp" << std::endl;
202- write_appended = true ;
203- } else {
204- std::cout << " zeCommandListAppendWriteGlobalTimestamp failed with error code: " << res
205- << std::endl;
206- }
207- }
208-
81+ //
82+ // Functions used in Callbacks
83+ //
20984void CallbackGPUOperationAppend ([[maybe_unused]] pti_callback_domain domain,
21085 pti_api_group_id driver_group_id, uint32_t driver_api_id,
21186 [[maybe_unused]] pti_backend_ctx_t backend_context, void *cb_data,
@@ -229,49 +104,29 @@ void CallbackGPUOperationAppend([[maybe_unused]] pti_callback_domain domain,
229104
230105 if (gpu_op_data->_operation_count != 1 ) {
231106 std::cout << " WARNING: Operation count is not 1, it is: " << gpu_op_data->_operation_count
232- << " . Unexpected for this sample! Will not proceed with appending "
233- << " Global Timestamp write " << std::endl;
107+ << " . Unexpected for this sample! Will not proceed with Push/Pop "
108+ << " of External Correlation " << std::endl;
234109 return ;
235110 }
236111
237- pti_gpu_op_details *gpu_op_details =
238- static_cast <pti_gpu_op_details *>(gpu_op_data->_operation_details );
239-
240- bool is_op_kernel = (gpu_op_details->_operation_kind == PTI_GPU_OPERATION_KIND_KERNEL);
241-
242112 if (gpu_op_data->_phase == PTI_CB_PHASE_API_ENTER) {
243113 *instance_user_data = static_cast <void *>(nullptr );
244- if (is_op_kernel) {
245- bool write_appended = false ;
246- GPUKernelAppendOnEnter (static_cast <ze_context_handle_t >(backend_context),
247- static_cast <ze_device_handle_t >(gpu_op_data->_device_handle ),
248- static_cast <ze_command_list_handle_t >(gpu_op_data->_cmd_list_handle ),
249- write_appended);
250- if (write_appended) {
251- // remember operation_id which duration is being profiled
252- g_profiled_kernel_id = gpu_op_details->_operation_id ;
253- // Store End event - to indicate for the pair EXIT Callback that
254- // it need to append another Write Global Timestamp
255- *instance_user_data = static_cast <void *>(&global_time_stamp_end_event);
256- }
257- }
114+ external_correlation_id.fetch_add (1 );
115+ auto result = ptiViewPushExternalCorrelationId (
116+ pti_view_external_kind::PTI_VIEW_EXTERNAL_KIND_CUSTOM_0, external_correlation_id);
117+ std::cout << " Pushing External Correlation Id: " << external_correlation_id
118+ << " , Result: " << result << std::endl;
258119 } else if (gpu_op_data->_phase == PTI_CB_PHASE_API_EXIT) {
259- if (is_op_kernel && (*instance_user_data != nullptr )) {
260- bool write_appended = false ;
261- GPUKernelAppendOnExit ( static_cast < ze_command_list_handle_t >(gpu_op_data-> _cmd_list_handle ),
262- write_appended);
263- }
120+ uint64_t local_external_correlation_id = 0ULL ;
121+ auto result = ptiViewPopExternalCorrelationId (
122+ pti_view_external_kind::PTI_VIEW_EXTERNAL_KIND_CUSTOM_0, &local_external_correlation_id);
123+ std::cout << " Popped External Correlation Id: " << local_external_correlation_id
124+ << " , Result: " << result << std::endl;
264125 } else {
265126 std::cout << " Unexpected phase: " << gpu_op_data->_phase << std::endl;
266127 }
267128}
268129
269- uint64_t ReadTimestamp (void *buff) {
270- // Copy buffer from device to host
271- uint64_t timestamp = *static_cast <uint64_t *>(buff);
272- return timestamp;
273- }
274-
275130void CallbackGPUOperationCompletion ([[maybe_unused]] pti_callback_domain domain,
276131 pti_api_group_id driver_group_id, uint32_t driver_api_id,
277132 [[maybe_unused]] pti_backend_ctx_t backend_context,
@@ -283,45 +138,6 @@ void CallbackGPUOperationCompletion([[maybe_unused]] pti_callback_domain domain,
283138
284139 samples_utils::DumpCallbackData (domain, driver_group_id, driver_api_id, backend_context, cb_data,
285140 user_data, instance_user_data);
286-
287- if (IsToolResourcesInitialized ()) {
288- // Checking the events status first - prior reading timestamps
289- auto result1 = zeEventQueryStatus (global_time_stamp_start_event);
290- auto result2 = zeEventQueryStatus (global_time_stamp_end_event);
291- if (result1 == ZE_RESULT_SUCCESS && result2 == ZE_RESULT_SUCCESS) {
292- std::cout << " Writes of Global Time Stamp signaled." << std::endl;
293- // Reading timestamps
294- auto start_time_stamp = ReadTimestamp (buff_start);
295- auto end_time_stamp = ReadTimestamp (buff_end);
296- std::cout << " Kernel with _kernel_id: " << g_profiled_kernel_id
297- << " , start TS: " << start_time_stamp << " , end TS: " << end_time_stamp
298- << " , duration: " << (end_time_stamp - start_time_stamp) << " cycles \n " ;
299-
300- // Resetting the events and will not use them anymore
301- result1 = zeEventHostReset (global_time_stamp_start_event);
302- if (result1 != ZE_RESULT_SUCCESS) {
303- std::cout << " zeEventHostReset for Start event failed with error code: " << result1
304- << std::endl;
305- }
306- result2 = zeEventHostReset (global_time_stamp_end_event);
307- if (result2 != ZE_RESULT_SUCCESS) {
308- std::cout << " zeEventHostReset for End event failed with error code: " << result2
309- << std::endl;
310- }
311-
312- } else if (result1 == ZE_RESULT_SUCCESS && result2 == ZE_RESULT_NOT_READY) {
313- std::cout << " Global Timestamp End event is NOT READY." ;
314- } else if (result1 == ZE_RESULT_NOT_READY && result2 == ZE_RESULT_NOT_READY) {
315- std::cout << " Global Timestamp Start and End event are NOT READY."
316- " It could be that they already processed and reset. "
317- << std::endl;
318- } else {
319- std::cout << " zeEventQueryStatus for Start event failed with error code: " << result1
320- << std::endl;
321- std::cout << " zeEventQueryStatus for End event failed with error code: " << result2
322- << std::endl;
323- }
324- }
325141}
326142
327143void CallbackCommon (pti_callback_domain domain, pti_api_group_id driver_group_id,
@@ -346,6 +162,9 @@ void CallbackCommon(pti_callback_domain domain, pti_api_group_id driver_group_id
346162 std::cout << std::endl;
347163}
348164
165+ //
166+ // PTI Reports Buffer functions
167+ //
349168void ProvideBuffer (unsigned char **buf, std::size_t *buf_size) {
350169 constexpr auto kRequestedRecordCount = 5'000'000ULL ;
351170 constexpr auto kRequestedBufferSize = kRequestedRecordCount * sizeof (pti_view_record_kernel);
@@ -383,15 +202,33 @@ void ParseBuffer(unsigned char *buf, std::size_t buf_size, std::size_t valid_buf
383202 break ;
384203 }
385204 case pti_view_kind::PTI_VIEW_DEVICE_GPU_KERNEL: {
386- pti_view_record_kernel *rec = reinterpret_cast <pti_view_record_kernel *>(ptr);
205+ auto *rec = reinterpret_cast <pti_view_record_kernel *>(ptr);
387206 std::cout << " ---------------------------------------------------"
388- " -----------------------------"
389- << ' \n ' ;
207+ " -----------------------------\n " ;
390208 std::cout << " Found Kernel Record" << ' \n ' ;
391209 samples_utils::DumpRecord (rec);
392210 std::cout << " ---------------------------------------------------"
393- " -----------------------------"
394- << ' \n ' ;
211+ " -----------------------------\n " ;
212+ break ;
213+ }
214+ case pti_view_kind::PTI_VIEW_DRIVER_API: {
215+ std::cout << " ---------------------------------------------------"
216+ " -----------------------------\n " ;
217+ std::cout << " Found Driver API Record" << ' \n ' ;
218+ auto *rec = reinterpret_cast <pti_view_record_api *>(ptr);
219+ samples_utils::DumpRecord (rec);
220+ std::cout << " ---------------------------------------------------"
221+ " -----------------------------\n " ;
222+ break ;
223+ }
224+ case pti_view_kind::PTI_VIEW_EXTERNAL_CORRELATION: {
225+ std::cout << " ---------------------------------------------------"
226+ " -----------------------------\n " ;
227+ std::cout << " Found External Correlation Record" << ' \n ' ;
228+ auto *rec = reinterpret_cast <pti_view_record_external_correlation *>(ptr);
229+ samples_utils::DumpRecord (rec);
230+ std::cout << " ---------------------------------------------------"
231+ " -----------------------------\n " ;
395232 break ;
396233 }
397234 default : {
0 commit comments