Skip to content

Commit fe0a858

Browse files
authored
[PTI-LIB] Make External correlation work from within Subscriber callbacks (#719)
* [PTI-LIB] Make external correlation work from Callbacks of subscribers - add new callback test, re-factored existing ones - change callback sample to call External correlation from Callbacks Signed-off-by: jfedorov <julia.fedorova@intel.com>
1 parent e901f77 commit fe0a858

File tree

6 files changed

+689
-497
lines changed

6 files changed

+689
-497
lines changed

sdk/samples/callback/client.cc

Lines changed: 71 additions & 234 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88

99
#include <level_zero/ze_api.h>
1010

11+
#include <atomic>
1112
#include <cstdlib>
1213
#include <iostream>
1314
#include <sycl/sycl.hpp>
@@ -16,43 +17,22 @@
1617
#include "samples_utils.h"
1718

1819
/**
19-
* This file implements a sample collector tool that:
20-
* - Has two subscribers to PTI Callback Domains related to appending and dispatching GPU
21-
* operations.
22-
*
23-
* - The subscribers trace the workload and report its progress from the callbacks.
24-
*
25-
* - Of the two subscribers, the second one subscribes only to operation completion.
26-
* This demonstrates that multiple subscribers can co-exist.
27-
*
28-
* - The first subscriber implements measurement of the first GPU kernel duration in GPU cycles.
29-
* It does this by appending commands to write global timestamps before and after the kernel,
30-
* and reading these timestamps upon kernel completion.
31-
*
32-
* For simplicity, there is no code for multi-thread synchronization, as well as other tool
33-
* aspects are ommitted. The sample workload uses a single thread.
34-
* If it is modified to run multiple threads, the synchronization is the first thing
35-
* that should be added to the tool.
20+
* This file demonstrate usage of PTI Callback Subscriber and
21+
* External Correlation called from within Append callbacks
22+
* The sample workload uses a single thread.
3623
*/
3724

38-
/// Tool resources
39-
4025
namespace {
4126

42-
pti_callback_subscriber_handle subscriber1 = nullptr;
43-
pti_callback_subscriber_handle subscriber2 = nullptr;
44-
45-
ze_event_handle_t global_time_stamp_start_event = nullptr;
46-
ze_event_handle_t global_time_stamp_end_event = nullptr;
47-
void *buff_start = nullptr;
48-
void *buff_end = nullptr;
27+
pti_callback_subscriber_handle subscriber = nullptr;
4928

50-
uint64_t g_profiled_kernel_id = 0;
29+
std::atomic<uint64_t> external_correlation_id = 0;
5130

5231
} // namespace
5332

54-
/// Forward declaration of tool functions required for PTI initialization
55-
33+
//
34+
// Forward declaration of tool functions required for PTI initialization
35+
//
5636
void CallbackCommon(pti_callback_domain domain, pti_api_group_id driver_group_id,
5737
uint32_t driver_api_id, pti_backend_ctx_t backend_context, void *cb_data,
5838
void *user_data, void **instance_user_data);
@@ -61,151 +41,46 @@ void ProvideBuffer(unsigned char **buf, std::size_t *buf_size);
6141

6242
void ParseBuffer(unsigned char *buf, std::size_t buf_size, std::size_t valid_buf_size);
6343

64-
/// Start and Stop profiling
65-
44+
//
45+
// Start and Stop profiling
46+
//
6647
void StartProfiling() {
67-
// At the moment when subscribe for ptiCallback-s - enable at least one ptiView
48+
// At the moment when subscribe for ptiCallback-s -
49+
// need to enable at least one ptiView for GPU operations
6850
PTI_CHECK_SUCCESS(ptiViewSetCallbacks(ProvideBuffer, ParseBuffer));
6951
PTI_CHECK_SUCCESS(ptiViewEnable(PTI_VIEW_DEVICE_GPU_KERNEL));
7052

71-
// Initializing two pti Subscribers and setting for both of them the same callback function
72-
// Note, that as user data we pass to each subscriber its own address
73-
PTI_CHECK_SUCCESS(ptiCallbackSubscribe(&subscriber1, CallbackCommon, &subscriber1));
74-
std::cout << "Initialized Subscriber: " << subscriber1 << std::endl;
75-
PTI_CHECK_SUCCESS(ptiCallbackSubscribe(&subscriber2, CallbackCommon, &subscriber2));
76-
std::cout << "Initialized Subscriber: " << subscriber2 << std::endl;
53+
// Demonstrating here how to use External Correlation in Subscriber Callbacks
54+
PTI_CHECK_SUCCESS(ptiViewEnable(PTI_VIEW_DRIVER_API));
55+
PTI_CHECK_SUCCESS(ptiViewEnableDriverApiClass(1, pti_api_class::PTI_API_CLASS_GPU_OPERATION_CORE,
56+
pti_api_group_id::PTI_API_GROUP_LEVELZERO));
57+
PTI_CHECK_SUCCESS(ptiViewEnable(PTI_VIEW_EXTERNAL_CORRELATION));
7758

78-
// Enabling for each subscriber the domains of interest
79-
// Subscriber1 will get notifications about GPU Operation Appended and Completed
80-
PTI_CHECK_SUCCESS(
81-
ptiCallbackEnableDomain(subscriber1, PTI_CB_DOMAIN_DRIVER_GPU_OPERATION_APPENDED, 1, 1));
59+
// Initializing Subscriber and setting the callback function
60+
// As user data we pass to subscriber its own address
61+
PTI_CHECK_SUCCESS(ptiCallbackSubscribe(&subscriber, CallbackCommon, &subscriber));
62+
std::cout << "Initialized Subscriber: " << subscriber << std::endl;
63+
64+
// Enabling for each subscriber domains of interest
8265
PTI_CHECK_SUCCESS(
83-
ptiCallbackEnableDomain(subscriber1, PTI_CB_DOMAIN_DRIVER_GPU_OPERATION_COMPLETED, 1, 1));
84-
// Subscriber2 will get notifications only GPU Operation Completed only
66+
ptiCallbackEnableDomain(subscriber, PTI_CB_DOMAIN_DRIVER_GPU_OPERATION_APPENDED, 1, 1));
8567
PTI_CHECK_SUCCESS(
86-
ptiCallbackEnableDomain(subscriber2, PTI_CB_DOMAIN_DRIVER_GPU_OPERATION_COMPLETED, 1, 1));
68+
ptiCallbackEnableDomain(subscriber, PTI_CB_DOMAIN_DRIVER_GPU_OPERATION_COMPLETED, 1, 1));
8769
}
8870

8971
void StopProfiling() {
90-
PTI_CHECK_SUCCESS(ptiCallbackUnsubscribe(subscriber1));
91-
PTI_CHECK_SUCCESS(ptiCallbackUnsubscribe(subscriber2));
72+
PTI_CHECK_SUCCESS(ptiCallbackUnsubscribe(subscriber));
9273

74+
PTI_CHECK_SUCCESS(ptiViewDisable(PTI_VIEW_DRIVER_API));
9375
PTI_CHECK_SUCCESS(ptiViewDisable(PTI_VIEW_DEVICE_GPU_KERNEL));
76+
PTI_CHECK_SUCCESS(ptiViewDisable(PTI_VIEW_EXTERNAL_CORRELATION));
9477

9578
PTI_CHECK_SUCCESS(ptiFlushAllViews());
9679
}
9780

98-
/// Tool working functions
99-
100-
bool IsToolResourcesInitialized() {
101-
return global_time_stamp_start_event != nullptr && global_time_stamp_end_event != nullptr &&
102-
buff_start != nullptr && buff_end != nullptr;
103-
}
104-
105-
/**
106-
* @brief Initialization is lazy and happens only once
107-
*/
108-
bool InitToolResources(ze_context_handle_t context, ze_device_handle_t device) {
109-
static bool ready = false;
110-
ze_event_pool_handle_t event_pool = nullptr;
111-
112-
if (!ready) {
113-
// Pool with 2 events - to be appended them (only) at the first kernel append callback pair
114-
ze_event_pool_desc_t event_pool_desc = {.stype = ZE_STRUCTURE_TYPE_EVENT_POOL_DESC,
115-
.pNext = nullptr,
116-
.flags = ZE_EVENT_POOL_FLAG_IPC |
117-
ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP |
118-
ZE_EVENT_POOL_FLAG_HOST_VISIBLE,
119-
.count = 2};
120-
121-
auto status = zeEventPoolCreate(context, &event_pool_desc, 1, &device, &event_pool);
122-
if (status != ZE_RESULT_SUCCESS) {
123-
std::cerr << "zeEventPoolCreate failed with error code: " << status << '\n';
124-
return false;
125-
}
126-
127-
// Memory buffers where to write timestamps
128-
ze_device_mem_alloc_desc_t device_desc = {ZE_STRUCTURE_TYPE_DEVICE_MEM_ALLOC_DESC, nullptr, 0,
129-
0};
130-
ze_host_mem_alloc_desc_t host_desc = {ZE_STRUCTURE_TYPE_HOST_MEM_ALLOC_DESC, nullptr, 0};
131-
132-
status = zeMemAllocShared(context, &device_desc, &host_desc, 8, 64, device, &buff_start);
133-
if (status != ZE_RESULT_SUCCESS) {
134-
std::cerr << "zeMemAllocShared failed with error code: " << status << '\n';
135-
return false;
136-
}
137-
status = zeMemAllocShared(context, &device_desc, &host_desc, 8, 64, device, &buff_end);
138-
if (status != ZE_RESULT_SUCCESS) {
139-
std::cerr << "zeMemAllocShared failed with error code: " << status << '\n';
140-
return false;
141-
}
142-
143-
// Create two events from the pool. They would signal that timestamps written
144-
ze_event_desc_t event_desc = {.stype = ZE_STRUCTURE_TYPE_EVENT_DESC,
145-
.pNext = nullptr,
146-
.index = 0,
147-
.signal = ZE_EVENT_SCOPE_FLAG_HOST, // Event is signaled on host
148-
.wait = ZE_EVENT_SCOPE_FLAG_HOST}; // Event is waited on host
149-
status = zeEventCreate(event_pool, &event_desc, &global_time_stamp_start_event);
150-
if (status != ZE_RESULT_SUCCESS) {
151-
std::cerr << "zeEventCreate failed with error code: " << status << '\n';
152-
return false;
153-
}
154-
event_desc.index = 1;
155-
status = zeEventCreate(event_pool, &event_desc, &global_time_stamp_end_event);
156-
if (status != ZE_RESULT_SUCCESS) {
157-
std::cerr << "zeEventCreate failed with error code: " << status << '\n';
158-
return false;
159-
}
160-
ready = true;
161-
return true;
162-
}
163-
return false;
164-
}
165-
166-
void GPUKernelAppendOnEnter(ze_context_handle_t context, ze_device_handle_t device,
167-
ze_command_list_handle_t command_list, bool &write_appended) {
168-
write_appended = false;
169-
std::cout << "Initialize tool resources to Write Global timestamps around the kernel"
170-
<< std::endl;
171-
// Lazy initialization, returns true only once
172-
auto res = InitToolResources(context, device);
173-
if (res) {
174-
std::cout << " -----> Appending Write Global timestamp before the kernel" << std::endl;
175-
res = zeCommandListAppendWriteGlobalTimestamp(command_list, static_cast<uint64_t *>(buff_start),
176-
global_time_stamp_start_event, 0, nullptr);
177-
178-
if (res == ZE_RESULT_SUCCESS) {
179-
std::cout << " Appended Write Global timestamp" << std::endl;
180-
write_appended = true;
181-
} else {
182-
std::cout << "zeCommandListAppendWriteGlobalTimestamp failed with error code: " << res
183-
<< std::endl;
184-
}
185-
} else {
186-
std::cout << "Data not (re-)initialized" << std::endl;
187-
}
188-
}
189-
190-
void GPUKernelAppendOnExit(ze_command_list_handle_t command_list, bool &write_appended) {
191-
write_appended = false;
192-
if (!IsToolResourcesInitialized()) {
193-
std::cout << "Tool resources not initialized. Cannot append write timestamp after the kernel"
194-
<< std::endl;
195-
return;
196-
}
197-
std::cout << " <----- Appending Write Global timestamp after the kernel" << std::endl;
198-
auto res = zeCommandListAppendWriteGlobalTimestamp(
199-
command_list, static_cast<uint64_t *>(buff_end), global_time_stamp_end_event, 0, nullptr);
200-
if (res == ZE_RESULT_SUCCESS) {
201-
std::cout << " Appended Write Global timestamp" << std::endl;
202-
write_appended = true;
203-
} else {
204-
std::cout << "zeCommandListAppendWriteGlobalTimestamp failed with error code: " << res
205-
<< std::endl;
206-
}
207-
}
208-
81+
//
82+
// Functions used in Callbacks
83+
//
20984
void CallbackGPUOperationAppend([[maybe_unused]] pti_callback_domain domain,
21085
pti_api_group_id driver_group_id, uint32_t driver_api_id,
21186
[[maybe_unused]] pti_backend_ctx_t backend_context, void *cb_data,
@@ -229,49 +104,29 @@ void CallbackGPUOperationAppend([[maybe_unused]] pti_callback_domain domain,
229104

230105
if (gpu_op_data->_operation_count != 1) {
231106
std::cout << "WARNING: Operation count is not 1, it is: " << gpu_op_data->_operation_count
232-
<< " . Unexpected for this sample! Will not proceed with appending "
233-
<< "Global Timestamp write" << std::endl;
107+
<< " . Unexpected for this sample! Will not proceed with Push/Pop "
108+
<< "of External Correlation " << std::endl;
234109
return;
235110
}
236111

237-
pti_gpu_op_details *gpu_op_details =
238-
static_cast<pti_gpu_op_details *>(gpu_op_data->_operation_details);
239-
240-
bool is_op_kernel = (gpu_op_details->_operation_kind == PTI_GPU_OPERATION_KIND_KERNEL);
241-
242112
if (gpu_op_data->_phase == PTI_CB_PHASE_API_ENTER) {
243113
*instance_user_data = static_cast<void *>(nullptr);
244-
if (is_op_kernel) {
245-
bool write_appended = false;
246-
GPUKernelAppendOnEnter(static_cast<ze_context_handle_t>(backend_context),
247-
static_cast<ze_device_handle_t>(gpu_op_data->_device_handle),
248-
static_cast<ze_command_list_handle_t>(gpu_op_data->_cmd_list_handle),
249-
write_appended);
250-
if (write_appended) {
251-
// remember operation_id which duration is being profiled
252-
g_profiled_kernel_id = gpu_op_details->_operation_id;
253-
// Store End event - to indicate for the pair EXIT Callback that
254-
// it need to append another Write Global Timestamp
255-
*instance_user_data = static_cast<void *>(&global_time_stamp_end_event);
256-
}
257-
}
114+
external_correlation_id.fetch_add(1);
115+
auto result = ptiViewPushExternalCorrelationId(
116+
pti_view_external_kind::PTI_VIEW_EXTERNAL_KIND_CUSTOM_0, external_correlation_id);
117+
std::cout << "Pushing External Correlation Id: " << external_correlation_id
118+
<< ", Result: " << result << std::endl;
258119
} else if (gpu_op_data->_phase == PTI_CB_PHASE_API_EXIT) {
259-
if (is_op_kernel && (*instance_user_data != nullptr)) {
260-
bool write_appended = false;
261-
GPUKernelAppendOnExit(static_cast<ze_command_list_handle_t>(gpu_op_data->_cmd_list_handle),
262-
write_appended);
263-
}
120+
uint64_t local_external_correlation_id = 0ULL;
121+
auto result = ptiViewPopExternalCorrelationId(
122+
pti_view_external_kind::PTI_VIEW_EXTERNAL_KIND_CUSTOM_0, &local_external_correlation_id);
123+
std::cout << "Popped External Correlation Id: " << local_external_correlation_id
124+
<< ", Result: " << result << std::endl;
264125
} else {
265126
std::cout << "Unexpected phase: " << gpu_op_data->_phase << std::endl;
266127
}
267128
}
268129

269-
uint64_t ReadTimestamp(void *buff) {
270-
// Copy buffer from device to host
271-
uint64_t timestamp = *static_cast<uint64_t *>(buff);
272-
return timestamp;
273-
}
274-
275130
void CallbackGPUOperationCompletion([[maybe_unused]] pti_callback_domain domain,
276131
pti_api_group_id driver_group_id, uint32_t driver_api_id,
277132
[[maybe_unused]] pti_backend_ctx_t backend_context,
@@ -283,45 +138,6 @@ void CallbackGPUOperationCompletion([[maybe_unused]] pti_callback_domain domain,
283138

284139
samples_utils::DumpCallbackData(domain, driver_group_id, driver_api_id, backend_context, cb_data,
285140
user_data, instance_user_data);
286-
287-
if (IsToolResourcesInitialized()) {
288-
// Checking the events status first - prior reading timestamps
289-
auto result1 = zeEventQueryStatus(global_time_stamp_start_event);
290-
auto result2 = zeEventQueryStatus(global_time_stamp_end_event);
291-
if (result1 == ZE_RESULT_SUCCESS && result2 == ZE_RESULT_SUCCESS) {
292-
std::cout << "Writes of Global Time Stamp signaled." << std::endl;
293-
// Reading timestamps
294-
auto start_time_stamp = ReadTimestamp(buff_start);
295-
auto end_time_stamp = ReadTimestamp(buff_end);
296-
std::cout << "Kernel with _kernel_id: " << g_profiled_kernel_id
297-
<< ", start TS: " << start_time_stamp << ", end TS: " << end_time_stamp
298-
<< ", duration: " << (end_time_stamp - start_time_stamp) << " cycles \n";
299-
300-
// Resetting the events and will not use them anymore
301-
result1 = zeEventHostReset(global_time_stamp_start_event);
302-
if (result1 != ZE_RESULT_SUCCESS) {
303-
std::cout << "zeEventHostReset for Start event failed with error code: " << result1
304-
<< std::endl;
305-
}
306-
result2 = zeEventHostReset(global_time_stamp_end_event);
307-
if (result2 != ZE_RESULT_SUCCESS) {
308-
std::cout << "zeEventHostReset for End event failed with error code: " << result2
309-
<< std::endl;
310-
}
311-
312-
} else if (result1 == ZE_RESULT_SUCCESS && result2 == ZE_RESULT_NOT_READY) {
313-
std::cout << "Global Timestamp End event is NOT READY.";
314-
} else if (result1 == ZE_RESULT_NOT_READY && result2 == ZE_RESULT_NOT_READY) {
315-
std::cout << "Global Timestamp Start and End event are NOT READY."
316-
"It could be that they already processed and reset. "
317-
<< std::endl;
318-
} else {
319-
std::cout << "zeEventQueryStatus for Start event failed with error code: " << result1
320-
<< std::endl;
321-
std::cout << "zeEventQueryStatus for End event failed with error code: " << result2
322-
<< std::endl;
323-
}
324-
}
325141
}
326142

327143
void CallbackCommon(pti_callback_domain domain, pti_api_group_id driver_group_id,
@@ -346,6 +162,9 @@ void CallbackCommon(pti_callback_domain domain, pti_api_group_id driver_group_id
346162
std::cout << std::endl;
347163
}
348164

165+
//
166+
// PTI Reports Buffer functions
167+
//
349168
void ProvideBuffer(unsigned char **buf, std::size_t *buf_size) {
350169
constexpr auto kRequestedRecordCount = 5'000'000ULL;
351170
constexpr auto kRequestedBufferSize = kRequestedRecordCount * sizeof(pti_view_record_kernel);
@@ -383,15 +202,33 @@ void ParseBuffer(unsigned char *buf, std::size_t buf_size, std::size_t valid_buf
383202
break;
384203
}
385204
case pti_view_kind::PTI_VIEW_DEVICE_GPU_KERNEL: {
386-
pti_view_record_kernel *rec = reinterpret_cast<pti_view_record_kernel *>(ptr);
205+
auto *rec = reinterpret_cast<pti_view_record_kernel *>(ptr);
387206
std::cout << "---------------------------------------------------"
388-
"-----------------------------"
389-
<< '\n';
207+
"-----------------------------\n";
390208
std::cout << "Found Kernel Record" << '\n';
391209
samples_utils::DumpRecord(rec);
392210
std::cout << "---------------------------------------------------"
393-
"-----------------------------"
394-
<< '\n';
211+
"-----------------------------\n";
212+
break;
213+
}
214+
case pti_view_kind::PTI_VIEW_DRIVER_API: {
215+
std::cout << "---------------------------------------------------"
216+
"-----------------------------\n";
217+
std::cout << "Found Driver API Record" << '\n';
218+
auto *rec = reinterpret_cast<pti_view_record_api *>(ptr);
219+
samples_utils::DumpRecord(rec);
220+
std::cout << "---------------------------------------------------"
221+
"-----------------------------\n";
222+
break;
223+
}
224+
case pti_view_kind::PTI_VIEW_EXTERNAL_CORRELATION: {
225+
std::cout << "---------------------------------------------------"
226+
"-----------------------------\n";
227+
std::cout << "Found External Correlation Record" << '\n';
228+
auto *rec = reinterpret_cast<pti_view_record_external_correlation *>(ptr);
229+
samples_utils::DumpRecord(rec);
230+
std::cout << "---------------------------------------------------"
231+
"-----------------------------\n";
395232
break;
396233
}
397234
default: {

0 commit comments

Comments
 (0)