Skip to content

Commit b51157f

Browse files
authored
Support for BMG sampling and other fixes (#474)
* Support for BMG sampling and other fixes * Fix an unnecessary first write if the first buffer size matches MAX_METRIC_BUFFER * Address comments from PR 474 * Fix typos * Turn errors into warnings and do not exit but stop processing the data file.
1 parent f718561 commit b51157f

File tree

1 file changed

+66
-31
lines changed

1 file changed

+66
-31
lines changed

tools/unitrace/src/levelzero/ze_metrics.h

Lines changed: 66 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -33,12 +33,10 @@
3333
#include "pti_assert.h"
3434
#include <inttypes.h>
3535

36-
3736
constexpr static uint64_t min_dummy_instance_id = 1024 * 1024; // min dummy instance id if idle sampling is enabled
38-
constexpr static uint32_t max_metric_size = 512;
39-
static uint32_t max_metric_samples = 32768;
37+
constexpr static uint32_t max_metric_samples = 32768;
4038

41-
#define MAX_METRIC_BUFFER (max_metric_samples * max_metric_size* 2)
39+
#define MAX_METRIC_BUFFER (8ULL * 1024ULL * 1024ULL)
4240

4341
inline void PrintDeviceList() {
4442
ze_result_t status = zeInit(ZE_INIT_FLAG_GPU_ONLY);
@@ -203,7 +201,6 @@ struct ZeDeviceDescriptor {
203201
std::atomic<ZeProfilerState> profiling_state_;
204202
std::string metric_file_name_;
205203
std::ofstream metric_file_stream_;
206-
std::vector<uint8_t> metric_data_;
207204
bool stall_sampling_;
208205
};
209206

@@ -526,7 +523,7 @@ class ZeMetricProfiler {
526523
}
527524

528525
void ComputeMetrics() {
529-
uint8_t *raw_metrics = (uint8_t *)malloc(sizeof(uint8_t) * (MAX_METRIC_BUFFER + 512));
526+
auto *raw_metrics = static_cast<uint8_t*>(malloc(sizeof(uint8_t)*MAX_METRIC_BUFFER));
530527
UniMemory::ExitIfOutOfMemory((void *)raw_metrics);
531528

532529
for (auto it = device_descriptors_.begin(); it != device_descriptors_.end(); ++it) {
@@ -611,8 +608,27 @@ class ZeMetricProfiler {
611608
}
612609

613610
while (!inf.eof()) {
614-
inf.read(reinterpret_cast<char *>(raw_metrics), MAX_METRIC_BUFFER + 512);
611+
// Read metric data in two stages, first actual size (in bytes), followed by actual metrics
612+
uint64_t data_size;
613+
inf.read(reinterpret_cast<char *>(&data_size), sizeof(data_size));
614+
if (inf.eof()) {
615+
// If we reached EOF, we can stop processing
616+
break;
617+
}
618+
if (inf.gcount() != sizeof(data_size)) {
619+
std::cerr << "[WARNING] Intermediate metrics file is invalid. Cannot find the size of the next data segment. Output likely to be incomplete." << std::endl;
620+
break;
621+
}
622+
if (data_size > MAX_METRIC_BUFFER) {
623+
std::cerr << "[WARNING] Intermediate metrics file is invalid. Next chunk cannot be larger than the allocated buffer. Output likely to be incomplete." << std::endl;
624+
break;
625+
}
626+
inf.read(reinterpret_cast<char *>(raw_metrics), data_size);
615627
int raw_size = inf.gcount();
628+
if (raw_size < data_size) {
629+
std::cerr << "[WARNING] Intermediate metrics file is incomplete. Expecting " << data_size << " bytes but only " << raw_size << " bytes were found. Output likely to be incomplete." << std::endl;
630+
break;
631+
}
616632
if (raw_size > 0) {
617633
uint32_t num_samples = 0;
618634
uint32_t num_metrics = 0;
@@ -838,8 +854,27 @@ class ZeMetricProfiler {
838854
uint64_t cur_sampling_ts = 0;
839855
auto kit = kinfo.begin();
840856
while (!inf.eof()) {
841-
inf.read(reinterpret_cast<char *>(raw_metrics), MAX_METRIC_BUFFER + 512);
857+
// Read metric data in two stages, first actual size (in bytes), followed by actual metrics
858+
uint64_t data_size;
859+
inf.read(reinterpret_cast<char *>(&data_size), sizeof(data_size));
860+
if (inf.eof()) {
861+
// If we reached EOF, we can stop processing
862+
break;
863+
}
864+
if (inf.gcount() != sizeof(data_size)) {
865+
std::cerr << "[WARNING] Intermediate metrics file is invalid. Cannot find the size of the next data segment. Output likely to be incomplete." << std::endl;
866+
break;
867+
}
868+
if (data_size > MAX_METRIC_BUFFER) {
869+
std::cerr << "[WARNING] Intermediate metrics file is invalid. Next chunk cannot be larger than the allocated buffer. Output likely to be incomplete." << std::endl;
870+
break;
871+
}
872+
inf.read(reinterpret_cast<char *>(raw_metrics), data_size);
842873
int raw_size = inf.gcount();
874+
if (raw_size < data_size) {
875+
std::cerr << "[WARNING] Intermediate metrics file is incomplete. Expecting " << data_size << " bytes but only " << raw_size << " bytes were found. Output likely to be incomplete." << std::endl;
876+
break;
877+
}
843878
if (raw_size > 0) {
844879
uint32_t num_samples = 0;
845880
uint32_t num_metrics = 0;
@@ -951,7 +986,7 @@ class ZeMetricProfiler {
951986
inf.close();
952987
}
953988
}
954-
free(raw_metrics);
989+
free (raw_metrics);
955990
}
956991

957992
private:
@@ -1054,6 +1089,7 @@ class ZeMetricProfiler {
10541089
PTI_ASSERT(status == ZE_RESULT_SUCCESS);
10551090
}
10561091
else {
1092+
// if (status == ZE_RESULT_NOT_READY)
10571093
return 0;
10581094
}
10591095

@@ -1075,12 +1111,12 @@ class ZeMetricProfiler {
10751111
ze_event_pool_desc_t event_pool_desc = {ZE_STRUCTURE_TYPE_EVENT_POOL_DESC, nullptr, ZE_EVENT_POOL_FLAG_HOST_VISIBLE, 1};
10761112
status = zeEventPoolCreate(context, &event_pool_desc, 1, &device, &event_pool);
10771113
PTI_ASSERT(status == ZE_RESULT_SUCCESS);
1078-
1114+
10791115
ze_event_handle_t event = nullptr;
10801116
ze_event_desc_t event_desc = {ZE_STRUCTURE_TYPE_EVENT_DESC, nullptr, 0, ZE_EVENT_SCOPE_FLAG_HOST, ZE_EVENT_SCOPE_FLAG_HOST};
10811117
status = zeEventCreate(event_pool, &event_desc, &event);
10821118
PTI_ASSERT(status == ZE_RESULT_SUCCESS);
1083-
1119+
10841120
zet_metric_streamer_handle_t streamer = nullptr;
10851121
uint32_t interval = std::stoi(utils::GetEnv("UNITRACE_SamplingInterval")) * 1000; // convert us to ns
10861122

@@ -1102,37 +1138,36 @@ class ZeMetricProfiler {
11021138
desc->profiling_state_.store(PROFILER_ENABLED, std::memory_order_release);
11031139
return;
11041140
}
1105-
1106-
if (streamer_desc.notifyEveryNReports > max_metric_samples) {
1107-
max_metric_samples = streamer_desc.notifyEveryNReports;
1108-
}
1109-
1141+
11101142
std::vector<std::string> metrics_list;
11111143
metrics_list = GetMetricList(group);
11121144
PTI_ASSERT(!metrics_list.empty());
11131145

1114-
uint8_t *raw_metrics = (uint8_t *)malloc(sizeof(uint8_t) * (MAX_METRIC_BUFFER + 512));
1146+
auto *raw_metrics = static_cast<uint8_t*>(malloc(sizeof(uint8_t)*MAX_METRIC_BUFFER));
11151147
UniMemory::ExitIfOutOfMemory((void *)raw_metrics);
11161148

1149+
auto dump_metrics = [](uint8_t *buffer, uint64_t size, std::ofstream *f) {
1150+
// Write metric data in two stages, first actual size (in bytes), followed by actual metrics
1151+
f->write(reinterpret_cast<char*>(&size), sizeof(size));
1152+
f->write(reinterpret_cast<char*>(buffer), size);
1153+
};
1154+
11171155
desc->profiling_state_.store(PROFILER_ENABLED, std::memory_order_release);
11181156
while (desc->profiling_state_.load(std::memory_order_acquire) != PROFILER_DISABLED) {
1119-
uint64_t size = EventBasedReadMetrics(event, streamer, raw_metrics, (MAX_METRIC_BUFFER + 512));
1120-
if (size == 0) {
1121-
if (!desc->metric_data_.empty()) {
1122-
desc->metric_file_stream_.write(reinterpret_cast<char*>(desc->metric_data_.data()), desc->metric_data_.size());
1123-
desc->metric_data_.clear();
1124-
}
1125-
continue;
1157+
auto size = EventBasedReadMetrics(event, streamer, raw_metrics, MAX_METRIC_BUFFER);
1158+
if (size > 0) {
1159+
// If we have data, dump it to the intermediate file
1160+
dump_metrics (raw_metrics, size, &desc->metric_file_stream_);
11261161
}
1127-
desc->metric_data_.insert(desc->metric_data_.end(), raw_metrics, raw_metrics + size);
11281162
}
1129-
auto size = ReadMetrics(streamer, raw_metrics, (MAX_METRIC_BUFFER + 512));
1130-
desc->metric_data_.insert(desc->metric_data_.end(), raw_metrics, raw_metrics + size);
1131-
if (!desc->metric_data_.empty()) {
1132-
desc->metric_file_stream_.write(reinterpret_cast<char*>(desc->metric_data_.data()), desc->metric_data_.size());
1133-
desc->metric_data_.clear();
1163+
1164+
// Flush the remaining metrics after the profiler has stopped
1165+
auto size = ReadMetrics(streamer, raw_metrics, MAX_METRIC_BUFFER);
1166+
while (size > 0) {
1167+
dump_metrics (raw_metrics, size, &desc->metric_file_stream_);
1168+
size = ReadMetrics(streamer, raw_metrics, MAX_METRIC_BUFFER);
11341169
}
1135-
free(raw_metrics);
1170+
free (raw_metrics);
11361171

11371172
status = zetMetricStreamerClose(streamer);
11381173
PTI_ASSERT(status == ZE_RESULT_SUCCESS);

0 commit comments

Comments
 (0)