Skip to content

Commit ba16d76

Browse files
committed
monitor all threads of the process
1 parent f698fc8 commit ba16d76

File tree

5 files changed

+143
-35
lines changed

5 files changed

+143
-35
lines changed

src/cpucounters.cpp

Lines changed: 136 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,7 @@ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
6666
#include <sys/time.h>
6767
#ifdef __linux__
6868
#include <sys/mman.h>
69+
#include <dirent.h>
6970
#endif
7071
#endif
7172

@@ -426,6 +427,12 @@ bool PCM::isFixedCounterSupported(unsigned c)
426427

427428
bool PCM::isHWTMAL1Supported() const
428429
{
430+
#ifdef PCM_USE_PERF
431+
if (perfEventTaskHandle.empty() == false)
432+
{
433+
return false; // per PID/task perf collection does not support HW TMA L1
434+
}
435+
#endif
429436
static int supported = -1;
430437
if (supported < 0)
431438
{
@@ -2144,8 +2151,7 @@ PCM::PCM() :
21442151

21452152
#ifdef PCM_USE_PERF
21462153
canUsePerf = true;
2147-
std::vector<int> dummy(PERF_MAX_COUNTERS, -1);
2148-
perfEventHandle.resize(num_cores, dummy);
2154+
perfEventHandle.resize(num_cores, std::vector<int>(PERF_MAX_COUNTERS, -1));
21492155
#endif
21502156

21512157
for (int32 i = 0; i < num_cores; ++i)
@@ -2442,7 +2448,7 @@ PCM::ErrorCode PCM::program(const PCM::ProgramMode mode_, const void * parameter
24422448
if (!silent) std::cerr << "Can not use Linux perf because OffcoreResponse usage is not supported. Falling-back to direct PMU programming.\n";
24432449
}
24442450
}
2445-
if (isHWTMAL1Supported() == true && perfSupportsTopDown() == false)
2451+
if (isHWTMAL1Supported() == true && perfSupportsTopDown() == false && pid == -1)
24462452
{
24472453
canUsePerf = false;
24482454
if (!silent) std::cerr << "Installed Linux kernel perf does not support hardware top-down level-1 counters. Using direct PMU programming instead.\n";
@@ -2695,6 +2701,45 @@ PCM::ErrorCode PCM::program(const PCM::ProgramMode mode_, const void * parameter
26952701
return PCM::UnknownError;
26962702
}
26972703

2704+
std::vector<int> tids{};
2705+
#ifdef PCM_USE_PERF
2706+
if (pid != -1)
2707+
{
2708+
const auto strDir = std::string("/proc/") + std::to_string(pid) + "/task/";
2709+
DIR * tidDir = opendir(strDir.c_str());
2710+
if (tidDir)
2711+
{
2712+
struct dirent * entry{nullptr};
2713+
while ((entry = readdir(tidDir)) != nullptr)
2714+
{
2715+
assert(entry->d_name);
2716+
const auto tid = atoi(entry->d_name);
2717+
if (tid)
2718+
{
2719+
tids.push_back(tid);
2720+
// std::cerr << "Detected task " << tids.back() << "\n";
2721+
}
2722+
}
2723+
closedir(tidDir);
2724+
}
2725+
else
2726+
{
2727+
std::cerr << "ERROR: Can't open " << strDir << "\n";
2728+
return PCM::UnknownError;
2729+
}
2730+
}
2731+
if (tids.empty() == false)
2732+
{
2733+
if (isHWTMAL1Supported())
2734+
{
2735+
if (!silent) std::cerr << "INFO: TMA L1 metrics are not supported in PID collection mode\n";
2736+
}
2737+
if (!silent) std::cerr << "INFO: collecting core metrics for " << tids.size() << " threads in process " << pid << "\n";
2738+
PerfEventHandleContainer _1(num_cores, std::vector<int>(PERF_MAX_COUNTERS, -1));
2739+
perfEventTaskHandle.resize(tids.size(), _1);
2740+
}
2741+
#endif
2742+
26982743
programmed_pmu = true;
26992744

27002745
lastProgrammedCustomCounters.clear();
@@ -2708,11 +2753,11 @@ PCM::ErrorCode PCM::program(const PCM::ProgramMode mode_, const void * parameter
27082753
{
27092754
if (isCoreOnline(i) == false) continue;
27102755

2711-
std::packaged_task<void()> task([this, i, mode_, pExtDesc, &programmingStatuses, &pid]() -> void
2756+
std::packaged_task<void()> task([this, i, mode_, pExtDesc, &programmingStatuses, &tids]() -> void
27122757
{
27132758
TemporalThreadAffinity tempThreadAffinity(i, false); // speedup trick for Linux
27142759

2715-
programmingStatuses[i] = programCoreCounters(i, mode_, pExtDesc, lastProgrammedCustomCounters[i], pid);
2760+
programmingStatuses[i] = programCoreCounters(i, mode_, pExtDesc, lastProgrammedCustomCounters[i], tids);
27162761
});
27172762
asyncCoreResults.push_back(task.get_future());
27182763
coreTaskQueues[i]->push(task);
@@ -2813,9 +2858,9 @@ PCM::ErrorCode PCM::programCoreCounters(const int i /* core */,
28132858
const PCM::ProgramMode mode_,
28142859
const ExtendedCustomCoreEventDescription * pExtDesc,
28152860
std::vector<EventSelectRegister> & result,
2816-
const int pid)
2861+
const std::vector<int> & tids)
28172862
{
2818-
(void) pid; // to silence uused param warning on non Linux OS
2863+
(void) tids; // to silence uused param warning on non Linux OS
28192864
// program core counters
28202865

28212866
result.clear();
@@ -2847,27 +2892,51 @@ PCM::ErrorCode PCM::programCoreCounters(const int i /* core */,
28472892
};
28482893
#ifdef PCM_USE_PERF
28492894
int leader_counter = -1;
2850-
auto programPerfEvent = [this, &leader_counter, &i, &pid](perf_event_attr & e, const int eventPos, const std::string & eventName) -> bool
2851-
{
2852-
// if (i == 0) std::cerr << "DEBUG: programming event "<< std::hex << e.config << std::dec << "\n";
2853-
if ((perfEventHandle[i][eventPos] = syscall(SYS_perf_event_open, &e, pid,
2854-
i /* core id */, leader_counter /* group leader */, 0)) <= 0)
2855-
{
2856-
std::lock_guard<std::mutex> _(printErrorMutex);
2857-
std::cerr << "Linux Perf: Error when programming " << eventName << ", error: " << strerror(errno) <<
2858-
" with config 0x" << std::hex << e.config <<
2859-
" config1 0x" << e.config1 << std::dec << "\n";
2860-
if (24 == errno)
2895+
auto programPerfEvent = [this, &leader_counter, &i, &tids](perf_event_attr e, const int eventPos, const std::string & eventName) -> bool
2896+
{
2897+
auto programPerfEventHelper = [&i]( PerfEventHandleContainer & perfEventHandle,
2898+
perf_event_attr & e,
2899+
const int eventPos,
2900+
const std::string & eventName,
2901+
const int leader_counter,
2902+
const int tid) -> bool
2903+
{
2904+
// if (i == 0) std::cerr << "DEBUG: programming event "<< std::hex << e.config << std::dec << "\n";
2905+
if ((perfEventHandle[i][eventPos] = syscall(SYS_perf_event_open, &e, tid,
2906+
i /* core id */, leader_counter /* group leader */, 0)) <= 0)
28612907
{
2862-
std::cerr << "try executing 'ulimit -n 20000' to increase the limit on the number of open files.\n";
2908+
std::lock_guard<std::mutex> _(printErrorMutex);
2909+
std::cerr << "Linux Perf: Error when programming " << eventName << ", error: " << strerror(errno) <<
2910+
" with config 0x" << std::hex << e.config <<
2911+
" config1 0x" << e.config1 << std::dec << " for tid " << tid << " leader " << leader_counter << "\n";
2912+
if (24 == errno)
2913+
{
2914+
std::cerr << PCM_ULIMIT_RECOMMENDATION;
2915+
}
2916+
else
2917+
{
2918+
std::cerr << "try running with environment variable PCM_NO_PERF=1\n";
2919+
}
2920+
return false;
28632921
}
2864-
else
2922+
return true;
2923+
};
2924+
if (tids.empty() == false)
2925+
{
2926+
e.inherit = 1;
2927+
e.read_format = 0; // 'inherit' does not work for combinations of read format (e.g. PERF_FORMAT_GROUP)
2928+
auto handleIt = perfEventTaskHandle.begin();
2929+
for (const auto & tid: tids)
28652930
{
2866-
std::cerr << "try running with environment variable PCM_NO_PERF=1\n";
2931+
if (programPerfEventHelper(*handleIt, e, eventPos, eventName, -1, tid) == false)
2932+
{
2933+
return false;
2934+
}
2935+
++handleIt;
28672936
}
2868-
return false;
2937+
return true;
28692938
}
2870-
return true;
2939+
return programPerfEventHelper(perfEventHandle, e, eventPos, eventName, leader_counter, -1);
28712940
};
28722941
if (canUsePerf)
28732942
{
@@ -3736,16 +3805,27 @@ void PCM::cleanupPMU(const bool silent)
37363805
#ifdef PCM_USE_PERF
37373806
if (canUsePerf)
37383807
{
3739-
for (int i = 0; i < num_cores; ++i)
3740-
for(int c = 0; c < PERF_MAX_COUNTERS; ++c)
3808+
auto cleanOne = [this](PerfEventHandleContainer & cont)
3809+
{
3810+
for (int i = 0; i < num_cores; ++i)
3811+
{
3812+
for(int c = 0; c < PERF_MAX_COUNTERS; ++c)
3813+
{
3814+
auto & h = cont[i][c];
3815+
if (h != -1) ::close(h);
3816+
h = -1;
3817+
}
3818+
}
3819+
};
3820+
cleanOne(perfEventHandle);
3821+
for (auto & cont : perfEventTaskHandle)
37413822
{
3742-
auto & h = perfEventHandle[i][c];
3743-
if (h != -1) ::close(h);
3744-
h = -1;
3823+
cleanOne(cont);
37453824
}
3825+
perfEventTaskHandle.clear();
37463826

3747-
if (!silent) std::cerr << " Closed perf event handles\n";
3748-
return;
3827+
if (!silent) std::cerr << " Closed perf event handles\n";
3828+
return;
37493829
}
37503830
#endif
37513831

@@ -4055,6 +4135,31 @@ CoreCounterState getCoreCounterState(uint32 core)
40554135
#ifdef PCM_USE_PERF
40564136
void PCM::readPerfData(uint32 core, std::vector<uint64> & outData)
40574137
{
4138+
if (perfEventTaskHandle.empty() == false)
4139+
{
4140+
std::fill(outData.begin(), outData.end(), 0);
4141+
for (const auto & handleArray : perfEventTaskHandle)
4142+
{
4143+
for (size_t ctr = 0; ctr < PERF_MAX_COUNTERS; ++ctr)
4144+
{
4145+
const int fd = handleArray[core][ctr];
4146+
if (fd != -1)
4147+
{
4148+
uint64 result{0ULL};
4149+
const int status = ::read(fd, &result, sizeof(result));
4150+
if (status != sizeof(result))
4151+
{
4152+
std::cerr << "PCM Error: failed to read from Linux perf handle " << fd << "\n";
4153+
}
4154+
else
4155+
{
4156+
outData[ctr] += result;
4157+
}
4158+
}
4159+
}
4160+
}
4161+
return;
4162+
}
40584163
auto readPerfDataHelper = [this](const uint32 core, std::vector<uint64>& outData, const uint32 leader, const uint32 num_counters)
40594164
{
40604165
if (perfEventHandle[core][leader] < 0)
@@ -6330,7 +6435,7 @@ class PerfVirtualControlRegister : public HWRegister
63306435
{
63316436
std::cerr << "Linux Perf: Error on programming PMU " << pmuID << ": " << strerror(errno) << "\n";
63326437
std::cerr << "config: 0x" << std::hex << event.config << " config1: 0x" << event.config1 << " config2: 0x" << event.config2 << std::dec << "\n";
6333-
if (errno == 24) std::cerr << "try executing 'ulimit -n 20000' to increase the limit on the number of open files.\n";
6438+
if (errno == 24) std::cerr << PCM_ULIMIT_RECOMMENDATION;
63346439
return;
63356440
}
63366441
}

src/cpucounters.h

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -859,7 +859,9 @@ class PCM_API PCM
859859

860860
bool canUsePerf;
861861
#ifdef PCM_USE_PERF
862-
std::vector<std::vector<int> > perfEventHandle;
862+
typedef std::vector<std::vector<int> > PerfEventHandleContainer;
863+
PerfEventHandleContainer perfEventHandle;
864+
std::vector<PerfEventHandleContainer> perfEventTaskHandle;
863865
void readPerfData(uint32 core, std::vector<uint64> & data);
864866

865867
enum {
@@ -894,7 +896,7 @@ class PCM_API PCM
894896
std::vector<std::vector<EventSelectRegister> > lastProgrammedCustomCounters;
895897
uint32 checkCustomCoreProgramming(std::shared_ptr<SafeMsrHandle> msr);
896898
ErrorCode programCoreCounters(int core, const PCM::ProgramMode mode, const ExtendedCustomCoreEventDescription * pExtDesc,
897-
std::vector<EventSelectRegister> & programmedCustomCounters, const int pid);
899+
std::vector<EventSelectRegister> & programmedCustomCounters, const std::vector<int> & tids);
898900

899901
bool PMUinUse();
900902
void cleanupPMU(const bool silent = false);

src/pci.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -384,7 +384,7 @@ int openHandle(uint32 groupnr_, uint32 bus, uint32 device, uint32 function)
384384
int handle = ::open(path.str().c_str(), O_RDWR);
385385
if (handle < 0)
386386
{
387-
if (errno == 24) std::cerr << "ERROR: try executing 'ulimit -n 20000' to increase the limit on the number of open files.\n";
387+
if (errno == 24) std::cerr << "ERROR: " << PCM_ULIMIT_RECOMMENDATION;
388388
handle = ::open((std::string("/pcm") + path.str()).c_str(), O_RDWR);
389389
}
390390
return handle;

src/resctrl.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -123,7 +123,7 @@ namespace pcm
123123
std::cerr << "Error reading " << f << ". Error: " << strerror(errno) << "\n";
124124
if (errno == 24)
125125
{
126-
std::cerr << "try executing 'ulimit -n 20000' to increase the limit on the number of open files.\n";
126+
std::cerr << PCM_ULIMIT_RECOMMENDATION;
127127
}
128128
}
129129
}

src/types.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@ typedef signed long long int64;
4040
typedef unsigned int uint32;
4141
typedef signed int int32;
4242

43+
#define PCM_ULIMIT_RECOMMENDATION ("try executing 'ulimit -n 1000000' to increase the limit on the number of open files.\n")
4344

4445
/*
4546
MSR addreses from

0 commit comments

Comments
 (0)