Skip to content

Commit 53ed2e8

Browse files
committed
implement PCM_KEEP_NMI_WATCHDOG mode
Change-Id: Icb88d7ab2a62619913ae49d25a27a4e1ffbcbad2
1 parent 20f1915 commit 53ed2e8

File tree

3 files changed

+46
-2
lines changed

3 files changed

+46
-2
lines changed

doc/ENVVAR_README.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,3 +7,5 @@
77
`PCM_USE_RESCTRL=1` : use Linux resctrl driver for RDT metrics
88

99
`PCM_PRINT_TOPOLOGY=1` : print detailed CPU topology
10+
11+
`PCM_KEEP_NMI_WATCHDOG=1` : don't disable NMI watchdog (reducing the core metrics set)

src/cpucounters.cpp

Lines changed: 35 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -352,6 +352,11 @@ void pcm_cpuid(const unsigned leaf, const unsigned subleaf, PCM_CPUID_INFO & inf
352352
#endif
353353
}
354354

355+
#ifdef __linux__
356+
bool isNMIWatchdogEnabled(const bool silent);
357+
bool keepNMIWatchdogEnabled();
358+
#endif
359+
355360
void PCM::readCoreCounterConfig(const bool complainAboutMSR)
356361
{
357362
if (max_cpuid >= 0xa)
@@ -412,6 +417,11 @@ void PCM::readCoreCounterConfig(const bool complainAboutMSR)
412417
std::cerr << "INFO: Reducing the number of programmable counters to 3 to workaround the fixed cycle counter virtualization issue on AWS.\n";
413418
std::cerr << " You can disable the workaround by setting PCM_NO_AWS_WORKAROUND=1 environment variable\n";
414419
}
420+
if (isNMIWatchdogEnabled(true) && keepNMIWatchdogEnabled())
421+
{
422+
--core_gen_counter_num_max;
423+
std::cerr << "INFO: Reducing the number of programmable counters to " << core_gen_counter_num_max << " because NMI watchdog is enabled.\n";
424+
}
415425
#endif
416426
}
417427
}
@@ -1980,6 +1990,18 @@ void PCM::initUncorePMUsPerf()
19801990

19811991
#ifdef __linux__
19821992

1993+
const char * keepNMIWatchdogEnabledEnvStr = "PCM_KEEP_NMI_WATCHDOG";
1994+
1995+
bool keepNMIWatchdogEnabled()
1996+
{
1997+
static int keep = -1;
1998+
if (keep < 0)
1999+
{
2000+
keep = (safe_getenv(keepNMIWatchdogEnabledEnvStr) == std::string("1")) ? 1 : 0;
2001+
}
2002+
return keep == 1;
2003+
}
2004+
19832005
#define PCM_NMI_WATCHDOG_PATH "/proc/sys/kernel/nmi_watchdog"
19842006

19852007
bool isNMIWatchdogEnabled(const bool silent)
@@ -1995,7 +2017,11 @@ bool isNMIWatchdogEnabled(const bool silent)
19952017

19962018
void disableNMIWatchdog(const bool silent)
19972019
{
1998-
if (!silent) std::cerr << "Disabling NMI watchdog since it consumes one hw-PMU counter.\n";
2020+
if (!silent)
2021+
{
2022+
std::cerr << "Disabling NMI watchdog since it consumes one hw-PMU counter. To keep NMU watchdog set environment variable "
2023+
<< keepNMIWatchdogEnabledEnvStr << "=1 (this reduces the core metrics set)\n";
2024+
}
19992025
writeSysFS(PCM_NMI_WATCHDOG_PATH, "0");
20002026
}
20012027

@@ -2423,7 +2449,7 @@ perf_event_attr PCM_init_perf_event_attr(bool group = true)
24232449
PCM::ErrorCode PCM::program(const PCM::ProgramMode mode_, const void * parameter_, const bool silent, const int pid)
24242450
{
24252451
#ifdef __linux__
2426-
if (isNMIWatchdogEnabled(silent))
2452+
if (isNMIWatchdogEnabled(silent) && (keepNMIWatchdogEnabled() == false))
24272453
{
24282454
disableNMIWatchdog(silent);
24292455
needToRestoreNMIWatchdog = true;
@@ -2725,6 +2751,13 @@ PCM::ErrorCode PCM::program(const PCM::ProgramMode mode_, const void * parameter
27252751
std::cerr << "PCM ERROR: pid monitoring is only supported with Linux perf_event driver\n";
27262752
return PCM::UnknownError;
27272753
}
2754+
#ifdef __linux__
2755+
if (isNMIWatchdogEnabled(silent) && (canUsePerf == false))
2756+
{
2757+
std::cerr << "PCM ERROR: Unsupported mode. NMI watchdog is enabled and Linux perf_event driver is not used\n";
2758+
return PCM::UnknownError;
2759+
}
2760+
#endif
27282761

27292762
std::vector<int> tids{};
27302763
#ifdef PCM_USE_PERF

tests/test.sh

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,9 @@ export BIN_DIR="build/bin"
44

55
pushd $BIN_DIR
66

7+
# enable NMI watchdog
8+
echo 1 > /proc/sys/kernel/nmi_watchdog
9+
710
PCM_NO_PERF=1 ./pcm -r -- sleep 1
811
if [ "$?" -ne "0" ]; then
912
echo "Error in pcm"
@@ -32,6 +35,12 @@ if [ "$?" -ne "0" ]; then
3235
fi
3336
kill $test_pid
3437

38+
PCM_KEEP_NMI_WATCHDOG=1 ./pcm -r -- sleep 1
39+
if [ "$?" -ne "0" ]; then
40+
echo "Error in pcm"
41+
exit 1
42+
fi
43+
3544
./pcm -r 0.1 -csv=pcm.csv -- sleep 5
3645
if [ "$?" -ne "0" ]; then
3746
echo "Error in pcm"

0 commit comments

Comments
 (0)