Skip to content

Commit 8b70eaa

Browse files
authored
Merge pull request #404 from opcm/push-2022-05-10
Push 2022 05 10
2 parents d1fc36d + 05614e7 commit 8b70eaa

File tree

3 files changed

+87
-8
lines changed

3 files changed

+87
-8
lines changed

doc/ENVVAR_README.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,3 +7,5 @@
77
`PCM_USE_RESCTRL=1` : use Linux resctrl driver for RDT metrics
88

99
`PCM_PRINT_TOPOLOGY=1` : print detailed CPU topology
10+
11+
`PCM_KEEP_NMI_WATCHDOG=1` : don't disable NMI watchdog (reducing the core metrics set)

src/cpucounters.cpp

Lines changed: 45 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -352,6 +352,11 @@ void pcm_cpuid(const unsigned leaf, const unsigned subleaf, PCM_CPUID_INFO & inf
352352
#endif
353353
}
354354

355+
#ifdef __linux__
356+
bool isNMIWatchdogEnabled(const bool silent);
357+
bool keepNMIWatchdogEnabled();
358+
#endif
359+
355360
void PCM::readCoreCounterConfig(const bool complainAboutMSR)
356361
{
357362
if (max_cpuid >= 0xa)
@@ -412,6 +417,11 @@ void PCM::readCoreCounterConfig(const bool complainAboutMSR)
412417
std::cerr << "INFO: Reducing the number of programmable counters to 3 to workaround the fixed cycle counter virtualization issue on AWS.\n";
413418
std::cerr << " You can disable the workaround by setting PCM_NO_AWS_WORKAROUND=1 environment variable\n";
414419
}
420+
if (isNMIWatchdogEnabled(true) && keepNMIWatchdogEnabled())
421+
{
422+
--core_gen_counter_num_max;
423+
std::cerr << "INFO: Reducing the number of programmable counters to " << core_gen_counter_num_max << " because NMI watchdog is enabled.\n";
424+
}
415425
#endif
416426
}
417427
}
@@ -1980,6 +1990,18 @@ void PCM::initUncorePMUsPerf()
19801990

19811991
#ifdef __linux__
19821992

1993+
const char * keepNMIWatchdogEnabledEnvStr = "PCM_KEEP_NMI_WATCHDOG";
1994+
1995+
bool keepNMIWatchdogEnabled()
1996+
{
1997+
static int keep = -1;
1998+
if (keep < 0)
1999+
{
2000+
keep = (safe_getenv(keepNMIWatchdogEnabledEnvStr) == std::string("1")) ? 1 : 0;
2001+
}
2002+
return keep == 1;
2003+
}
2004+
19832005
#define PCM_NMI_WATCHDOG_PATH "/proc/sys/kernel/nmi_watchdog"
19842006

19852007
bool isNMIWatchdogEnabled(const bool silent)
@@ -1995,7 +2017,11 @@ bool isNMIWatchdogEnabled(const bool silent)
19952017

19962018
void disableNMIWatchdog(const bool silent)
19972019
{
1998-
if (!silent) std::cerr << "Disabling NMI watchdog since it consumes one hw-PMU counter.\n";
2020+
if (!silent)
2021+
{
2022+
std::cerr << " Disabling NMI watchdog since it consumes one hw-PMU counter. To keep NMU watchdog set environment variable "
2023+
<< keepNMIWatchdogEnabledEnvStr << "=1 (this reduces the core metrics set)\n";
2024+
}
19992025
writeSysFS(PCM_NMI_WATCHDOG_PATH, "0");
20002026
}
20012027

@@ -2423,7 +2449,7 @@ perf_event_attr PCM_init_perf_event_attr(bool group = true)
24232449
PCM::ErrorCode PCM::program(const PCM::ProgramMode mode_, const void * parameter_, const bool silent, const int pid)
24242450
{
24252451
#ifdef __linux__
2426-
if (isNMIWatchdogEnabled(silent))
2452+
if (isNMIWatchdogEnabled(silent) && (keepNMIWatchdogEnabled() == false))
24272453
{
24282454
disableNMIWatchdog(silent);
24292455
needToRestoreNMIWatchdog = true;
@@ -2590,7 +2616,16 @@ PCM::ErrorCode PCM::program(const PCM::ProgramMode mode_, const void * parameter
25902616
coreEventDesc[2].umask_value = SKL_MEM_LOAD_RETIRED_L2_MISS_UMASK;
25912617
coreEventDesc[3].event_number = SKL_MEM_LOAD_RETIRED_L2_HIT_EVTNR;
25922618
coreEventDesc[3].umask_value = SKL_MEM_LOAD_RETIRED_L2_HIT_UMASK;
2593-
if (core_gen_counter_num_max == 3)
2619+
if (core_gen_counter_num_max == 2)
2620+
{
2621+
L3CacheHitRatioAvailable = true;
2622+
L3CacheMissesAvailable = true;
2623+
L3CacheHitsSnoopAvailable = true;
2624+
L3CacheHitsAvailable = true;
2625+
core_gen_counter_num_used = 2;
2626+
break;
2627+
}
2628+
else if (core_gen_counter_num_max == 3)
25942629
{
25952630
L3CacheHitRatioAvailable = true;
25962631
L3CacheMissesAvailable = true;
@@ -2725,6 +2760,13 @@ PCM::ErrorCode PCM::program(const PCM::ProgramMode mode_, const void * parameter
27252760
std::cerr << "PCM ERROR: pid monitoring is only supported with Linux perf_event driver\n";
27262761
return PCM::UnknownError;
27272762
}
2763+
#ifdef __linux__
2764+
if (isNMIWatchdogEnabled(silent) && (canUsePerf == false))
2765+
{
2766+
std::cerr << "PCM ERROR: Unsupported mode. NMI watchdog is enabled and Linux perf_event driver is not used\n";
2767+
return PCM::UnknownError;
2768+
}
2769+
#endif
27282770

27292771
std::vector<int> tids{};
27302772
#ifdef PCM_USE_PERF

tests/test.sh

Lines changed: 40 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -4,24 +4,31 @@ export BIN_DIR="build/bin"
44

55
pushd $BIN_DIR
66

7+
echo Enable NMI watchdog
8+
echo 1 > /proc/sys/kernel/nmi_watchdog
9+
10+
echo Testing pcm with PCM_NO_PERF=1
711
PCM_NO_PERF=1 ./pcm -r -- sleep 1
812
if [ "$?" -ne "0" ]; then
913
echo "Error in pcm"
1014
exit 1
1115
fi
1216

17+
echo Testing pcm with PCM_USE_UNCORE_PERF=1
1318
PCM_USE_UNCORE_PERF=1 ./pcm -r -- sleep 1
1419
if [ "$?" -ne "0" ]; then
1520
echo "Error in pcm"
1621
exit 1
1722
fi
1823

24+
echo Testing pcm w/o env vars
1925
./pcm -r -- sleep 1
2026
if [ "$?" -ne "0" ]; then
2127
echo "Error in pcm"
2228
exit 1
2329
fi
2430

31+
echo Testing pcm with -pid
2532
perl -e ' do {} until (0)' &
2633
test_pid="$!"
2734
./pcm -pid $test_pid -- sleep 1
@@ -32,102 +39,126 @@ if [ "$?" -ne "0" ]; then
3239
fi
3340
kill $test_pid
3441

42+
echo Testing pcm with PCM_KEEP_NMI_WATCHDOG=1
43+
PCM_KEEP_NMI_WATCHDOG=1 ./pcm -r -- sleep 1
44+
if [ "$?" -ne "0" ]; then
45+
echo "Error in pcm"
46+
exit 1
47+
fi
48+
49+
echo Testing pcm with -csv
3550
./pcm -r 0.1 -csv=pcm.csv -- sleep 5
3651
if [ "$?" -ne "0" ]; then
3752
echo "Error in pcm"
3853
exit 1
3954
fi
4055

56+
echo Testing pcm-memory
4157
./pcm-memory -- sleep 1
4258
if [ "$?" -ne "0" ]; then
4359
echo "Error in pcm-memory"
4460
exit 1
4561
fi
4662

63+
echo Testing pcm-memory with -rank
4764
./pcm-memory -rank=1 -- sleep 1
4865
if [ "$?" -ne "0" ]; then
4966
echo "Error in pcm-memory"
5067
exit 1
5168
fi
5269

70+
echo Testing pcm-memory with -rank and -csv
5371
./pcm-memory -rank=1 -csv -- sleep 1
5472
if [ "$?" -ne "0" ]; then
5573
echo "Error in pcm-memory"
5674
exit 1
5775
fi
5876

77+
echo Testing pcm-iio
5978
./pcm-iio -i=1
6079
if [ "$?" -ne "0" ]; then
6180
echo "Error in pcm-iio"
6281
exit 1
6382
fi
6483

84+
echo Testing pcm-raw
6585
./pcm-raw -e core/config=0x30203,name=LD_BLOCKS.STORE_FORWARD/ -e cha/config=0,name=UNC_CHA_CLOCKTICKS/ -e imc/fixed,name=DRAM_CLOCKS -e thread_msr/config=0x10,config1=1 -e thread_msr/config=0x19c,config1=0 -- sleep 1
6686
if [ "$?" -ne "0" ]; then
6787
echo "Error in pcm-raw"
6888
exit 1
6989
fi
7090

91+
echo Testing pcm-mmio
7192
./pcm-mmio 0x0
7293
if [ "$?" -ne "0" ]; then
7394
echo "Error in pcm-mmio"
7495
exit 1
7596
fi
7697

98+
echo Testing pcm-pcicfg
7799
./pcm-pcicfg 0 0 0 0 0
78100
if [ "$?" -ne "0" ]; then
79101
echo "Error in pcm-pcicfg"
80102
exit 1
81103
fi
82104

105+
echo Testing pcm-numa
83106
./pcm-numa -- sleep 1
84107
if [ "$?" -ne "0" ]; then
85108
echo "Error in pcm-numa"
86109
exit 1
87110
fi
88111

112+
echo Testing pcm-core
89113
./pcm-core -e cpu/umask=0x01,event=0x0e,name=UOPS_ISSUED.STALL_CYCLES/ -- sleep 1
90114
if [ "$?" -ne "0" ]; then
91115
echo "Error in pcm-core"
92116
exit 1
93117
fi
94118

119+
echo Testing c_example
95120
./examples/c_example
96121
if [ "$?" -ne "0" ]; then
97122
echo "Error in c_example"
98123
exit 1
99124
fi
100125

126+
echo Testing c_example_shlib
101127
./examples/c_example_shlib
102128
if [ "$?" -ne "0" ]; then
103129
echo "Error in c_example_shlib"
104130
exit 1
105131
fi
106132

133+
echo Testing pcm-msr \(read only\)
107134
./pcm-msr -a 0x30A
108135
if [ "$?" -ne "0" ]; then
109136
echo "Error in pcm-msr"
110137
exit 1
111138
fi
112139

140+
echo Testing pcm-power
113141
./pcm-power -- sleep 1
114142
if [ "$?" -ne "0" ]; then
115143
echo "Error in pcm-power"
116144
exit 1
117145
fi
118146

147+
echo Testing pcm-pcie
119148
./pcm-pcie -- sleep 1
120149
if [ "$?" -ne "0" ]; then
121150
echo "Error in pcm-pcie"
122151
exit 1
123152
fi
124153

154+
echo Testing pcm-latency
125155
./pcm-latency -i=1
126156
if [ "$?" -ne "0" ]; then
127157
echo "Error in pcm-latency"
128158
exit 1
129159
fi
130160

161+
echo Testing pcm-tsx
131162
./pcm-tsx -- sleep 1
132163
if [ "$?" -ne "0" ]; then
133164
echo "Error in pcm-tsx"
@@ -137,15 +168,16 @@ fi
137168
# TODO add more tests
138169
# e.g for ./pcm-sensor-server, ./pcm-sensor, ...
139170

171+
echo Testing urltest
140172
./tests/urltest
141173
# We have 2 expected errors, anything else is a bug
142174
if [ "$?" != 2 ]; then
143175
echo "Error in urltest, 2 expected errors but found $?!"
144176
exit 1
145177
fi
146178

147-
### Check pcm-raw with event files
148-
# Download necessary files
179+
echo Testing pcm-raw with event files
180+
echo Download necessary files
149181
if [ ! -f "mapfile.csv" ]; then
150182
echo "Downloading https://download.01.org/perfmon/mapfile.csv"
151183
wget -q --timeout=10 https://download.01.org/perfmon/mapfile.csv
@@ -193,15 +225,15 @@ do
193225
fi
194226
done
195227

196-
# Now check pcm-raw with JSON files from mapFile.csv
228+
echo Now check pcm-raw with JSON files from mapFile.csv
197229
./pcm-raw -r -e LD_BLOCKS.STORE_FORWARD -e CPU_CLK_UNHALTED.THREAD_ANY -e INST_RETIRED.ANY -e UNC_CHA_CLOCKTICKS -- sleep 1
198230

199231
if [ "$?" -ne "0" ]; then
200232
echo "Error in pcm-raw"
201233
exit 1
202234
fi
203235

204-
# Now get corresponding TSV files and replace JSON files in mapFile.csv with them
236+
echo Now get corresponding TSV files and replace JSON files in mapFile.csv with them
205237
cp "mapfile.csv" "mapfile.csv_orig"
206238
for FILE in $FILES
207239
do
@@ -220,7 +252,7 @@ do
220252
done
221253

222254

223-
# Check pcm-raw with TSV files
255+
# echo Test pcm-raw with TSV files
224256
#./pcm-raw -r -e LD_BLOCKS.STORE_FORWARD -e CPU_CLK_UNHALTED.THREAD_ANY -e INST_RETIRED.ANY -e UNC_CHA_CLOCKTICKS -- sleep 1
225257

226258
#if [ "$?" -ne "0" ]; then
@@ -274,18 +306,21 @@ EOF
274306

275307
fi
276308

309+
echo Testing pcm-raw with -el event_file_test.txt -tr -csv
277310
./pcm-raw -el event_file_test.txt -tr -csv=raw_tr_wo_ext.csv -i=4 0.25
278311
if [ "$?" -ne "0" ]; then
279312
echo "Error in pcm-raw"
280313
exit 1
281314
fi
282315

316+
echo Testing pcm-raw with -el event_file_test.txt -tr -ext -csv
283317
./pcm-raw -el event_file_test.txt -tr -ext -csv=raw_tr_wi_ext.csv -i=4 0.25
284318
if [ "$?" -ne "0" ]; then
285319
echo "Error in pcm-raw"
286320
exit 1
287321
fi
288322

323+
echo Testing pcm-raw with -el event_file_test.txt -tr -ext -single-header -csv
289324
./pcm-raw -el event_file_test.txt -tr -ext -single-header -csv=raw_tr_wi_ext_single_header.csv -i=4 0.25
290325
if [ "$?" -ne "0" ]; then
291326
echo "Error in pcm-raw"

0 commit comments

Comments
 (0)