Skip to content

Commit 9ab13e2

Browse files
committed
Merged PR 7872479: Fix DSA selftest perf. Add perf measurements to selftest execution in unit tests.
`SymCryptDsaSelftest` was passing the `SYMCRYPT_DLGROUP_FIPS_LATEST` flag when calling `SymCryptDlgroupSetValue`, which causes that function to regenerate the primes P and Q and perform Rabin-Miller primality tests on them. This is very computationally expensive, to the point that running the DSA selftest caused significant performance problems in some scenarios. The fix is to instead use `SYMCRYPT_DLGROUP_FIPS_NONE`; since we're using a hardcoded, known-good key, we do not need to perform the additional validation on it. Also added error injection to selftests that were missing it, and added basic performance measurement to the selftests when they are run as part of the unit test, so that we can catch selftest performance issues more easily.
1 parent bc66c79 commit 9ab13e2

File tree

6 files changed

+147
-138
lines changed

6 files changed

+147
-138
lines changed

lib/fips_selftest.c

Lines changed: 25 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -54,9 +54,6 @@ typedef struct _SYMCRYPT_SELFTEST_RSAKEY_2048
5454
// DL groups and keys for DH secret agreement selftest
5555
//
5656

57-
// Generator for the DL group used with the two following DH keys
58-
const BYTE dhGenerator = 2;
59-
6057
// Keys generated from Oakley group 14 (IKE group mod p2048 from RFC 3526)
6158
// aka SymCryptDlgroupDhSafePrimeParamsModp2048
6259
const SYMCRYPT_SELFTEST_DLKEY_2048 dhKey1 =
@@ -499,27 +496,16 @@ SymCryptDhSecretAgreementSelftest()
499496
PSYMCRYPT_DLKEY pkKey1 = NULL;
500497
PSYMCRYPT_DLKEY pkKey2 = NULL;
501498

502-
BYTE rgbSecret[sizeof(dhKey1.public)];
499+
BYTE rgbSecret[ sizeof(rgbDhKnownSecret) ];
503500

504501
pDlgroup = SymCryptDlgroupAllocate(
505502
SymCryptDlgroupDhSafePrimeParamsModp2048->nBitsOfP,
506503
0 );
507504
SYMCRYPT_FIPS_ASSERT( pDlgroup != NULL );
508505

509-
scError = SymCryptDlgroupSetValue(
510-
SymCryptDlgroupDhSafePrimeParamsModp2048->pcbPrimeP,
511-
SymCryptDlgroupDhSafePrimeParamsModp2048->nBitsOfP / 8,
512-
NULL, // pbPrimeQ
513-
0, // cbPrimeQ
514-
(PBYTE) &dhGenerator,
515-
sizeof(dhGenerator),
516-
SYMCRYPT_NUMBER_FORMAT_MSB_FIRST,
517-
NULL, // pHashAlgorithm
518-
NULL, // pbSeed
519-
0, // cbSeed
520-
0, // genCounter
521-
SYMCRYPT_DLGROUP_FIPS_NONE,
522-
pDlgroup);
506+
scError = SymCryptDlgroupSetValueSafePrime(
507+
SYMCRYPT_DLGROUP_DH_SAFEPRIMETYPE_IKE_3526,
508+
pDlgroup );
523509
SYMCRYPT_FIPS_ASSERT( scError == SYMCRYPT_NO_ERROR );
524510

525511
pkKey1 = SymCryptDlkeyAllocate( pDlgroup );
@@ -531,7 +517,7 @@ SymCryptDhSecretAgreementSelftest()
531517
dhKey1.public,
532518
sizeof(dhKey1.public),
533519
SYMCRYPT_NUMBER_FORMAT_MSB_FIRST,
534-
SYMCRYPT_FLAG_DLKEY_DH | SYMCRYPT_FLAG_KEY_NO_FIPS, // flags
520+
SYMCRYPT_FLAG_DLKEY_DH | SYMCRYPT_FLAG_KEY_NO_FIPS,
535521
pkKey1 );
536522
SYMCRYPT_FIPS_ASSERT( scError == SYMCRYPT_NO_ERROR );
537523

@@ -544,7 +530,7 @@ SymCryptDhSecretAgreementSelftest()
544530
dhKey2.public,
545531
sizeof(dhKey2.public),
546532
SYMCRYPT_NUMBER_FORMAT_MSB_FIRST,
547-
SYMCRYPT_FLAG_DLKEY_DH | SYMCRYPT_FLAG_KEY_NO_FIPS, // flags
533+
SYMCRYPT_FLAG_DLKEY_DH | SYMCRYPT_FLAG_KEY_NO_FIPS,
548534
pkKey2 );
549535
SYMCRYPT_FIPS_ASSERT( scError == SYMCRYPT_NO_ERROR );
550536

@@ -558,8 +544,8 @@ SymCryptDhSecretAgreementSelftest()
558544
sizeof(rgbSecret) );
559545
SYMCRYPT_FIPS_ASSERT( scError == SYMCRYPT_NO_ERROR );
560546

561-
#pragma warning( suppress: 4127 ) // conditional expression is constant
562-
SYMCRYPT_FIPS_ASSERT( sizeof(rgbSecret) == sizeof(rgbDhKnownSecret) );
547+
SymCryptInjectError( rgbSecret, sizeof(rgbSecret) );
548+
563549
SYMCRYPT_FIPS_ASSERT( memcmp( rgbSecret, rgbDhKnownSecret, sizeof(rgbDhKnownSecret) ) == 0 );
564550

565551
SymCryptDlkeyFree( pkKey2 );
@@ -592,7 +578,7 @@ SymCryptEcDhSecretAgreementSelftest( )
592578
sizeof(eckey1.Qxy),
593579
SYMCRYPT_NUMBER_FORMAT_MSB_FIRST,
594580
SYMCRYPT_ECPOINT_FORMAT_XY,
595-
SYMCRYPT_FLAG_ECKEY_ECDH | SYMCRYPT_FLAG_KEY_NO_FIPS, // flags
581+
SYMCRYPT_FLAG_ECKEY_ECDH | SYMCRYPT_FLAG_KEY_NO_FIPS,
596582
pkKey1);
597583
SYMCRYPT_FIPS_ASSERT( scError == SYMCRYPT_NO_ERROR );
598584

@@ -606,7 +592,7 @@ SymCryptEcDhSecretAgreementSelftest( )
606592
sizeof(eckey2.Qxy),
607593
SYMCRYPT_NUMBER_FORMAT_MSB_FIRST,
608594
SYMCRYPT_ECPOINT_FORMAT_XY,
609-
SYMCRYPT_FLAG_ECKEY_ECDH | SYMCRYPT_FLAG_KEY_NO_FIPS, // flags
595+
SYMCRYPT_FLAG_ECKEY_ECDH | SYMCRYPT_FLAG_KEY_NO_FIPS,
610596
pkKey2);
611597
SYMCRYPT_FIPS_ASSERT( scError == SYMCRYPT_NO_ERROR );
612598

@@ -620,7 +606,9 @@ SymCryptEcDhSecretAgreementSelftest( )
620606
sizeof(rgbSecret));
621607
SYMCRYPT_FIPS_ASSERT( scError == SYMCRYPT_NO_ERROR );
622608

623-
SYMCRYPT_FIPS_ASSERT( memcmp(rgbSecret, rgbEcdhKnownSecret, sizeof(rgbSecret)) == 0);
609+
SymCryptInjectError( rgbSecret, sizeof(rgbSecret) );
610+
611+
SYMCRYPT_FIPS_ASSERT( memcmp(rgbSecret, rgbEcdhKnownSecret, sizeof(rgbSecret)) == 0 );
624612

625613
SymCryptEckeyFree( pkKey2 );
626614
SymCryptEckeyFree( pkKey1 );
@@ -647,6 +635,8 @@ SymCryptDsaSignVerifyTest( PCSYMCRYPT_DLKEY pkDlkey )
647635
cbSignature );
648636
SYMCRYPT_FIPS_ASSERT( scError == SYMCRYPT_NO_ERROR );
649637

638+
SymCryptInjectError( pbSignature, cbSignature );
639+
650640
scError = SymCryptDsaVerify(
651641
pkDlkey,
652642
rgbSha256Hash,
@@ -672,8 +662,8 @@ SymCryptDsaSelftest( )
672662

673663
pDlgroup = SymCryptDlgroupAllocate(
674664
sizeof(dsaDlgroup.primeP) * 8,
675-
sizeof(dsaDlgroup.primeQ) * 8);
676-
SYMCRYPT_FIPS_ASSERT(pDlgroup != NULL);
665+
sizeof(dsaDlgroup.primeQ) * 8 );
666+
SYMCRYPT_FIPS_ASSERT( pDlgroup != NULL );
677667

678668
scError = SymCryptDlgroupSetValue(
679669
dsaDlgroup.primeP,
@@ -687,8 +677,8 @@ SymCryptDsaSelftest( )
687677
dsaDlgroup.seed,
688678
sizeof(dsaDlgroup.seed),
689679
dsaDlgroup.counter,
690-
SYMCRYPT_DLGROUP_FIPS_LATEST,
691-
pDlgroup);
680+
SYMCRYPT_DLGROUP_FIPS_NONE,
681+
pDlgroup );
692682
SYMCRYPT_FIPS_ASSERT( scError == SYMCRYPT_NO_ERROR );
693683

694684
pkDlkey = SymCryptDlkeyAllocate( pDlgroup );
@@ -700,7 +690,7 @@ SymCryptDsaSelftest( )
700690
dsaKey.public,
701691
sizeof(dsaKey.public),
702692
SYMCRYPT_NUMBER_FORMAT_MSB_FIRST,
703-
SYMCRYPT_FLAG_DLKEY_DSA | SYMCRYPT_FLAG_KEY_NO_FIPS, // flags
693+
SYMCRYPT_FLAG_DLKEY_DSA | SYMCRYPT_FLAG_KEY_NO_FIPS,
704694
pkDlkey );
705695
SYMCRYPT_FIPS_ASSERT( scError == SYMCRYPT_NO_ERROR );
706696

@@ -732,6 +722,8 @@ SymCryptEcDsaSignVerifyTest( PCSYMCRYPT_ECKEY pkEckey )
732722
cbSignature );
733723
SYMCRYPT_FIPS_ASSERT( scError == SYMCRYPT_NO_ERROR );
734724

725+
SymCryptInjectError( pbSignature, cbSignature );
726+
735727
scError = SymCryptEcDsaVerify(
736728
pkEckey,
737729
rgbSha256Hash,
@@ -768,7 +760,7 @@ SymCryptEcDsaSelftest( )
768760
sizeof(eckey1.Qxy),
769761
SYMCRYPT_NUMBER_FORMAT_MSB_FIRST,
770762
SYMCRYPT_ECPOINT_FORMAT_XY,
771-
SYMCRYPT_FLAG_ECKEY_ECDSA | SYMCRYPT_FLAG_KEY_NO_FIPS, // flags
763+
SYMCRYPT_FLAG_ECKEY_ECDSA | SYMCRYPT_FLAG_KEY_NO_FIPS,
772764
pkEckey);
773765
SYMCRYPT_FIPS_ASSERT( scError == SYMCRYPT_NO_ERROR );
774766

@@ -801,6 +793,8 @@ SymCryptRsaSignVerifyTest( PCSYMCRYPT_RSAKEY pkRsakey )
801793
&cbSignature );
802794
SYMCRYPT_FIPS_ASSERT( scError == SYMCRYPT_NO_ERROR );
803795

796+
SymCryptInjectError( pbSignature, cbSignature );
797+
804798
scError = SymCryptRsaPkcs1Verify(
805799
pkRsakey,
806800
rgbSha256Hash,

unittest/inc/perf.h

Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,89 @@
55
//
66

77

8+
//
9+
// The performance infrastructure has some flexibility to use different clocks.
10+
//
11+
// On Windows we use HW counters when the target architecture is known, and QueryPerformanceCounter when it is not.
12+
// On Linux we use rdtscp directly for x86/AMD64, and clock_gettime otherwise.
13+
//
14+
15+
// Perf clock scaling uses a fixed function with cycle latency known at compile time as a measuring stick
16+
// to scale an arbitrary wall clock into a cycle clock (detecting CPU frequency changes and retaking measurements
17+
// as appropriate)
18+
// We currently only do not use this for ARM and ARM64 on Windows, where we can guarantee to access a raw CPU cycle counter
19+
#define ENABLE_PERF_CLOCK_SCALING ((BOOLEAN) TRUE)
20+
#define FIXED_TIME_LOOP() fixedTimeLoopPerfFunction( NULL, NULL, NULL, 0 )
21+
22+
#if (SYMCRYPT_MS_VC || SYMCRYPT_GNUC) && (SYMCRYPT_CPU_AMD64 || SYMCRYPT_CPU_X86)
23+
// Windows or Linux, x86 or AMD64
24+
#if SYMCRYPT_MS_VC
25+
#include <intrin.h>
26+
#else
27+
#include <x86intrin.h>
28+
#endif
29+
30+
FORCEINLINE
31+
ULONGLONG
32+
GET_PERF_CLOCK()
33+
{
34+
// Use rdtscp instead of rdtsc as it ensures earlier (non-store) instructions complete before it executes
35+
// This does not have the performance impact of serializing with cpuid
36+
unsigned int ui;
37+
ULONGLONG timestamp = __rdtscp(&ui);
38+
// Use lfence to prevent speculative execution of instructions _after_ the rdtscp (assumes lfence is serializing which is true after spectre mitigations)
39+
_mm_lfence();
40+
return timestamp;
41+
}
42+
43+
#define PERF_UNIT "cycles"
44+
45+
#elif SYMCRYPT_MS_VC && (SYMCRYPT_CPU_ARM || SYMCRYPT_CPU_ARM64)
46+
// Windows, Arm or Arm64
47+
#undef ENABLE_PERF_CLOCK_SCALING
48+
#define ENABLE_PERF_CLOCK_SCALING ((BOOLEAN) FALSE)
49+
#undef FIXED_TIME_LOOP
50+
51+
#define FIXED_TIME_LOOP() nullPerfFunction( NULL, NULL, NULL, 0 )
52+
53+
#if SYMCRYPT_CPU_ARM
54+
// Windows, Arm
55+
#define GET_PERF_CLOCK() __rdpmccntr64()
56+
#elif SYMCRYPT_CPU_ARM64
57+
// Windows, Arm64
58+
#define GET_PERF_CLOCK() _ReadStatusReg(ARM64_PMCCNTR_EL0)
59+
#endif
60+
#define PERF_UNIT "cycles"
61+
62+
#elif SYMCRYPT_MS_VC
63+
// Windows, Generic (no architecture specified at compile time)
64+
FORCEINLINE
65+
ULONGLONG
66+
GET_PERF_CLOCK()
67+
{
68+
LARGE_INTEGER t;
69+
QueryPerformanceCounter( &t );
70+
return (ULONGLONG) t.QuadPart;
71+
}
72+
73+
// We rely on performance scaling logic to convert the raw nanoseconds readings into cycles
74+
#define PERF_UNIT "cycles"
75+
76+
#elif SYMCRYPT_GNUC
77+
// Linux, not x86 or AMD64
78+
FORCEINLINE
79+
ULONGLONG
80+
GET_PERF_CLOCK()
81+
{
82+
struct timespec time;
83+
clock_gettime(CLOCK_MONOTONIC, &time);
84+
return time.tv_nsec;
85+
}
86+
87+
// We rely on performance scaling logic to convert the raw nanoseconds readings into cycles
88+
#define PERF_UNIT "cycles"
89+
#endif
90+
891
//
992
// The perf measuring is complicated. The P4 has all kind of alignment issues;
1093
// The L1 cache can't hold two cache lines that are aliased mod 64 kB. This leads

unittest/inc/test_lib.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@
3131
#include <math.h>
3232
#include <unistd.h>
3333

34+
#include <chrono>
3435
#include <vector>
3536
#include <string>
3637
#include <memory>
@@ -51,6 +52,7 @@
5152
#include <math.h>
5253
#include <unistd.h>
5354

55+
#include <chrono>
5456
#include <vector>
5557
#include <string>
5658
#include <memory>
@@ -159,6 +161,7 @@
159161

160162
#include <powrprof.h>
161163

164+
#include <chrono>
162165
#include <vector>
163166
#include <string>
164167
#include <memory>
@@ -1377,6 +1380,8 @@ VOID measurePerfOfWipe();
13771380

13781381
VOID initPerfSystem();
13791382

1383+
VOID testSelftestPerf();
1384+
13801385
VOID testSelftest();
13811386

13821387
CHAR charToLower( CHAR c );

unittest/lib/perf.cpp

Lines changed: 0 additions & 89 deletions
Original file line numberDiff line numberDiff line change
@@ -7,97 +7,11 @@
77

88
#include "precomp.h"
99

10-
#include <chrono>
11-
1210
ULONGLONG g_minMeasurementClockTime = 0;
1311
ULONGLONG g_largeMeasurementClockTime = (ULONGLONG)-1;
1412
double g_perfScaleFactor;
1513
double g_tscFreq;
1614

17-
18-
//
19-
// The performance infrastructure has some flexibility to use different clocks.
20-
//
21-
// On Windows we use HW counters when the target architecture is known, and QueryPerformanceCounter when it is not.
22-
// On Linux we use rdtscp directly for x86/AMD64, and clock_gettime otherwise.
23-
//
24-
25-
// Perf clock scaling uses a fixed function with cycle latency known at compile time as a measuring stick
26-
// to scale an arbitrary wall clock into a cycle clock (detecting CPU frequency changes and retaking measurements
27-
// as appropriate)
28-
// We currently only do not use this for ARM and ARM64 on Windows, where we can guarantee to access a raw CPU cycle counter
29-
#define ENABLE_PERF_CLOCK_SCALING ((BOOLEAN) TRUE)
30-
#define FIXED_TIME_LOOP() fixedTimeLoopPerfFunction( NULL, NULL, NULL, 0 )
31-
32-
#if (SYMCRYPT_MS_VC || SYMCRYPT_GNUC) && (SYMCRYPT_CPU_AMD64 || SYMCRYPT_CPU_X86)
33-
// Windows or Linux, x86 or AMD64
34-
#if SYMCRYPT_MS_VC
35-
#include <intrin.h>
36-
#else
37-
#include <x86intrin.h>
38-
#endif
39-
40-
FORCEINLINE
41-
ULONGLONG
42-
GET_PERF_CLOCK()
43-
{
44-
// Use rdtscp instead of rdtsc as it ensures earlier (non-store) instructions complete before it executes
45-
// This does not have the performance impact of serializing with cpuid
46-
unsigned int ui;
47-
ULONGLONG timestamp = __rdtscp(&ui);
48-
// Use lfence to prevent speculative execution of instructions _after_ the rdtscp (assumes lfence is serializing which is true after spectre mitigations)
49-
_mm_lfence();
50-
return timestamp;
51-
}
52-
53-
#define PERF_UNIT "cycles"
54-
55-
#elif SYMCRYPT_MS_VC && (SYMCRYPT_CPU_ARM || SYMCRYPT_CPU_ARM64)
56-
// Windows, Arm or Arm64
57-
#undef ENABLE_PERF_CLOCK_SCALING
58-
#define ENABLE_PERF_CLOCK_SCALING ((BOOLEAN) FALSE)
59-
#undef FIXED_TIME_LOOP
60-
61-
#define FIXED_TIME_LOOP() nullPerfFunction( NULL, NULL, NULL, 0 )
62-
63-
#if SYMCRYPT_CPU_ARM
64-
// Windows, Arm
65-
#define GET_PERF_CLOCK() __rdpmccntr64()
66-
#elif SYMCRYPT_CPU_ARM64
67-
// Windows, Arm64
68-
#define GET_PERF_CLOCK() _ReadStatusReg(ARM64_PMCCNTR_EL0)
69-
#endif
70-
#define PERF_UNIT "cycles"
71-
72-
#elif SYMCRYPT_MS_VC
73-
// Windows, Generic (no architecture specified at compile time)
74-
FORCEINLINE
75-
ULONGLONG
76-
GET_PERF_CLOCK()
77-
{
78-
LARGE_INTEGER t;
79-
QueryPerformanceCounter( &t );
80-
return (ULONGLONG) t.QuadPart;
81-
}
82-
83-
// We rely on performance scaling logic to convert the raw nanoseconds readings into cycles
84-
#define PERF_UNIT "cycles"
85-
86-
#elif SYMCRYPT_GNUC
87-
// Linux, not x86 or AMD64
88-
FORCEINLINE
89-
ULONGLONG
90-
GET_PERF_CLOCK()
91-
{
92-
struct timespec time;
93-
clock_gettime(CLOCK_MONOTONIC, &time);
94-
return time.tv_nsec;
95-
}
96-
97-
// We rely on performance scaling logic to convert the raw nanoseconds readings into cycles
98-
#define PERF_UNIT "cycles"
99-
#endif
100-
10115
BOOLEAN g_perfClockScaling = ENABLE_PERF_CLOCK_SCALING;
10216

10317
#if SYMCRYPT_MS_VC
@@ -1197,9 +1111,6 @@ measurePerfOfAlgorithms()
11971111
i != g_algorithmImplementation.end();
11981112
i++ )
11991113
{
1200-
1201-
//iprint( "Performance testing %s/%s\n", (*i)->m_implementationName.c_str(), (*i)->m_algorithmName.c_str() );
1202-
//Sleep( 10 );
12031114
measurePerfOneAlg( *i );
12041115
}
12051116

0 commit comments

Comments
 (0)