Skip to content

Commit ddc015e

Browse files
authored
Implement verification cache (#3801)
This PR introduces a cache that allows the scanner to avoid emitting multiple requests to verify the same credential. In practice, it doesn't seem to reduce scan time at all, but it does seem to reduce the number of calls to FromData rather drastically. The cache is implemented as an opt-out feature that can be disabled with a new CLI flag. If we don't like this, we can change it. The metrics collection hopefully isn't too architecture-astronauty; I wanted to create something useful here that could also accommodate future Prometheus configuration without making the implementation all stupid.
1 parent eeb5c04 commit ddc015e

File tree

10 files changed

+546
-36
lines changed

10 files changed

+546
-36
lines changed

main.go

Lines changed: 40 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,9 @@ import (
2020
"github.com/go-logr/logr"
2121
"github.com/jpillora/overseer"
2222
"github.com/mattn/go-isatty"
23+
"github.com/trufflesecurity/trufflehog/v3/pkg/cache/simple"
24+
"github.com/trufflesecurity/trufflehog/v3/pkg/detectors"
25+
"github.com/trufflesecurity/trufflehog/v3/pkg/verificationcache"
2326
"go.uber.org/automaxprocs/maxprocs"
2427

2528
"github.com/trufflesecurity/trufflehog/v3/pkg/analyzer"
@@ -76,6 +79,8 @@ var (
7679
excludeDetectors = cli.Flag("exclude-detectors", "Comma separated list of detector types to exclude. Protobuf name or IDs may be used, as well as ranges. IDs defined here take precedence over the include list.").String()
7780
jobReportFile = cli.Flag("output-report", "Write a scan report to the provided path.").Hidden().OpenFile(os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0666)
7881

82+
noVerificationCache = cli.Flag("no-verification-cache", "Disable verification caching").Bool()
83+
7984
// Add feature flags
8085
forceSkipBinaries = cli.Flag("force-skip-binaries", "Force skipping binaries.").Bool()
8186
forceSkipArchives = cli.Flag("force-skip-archives", "Force skipping archives.").Bool()
@@ -480,25 +485,32 @@ func run(state overseer.State) {
480485
logFatal(err, "failed to configure results flag")
481486
}
482487

488+
verificationCacheMetrics := verificationcache.InMemoryMetrics{}
489+
483490
engConf := engine.Config{
484491
Concurrency: *concurrency,
485492
// The engine must always be configured with the list of
486493
// default detectors, which can be further filtered by the
487494
// user. The filters are applied by the engine and are only
488495
// subtractive.
489-
Detectors: append(defaults.DefaultDetectors(), conf.Detectors...),
490-
Verify: !*noVerification,
491-
IncludeDetectors: *includeDetectors,
492-
ExcludeDetectors: *excludeDetectors,
493-
CustomVerifiersOnly: *customVerifiersOnly,
494-
VerifierEndpoints: *verifiers,
495-
Dispatcher: engine.NewPrinterDispatcher(printer),
496-
FilterUnverified: *filterUnverified,
497-
FilterEntropy: *filterEntropy,
498-
VerificationOverlap: *allowVerificationOverlap,
499-
Results: parsedResults,
500-
PrintAvgDetectorTime: *printAvgDetectorTime,
501-
ShouldScanEntireChunk: *scanEntireChunk,
496+
Detectors: append(defaults.DefaultDetectors(), conf.Detectors...),
497+
Verify: !*noVerification,
498+
IncludeDetectors: *includeDetectors,
499+
ExcludeDetectors: *excludeDetectors,
500+
CustomVerifiersOnly: *customVerifiersOnly,
501+
VerifierEndpoints: *verifiers,
502+
Dispatcher: engine.NewPrinterDispatcher(printer),
503+
FilterUnverified: *filterUnverified,
504+
FilterEntropy: *filterEntropy,
505+
VerificationOverlap: *allowVerificationOverlap,
506+
Results: parsedResults,
507+
PrintAvgDetectorTime: *printAvgDetectorTime,
508+
ShouldScanEntireChunk: *scanEntireChunk,
509+
VerificationCacheMetrics: &verificationCacheMetrics,
510+
}
511+
512+
if !*noVerificationCache {
513+
engConf.VerificationResultCache = simple.NewCache[detectors.Result]()
502514
}
503515

504516
if *compareDetectionStrategies {
@@ -518,6 +530,20 @@ func run(state overseer.State) {
518530
logFatal(err, "error running scan")
519531
}
520532

533+
verificationCacheMetrics := struct {
534+
Hits int32
535+
Misses int32
536+
HitsWasted int32
537+
AttemptsSaved int32
538+
VerificationTimeSpentMS int64
539+
}{
540+
Hits: verificationCacheMetrics.ResultCacheHits.Load(),
541+
Misses: verificationCacheMetrics.ResultCacheMisses.Load(),
542+
HitsWasted: verificationCacheMetrics.ResultCacheHitsWasted.Load(),
543+
AttemptsSaved: verificationCacheMetrics.CredentialVerificationsSaved.Load(),
544+
VerificationTimeSpentMS: verificationCacheMetrics.FromDataVerifyTimeSpentMS.Load(),
545+
}
546+
521547
// Print results.
522548
logger.Info("finished scanning",
523549
"chunks", metrics.ChunksScanned,
@@ -526,6 +552,7 @@ func run(state overseer.State) {
526552
"unverified_secrets", metrics.UnverifiedSecretsFound,
527553
"scan_duration", metrics.ScanDuration.String(),
528554
"trufflehog_version", version.BuildVersion,
555+
"verification_caching", verificationCacheMetrics,
529556
)
530557

531558
if metrics.hasFoundResults && *fail {

pkg/detectors/detectors.go

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,9 @@ type Result struct {
9090
// DetectorName is the name of the Detector. Used for custom detectors.
9191
DetectorName string
9292
Verified bool
93+
// VerificationFromCache indicates whether this result's verification result came from the verification cache rather
94+
// than an actual remote request.
95+
VerificationFromCache bool
9396
// Raw contains the raw secret identifier data. Prefer IDs over secrets since it is used for deduping after hashing.
9497
Raw []byte
9598
// RawV2 contains the raw secret identifier that is a combination of both the ID and the secret.
@@ -111,7 +114,15 @@ type Result struct {
111114
AnalysisInfo map[string]string
112115
}
113116

114-
// SetVerificationError is the only way to set a verification error. Any sensitive values should be passed-in as secrets to be redacted.
117+
// CopyVerificationInfo clones verification info (status and error) from another Result struct. This is used when
118+
// loading verification info from a verification cache. (A method is necessary because verification errors are not
119+
// exported, to prevent the accidental storage of sensitive information in them.)
120+
func (r *Result) CopyVerificationInfo(from *Result) {
121+
r.Verified = from.Verified
122+
r.verificationError = from.verificationError
123+
}
124+
125+
// SetVerificationError is the only way to set a new verification error. Any sensitive values should be passed-in as secrets to be redacted.
115126
func (r *Result) SetVerificationError(err error, secrets ...string) {
116127
if err != nil {
117128
r.verificationError = redactSecrets(err, secrets...)

pkg/engine/engine.go

Lines changed: 17 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ import (
1313
"github.com/adrg/strutil"
1414
"github.com/adrg/strutil/metrics"
1515
lru "github.com/hashicorp/golang-lru/v2"
16+
"github.com/trufflesecurity/trufflehog/v3/pkg/verificationcache"
1617
"google.golang.org/protobuf/proto"
1718

1819
"github.com/trufflesecurity/trufflehog/v3/pkg/common"
@@ -145,6 +146,9 @@ type Config struct {
145146

146147
// VerificationOverlapWorkerMultiplier is used to determine the number of verification overlap workers to spawn.
147148
VerificationOverlapWorkerMultiplier int
149+
150+
VerificationResultCache verificationcache.ResultCache
151+
VerificationCacheMetrics verificationcache.MetricsReporter
148152
}
149153

150154
// Engine represents the core scanning engine responsible for detecting secrets in input data.
@@ -153,9 +157,10 @@ type Config struct {
153157
// customization through various options and configurations.
154158
type Engine struct {
155159
// CLI flags.
156-
concurrency int
157-
decoders []decoders.Decoder
158-
detectors []detectors.Detector
160+
concurrency int
161+
decoders []decoders.Decoder
162+
detectors []detectors.Detector
163+
verificationCache *verificationcache.VerificationCache
159164
// Any detectors configured to override sources' verification flags
160165
detectorVerificationOverrides map[config.DetectorID]bool
161166

@@ -216,10 +221,13 @@ type Engine struct {
216221

217222
// NewEngine creates a new Engine instance with the provided configuration.
218223
func NewEngine(ctx context.Context, cfg *Config) (*Engine, error) {
224+
verificationCache := verificationcache.New(cfg.VerificationResultCache, cfg.VerificationCacheMetrics)
225+
219226
engine := &Engine{
220227
concurrency: cfg.Concurrency,
221228
decoders: cfg.Decoders,
222229
detectors: cfg.Detectors,
230+
verificationCache: verificationCache,
223231
dispatcher: cfg.Dispatcher,
224232
verify: cfg.Verify,
225233
filterUnverified: cfg.FilterUnverified,
@@ -1056,7 +1064,12 @@ func (e *Engine) detectChunk(ctx context.Context, data detectableChunk) {
10561064
t := time.AfterFunc(detectionTimeout+1*time.Second, func() {
10571065
ctx.Logger().Error(nil, "a detector ignored the context timeout")
10581066
})
1059-
results, err := data.detector.Detector.FromData(ctx, data.chunk.Verify, matchBytes)
1067+
results, err := e.verificationCache.FromData(
1068+
ctx,
1069+
data.detector.Detector,
1070+
data.chunk.Verify,
1071+
data.chunk.SecretID != 0,
1072+
matchBytes)
10601073
t.Stop()
10611074
cancel()
10621075
if err != nil {

pkg/output/json.go

Lines changed: 20 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -40,9 +40,10 @@ func (p *JSONPrinter) Print(_ context.Context, r *detectors.ResultWithMetadata)
4040
// DetectorDescription is the description of the Detector.
4141
DetectorDescription string
4242
// DecoderName is the string name of the DecoderType.
43-
DecoderName string
44-
Verified bool
45-
VerificationError string `json:",omitempty"`
43+
DecoderName string
44+
Verified bool
45+
VerificationError string `json:",omitempty"`
46+
VerificationFromCache bool
4647
// Raw contains the raw secret data.
4748
Raw string
4849
// RawV2 contains the raw secret identifier that is a combination of both the ID and the secret.
@@ -54,21 +55,22 @@ func (p *JSONPrinter) Print(_ context.Context, r *detectors.ResultWithMetadata)
5455
ExtraData map[string]string
5556
StructuredData *detectorspb.StructuredData
5657
}{
57-
SourceMetadata: r.SourceMetadata,
58-
SourceID: r.SourceID,
59-
SourceType: r.SourceType,
60-
SourceName: r.SourceName,
61-
DetectorType: r.DetectorType,
62-
DetectorName: r.DetectorType.String(),
63-
DetectorDescription: r.DetectorDescription,
64-
DecoderName: r.DecoderType.String(),
65-
Verified: r.Verified,
66-
VerificationError: verificationErr,
67-
Raw: string(r.Raw),
68-
RawV2: string(r.RawV2),
69-
Redacted: r.Redacted,
70-
ExtraData: r.ExtraData,
71-
StructuredData: r.StructuredData,
58+
SourceMetadata: r.SourceMetadata,
59+
SourceID: r.SourceID,
60+
SourceType: r.SourceType,
61+
SourceName: r.SourceName,
62+
DetectorType: r.DetectorType,
63+
DetectorName: r.DetectorType.String(),
64+
DetectorDescription: r.DetectorDescription,
65+
DecoderName: r.DecoderType.String(),
66+
Verified: r.Verified,
67+
VerificationError: verificationErr,
68+
VerificationFromCache: r.VerificationFromCache,
69+
Raw: string(r.Raw),
70+
RawV2: string(r.RawV2),
71+
Redacted: r.Redacted,
72+
ExtraData: r.ExtraData,
73+
StructuredData: r.StructuredData,
7274
}
7375
out, err := json.Marshal(v)
7476
if err != nil {

pkg/output/plain.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ var (
2222
boldGreenPrinter = color.New(color.Bold, color.FgHiGreen)
2323
whitePrinter = color.New(color.FgWhite)
2424
boldWhitePrinter = color.New(color.Bold, color.FgWhite)
25+
cyanPrinter = color.New(color.FgCyan)
2526
)
2627

2728
// PlainPrinter is a printer that prints results in plain text format.
@@ -56,6 +57,9 @@ func (p *PlainPrinter) Print(_ context.Context, r *detectors.ResultWithMetadata)
5657
yellowPrinter.Printf("Verification issue: %s\n", out.VerificationError)
5758
}
5859
}
60+
if r.VerificationFromCache {
61+
cyanPrinter.Print("(Verification info cached)\n")
62+
}
5963
printer.Printf("Detector Type: %s\n", out.DetectorType)
6064
printer.Printf("Decoder Type: %s\n", out.DecoderType)
6165
printer.Printf("Raw result: %s\n", whitePrinter.Sprint(out.Raw))
Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
package verificationcache
2+
3+
import (
4+
"sync/atomic"
5+
"time"
6+
)
7+
8+
// InMemoryMetrics is a MetricsReporter that stores reported metrics in memory for retrieval at the end of a scan.
9+
type InMemoryMetrics struct {
10+
CredentialVerificationsSaved atomic.Int32
11+
FromDataVerifyTimeSpentMS atomic.Int64
12+
ResultCacheHits atomic.Int32
13+
ResultCacheHitsWasted atomic.Int32
14+
ResultCacheMisses atomic.Int32
15+
}
16+
17+
var _ MetricsReporter = (*InMemoryMetrics)(nil)
18+
19+
func (m *InMemoryMetrics) AddCredentialVerificationsSaved(count int) {
20+
m.CredentialVerificationsSaved.Add(int32(count))
21+
}
22+
23+
func (m *InMemoryMetrics) AddFromDataVerifyTimeSpent(wallTime time.Duration) {
24+
m.FromDataVerifyTimeSpentMS.Add(wallTime.Milliseconds())
25+
}
26+
27+
func (m *InMemoryMetrics) AddResultCacheHits(count int) {
28+
m.ResultCacheHits.Add(int32(count))
29+
}
30+
31+
func (m *InMemoryMetrics) AddResultCacheMisses(count int) {
32+
m.ResultCacheMisses.Add(int32(count))
33+
}
34+
35+
func (m *InMemoryMetrics) AddResultCacheHitsWasted(count int) {
36+
m.ResultCacheHitsWasted.Add(int32(count))
37+
}
Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
package verificationcache
2+
3+
import "time"
4+
5+
// MetricsReporter is an interface used by a verification cache to report various metrics related to its operation.
6+
// Implementations must be thread-safe.
7+
type MetricsReporter interface {
8+
// AddCredentialVerificationsSaved records "saved" verification attempts, which is when credential verification
9+
// status is loaded from the cache instead of retrieved from a remote verification endpoint. This number might be
10+
// smaller than the cache hit count due to cache hit "wasting"; see AddResultCacheHitsWasted for more information.
11+
AddCredentialVerificationsSaved(count int)
12+
13+
// AddFromDataVerifyTimeSpent records wall time spent in calls to detector.FromData with verify=true.
14+
AddFromDataVerifyTimeSpent(wallTime time.Duration)
15+
16+
// AddResultCacheHits records result cache hits. Not all cache hits result in elided remote verification requests
17+
// due to cache hit "wasting"; see AddResultCacheHitsWasted for more information.
18+
AddResultCacheHits(count int)
19+
20+
// AddResultCacheMisses records result cache misses.
21+
AddResultCacheMisses(count int)
22+
23+
// AddResultCacheHitsWasted records "wasted" result cache hits. A "wasted" result cache hit is a result cache hit
24+
// that does not elide a remote verification request because there are other secret findings in the relevant chunk
25+
// that are not cached. When this happens, the detector's FromData method must be called anyway, so the cache hit
26+
// doesn't save any remote requests.
27+
AddResultCacheHitsWasted(count int)
28+
}

pkg/verificationcache/result_cache.go

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
package verificationcache
2+
3+
import (
4+
"github.com/trufflesecurity/trufflehog/v3/pkg/cache"
5+
"github.com/trufflesecurity/trufflehog/v3/pkg/detectors"
6+
)
7+
8+
// ResultCache is a cache that holds individual detector results. It serves as a component of a VerificationCache.
9+
type ResultCache cache.Cache[detectors.Result]

0 commit comments

Comments
 (0)