Skip to content

Commit 22032f7

Browse files
authored
[refactor] - detectorKeywordMatcher initialization (#3687)
* Move logic to build keyword matcher into apk file and use singleton * remove mutex * inline
1 parent 01ad208 commit 22032f7

File tree

2 files changed

+78
-65
lines changed

2 files changed

+78
-65
lines changed

pkg/engine/defaults/defaults.go

Lines changed: 0 additions & 63 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,6 @@
11
package defaults
22

33
import (
4-
"bytes"
5-
"strings"
6-
"sync"
7-
8-
ahocorasick "github.com/BobuSumisu/aho-corasick"
9-
104
"github.com/trufflesecurity/trufflehog/v3/pkg/detectors"
115
"github.com/trufflesecurity/trufflehog/v3/pkg/detectors/abbysale"
126
"github.com/trufflesecurity/trufflehog/v3/pkg/detectors/abuseipdb"
@@ -1699,60 +1693,3 @@ func DefaultDetectorTypesImplementing[T any]() map[detectorspb.DetectorType]stru
16991693
}
17001694
return out
17011695
}
1702-
1703-
func defaultDetectorKeywords() []string {
1704-
allDetectors := buildDetectorList()
1705-
1706-
// Remove keywords that cause lots of false positives.
1707-
var exclusions = []string{
1708-
"AKIA", "SG.", "pat", "token", "gh", "github", "sql", "database", "http", "key", "api-", "sdk-", "float", "-us", "gh", "pat", "token", "sid", "http", "private", "key", "segment", "close", "protocols", "verifier", "box", "privacy", "dm", "sl.", "vf", "flat",
1709-
}
1710-
1711-
var keywords []string
1712-
exclusionSet := make(map[string]struct{})
1713-
for _, excl := range exclusions {
1714-
exclusionSet[strings.ToLower(excl)] = struct{}{}
1715-
}
1716-
1717-
// Aggregate all keywords from detectors.
1718-
for _, detector := range allDetectors {
1719-
for _, kw := range detector.Keywords() {
1720-
kwLower := strings.ToLower(kw)
1721-
if _, excluded := exclusionSet[kwLower]; !excluded {
1722-
keywords = append(keywords, kwLower)
1723-
}
1724-
}
1725-
}
1726-
return keywords
1727-
}
1728-
1729-
// DefaultDetectorKeywordMatcher encapsulates the Aho-Corasick trie for keyword matching.
1730-
type DefaultDetectorKeywordMatcher struct {
1731-
mu sync.RWMutex
1732-
trie *ahocorasick.Trie
1733-
}
1734-
1735-
// NewDefaultDetectorKeywordMatcher creates a new DefaultDetectorKeywordMatcher.
1736-
func NewDefaultDetectorKeywordMatcher() *DefaultDetectorKeywordMatcher {
1737-
keywords := defaultDetectorKeywords()
1738-
return &DefaultDetectorKeywordMatcher{trie: ahocorasick.NewTrieBuilder().AddStrings(keywords).Build()}
1739-
}
1740-
1741-
// FindKeywords scans the input text and returns a slice of matched keywords.
1742-
func (km *DefaultDetectorKeywordMatcher) FindKeywords(text []byte) []string {
1743-
km.mu.RLock()
1744-
defer km.mu.RUnlock()
1745-
1746-
matches := km.trie.Match(bytes.ToLower(text))
1747-
found := make([]string, 0, len(matches))
1748-
seen := make(map[string]struct{}) // To avoid duplicate entries
1749-
1750-
for _, match := range matches {
1751-
keyword := match.MatchString()
1752-
if _, exists := seen[keyword]; !exists {
1753-
found = append(found, keyword)
1754-
seen[keyword] = struct{}{}
1755-
}
1756-
}
1757-
return found
1758-
}

pkg/handlers/apk.go

Lines changed: 78 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,8 +10,10 @@ import (
1010
"path/filepath"
1111
"regexp"
1212
"strings"
13+
"sync"
1314
"time"
1415

16+
ahocorasick "github.com/BobuSumisu/aho-corasick"
1517
"github.com/avast/apkparser"
1618
dextk "github.com/csnewman/dextk"
1719

@@ -29,6 +31,80 @@ import (
2931
// ToDo: Scan nested APKs (aka XAPK files). ATM the archive.go file will skip over them.
3032
// ToDo: Provide file location information to secret output.
3133

34+
var (
35+
keywordMatcherOnce sync.Once
36+
keywordMatcher *detectorKeywordMatcher
37+
)
38+
39+
func defaultDetectorKeywords() []string {
40+
allDetectors := defaults.DefaultDetectors()
41+
42+
// Remove keywords that cause lots of false positives.
43+
var exclusions = []string{
44+
"AKIA", "SG.", "pat", "token", "gh", "github", "sql", "database", "http", "key", "api-", "sdk-", "float", "-us", "gh", "pat", "token", "sid", "http", "private", "key", "segment", "close", "protocols", "verifier", "box", "privacy", "dm", "sl.", "vf", "flat",
45+
}
46+
47+
var keywords []string
48+
exclusionSet := make(map[string]struct{})
49+
for _, excl := range exclusions {
50+
exclusionSet[strings.ToLower(excl)] = struct{}{}
51+
}
52+
53+
// Aggregate all keywords from detectors.
54+
for _, detector := range allDetectors {
55+
for _, kw := range detector.Keywords() {
56+
kwLower := strings.ToLower(kw)
57+
if _, excluded := exclusionSet[kwLower]; !excluded {
58+
keywords = append(keywords, kwLower)
59+
}
60+
}
61+
}
62+
return keywords
63+
}
64+
65+
// detectorKeywordMatcher encapsulates the Aho-Corasick trie for efficient keyword matching.
66+
// It is used to scan APK file contents for keywords associated with our credential detectors.
67+
// By only processing files/sections that contain these keywords, we can efficiently filter
68+
// out irrelevant data and focus on content that is more likely to contain credentials.
69+
// The Aho-Corasick algorithm provides fast, simultaneous matching of multiple patterns in
70+
// a single pass through the text, which is crucial for performance when scanning large APK files.
71+
type detectorKeywordMatcher struct{ trie *ahocorasick.Trie }
72+
73+
// getDefaultDetectorKeywordMatcher creates or returns the singleton detectorKeywordMatcher.
74+
// This is implemented as a singleton for several important reasons:
75+
// 1. Building the Aho-Corasick trie is computationally expensive and should only be done once.
76+
// 2. The trie is immutable after construction and can be safely shared across goroutines.
77+
// 3. The keyword list from the detectors is static for a given program execution.
78+
// 4. Memory efficiency - we avoid duplicating the trie structure for each handler instance.
79+
func getDefaultDetectorKeywordMatcher() *detectorKeywordMatcher {
80+
keywordMatcherOnce.Do(func() {
81+
keywords := defaultDetectorKeywords()
82+
keywordMatcher = &detectorKeywordMatcher{
83+
trie: ahocorasick.NewTrieBuilder().AddStrings(keywords).Build(),
84+
}
85+
})
86+
return keywordMatcher
87+
}
88+
89+
// FindKeywords scans the input text and returns a slice of matched keywords.
90+
// The method is thread-safe and uses a read lock since the trie is immutable.
91+
// It returns unique matches only, eliminating duplicates that may occur when
92+
// the same keyword appears multiple times in the input text.
93+
func (km *detectorKeywordMatcher) FindKeywords(text []byte) []string {
94+
matches := km.trie.Match(bytes.ToLower(text))
95+
found := make([]string, 0, len(matches))
96+
seen := make(map[string]struct{}) // To avoid duplicate entries
97+
98+
for _, match := range matches {
99+
keyword := match.MatchString()
100+
if _, exists := seen[keyword]; !exists {
101+
found = append(found, keyword)
102+
seen[keyword] = struct{}{}
103+
}
104+
}
105+
return found
106+
}
107+
32108
var (
33109
stringInstructionType = "const-string"
34110
targetInstructionTypes = []string{stringInstructionType, "iput-object", "sput-object", "const-class", "invoke-virtual", "invoke-super", "invoke-direct", "invoke-static", "invoke-interface"}
@@ -53,15 +129,15 @@ var (
53129

54130
// apkHandler handles apk archive formats.
55131
type apkHandler struct {
56-
keywordMatcher *defaults.DefaultDetectorKeywordMatcher
132+
keywordMatcher *detectorKeywordMatcher
57133
*defaultHandler
58134
}
59135

60136
// newAPKHandler creates an apkHandler.
61137
func newAPKHandler() *apkHandler {
62138
return &apkHandler{
63139
defaultHandler: newDefaultHandler(apkHandlerType),
64-
keywordMatcher: defaults.NewDefaultDetectorKeywordMatcher(),
140+
keywordMatcher: getDefaultDetectorKeywordMatcher(),
65141
}
66142
}
67143

0 commit comments

Comments
 (0)