Skip to content

Commit e40bcbd

Browse files
authored
perf(local): only load advisories that are about the packages being scanned (#2241)
This updates the local matcher to have it skip advisories that don't have at least one affected entry with a package name matching one of the packages being scanned in the current run, which can greatly reduce the peak memory usage for databases like Ubuntu (going from something like 10gb down to 1gb). Since we cache databases based on their ecosystem only, this does mean subsequent calls to `LocalMatcher#MatchVulnerabilities` will not give any results for packages that were not present in the first call - while this shouldn't be a problem currently since we handle creating the `VulnerabilityMatcher` as part of scanning, I've added a basic guard that returns an error if the function is called with any "partial" database cached to catch this (be it on purpose or because of a bug) This should not impact guided remediation since it explicitly loads the database before doing any work meaning this change won't help it but should also not hurt it Resolves #2217 (again)
1 parent df179f4 commit e40bcbd

File tree

3 files changed

+191
-21
lines changed

3 files changed

+191
-21
lines changed

internal/clients/clientimpl/localmatcher/localmatcher.go

Lines changed: 23 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,15 @@ func NewLocalMatcher(localDBPath string, userAgent string, downloadDB bool) (*Lo
4949
func (matcher *LocalMatcher) MatchVulnerabilities(ctx context.Context, invs []*extractor.Package) ([][]*osvschema.Vulnerability, error) {
5050
results := make([][]*osvschema.Vulnerability, 0, len(invs))
5151

52+
// ensure all databases loaded so far have been fully loaded; this is just a
53+
// basic safeguard since we don't actually currently attempt to reuse matchers
54+
// across scans, and its possible we never will, so we don't need to be smart
55+
for _, db := range matcher.dbs {
56+
if db.Partial {
57+
return nil, errors.New("local matcher cannot be (re)used with a partially loaded database")
58+
}
59+
}
60+
5261
for _, inv := range invs {
5362
if ctx.Err() != nil {
5463
return nil, ctx.Err()
@@ -77,7 +86,7 @@ func (matcher *LocalMatcher) MatchVulnerabilities(ctx context.Context, invs []*e
7786
eco = "GIT"
7887
}
7988

80-
db, err := matcher.loadDBFromCache(ctx, eco)
89+
db, err := matcher.loadDBFromCache(ctx, eco, invs)
8190

8291
if err != nil {
8392
// no logging here as the loader will have already done that
@@ -94,13 +103,15 @@ func (matcher *LocalMatcher) MatchVulnerabilities(ctx context.Context, invs []*e
94103

95104
// LoadEcosystem tries to preload the ecosystem into the cache, and returns an error if the ecosystem
96105
// cannot be loaded.
106+
//
107+
// Preloaded databases include every advisory, so can be reused.
97108
func (matcher *LocalMatcher) LoadEcosystem(ctx context.Context, eco osvecosystem.Parsed) error {
98-
_, err := matcher.loadDBFromCache(ctx, eco.Ecosystem)
109+
_, err := matcher.loadDBFromCache(ctx, eco.Ecosystem, nil)
99110

100111
return err
101112
}
102113

103-
func (matcher *LocalMatcher) loadDBFromCache(ctx context.Context, eco osvschema.Ecosystem) (*ZipDB, error) {
114+
func (matcher *LocalMatcher) loadDBFromCache(ctx context.Context, eco osvschema.Ecosystem, invs []*extractor.Package) (*ZipDB, error) {
104115
if db, ok := matcher.dbs[eco]; ok {
105116
return db, nil
106117
}
@@ -109,7 +120,15 @@ func (matcher *LocalMatcher) loadDBFromCache(ctx context.Context, eco osvschema.
109120
return nil, matcher.failedDBs[eco]
110121
}
111122

112-
db, err := NewZippedDB(ctx, matcher.dbBasePath, string(eco), fmt.Sprintf("%s/%s/all.zip", zippedDBRemoteHost, eco), matcher.userAgent, !matcher.downloadDB)
123+
db, err := NewZippedDB(
124+
ctx,
125+
matcher.dbBasePath,
126+
string(eco),
127+
fmt.Sprintf("%s/%s/all.zip", zippedDBRemoteHost, eco),
128+
matcher.userAgent,
129+
!matcher.downloadDB,
130+
invs,
131+
)
113132

114133
if err != nil {
115134
matcher.failedDBs[eco] = err

internal/clients/clientimpl/localmatcher/zip.go

Lines changed: 49 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ import (
1616
"path"
1717
"strings"
1818

19+
"github.com/google/osv-scalibr/extractor"
1920
"github.com/google/osv-scanner/v2/internal/cmdlogger"
2021
"github.com/google/osv-scanner/v2/internal/imodels"
2122
"github.com/google/osv-scanner/v2/internal/utility/vulns"
@@ -35,6 +36,10 @@ type ZipDB struct {
3536
Vulnerabilities []osvschema.Vulnerability
3637
// User agent to query with
3738
UserAgent string
39+
40+
// whether this database only has some of the advisories
41+
// loaded from the underlying zip file
42+
Partial bool
3843
}
3944

4045
var ErrOfflineDatabaseNotFound = errors.New("no offline version of the OSV database is available")
@@ -143,9 +148,28 @@ func (db *ZipDB) fetchZip(ctx context.Context) ([]byte, error) {
143148
return body, nil
144149
}
145150

151+
func mightAffectPackages(v osvschema.Vulnerability, names []string) bool {
152+
for _, affected := range v.Affected {
153+
for _, name := range names {
154+
if affected.Package.Name == name {
155+
return true
156+
}
157+
158+
// "name" will be the git repository in the case of the GIT ecosystem
159+
for _, ran := range affected.Ranges {
160+
if ran.Repo == name {
161+
return true
162+
}
163+
}
164+
}
165+
}
166+
167+
return false
168+
}
169+
146170
// Loads the given zip file into the database as an OSV.
147171
// It is assumed that the file is JSON and in the working directory of the db
148-
func (db *ZipDB) loadZipFile(zipFile *zip.File) {
172+
func (db *ZipDB) loadZipFile(zipFile *zip.File, names []string) {
149173
file, err := zipFile.Open()
150174
if err != nil {
151175
cmdlogger.Warnf("Could not read %s: %v", zipFile.Name, err)
@@ -169,16 +193,23 @@ func (db *ZipDB) loadZipFile(zipFile *zip.File) {
169193
return
170194
}
171195

172-
db.Vulnerabilities = append(db.Vulnerabilities, vulnerability)
196+
// if we have been provided a list of package names, only load advisories
197+
// that might actually affect those packages, rather than all advisories
198+
if len(names) == 0 || mightAffectPackages(vulnerability, names) {
199+
db.Vulnerabilities = append(db.Vulnerabilities, vulnerability)
200+
}
173201
}
174202

175203
// load fetches a zip archive of the OSV database and loads known vulnerabilities
176204
// from it (which are assumed to be in json files following the OSV spec).
177205
//
206+
// If a list of package names is provided, then only advisories with at least
207+
// one affected entry for a listed package will be loaded.
208+
//
178209
// Internally, the archive is cached along with the date that it was fetched
179210
// so that a new version of the archive is only downloaded if it has been
180211
// modified, per HTTP caching standards.
181-
func (db *ZipDB) load(ctx context.Context) error {
212+
func (db *ZipDB) load(ctx context.Context, names []string) error {
182213
db.Vulnerabilities = []osvschema.Vulnerability{}
183214

184215
body, err := db.fetchZip(ctx)
@@ -198,21 +229,33 @@ func (db *ZipDB) load(ctx context.Context) error {
198229
continue
199230
}
200231

201-
db.loadZipFile(zipFile)
232+
db.loadZipFile(zipFile, names)
202233
}
203234

204235
return nil
205236
}
206237

207-
func NewZippedDB(ctx context.Context, dbBasePath, name, url, userAgent string, offline bool) (*ZipDB, error) {
238+
func NewZippedDB(ctx context.Context, dbBasePath, name, url, userAgent string, offline bool, invs []*extractor.Package) (*ZipDB, error) {
208239
db := &ZipDB{
209240
Name: name,
210241
ArchiveURL: url,
211242
Offline: offline,
212243
StoredAt: path.Join(dbBasePath, name, "all.zip"),
213244
UserAgent: userAgent,
245+
246+
// we only fully load the database if we're not provided a list of packages
247+
Partial: len(invs) != 0,
214248
}
215-
if err := db.load(ctx); err != nil {
249+
names := make([]string, 0, len(invs))
250+
251+
// map the packages to their names ahead of loading,
252+
// to make things simpler and reduce double working
253+
for _, inv := range invs {
254+
in := imodels.FromInventory(inv)
255+
names = append(names, in.Name())
256+
}
257+
258+
if err := db.load(ctx, names); err != nil {
216259
return nil, fmt.Errorf("unable to fetch OSV database: %w", err)
217260
}
218261

0 commit comments

Comments
 (0)