Skip to content

Commit ccc06b7

Browse files
tonistiigijsternberg
authored andcommitted
bboltcachestorage: mitigate corrupt boltdb cache after panic
There are some reports that the nosync configuration of the boltdb can cause panics on restarts due to corruption of the database. Mitigate by panic recovery until there is a better solution. Co-authored-by: Tonis Tiigi <[email protected]> Signed-off-by: Jonathan A. Sternberg <[email protected]>
1 parent 1c55173 commit ccc06b7

File tree

1 file changed

+55
-2
lines changed

1 file changed

+55
-2
lines changed

solver/bboltcachestorage/storage.go

Lines changed: 55 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,11 @@ import (
44
"bytes"
55
"encoding/json"
66
"fmt"
7+
"os"
78

9+
"github.com/moby/buildkit/identity"
810
"github.com/moby/buildkit/solver"
11+
"github.com/moby/buildkit/util/bklog"
912
digest "github.com/opencontainers/go-digest"
1013
"github.com/pkg/errors"
1114
bolt "go.etcd.io/bbolt"
@@ -23,10 +26,12 @@ type Store struct {
2326
}
2427

2528
func NewStore(dbPath string) (*Store, error) {
26-
db, err := bolt.Open(dbPath, 0600, nil)
29+
db, err := safeOpenDB(dbPath)
2730
if err != nil {
28-
return nil, errors.Wrapf(err, "failed to open database file %s", dbPath)
31+
return nil, err
2932
}
33+
34+
// Initialize the database with the needed buckets if they do not exist.
3035
if err := db.Update(func(tx *bolt.Tx) error {
3136
for _, b := range []string{resultBucket, linksBucket, byResultBucket, backlinksBucket} {
3237
if _, err := tx.CreateBucketIfNotExists([]byte(b)); err != nil {
@@ -455,3 +460,51 @@ func isEmptyBucket(b *bolt.Bucket) bool {
455460
k, _ := b.Cursor().First()
456461
return k == nil
457462
}
463+
464+
// safeOpenDB opens a bolt database and recovers from panic that
465+
// can be caused by a corrupted database file.
466+
func safeOpenDB(dbPath string) (db *bolt.DB, err error) {
467+
defer func() {
468+
if r := recover(); r != nil {
469+
err = errors.Errorf("%v", r)
470+
}
471+
472+
// If we get an error when opening the database, but we have
473+
// access to the file and the file looks like it has content,
474+
// then fallback to resetting the database since the database
475+
// may be corrupt.
476+
if err != nil && fileHasContent(dbPath) {
477+
db, err = fallbackOpenDB(dbPath, err)
478+
}
479+
}()
480+
return openDB(dbPath)
481+
}
482+
483+
// fallbackOpenDB performs database recovery and opens the new database
484+
// file when the database fails to open. Called after the first database
485+
// open fails.
486+
func fallbackOpenDB(dbPath string, openErr error) (*bolt.DB, error) {
487+
backupPath := dbPath + "." + identity.NewID() + ".bak"
488+
bklog.L.Errorf("failed to open database file %s, resetting to empty. Old database is backed up to %s. "+
489+
"This error signifies that buildkitd likely crashed or was sigkilled abrubtly, leaving the database corrupted. "+
490+
"If you see logs from a previous panic then please report in the issue tracker at https://github.com/moby/buildkit . %+v", dbPath, backupPath, openErr)
491+
if err := os.Rename(dbPath, backupPath); err != nil {
492+
return nil, errors.Wrapf(err, "failed to rename database file %s to %s", dbPath, backupPath)
493+
}
494+
495+
// Attempt to open the database again. This should be a new database.
496+
// If this fails, it is a permanent error.
497+
return openDB(dbPath)
498+
}
499+
500+
// openDB opens a bolt database in user-only read/write mode.
501+
func openDB(dbPath string) (*bolt.DB, error) {
502+
return bolt.Open(dbPath, 0600, nil)
503+
}
504+
505+
// fileHasContent checks if we have access to the file with appropriate
506+
// permissions and the file has a non-zero size.
507+
func fileHasContent(dbPath string) bool {
508+
st, err := os.Stat(dbPath)
509+
return err == nil && st.Size() > 0
510+
}

0 commit comments

Comments
 (0)