Skip to content

Commit 51ddcea

Browse files
corylanouclaude
andcommitted
fix(restore): improve diagnostics and error handling for GCS restore failures
- Add detailed restore plan logging showing each file's level, TXID range, and size - Fix compaction error capture (was being swallowed due to variable scoping) - Improve decode error message to include first file info for debugging EOF errors - Add warning when restore plan starts from non-TXID-1 (seeded replica detection) - Add GCS-specific fail-fast for truncated files (< header size) - Add debug logging when opening LTX files from GCS These changes help diagnose issue #858 where GCS restores fail with "decode database: decode header: EOF" by providing better visibility into the restore plan and catching truncated files early. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <[email protected]>
1 parent fc6a020 commit 51ddcea

File tree

2 files changed

+52
-4
lines changed

2 files changed

+52
-4
lines changed

gs/replica_client.go

Lines changed: 20 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -188,8 +188,27 @@ func (c *ReplicaClient) OpenLTXFile(ctx context.Context, level int, minTXID, max
188188
return nil, err
189189
}
190190

191+
// Log the actual size from GCS for debugging
192+
actualSize := r.Attrs.Size
193+
c.logger.Debug("opened ltx file from GCS",
194+
"key", key,
195+
"level", level,
196+
"minTXID", minTXID,
197+
"maxTXID", maxTXID,
198+
"actualSize", actualSize,
199+
"requestedOffset", offset,
200+
"requestedLength", length)
201+
202+
// Fail fast if file is smaller than LTX header - it can never decode successfully.
203+
// This catches truncated uploads or corrupted files early with a clear error.
204+
if actualSize < ltx.HeaderSize {
205+
_ = r.Close()
206+
return nil, fmt.Errorf("gs: ltx file %q is truncated: size %d bytes (minimum %d for header)",
207+
key, actualSize, ltx.HeaderSize)
208+
}
209+
191210
internal.OperationTotalCounterVec.WithLabelValues(ReplicaClientType, "GET").Inc()
192-
internal.OperationBytesCounterVec.WithLabelValues(ReplicaClientType, "GET").Add(float64(r.Attrs.Size))
211+
internal.OperationBytesCounterVec.WithLabelValues(ReplicaClientType, "GET").Add(float64(actualSize))
193212

194213
return r, nil
195214
}

replica.go

Lines changed: 32 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -425,6 +425,16 @@ func (r *Replica) Restore(ctx context.Context, opt RestoreOptions) (err error) {
425425

426426
r.Logger().Debug("restore plan", "n", len(infos), "txid", infos[len(infos)-1].MaxTXID, "timestamp", infos[len(infos)-1].CreatedAt)
427427

428+
// Log detailed restore plan for debugging
429+
for i, info := range infos {
430+
r.Logger().Debug("restore plan file",
431+
"index", i,
432+
"level", info.Level,
433+
"min", info.MinTXID,
434+
"max", info.MaxTXID,
435+
"size", info.Size)
436+
}
437+
428438
rdrs := make([]io.Reader, 0, len(infos))
429439
defer func() {
430440
for _, rd := range rdrs {
@@ -441,7 +451,7 @@ func (r *Replica) Restore(ctx context.Context, opt RestoreOptions) (err error) {
441451
info.Level, info.MinTXID, info.MaxTXID, info.Size, ltx.HeaderSize)
442452
}
443453

444-
r.Logger().Debug("opening ltx file for restore", "level", info.Level, "min", info.MinTXID, "max", info.MaxTXID)
454+
r.Logger().Debug("opening ltx file for restore", "level", info.Level, "min", info.MinTXID, "max", info.MaxTXID, "size", info.Size)
445455

446456
// Add file to be compacted.
447457
f, err := r.Client.OpenLTXFile(ctx, info.Level, info.MinTXID, info.MaxTXID, 0, 0)
@@ -479,16 +489,24 @@ func (r *Replica) Restore(ctx context.Context, opt RestoreOptions) (err error) {
479489
go func() {
480490
c, err := ltx.NewCompactor(pw, rdrs)
481491
if err != nil {
492+
r.Logger().Error("compactor init failed", "error", err, "n_readers", len(rdrs))
482493
pw.CloseWithError(fmt.Errorf("new ltx compactor: %w", err))
483494
return
484495
}
485496
c.HeaderFlags = ltx.HeaderFlagNoChecksum
486-
_ = pw.CloseWithError(c.Compact(ctx))
497+
compactErr := c.Compact(ctx)
498+
if compactErr != nil {
499+
r.Logger().Error("compaction failed", "error", compactErr)
500+
}
501+
_ = pw.CloseWithError(compactErr)
487502
}()
488503

489504
dec := ltx.NewDecoder(pr)
490505
if err := dec.DecodeDatabaseTo(f); err != nil {
491-
return fmt.Errorf("decode database: %w", err)
506+
// Include first file info for debugging EOF errors
507+
firstFile := infos[0]
508+
return fmt.Errorf("decode database (first file: level=%d min=%s max=%s size=%d): %w",
509+
firstFile.Level, firstFile.MinTXID, firstFile.MaxTXID, firstFile.Size, err)
492510
}
493511

494512
if err := f.Sync(); err != nil {
@@ -574,5 +592,16 @@ func CalcRestorePlan(ctx context.Context, client ReplicaClient, txID ltx.TXID, t
574592
return nil, ErrTxNotAvailable
575593
}
576594

595+
// Warn if the first file doesn't start at TXID 1 - this may indicate missing
596+
// base snapshot, but could also be a legitimate seeded replica starting from
597+
// a later snapshot. Log for diagnostics but allow the restore to proceed.
598+
firstFile := infos[0]
599+
if firstFile.MinTXID != 1 {
600+
logger.Warn("restore plan starts from non-initial TXID (may be seeded replica or missing base snapshot)",
601+
"first_file_level", firstFile.Level,
602+
"first_file_min", firstFile.MinTXID,
603+
"first_file_max", firstFile.MaxTXID)
604+
}
605+
577606
return infos, nil
578607
}

0 commit comments

Comments
 (0)