Skip to content

Commit d4046ec

Browse files
committed
fixes
1 parent dd27946 commit d4046ec

File tree

6 files changed

+415
-26
lines changed

6 files changed

+415
-26
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,3 +7,4 @@ scripts/build/pchaind
77
coverage.out
88
coverage.html
99
*.out
10+
CLAUDE.md

cmd/push-validator/cmd_start.go

Lines changed: 151 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ import (
55
"fmt"
66
"os"
77
"path/filepath"
8+
"strconv"
89
"strings"
910
"time"
1011

@@ -200,6 +201,56 @@ var startCmd = &cobra.Command{
200201
})
201202
return err
202203
}
204+
205+
// Verify the node process actually survived startup and is serving RPC.
206+
// PID checks are racy for detached processes — the node may crash seconds after
207+
// launch (e.g. corrupt DB). Polling RPC for up to 10s is more reliable.
208+
nodeAlive := false
209+
for i := 0; i < 10; i++ {
210+
time.Sleep(1 * time.Second)
211+
if !sup.IsRunning() {
212+
break // process already exited
213+
}
214+
if process.IsRPCListening("127.0.0.1:26657", 800*time.Millisecond) {
215+
nodeAlive = true
216+
break
217+
}
218+
}
219+
if !nodeAlive {
220+
logPath := sup.LogPath()
221+
// Read last few lines from log for diagnostics
222+
logTail := ""
223+
if b, err := os.ReadFile(logPath); err == nil {
224+
lines := strings.Split(strings.TrimSpace(string(b)), "\n")
225+
start := len(lines) - 5
226+
if start < 0 {
227+
start = 0
228+
}
229+
logTail = strings.Join(lines[start:], "\n")
230+
}
231+
ui.PrintError(ui.ErrorMessage{
232+
Problem: "Node is not running",
233+
Causes: []string{
234+
"Node crashed on startup (corrupt or incomplete database)",
235+
"Incompatible binary version for existing data",
236+
"Missing required files in data directory",
237+
},
238+
Actions: []string{
239+
"Check logs: cat " + logPath,
240+
"Try resetting: push-validator reset && push-validator start",
241+
"If the issue persists, re-download the snapshot",
242+
},
243+
})
244+
if logTail != "" {
245+
fmt.Println()
246+
fmt.Println(" Last log lines:")
247+
for _, line := range strings.Split(logTail, "\n") {
248+
fmt.Println(" " + line)
249+
}
250+
}
251+
return fmt.Errorf("node process is not running")
252+
}
253+
203254
if flagOutput == "json" {
204255
p.JSON(map[string]any{"ok": true, "action": "start", "already_running": isAlreadyRunning, "cosmovisor": true})
205256
} else {
@@ -234,6 +285,20 @@ func init() {
234285
rootCmd.AddCommand(startCmd)
235286
}
236287

288+
// defaultSnapshotSyncThreshold is the number of blocks behind the chain tip
289+
// at which the CLI will proactively download a fresh snapshot rather than
290+
// syncing block-by-block. Override via PUSH_SNAPSHOT_THRESHOLD env var.
291+
const defaultSnapshotSyncThreshold int64 = 25000
292+
293+
func snapshotSyncThreshold() int64 {
294+
if v := os.Getenv("PUSH_SNAPSHOT_THRESHOLD"); v != "" {
295+
if n, err := strconv.ParseInt(v, 10, 64); err == nil && n > 0 {
296+
return n
297+
}
298+
}
299+
return defaultSnapshotSyncThreshold
300+
}
301+
237302
// handlePostStartFlow manages the post-start flow based on validator status.
238303
// Returns false if an error occurred (non-fatal), true if flow completed successfully.
239304
func handlePostStartFlow(cfg config.Config, p *ui.Printer) bool {
@@ -263,6 +328,87 @@ func handlePostStartFlow(cfg config.Config, p *ui.Printer) bool {
263328
// Node is still syncing - wait for sync to complete before validator checks
264329
fmt.Println(p.Colors.Info(" ▸ Node is syncing with the network..."))
265330
fmt.Println(p.Colors.Apply(p.Colors.Theme.Description, " Waiting for sync to complete...\n"))
331+
332+
// Proactive snapshot: if far behind, downloading a snapshot is faster than block-by-block sync
333+
blockDiff := snap.Chain.RemoteHeight - snap.Chain.LocalHeight
334+
if snap.Chain.RemoteHeight > 0 && snap.Chain.LocalHeight > 0 && blockDiff > snapshotSyncThreshold() {
335+
fmt.Println(p.Colors.Info("▸ Accelerating Sync"))
336+
fmt.Printf(" Node is %d blocks behind the chain tip.\n", blockDiff)
337+
estimatedHours := float64(blockDiff) / 15.0 / 3600.0
338+
if estimatedHours >= 1.0 {
339+
fmt.Printf(" Downloading a fresh snapshot (saves ~%.0fh of block-by-block syncing)...\n\n", estimatedHours)
340+
} else {
341+
fmt.Printf(" Downloading a fresh snapshot to speed up sync...\n\n")
342+
}
343+
344+
snapshotErr := func() error {
345+
sup := newSupervisor(cfg.HomeDir)
346+
347+
fmt.Println(p.Colors.Info(" Stopping node..."))
348+
if err := sup.Stop(); err != nil {
349+
// Ignore stop errors - node might not be running
350+
}
351+
time.Sleep(2 * time.Second)
352+
353+
fmt.Println(p.Colors.Info(" Clearing blockchain data..."))
354+
if err := admin.Reset(admin.ResetOptions{
355+
HomeDir: cfg.HomeDir,
356+
BinPath: findPchaind(),
357+
KeepAddrBook: true,
358+
}); err != nil {
359+
return fmt.Errorf("reset failed: %w", err)
360+
}
361+
362+
fmt.Println(p.Colors.Info(" Downloading snapshot..."))
363+
snapshotSvc := snapshot.New()
364+
if err := snapshotSvc.Download(context.Background(), snapshot.Options{
365+
SnapshotURL: cfg.SnapshotURL,
366+
HomeDir: cfg.HomeDir,
367+
Progress: createSnapshotProgressCallback(flagOutput),
368+
}); err != nil {
369+
return fmt.Errorf("snapshot download failed: %w", err)
370+
}
371+
372+
fmt.Println(p.Colors.Info(" Extracting snapshot..."))
373+
if err := snapshotSvc.Extract(context.Background(), snapshot.ExtractOptions{
374+
HomeDir: cfg.HomeDir,
375+
TargetDir: filepath.Join(cfg.HomeDir, "data"),
376+
Progress: createSnapshotProgressCallback(flagOutput),
377+
}); err != nil {
378+
return fmt.Errorf("snapshot extract failed: %w", err)
379+
}
380+
381+
// Ensure priv_validator_state.json exists after extraction
382+
pvsPath := filepath.Join(cfg.HomeDir, "data", "priv_validator_state.json")
383+
if _, err := os.Stat(pvsPath); os.IsNotExist(err) {
384+
_ = os.WriteFile(pvsPath, []byte(`{"height":"0","round":0,"step":0}`+"\n"), 0o644)
385+
}
386+
387+
fmt.Println(p.Colors.Info(" Restarting node..."))
388+
_, err := sup.Start(process.StartOpts{
389+
HomeDir: cfg.HomeDir,
390+
Moniker: os.Getenv("MONIKER"),
391+
BinPath: findPchaind(),
392+
})
393+
if err != nil {
394+
return fmt.Errorf("restart failed: %w", err)
395+
}
396+
time.Sleep(5 * time.Second)
397+
return nil
398+
}()
399+
400+
if snapshotErr != nil {
401+
fmt.Println()
402+
fmt.Println(p.Colors.Warning(" " + p.Colors.Emoji("!") + " Snapshot optimization failed: " + snapshotErr.Error()))
403+
fmt.Println(p.Colors.Apply(p.Colors.Theme.Description, " Falling back to block-by-block sync..."))
404+
fmt.Println()
405+
} else {
406+
fmt.Println()
407+
fmt.Println(p.Colors.Success(" " + p.Colors.Emoji("✓") + " Snapshot restored — syncing remaining blocks..."))
408+
fmt.Println()
409+
}
410+
}
411+
266412
fmt.Println(p.Colors.Info("▸ Monitoring Sync Progress"))
267413

268414
// Wait for sync to complete using sync monitor
@@ -316,7 +462,7 @@ func handlePostStartFlow(cfg config.Config, p *ui.Printer) bool {
316462
return nil
317463
}
318464

319-
if err := syncmon.RunWithRetry(context.Background(), syncmon.RetryOptions{
465+
syncErr := syncmon.RunWithRetry(context.Background(), syncmon.RetryOptions{
320466
Options: syncmon.Options{
321467
LocalRPC: "http://127.0.0.1:26657",
322468
RemoteRPC: remoteURL,
@@ -331,10 +477,11 @@ func handlePostStartFlow(cfg config.Config, p *ui.Printer) bool {
331477
},
332478
MaxRetries: 3,
333479
ResetFunc: resetFunc,
334-
}); err != nil {
335-
// Sync failed after retries - show warning and dashboard
480+
})
481+
if syncErr != nil {
482+
// Sync failed - show warning and dashboard
336483
fmt.Println()
337-
fmt.Println(p.Colors.Warning(" " + p.Colors.Emoji("⚠") + " Sync failed after retries"))
484+
fmt.Println(p.Colors.Warning(" " + p.Colors.Emoji("⚠") + " Sync failed: " + syncErr.Error()))
338485
fmt.Println(p.Colors.Apply(p.Colors.Theme.Description, " Try: push-validator reset && push-validator start"))
339486
showDashboardPrompt(cfg, p)
340487
return false

internal/node/client.go

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,9 @@ func (c *httpClient) RemoteStatus(ctx context.Context, baseURL string) (Status,
7676
resp, err := c.http.Do(req)
7777
if err != nil { return Status{}, err }
7878
defer func() { _ = resp.Body.Close() }()
79+
if resp.StatusCode != http.StatusOK {
80+
return Status{}, fmt.Errorf("remote RPC returned HTTP %d", resp.StatusCode)
81+
}
7982
var payload struct {
8083
Result struct {
8184
NodeInfo struct{
@@ -105,6 +108,9 @@ func (c *httpClient) Peers(ctx context.Context) ([]Peer, error) {
105108
resp, err := c.http.Do(req)
106109
if err != nil { return nil, err }
107110
defer func() { _ = resp.Body.Close() }()
111+
if resp.StatusCode != http.StatusOK {
112+
return nil, fmt.Errorf("remote RPC returned HTTP %d", resp.StatusCode)
113+
}
108114
var payload struct {
109115
Result struct {
110116
Peers []struct {

internal/snapshot/extractor.go

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -80,11 +80,21 @@ func extractTarLz4(archivePath, destDir string, progress ExtractProgressFunc) er
8080
}
8181

8282
// Copy contents
83-
if _, err := io.Copy(outFile, tarReader); err != nil {
83+
written, copyErr := io.Copy(outFile, tarReader)
84+
if copyErr != nil {
8485
outFile.Close()
85-
return fmt.Errorf("write file %s: %w", cleanName, err)
86+
return fmt.Errorf("write file %s: %w", cleanName, copyErr)
87+
}
88+
89+
// Verify all bytes were written
90+
if header.Size > 0 && written != header.Size {
91+
outFile.Close()
92+
return fmt.Errorf("incomplete extraction of %s: wrote %d of %d bytes (disk full?)", cleanName, written, header.Size)
93+
}
94+
95+
if err := outFile.Close(); err != nil {
96+
return fmt.Errorf("close file %s: %w", cleanName, err)
8697
}
87-
outFile.Close()
8898

8999
case tar.TypeSymlink:
90100
// Security: validate symlink target

internal/snapshot/snapshot.go

Lines changed: 61 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -258,6 +258,7 @@ func isCacheValid(homeDir, remoteChecksum string) bool {
258258

259259
// copyDir recursively copies a directory from src to dst.
260260
// Used as fallback when os.Rename fails (cross-device move).
261+
// Returns an error if any file fails to copy completely (e.g. disk full).
261262
func copyDir(src, dst string) error {
262263
return filepath.Walk(src, func(path string, info os.FileInfo, err error) error {
263264
if err != nil {
@@ -286,11 +287,43 @@ func copyDir(src, dst string) error {
286287
if err != nil {
287288
return err
288289
}
289-
defer dstFile.Close()
290290

291-
_, err = io.Copy(dstFile, srcFile)
292-
return err
291+
written, err := io.Copy(dstFile, srcFile)
292+
if err != nil {
293+
dstFile.Close()
294+
return fmt.Errorf("write %s: %w", relPath, err)
295+
}
296+
297+
// Verify all bytes were written
298+
if written != info.Size() {
299+
dstFile.Close()
300+
return fmt.Errorf("incomplete write for %s: wrote %d of %d bytes (disk full?)", relPath, written, info.Size())
301+
}
302+
303+
// Flush to disk to ensure data is persisted
304+
if err := dstFile.Sync(); err != nil {
305+
dstFile.Close()
306+
return fmt.Errorf("sync %s: %w", relPath, err)
307+
}
308+
309+
return dstFile.Close()
310+
})
311+
}
312+
313+
// countDirFiles counts regular files and total size in a directory tree.
314+
func countDirFiles(dir string) (int64, int64, error) {
315+
var count, totalSize int64
316+
err := filepath.Walk(dir, func(_ string, info os.FileInfo, err error) error {
317+
if err != nil {
318+
return err
319+
}
320+
if !info.IsDir() {
321+
count++
322+
totalSize += info.Size()
323+
}
324+
return nil
293325
})
326+
return count, totalSize, err
294327
}
295328

296329
// Download fetches and caches a snapshot tarball (does not extract).
@@ -468,10 +501,13 @@ func (s *svc) Extract(ctx context.Context, opts ExtractOptions) error {
468501
}
469502

470503
// Disk space pre-check for extraction
504+
// We need space for both the temp extraction AND the final copy, so use 2x the
505+
// estimated extraction size as the requirement.
471506
progress(PhaseExtract, 0, -1, "Checking disk space...")
472507
if tarballInfo, err := os.Stat(cachedTarball); err == nil {
473508
// lz4 typical compression ratio ~3-4x for blockchain data
474-
estimatedSize := tarballInfo.Size() * 4
509+
// Need space for temp dir + final target dir simultaneously during copy
510+
estimatedSize := tarballInfo.Size() * 8
475511
if err := checkDiskSpace(opts.TargetDir, estimatedSize); err != nil {
476512
return fmt.Errorf("extraction disk space check: %w", err)
477513
}
@@ -506,14 +542,34 @@ func (s *svc) Extract(ctx context.Context, opts ExtractOptions) error {
506542
return fmt.Errorf("extracted snapshot missing data/ directory")
507543
}
508544

545+
// Count files in extracted source for post-copy verification
546+
srcFileCount, srcTotalSize, err := countDirFiles(extractedDataDir)
547+
if err != nil {
548+
return fmt.Errorf("count extracted files: %w", err)
549+
}
550+
509551
// Prepare target directory (clear existing contents except priv_validator_state.json)
510552
if err := prepareDataDir(opts.TargetDir); err != nil {
511553
return fmt.Errorf("prepare target dir: %w", err)
512554
}
513555

514556
// Copy extracted data to target
515557
if err := copyDir(extractedDataDir, opts.TargetDir); err != nil {
516-
return fmt.Errorf("copy to target: %w", err)
558+
// Copy failed (likely disk full) — clean up partial data to avoid corrupt state
559+
_ = prepareDataDir(opts.TargetDir)
560+
return fmt.Errorf("copy to target (disk may be full): %w", err)
561+
}
562+
563+
// Post-copy verification: ensure all files were copied completely
564+
dstFileCount, dstTotalSize, err := countDirFiles(opts.TargetDir)
565+
if err != nil {
566+
_ = prepareDataDir(opts.TargetDir)
567+
return fmt.Errorf("verify extracted files: %w", err)
568+
}
569+
if dstFileCount < srcFileCount || dstTotalSize < srcTotalSize {
570+
_ = prepareDataDir(opts.TargetDir)
571+
return fmt.Errorf("extraction incomplete: expected %d files (%s) but got %d files (%s) — disk may be full",
572+
srcFileCount, formatBytesHuman(srcTotalSize), dstFileCount, formatBytesHuman(dstTotalSize))
517573
}
518574

519575
// Restore priv_validator_state.json if it was backed up

0 commit comments

Comments
 (0)