55 "fmt"
66 "os"
77 "path/filepath"
8+ "strconv"
89 "strings"
910 "time"
1011
@@ -200,6 +201,56 @@ var startCmd = &cobra.Command{
200201 })
201202 return err
202203 }
204+
205+ // Verify the node process actually survived startup and is serving RPC.
206+ // PID checks are racy for detached processes — the node may crash seconds after
207+ // launch (e.g. corrupt DB). Polling RPC for up to 10s is more reliable.
208+ nodeAlive := false
209+ for i := 0 ; i < 10 ; i ++ {
210+ time .Sleep (1 * time .Second )
211+ if ! sup .IsRunning () {
212+ break // process already exited
213+ }
214+ if process .IsRPCListening ("127.0.0.1:26657" , 800 * time .Millisecond ) {
215+ nodeAlive = true
216+ break
217+ }
218+ }
219+ if ! nodeAlive {
220+ logPath := sup .LogPath ()
221+ // Read last few lines from log for diagnostics
222+ logTail := ""
223+ if b , err := os .ReadFile (logPath ); err == nil {
224+ lines := strings .Split (strings .TrimSpace (string (b )), "\n " )
225+ start := len (lines ) - 5
226+ if start < 0 {
227+ start = 0
228+ }
229+ logTail = strings .Join (lines [start :], "\n " )
230+ }
231+ ui .PrintError (ui.ErrorMessage {
232+ Problem : "Node is not running" ,
233+ Causes : []string {
234+ "Node crashed on startup (corrupt or incomplete database)" ,
235+ "Incompatible binary version for existing data" ,
236+ "Missing required files in data directory" ,
237+ },
238+ Actions : []string {
239+ "Check logs: cat " + logPath ,
240+ "Try resetting: push-validator reset && push-validator start" ,
241+ "If the issue persists, re-download the snapshot" ,
242+ },
243+ })
244+ if logTail != "" {
245+ fmt .Println ()
246+ fmt .Println (" Last log lines:" )
247+ for _ , line := range strings .Split (logTail , "\n " ) {
248+ fmt .Println (" " + line )
249+ }
250+ }
251+ return fmt .Errorf ("node process is not running" )
252+ }
253+
203254 if flagOutput == "json" {
204255 p .JSON (map [string ]any {"ok" : true , "action" : "start" , "already_running" : isAlreadyRunning , "cosmovisor" : true })
205256 } else {
@@ -234,6 +285,20 @@ func init() {
234285 rootCmd .AddCommand (startCmd )
235286}
236287
288+ // defaultSnapshotSyncThreshold is the number of blocks behind the chain tip
289+ // at which the CLI will proactively download a fresh snapshot rather than
290+ // syncing block-by-block. Override via PUSH_SNAPSHOT_THRESHOLD env var.
291+ const defaultSnapshotSyncThreshold int64 = 25000
292+
293+ func snapshotSyncThreshold () int64 {
294+ if v := os .Getenv ("PUSH_SNAPSHOT_THRESHOLD" ); v != "" {
295+ if n , err := strconv .ParseInt (v , 10 , 64 ); err == nil && n > 0 {
296+ return n
297+ }
298+ }
299+ return defaultSnapshotSyncThreshold
300+ }
301+
237302// handlePostStartFlow manages the post-start flow based on validator status.
238303// Returns false if an error occurred (non-fatal), true if flow completed successfully.
239304func handlePostStartFlow (cfg config.Config , p * ui.Printer ) bool {
@@ -263,6 +328,87 @@ func handlePostStartFlow(cfg config.Config, p *ui.Printer) bool {
263328 // Node is still syncing - wait for sync to complete before validator checks
264329 fmt .Println (p .Colors .Info (" ▸ Node is syncing with the network..." ))
265330 fmt .Println (p .Colors .Apply (p .Colors .Theme .Description , " Waiting for sync to complete...\n " ))
331+
332+ // Proactive snapshot: if far behind, downloading a snapshot is faster than block-by-block sync
333+ blockDiff := snap .Chain .RemoteHeight - snap .Chain .LocalHeight
334+ if snap .Chain .RemoteHeight > 0 && snap .Chain .LocalHeight > 0 && blockDiff > snapshotSyncThreshold () {
335+ fmt .Println (p .Colors .Info ("▸ Accelerating Sync" ))
336+ fmt .Printf (" Node is %d blocks behind the chain tip.\n " , blockDiff )
337+ estimatedHours := float64 (blockDiff ) / 15.0 / 3600.0
338+ if estimatedHours >= 1.0 {
339+ fmt .Printf (" Downloading a fresh snapshot (saves ~%.0fh of block-by-block syncing)...\n \n " , estimatedHours )
340+ } else {
341+ fmt .Printf (" Downloading a fresh snapshot to speed up sync...\n \n " )
342+ }
343+
344+ snapshotErr := func () error {
345+ sup := newSupervisor (cfg .HomeDir )
346+
347+ fmt .Println (p .Colors .Info (" Stopping node..." ))
348+ if err := sup .Stop (); err != nil {
349+ // Ignore stop errors - node might not be running
350+ }
351+ time .Sleep (2 * time .Second )
352+
353+ fmt .Println (p .Colors .Info (" Clearing blockchain data..." ))
354+ if err := admin .Reset (admin.ResetOptions {
355+ HomeDir : cfg .HomeDir ,
356+ BinPath : findPchaind (),
357+ KeepAddrBook : true ,
358+ }); err != nil {
359+ return fmt .Errorf ("reset failed: %w" , err )
360+ }
361+
362+ fmt .Println (p .Colors .Info (" Downloading snapshot..." ))
363+ snapshotSvc := snapshot .New ()
364+ if err := snapshotSvc .Download (context .Background (), snapshot.Options {
365+ SnapshotURL : cfg .SnapshotURL ,
366+ HomeDir : cfg .HomeDir ,
367+ Progress : createSnapshotProgressCallback (flagOutput ),
368+ }); err != nil {
369+ return fmt .Errorf ("snapshot download failed: %w" , err )
370+ }
371+
372+ fmt .Println (p .Colors .Info (" Extracting snapshot..." ))
373+ if err := snapshotSvc .Extract (context .Background (), snapshot.ExtractOptions {
374+ HomeDir : cfg .HomeDir ,
375+ TargetDir : filepath .Join (cfg .HomeDir , "data" ),
376+ Progress : createSnapshotProgressCallback (flagOutput ),
377+ }); err != nil {
378+ return fmt .Errorf ("snapshot extract failed: %w" , err )
379+ }
380+
381+ // Ensure priv_validator_state.json exists after extraction
382+ pvsPath := filepath .Join (cfg .HomeDir , "data" , "priv_validator_state.json" )
383+ if _ , err := os .Stat (pvsPath ); os .IsNotExist (err ) {
384+ _ = os .WriteFile (pvsPath , []byte (`{"height":"0","round":0,"step":0}` + "\n " ), 0o644 )
385+ }
386+
387+ fmt .Println (p .Colors .Info (" Restarting node..." ))
388+ _ , err := sup .Start (process.StartOpts {
389+ HomeDir : cfg .HomeDir ,
390+ Moniker : os .Getenv ("MONIKER" ),
391+ BinPath : findPchaind (),
392+ })
393+ if err != nil {
394+ return fmt .Errorf ("restart failed: %w" , err )
395+ }
396+ time .Sleep (5 * time .Second )
397+ return nil
398+ }()
399+
400+ if snapshotErr != nil {
401+ fmt .Println ()
402+ fmt .Println (p .Colors .Warning (" " + p .Colors .Emoji ("!" ) + " Snapshot optimization failed: " + snapshotErr .Error ()))
403+ fmt .Println (p .Colors .Apply (p .Colors .Theme .Description , " Falling back to block-by-block sync..." ))
404+ fmt .Println ()
405+ } else {
406+ fmt .Println ()
407+ fmt .Println (p .Colors .Success (" " + p .Colors .Emoji ("✓" ) + " Snapshot restored — syncing remaining blocks..." ))
408+ fmt .Println ()
409+ }
410+ }
411+
266412 fmt .Println (p .Colors .Info ("▸ Monitoring Sync Progress" ))
267413
268414 // Wait for sync to complete using sync monitor
@@ -316,7 +462,7 @@ func handlePostStartFlow(cfg config.Config, p *ui.Printer) bool {
316462 return nil
317463 }
318464
319- if err := syncmon .RunWithRetry (context .Background (), syncmon.RetryOptions {
465+ syncErr := syncmon .RunWithRetry (context .Background (), syncmon.RetryOptions {
320466 Options : syncmon.Options {
321467 LocalRPC : "http://127.0.0.1:26657" ,
322468 RemoteRPC : remoteURL ,
@@ -331,10 +477,11 @@ func handlePostStartFlow(cfg config.Config, p *ui.Printer) bool {
331477 },
332478 MaxRetries : 3 ,
333479 ResetFunc : resetFunc ,
334- }); err != nil {
335- // Sync failed after retries - show warning and dashboard
480+ })
481+ if syncErr != nil {
482+ // Sync failed - show warning and dashboard
336483 fmt .Println ()
337- fmt .Println (p .Colors .Warning (" " + p .Colors .Emoji ("⚠" ) + " Sync failed after retries" ))
484+ fmt .Println (p .Colors .Warning (" " + p .Colors .Emoji ("⚠" ) + " Sync failed: " + syncErr . Error () ))
338485 fmt .Println (p .Colors .Apply (p .Colors .Theme .Description , " Try: push-validator reset && push-validator start" ))
339486 showDashboardPrompt (cfg , p )
340487 return false
0 commit comments