-
Notifications
You must be signed in to change notification settings - Fork 382
feat: bootup time metrics #5162
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 5 commits
39ca779
e6c660d
219cda0
b96d82a
2ac05c9
f5fe403
3d05785
98c19ca
72fa9a8
e26cfac
511394d
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -206,6 +206,10 @@ func NewBee( | |
| session accesscontrol.Session, | ||
| o *Options, | ||
| ) (b *Bee, err error) { | ||
| // start time for node warmup duration measurement | ||
| warmupStartTime := time.Now() | ||
| var pullSyncStartTime time.Time | ||
|
|
||
| tracer, tracerCloser, err := tracing.NewTracer(&tracing.Options{ | ||
| Enabled: o.TracingEnabled, | ||
| Endpoint: o.TracingEndpoint, | ||
|
|
@@ -595,9 +599,28 @@ func NewBee( | |
| logger.Info("node warmup check initiated. monitoring activity rate to determine readiness.", "startTime", t) | ||
| } | ||
|
|
||
| detector.OnStabilized = func(t time.Time, totalCount int) { | ||
| logger.Info("node warmup complete. system is considered stable and ready.", "stabilizationTime", t, "totalMonitoredEvents", totalCount) | ||
| nodeWarmupDuration := prometheus.NewHistogram( | ||
| prometheus.HistogramOpts{ | ||
| Namespace: metrics.Namespace, | ||
| Subsystem: "init", | ||
| Name: "warmup_duration_seconds", | ||
| Help: "Duration in seconds for node warmup to complete", | ||
| }, | ||
| ) | ||
| prometheus.MustRegister(nodeWarmupDuration) | ||
|
|
||
| warmupMeasurement := func(t time.Time, totalCount int) { | ||
| warmupDuration := t.Sub(warmupStartTime).Seconds() | ||
| logger.Info("node warmup complete. system is considered stable and ready.", | ||
| "stabilizationTime", t, | ||
| "totalMonitoredEvents", totalCount, | ||
| "warmupDurationSeconds", warmupDuration) | ||
|
|
||
| // Record the warmup duration in the prometheus metric | ||
| nodeWarmupDuration.Observe(warmupDuration) | ||
| pullSyncStartTime = t | ||
| } | ||
| detector.OnStabilized = warmupMeasurement | ||
|
|
||
| detector.OnPeriodComplete = func(t time.Time, periodCount int, stDev float64) { | ||
| logger.Debug("node warmup check: period complete.", "periodEndTime", t, "eventsInPeriod", periodCount, "rateStdDev", stDev) | ||
|
|
@@ -1130,6 +1153,45 @@ func NewBee( | |
| localStore.StartReserveWorker(ctx, pullerService, waitNetworkRFunc) | ||
| nodeStatus.SetSync(pullerService) | ||
|
|
||
| // measure full sync duration | ||
| detector.OnStabilized = func(t time.Time, totalCount int) { | ||
| warmupMeasurement(t, totalCount) | ||
| fullSyncDuration := prometheus.NewHistogram( | ||
| prometheus.HistogramOpts{ | ||
| Namespace: metrics.Namespace, | ||
| Subsystem: "init", | ||
| Name: "full_sync_duration_seconds", | ||
| Help: "Duration in seconds for node warmup to complete", | ||
| }, | ||
| ) | ||
| prometheus.MustRegister(fullSyncDuration) | ||
|
|
||
| reserveTreshold := reserveCapacity >> 1 | ||
| isFullySynced := func() bool { | ||
| return pullerService.SyncRate() == 0 && saludService.IsHealthy() && localStore.ReserveSize() >= reserveTreshold | ||
| } | ||
|
|
||
| syncCheckTicker := time.NewTicker(time.Second) | ||
| go func() { | ||
| defer syncCheckTicker.Stop() | ||
| for { | ||
| select { | ||
| case <-ctx.Done(): | ||
| return | ||
| case <-syncCheckTicker.C: | ||
| synced := isFullySynced() | ||
| logger.Debug("sync status check", "synced", synced, "reserveSize", localStore.ReserveSize(), "threshold", reserveTreshold, "syncRate", pullerService.SyncRate()) | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Maybe change log level to Trace, because it will spam every second until ReserveSize reaches trashold? Or we can even increase the time checking to 2 seconds?
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I increased the time check to 2 seconds because debug level is the most verbose. |
||
| if synced { | ||
| fullSyncTime := pullSyncStartTime.Sub(t) | ||
| fullSyncDuration.Observe(fullSyncTime.Seconds()) | ||
| syncCheckTicker.Stop() | ||
| return | ||
| } | ||
| } | ||
| } | ||
| }() | ||
| } | ||
|
|
||
| if o.EnableStorageIncentives { | ||
|
|
||
| redistributionContractAddress := chainCfg.RedistributionAddress | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.