Skip to content

Commit 7ae63e2

Browse files
authored
ref(project-cache): Add even more detailed fetch metrics (#4719)
Breaks down the latency by the delay, a latency with a high delay is not sign of an issue, it's just a sign of a low volume project. But depending on how often this happens, we may want to address this case too. Also tracks 'end to end' duration of a fetch, through the entire system, from initiating the fetch to completing it. Long fetch are always a sign for a problem in the system, but they are mostly dominated by upstreams not being able to provide the project config. Refs: INGEST-180
1 parent 249971e commit 7ae63e2

File tree

3 files changed

+105
-19
lines changed

3 files changed

+105
-19
lines changed

relay-server/src/services/projects/cache/service.rs

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -150,7 +150,6 @@ impl ProjectCacheService {
150150

151151
fn handle_completed_fetch(&mut self, fetch: CompletedFetch) {
152152
let project_key = fetch.project_key();
153-
let latency = fetch.latency();
154153

155154
if let Some(fetch) = self.store.complete_fetch(fetch, &self.config) {
156155
relay_log::trace!(
@@ -165,10 +164,6 @@ impl ProjectCacheService {
165164
.project_events_tx
166165
.send(ProjectChange::Ready(project_key));
167166

168-
if let Some(latency) = latency {
169-
metric!(timer(RelayTimers::ProjectCacheUpdateLatency) = latency);
170-
}
171-
172167
metric!(
173168
gauge(RelayGauges::ProjectCacheNotificationChannel) =
174169
self.project_events_tx.len() as u64

relay-server/src/services/projects/cache/state.rs

Lines changed: 99 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ use relay_statsd::metric;
1313

1414
use crate::services::projects::project::{ProjectState, Revision};
1515
use crate::services::projects::source::SourceProjectState;
16-
use crate::statsd::RelayHistograms;
16+
use crate::statsd::{RelayHistograms, RelayTimers};
1717
use crate::utils::{RetryBackoff, UniqueScheduledQueue};
1818

1919
/// The backing storage for a project cache.
@@ -85,6 +85,7 @@ impl ProjectStore {
8585
new_fetch.is_none(),
8686
"there cannot be a new fetch and a scheduled expiry"
8787
);
88+
8889
self.evictions.schedule(when, project_key);
8990
}
9091

@@ -271,6 +272,38 @@ impl ProjectRef<'_> {
271272

272273
fn complete_fetch(&mut self, fetch: CompletedFetch, config: &Config) -> Option<ExpiryTime> {
273274
let now = Instant::now();
275+
276+
if let Some(latency) = fetch.latency() {
277+
let delay = match fetch.delay() {
278+
Some(delay) if delay.as_secs() <= 15 => "lte15s",
279+
Some(delay) if delay.as_secs() <= 30 => "lte30s",
280+
Some(delay) if delay.as_secs() <= 60 => "lte60s",
281+
Some(delay) if delay.as_secs() <= 120 => "lte120",
282+
Some(delay) if delay.as_secs() <= 300 => "lte300s",
283+
Some(delay) if delay.as_secs() <= 600 => "lte600s",
284+
Some(delay) if delay.as_secs() <= 1800 => "lte1800s",
285+
Some(delay) if delay.as_secs() <= 3600 => "lte3600s",
286+
Some(_) => "gt3600s",
287+
None => "none",
288+
};
289+
metric!(
290+
timer(RelayTimers::ProjectCacheUpdateLatency) = latency,
291+
delay = delay
292+
);
293+
}
294+
295+
if !fetch.is_pending() {
296+
let state = match fetch.state {
297+
SourceProjectState::New(_) => "new",
298+
SourceProjectState::NotModified => "not_modified",
299+
};
300+
301+
metric!(
302+
timer(RelayTimers::ProjectCacheFetchDuration) = fetch.duration(now),
303+
state = state
304+
);
305+
}
306+
274307
self.private.complete_fetch(&fetch, now);
275308

276309
// Keep the old state around if the current fetch is pending.
@@ -306,6 +339,8 @@ impl Eviction {
306339
#[derive(Debug)]
307340
pub struct Fetch {
308341
project_key: ProjectKey,
342+
previous_fetch: Option<Instant>,
343+
initiated: Instant,
309344
when: Option<Instant>,
310345
revision: Revision,
311346
}
@@ -354,7 +389,21 @@ pub struct CompletedFetch {
354389
impl CompletedFetch {
355390
/// Returns the [`ProjectKey`] of the project which was fetched.
356391
pub fn project_key(&self) -> ProjectKey {
357-
self.fetch.project_key
392+
self.fetch.project_key()
393+
}
394+
395+
/// Returns the amount of time passed between the last successful fetch for this project and the start of this fetch.
396+
///
397+
/// `None` if this is the first fetch.
398+
fn delay(&self) -> Option<Duration> {
399+
self.fetch
400+
.previous_fetch
401+
.map(|pf| pf.duration_since(self.fetch.initiated))
402+
}
403+
404+
/// Returns the duration between first initiating the fetch and `now`.
405+
fn duration(&self, now: Instant) -> Duration {
406+
now.duration_since(self.fetch.initiated)
358407
}
359408

360409
/// Returns the update latency of the fetched project config from the upstream.
@@ -364,7 +413,7 @@ impl CompletedFetch {
364413
///
365414
/// Note: this latency is computed on access, it does not use the time when the [`Fetch`]
366415
/// was marked as (completed)[`Fetch::complete`].
367-
pub fn latency(&self) -> Option<Duration> {
416+
fn latency(&self) -> Option<Duration> {
368417
// We're not interested in initial fetches. The latency on the first fetch
369418
// has no meaning about how long it takes for an updated project config to be
370419
// propagated to a Relay.
@@ -376,11 +425,14 @@ impl CompletedFetch {
376425
let project_info = match &self.state {
377426
SourceProjectState::New(ProjectState::Enabled(project_info)) => project_info,
378427
// Not modified or deleted/disabled -> no latency to track.
428+
//
429+
// Currently we discard the last changed timestamp for disabled projects,
430+
// it would be possible to do so and then also expose a latency for disabled projects.
379431
_ => return None,
380432
};
381433

382434
// A matching revision is not an update.
383-
if project_info.rev == self.fetch.revision() {
435+
if project_info.rev == self.fetch.revision {
384436
return None;
385437
}
386438

@@ -475,6 +527,14 @@ enum FetchState {
475527
/// If the upstream notifies this instance about a pending config,
476528
/// a backoff is applied, before trying again.
477529
Pending {
530+
/// Instant when the fetch was first initiated.
531+
///
532+
/// A state may be transitioned multiple times from [`Self::Pending`] to [`Self::InProgress`]
533+
/// and back to [`Self::Pending`]. This timestamp is the first time when the state
534+
/// was transitioned from [`Self::Complete`] to [`Self::InProgress`].
535+
///
536+
/// Only `None` on first fetch.
537+
initiated: Option<Instant>,
478538
/// Time when the next fetch should be attempted.
479539
///
480540
/// `None` means soon as possible.
@@ -483,7 +543,7 @@ enum FetchState {
483543
/// There was a successful non-pending fetch.
484544
Complete {
485545
/// Time when the fetch was completed.
486-
last_fetch: LastFetch,
546+
when: LastFetch,
487547
},
488548
}
489549

@@ -498,40 +558,59 @@ struct PrivateProjectState {
498558
///
499559
/// The backoff is reset after a successful, non-pending fetch.
500560
backoff: RetryBackoff,
561+
562+
/// The last time the state was successfully fetched.
563+
///
564+
/// May be `None` when the state has never been successfully fetched.
565+
///
566+
/// This is purely informational, all necessary information to make
567+
/// state transitions is contained in [`FetchState`].
568+
last_fetch: Option<Instant>,
501569
}
502570

503571
impl PrivateProjectState {
504572
fn new(project_key: ProjectKey, config: &Config) -> Self {
505573
Self {
506574
project_key,
507575
state: FetchState::Pending {
576+
initiated: None,
508577
next_fetch_attempt: None,
509578
},
510579
backoff: RetryBackoff::new(config.http_max_retry_interval()),
580+
last_fetch: None,
511581
}
512582
}
513583

584+
/// Returns the expiry time of the time project state.
585+
///
586+
/// `None` if there is currently a fetch pending or in progress.
514587
fn expiry_time(&self, config: &Config) -> Option<ExpiryTime> {
515588
match &self.state {
516-
FetchState::Complete { last_fetch } => Some(last_fetch.expiry_time(config)),
589+
FetchState::Complete { when } => {
590+
debug_assert_eq!(Some(when.0), self.last_fetch);
591+
Some(when.expiry_time(config))
592+
}
517593
_ => None,
518594
}
519595
}
520596

521597
fn try_begin_fetch(&mut self, now: Instant, config: &Config) -> Option<Fetch> {
522-
let when = match &self.state {
598+
let (initiated, when) = match &self.state {
523599
FetchState::InProgress => {
524600
relay_log::trace!(
525601
tags.project_key = self.project_key.as_str(),
526602
"project fetch skipped, fetch in progress"
527603
);
528604
return None;
529605
}
530-
FetchState::Pending { next_fetch_attempt } => {
606+
FetchState::Pending {
607+
initiated,
608+
next_fetch_attempt,
609+
} => {
531610
// Schedule a new fetch, even if there is a backoff, it will just be sleeping for a while.
532-
*next_fetch_attempt
611+
(initiated.unwrap_or(now), *next_fetch_attempt)
533612
}
534-
FetchState::Complete { last_fetch } => {
613+
FetchState::Complete { when: last_fetch } => {
535614
if last_fetch.check_expiry(now, config).is_fresh() {
536615
// The current state is up to date, no need to start another fetch.
537616
relay_log::trace!(
@@ -540,12 +619,12 @@ impl PrivateProjectState {
540619
);
541620
return None;
542621
}
543-
None
622+
(now, None)
544623
}
545624
};
546625

547626
// Mark a current fetch in progress.
548-
self.state = FetchState::InProgress {};
627+
self.state = FetchState::InProgress;
549628

550629
relay_log::trace!(
551630
tags.project_key = &self.project_key.as_str(),
@@ -556,6 +635,8 @@ impl PrivateProjectState {
556635

557636
Some(Fetch {
558637
project_key: self.project_key,
638+
previous_fetch: self.last_fetch,
639+
initiated,
559640
when,
560641
revision: Revision::default(),
561642
})
@@ -573,7 +654,10 @@ impl PrivateProjectState {
573654
false => now.checked_add(next_backoff),
574655
true => None,
575656
};
576-
self.state = FetchState::Pending { next_fetch_attempt };
657+
self.state = FetchState::Pending {
658+
next_fetch_attempt,
659+
initiated: Some(fetch.fetch.initiated),
660+
};
577661
relay_log::trace!(
578662
tags.project_key = &self.project_key.as_str(),
579663
"project state fetch completed but still pending"
@@ -584,8 +668,9 @@ impl PrivateProjectState {
584668
"project state fetch completed with non-pending config"
585669
);
586670
self.backoff.reset();
671+
self.last_fetch = Some(now);
587672
self.state = FetchState::Complete {
588-
last_fetch: LastFetch(now),
673+
when: LastFetch(now),
589674
};
590675
}
591676
}

relay-server/src/statsd.rs

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -416,7 +416,12 @@ pub enum RelayTimers {
416416
/// and when Relay updates its local cache with the new project config.
417417
///
418418
/// No metric is emitted when Relay fetches a project config for the first time.
419+
///
420+
/// This metric is tagged with:
421+
/// - `delay`: Bucketed amount of seconds passed between fetches.
419422
ProjectCacheUpdateLatency,
423+
/// Total time spent from starting to fetch a project config update to completing the fetch.
424+
ProjectCacheFetchDuration,
420425
/// Total time in milliseconds spent fetching queued project configuration updates requests to
421426
/// resolve.
422427
///
@@ -605,6 +610,7 @@ impl TimerMetric for RelayTimers {
605610
#[cfg(feature = "processing")]
606611
RelayTimers::ProjectStateDecompression => "project_state.decompression",
607612
RelayTimers::ProjectCacheUpdateLatency => "project_cache.latency",
613+
RelayTimers::ProjectCacheFetchDuration => "project_cache.fetch.duration",
608614
RelayTimers::RequestsDuration => "requests.duration",
609615
RelayTimers::MinidumpScrubbing => "scrubbing.minidumps.duration",
610616
RelayTimers::ViewHierarchyScrubbing => "scrubbing.view_hierarchy_scrubbing.duration",

0 commit comments

Comments
 (0)