Skip to content

Commit 93a0548

Browse files
authored
Bug 2007542 - Rate limit error reports (mozilla#7139)
1 parent 009bf85 commit 93a0548

File tree

1 file changed

+114
-3
lines changed

1 file changed

+114
-3
lines changed

components/support/error/src/error_tracing.rs

Lines changed: 114 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,18 +2,34 @@
22
* License, v. 2.0. If a copy of the MPL was not distributed with this
33
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
44

5+
use std::{
6+
collections::HashMap,
7+
time::{Duration, Instant},
8+
};
9+
510
use parking_lot::Mutex;
611

7-
static RECENT_BREADCRUMBS: Mutex<BreadcrumbRingBuffer> = Mutex::new(BreadcrumbRingBuffer::new());
12+
static GLOBALS: Mutex<Globals> = Mutex::new(Globals::new());
813

914
pub fn report_error_to_app(type_name: String, message: String) {
15+
// First step: perform all actions that we need the lock for.
16+
let breadcrumbs = {
17+
let mut globals = GLOBALS.lock();
18+
if !globals
19+
.rate_limiter
20+
.should_send_report(&type_name, Instant::now())
21+
{
22+
return;
23+
}
24+
globals.breadcrumbs.get_breadcrumbs()
25+
};
1026
// Report errors by sending a tracing event to the `app-services-error-reporter::error` target.
1127
//
1228
// Applications should register for these events and send a glean error ping when they occur.
1329
//
1430
// breadcrumbs will be sent in the `breadcrumbs` field as a single string, with each individual
1531
// breadcrumb joined by newlines.
16-
let breadcrumbs = RECENT_BREADCRUMBS.lock().get_breadcrumbs().join("\n");
32+
let breadcrumbs = breadcrumbs.join("\n");
1733
tracing_support::error!(target: "app-services-error-reporter::error", message, type_name, breadcrumbs);
1834
}
1935

@@ -22,10 +38,25 @@ pub fn report_breadcrumb(message: String, module: String, line: u32, column: u32
2238
// - Push it to the `RECENT_BREADCRUMBS` list
2339
// - Send out the `app-services-error-reporter::breadcrumb`. Applications can register for
2440
// these events and log them.
25-
RECENT_BREADCRUMBS.lock().push(message.clone());
41+
GLOBALS.lock().breadcrumbs.push(message.clone());
2642
tracing_support::info!(target: "app-services-error-reporter::breadcrumb", message, module, line, column);
2743
}
2844

45+
// Global structs used for error reporting
46+
struct Globals {
47+
breadcrumbs: BreadcrumbRingBuffer,
48+
rate_limiter: RateLimiter,
49+
}
50+
51+
impl Globals {
52+
const fn new() -> Self {
53+
Self {
54+
breadcrumbs: BreadcrumbRingBuffer::new(),
55+
rate_limiter: RateLimiter::new(),
56+
}
57+
}
58+
}
59+
2960
/// Ring buffer implementation that we use to store the most recent 20 breadcrumbs
3061
#[derive(Default)]
3162
struct BreadcrumbRingBuffer {
@@ -73,6 +104,49 @@ fn truncate_breadcrumb(breadcrumb: String) -> String {
73104
breadcrumb[0..split_point].to_string()
74105
}
75106

107+
/// Rate-limits error reports per component by type to a max of 20/hr
108+
///
109+
/// This uses the simplest algorithm possible. We could use something like a token bucket to allow
110+
/// for a small burst of errors, but that doesn't seem so useful. In that scenario, the first
111+
/// error report is the one we want to fix.
112+
struct RateLimiter {
113+
// Optional so we can make `new()` const.
114+
last_report: Option<HashMap<String, Instant>>,
115+
}
116+
117+
impl RateLimiter {
118+
// Rate limit reports if they're within 3 minutes of each other.
119+
const INTERVAL: Duration = Duration::from_secs(180);
120+
121+
const fn new() -> Self {
122+
Self { last_report: None }
123+
}
124+
125+
fn should_send_report(&mut self, error_type: &str, now: Instant) -> bool {
126+
let component = error_type.split("-").next().unwrap();
127+
let last_report = self.last_report.get_or_insert_with(HashMap::default);
128+
129+
if let Some(last_report) = last_report.get(component) {
130+
match now.checked_duration_since(*last_report) {
131+
// Not enough time has passed, rate-limit the report
132+
Some(elapsed) if elapsed < Self::INTERVAL => {
133+
return false;
134+
}
135+
// For all other cases, fall through and allow the report to be sent.
136+
//
137+
// Note: this also covers the `None` case which happens when the clock is
138+
// non-monotonic. This shouldn't happen often, but it's possible after the OS syncs
139+
// with NTP, if users manually adjust their clocks, etc. Letting an extra event
140+
// through seems okay in this case. We should get back into a good state soon
141+
// after.
142+
_ => (),
143+
}
144+
}
145+
last_report.insert(component.to_string(), now);
146+
true
147+
}
148+
}
149+
76150
#[cfg(test)]
77151
mod test {
78152
use super::*;
@@ -212,4 +286,41 @@ mod test {
212286
// fire emoji, which is multiple bytes long.
213287
assert_eq!(truncate_breadcrumb("0".repeat(99) + "🔥").len(), 99);
214288
}
289+
290+
#[test]
291+
fn test_rate_limiter() {
292+
let mut rate_limiter = RateLimiter::new();
293+
let start = Instant::now();
294+
let min = Duration::from_secs(60);
295+
// The first error report is okay
296+
assert!(rate_limiter.should_send_report("test-type", start));
297+
// The report should be rate limited until 3 minutes pass, then we can send another one.
298+
// Add time from the instant to simulate time going forward.
299+
assert!(!rate_limiter.should_send_report("test-type", start));
300+
assert!(!rate_limiter.should_send_report("test-type", start + min * 1));
301+
assert!(!rate_limiter.should_send_report("test-type", start + min * 2));
302+
assert!(rate_limiter.should_send_report("test-type", start + min * 3));
303+
assert!(!rate_limiter.should_send_report("test-type", start + min * 4));
304+
assert!(!rate_limiter.should_send_report("test-type", start + min * 5));
305+
assert!(rate_limiter.should_send_report("test-type", start + min * 6));
306+
307+
assert!(rate_limiter.should_send_report("test-type", start + min * 60));
308+
assert!(!rate_limiter.should_send_report("test-type", start + min * 61));
309+
assert!(!rate_limiter.should_send_report("test-type", start + min * 62));
310+
assert!(rate_limiter.should_send_report("test-type", start + min * 63));
311+
}
312+
313+
#[test]
314+
fn test_rate_limiter_type_matching() {
315+
let mut rate_limiter = RateLimiter::new();
316+
let start = Instant::now();
317+
// Cause error error reports to be rate limited
318+
assert!(rate_limiter.should_send_report("componenta-network-error", start));
319+
assert!(!rate_limiter.should_send_report("componenta-network-error", start));
320+
// Other reports from the same component should also be rate limited
321+
assert!(!rate_limiter.should_send_report("componenta-database-error", start));
322+
// But not ones from other components
323+
assert!(rate_limiter.should_send_report("componentb-database-error", start));
324+
assert!(rate_limiter.should_send_report("componentaa-network-error", start));
325+
}
215326
}

0 commit comments

Comments
 (0)