Skip to content

Commit 38dc384

Browse files
authored
Server: fixup some flaky tests (#965)
Many of the flakes we see are timing-related. The more straightforward kludges I've used here are just adding small sleeps between things so they are more clearly separated in time. --- `test_endpoint_disable_on_repeated_failure` is a special case. The test requires that 2 requests fire but not too soon but also not too far apart. In practice when this test fails, it's because the 2nd request fires too late, after the "forgiveness" rule kicks in (if an endpoint fails and we don't see it _fail again_ within 2x the `disabled_in` duration, then we don't disable it). The reason for the poor timing could be contention on the db/queue from or just due to the CPU being too busy. I tweaked the timing a little to try and smooth it over, but setting `RUST_TEST_THREADS=1` seemed to help the most. When I run the suite locally with `RUST_TEST_THREADS=1` set, I regularly see deadlocks, so I've set this in CI, not in `run-tests.sh` for the time being. To be fair, I also see deadlocks locally without `RUST_TEST_THREADS=1` being set, but different ones. Commonly these deadlocking tests involve multiple calls to "start svix server" functions, and seem to be mitigated by carefully dropping/aborting the server join handles one by one, or rewriting such that you only need one server. A couple of these tests have been rewritten, but there are going to be more out there.
2 parents 620cf53 + 3c7f042 commit 38dc384

File tree

6 files changed

+46
-23
lines changed

6 files changed

+46
-23
lines changed

.github/workflows/server-ci.yml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,11 @@ jobs:
7373

7474
- name: Run tests
7575
working-directory: ./server
76+
env:
77+
# Timing sensitive tests can flake if the docker-compose services get overwhelmed.
78+
# Restrict test execution to help avoid this.
79+
# `test_endpoint_disable_on_repeated_failure` specifically benefits.
80+
RUST_TEST_THREADS: 1
7681
run: ./run-tests.sh
7782

7883
- name: Stop dependencies

server/svix-server/src/worker.rs

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -115,10 +115,10 @@ async fn process_endpoint_success(
115115
/// If no failure has previously been reported, then now is cached as the time of first failure and
116116
/// the endpoint is not disabled.
117117
///
118-
/// If there has been a preivous failure, then it is compared to the configured grace period, where
118+
/// If there has been a previous failure, then it is compared to the configured grace period, where
119119
/// if there have been only failures within the grace period, then the endpoint is disabled.
120120
///
121-
/// All cache values are set with an expiration time greater thah the grace period, so occasional
121+
/// All cache values are set with an expiration time greater that the grace period, so occasional
122122
/// failures will not cause an endpoint to be disabled.
123123
#[tracing::instrument(skip_all)]
124124
async fn process_endpoint_failure(
@@ -131,7 +131,7 @@ async fn process_endpoint_failure(
131131
let key = FailureCacheKey::new(org_id, app_id, &endp.id);
132132
let now = Utc::now();
133133

134-
// If it already exists in the cache, see if the grace preiod has already elapsed
134+
// If it already exists in the cache, see if the grace period has already elapsed
135135
if let Some(FailureCacheValue { first_failure_at }) = cache
136136
.get::<FailureCacheValue>(&key)
137137
.await

server/svix-server/tests/e2e_application.rs

Lines changed: 13 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3,13 +3,16 @@
33

44
use crate::utils::common_calls::metadata;
55
use reqwest::StatusCode;
6+
use svix_server::core::security::generate_org_token;
7+
use svix_server::core::types::{BaseId, OrganizationId};
68
use svix_server::{
79
cfg::CacheType, core::types::ApplicationUid, v1::endpoints::application::ApplicationIn,
810
v1::endpoints::application::ApplicationOut,
911
};
1012

1113
mod utils;
1214

15+
use crate::utils::get_default_test_config;
1316
use utils::{
1417
common_calls::{application_in, common_test_list},
1518
start_svix_server, IgnoredResponse,
@@ -549,11 +552,8 @@ async fn test_uid() {
549552

550553
#[tokio::test]
551554
async fn test_uid_across_users() {
552-
let (client, _jh) = start_svix_server().await;
553-
let (client2, _jh2) = start_svix_server().await;
554-
555+
let (mut client, _jh) = start_svix_server().await;
555556
// Make sure that uids aren't unique across different users
556-
557557
let _app: ApplicationOut = client
558558
.post(
559559
"api/v1/app/",
@@ -567,7 +567,15 @@ async fn test_uid_across_users() {
567567
.await
568568
.unwrap();
569569

570-
let _app2: ApplicationOut = client2
570+
// N.b. previously we made a 2nd call to `start_svix_server()` just to create a 2nd client with
571+
// a fresh token.
572+
// It started deadlocking on that 2nd server call, so instead just make a new token and update
573+
// the auth header on the existing client.
574+
let cfg = get_default_test_config();
575+
let other_token = generate_org_token(&cfg.jwt_secret, OrganizationId::new(None, None)).unwrap();
576+
client.set_auth_header(other_token);
577+
578+
let _app2: ApplicationOut = client
571579
.post(
572580
"api/v1/app/",
573581
ApplicationIn {

server/svix-server/tests/e2e_attempt.rs

Lines changed: 16 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -466,15 +466,22 @@ async fn test_pagination_by_endpoint() {
466466
let mut messages = Vec::new();
467467
for i in 1..=6usize {
468468
messages.push(
469-
create_test_message(
470-
&client,
471-
&app.id,
472-
serde_json::json!({
473-
"test": i,
474-
}),
475-
)
476-
.await
477-
.unwrap(),
469+
async {
470+
// the requests that depend on time (ie, `before` and `after`) can flake if too many
471+
// messages are created too close together.
472+
// This short sleep aims to separate them a little so we can get clean counts.
473+
tokio::time::sleep(Duration::from_millis(10)).await;
474+
create_test_message(
475+
&client,
476+
&app.id,
477+
serde_json::json!({
478+
"test": i,
479+
}),
480+
)
481+
.await
482+
.unwrap()
483+
}
484+
.await,
478485
);
479486
}
480487

server/svix-server/tests/e2e_endpoint.rs

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -982,6 +982,7 @@ async fn test_recovery_expected_retry_counts() {
982982
.await
983983
.unwrap();
984984

985+
tokio::time::sleep(Duration::from_millis(10)).await;
985986
let after_msg = Utc::now();
986987

987988
// recovery time after msg -- should be no additional attempts
@@ -1343,14 +1344,14 @@ async fn test_custom_endpoint_secret() {
13431344
#[tokio::test]
13441345
async fn test_endpoint_secret_encryption() {
13451346
let org_id = OrganizationId::new(None, None);
1346-
let cfg = get_default_test_config();
1347-
let (client, _jh) = start_svix_server_with_cfg_and_org_id(&cfg, org_id.clone()).await;
13481347

13491348
#[derive(Deserialize)]
13501349
pub struct EndpointSecretOutTest {
13511350
pub key: String,
13521351
}
13531352

1353+
let cfg = get_default_test_config();
1354+
let (client, jh) = start_svix_server_with_cfg_and_org_id(&cfg, org_id.clone()).await;
13541355
let app_id = create_test_app(&client, "app1").await.unwrap().id;
13551356

13561357
let ep_in = default_test_endpoint();
@@ -1367,11 +1368,12 @@ async fn test_endpoint_secret_encryption() {
13671368
.await
13681369
.unwrap()
13691370
.key;
1371+
jh.abort();
13701372

13711373
// Now add encryption and check the secret is still fine
13721374
let mut cfg = get_default_test_config();
13731375
cfg.encryption = Encryption::new([1; 32]);
1374-
let (client, _jh) = start_svix_server_with_cfg_and_org_id(&cfg, org_id.clone()).await;
1376+
let (client, jh) = start_svix_server_with_cfg_and_org_id(&cfg, org_id.clone()).await;
13751377

13761378
let secret2 = client
13771379
.get::<EndpointSecretOutTest>(
@@ -1406,6 +1408,7 @@ async fn test_endpoint_secret_encryption() {
14061408

14071409
// Ensure loading and saving works for encrypted
14081410
assert_eq!(secret, secret2);
1411+
jh.abort();
14091412

14101413
// Make sure we can't read it with the secret unset
14111414
let cfg = get_default_test_config();

server/svix-server/tests/worker.rs

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -130,15 +130,15 @@ async fn test_no_redirects_policy() {
130130

131131
/// This tests that endpoints are successfully disabled after the retry schedule is exhausted
132132
/// multiple times without intermittent success over a period exceeding the grace period. So the
133-
/// tests don't take too long, thes grace period and expiration period will be reconfigured to be
133+
/// tests don't take too long, these grace period and expiration period will be reconfigured to be
134134
/// on the order of seconds
135135
#[tokio::test]
136136
async fn test_endpoint_disable_on_repeated_failure() {
137137
let mut cfg = get_default_test_config();
138138

139139
if !matches!(cfg.cache_type, svix_server::cfg::CacheType::None) {
140140
cfg.retry_schedule = vec![];
141-
cfg.endpoint_failure_disable_after = Duration::from_secs(1);
141+
cfg.endpoint_failure_disable_after = Duration::from_secs(2);
142142

143143
let (client, _jh) = start_svix_server_with_cfg(&cfg).await;
144144

@@ -153,7 +153,7 @@ async fn test_endpoint_disable_on_repeated_failure() {
153153
.unwrap()
154154
.id;
155155

156-
tokio::time::sleep(Duration::from_millis(1200)).await;
156+
tokio::time::sleep(Duration::from_millis(2_500)).await;
157157

158158
let _msg_id = create_test_message(&client, &app_id, serde_json::json!({}))
159159
.await

0 commit comments

Comments
 (0)