Server: fixup some flaky tests (#965)

svix-onelson · web-flow · commit 38dc3842de68 · 2023-06-20T11:43:23.000-07:00
Many of the flakes we see are timing-related. The more straightforward
kludges I've used here are just adding small sleeps between things so
they are more clearly separated in time.

---

`test_endpoint_disable_on_repeated_failure` is a special case.

The test requires that 2 requests fire but not too soon but also not too
far apart.
In practice when this test fails, it's because the 2nd request fires too
late, after the "forgiveness" rule kicks in (if an endpoint fails and we
don't see it _fail again_ within 2x the `disabled_in` duration, then we
don't
disable it).

The reason for the poor timing could be contention on the db/queue from
or just due to the CPU being too busy. I tweaked the timing a little to
try and smooth it over, but setting `RUST_TEST_THREADS=1` seemed to help
the most.

When I run the suite locally with `RUST_TEST_THREADS=1`
set, I regularly see deadlocks, so I've set this in CI, not in
`run-tests.sh` for the time being.

To be fair, I also see deadlocks locally without `RUST_TEST_THREADS=1`
being set, but different ones.

Commonly these deadlocking tests involve multiple calls to "start svix
server" functions, and seem to be mitigated by carefully
dropping/aborting the server join handles one by one, or rewriting such
that you only need one server. A couple of these tests have been
rewritten, but there are going to be more out there.
diff --git a/.github/workflows/server-ci.yml b/.github/workflows/server-ci.yml
@@ -73,6 +73,11 @@ jobs:
 
     - name: Run tests
       working-directory: ./server
+      env:
+        # Timing sensitive tests can flake if the docker-compose services get overwhelmed.
+        # Restrict test execution to help avoid this.
+        # `test_endpoint_disable_on_repeated_failure` specifically benefits.
+        RUST_TEST_THREADS: 1
       run: ./run-tests.sh
 
     - name: Stop dependencies
diff --git a/server/svix-server/src/worker.rs b/server/svix-server/src/worker.rs
@@ -115,10 +115,10 @@ async fn process_endpoint_success(
 /// If no failure has previously been reported, then now is cached as the time of first failure and
 /// the endpoint is not disabled.
 ///
-/// If there has been a  preivous failure, then it is compared to the configured grace period, where
+/// If there has been a  previous failure, then it is compared to the configured grace period, where
 /// if there have been only failures within the grace period, then the endpoint is disabled.
 ///
-/// All cache values are set with an expiration time greater thah the grace period, so occasional
+/// All cache values are set with an expiration time greater that the grace period, so occasional
 /// failures will not cause an endpoint to be disabled.
 #[tracing::instrument(skip_all)]
 async fn process_endpoint_failure(
@@ -131,7 +131,7 @@ async fn process_endpoint_failure(
     let key = FailureCacheKey::new(org_id, app_id, &endp.id);
     let now = Utc::now();
 
-    // If it already exists in the cache, see if the grace preiod has already elapsed
+    // If it already exists in the cache, see if the grace period has already elapsed
     if let Some(FailureCacheValue { first_failure_at }) = cache
         .get::<FailureCacheValue>(&key)
         .await
diff --git a/server/svix-server/tests/e2e_application.rs b/server/svix-server/tests/e2e_application.rs
@@ -3,13 +3,16 @@
 
 use crate::utils::common_calls::metadata;
 use reqwest::StatusCode;
+use svix_server::core::security::generate_org_token;
+use svix_server::core::types::{BaseId, OrganizationId};
 use svix_server::{
     cfg::CacheType, core::types::ApplicationUid, v1::endpoints::application::ApplicationIn,
     v1::endpoints::application::ApplicationOut,
 };
 
 mod utils;
 
+use crate::utils::get_default_test_config;
 use utils::{
     common_calls::{application_in, common_test_list},
     start_svix_server, IgnoredResponse,
@@ -549,11 +552,8 @@ async fn test_uid() {
 
 #[tokio::test]
 async fn test_uid_across_users() {
-    let (client, _jh) = start_svix_server().await;
-    let (client2, _jh2) = start_svix_server().await;
-
+    let (mut client, _jh) = start_svix_server().await;
     // Make sure that uids aren't unique across different users
-
     let _app: ApplicationOut = client
         .post(
             "api/v1/app/",
@@ -567,7 +567,15 @@ async fn test_uid_across_users() {
         .await
         .unwrap();
 
-    let _app2: ApplicationOut = client2
+    // N.b. previously we made a 2nd call to `start_svix_server()` just to create a 2nd client with
+    // a fresh token.
+    // It started deadlocking on that 2nd server call, so instead just make a new token and update
+    // the auth header on the existing client.
+    let cfg = get_default_test_config();
+    let other_token = generate_org_token(&cfg.jwt_secret, OrganizationId::new(None, None)).unwrap();
+    client.set_auth_header(other_token);
+
+    let _app2: ApplicationOut = client
         .post(
             "api/v1/app/",
             ApplicationIn {
diff --git a/server/svix-server/tests/e2e_attempt.rs b/server/svix-server/tests/e2e_attempt.rs
@@ -466,15 +466,22 @@ async fn test_pagination_by_endpoint() {
     let mut messages = Vec::new();
     for i in 1..=6usize {
         messages.push(
-            create_test_message(
-                &client,
-                &app.id,
-                serde_json::json!({
-                    "test": i,
-                }),
-            )
-            .await
-            .unwrap(),
+            async {
+                // the requests that depend on time (ie, `before` and `after`) can flake if too many
+                // messages are created too close together.
+                // This short sleep aims to separate them a little so we can get clean counts.
+                tokio::time::sleep(Duration::from_millis(10)).await;
+                create_test_message(
+                    &client,
+                    &app.id,
+                    serde_json::json!({
+                        "test": i,
+                    }),
+                )
+                .await
+                .unwrap()
+            }
+            .await,
         );
     }
 
diff --git a/server/svix-server/tests/e2e_endpoint.rs b/server/svix-server/tests/e2e_endpoint.rs
@@ -982,6 +982,7 @@ async fn test_recovery_expected_retry_counts() {
         .await
         .unwrap();
 
+    tokio::time::sleep(Duration::from_millis(10)).await;
     let after_msg = Utc::now();
 
     // recovery time after msg -- should be no additional attempts
@@ -1343,14 +1344,14 @@ async fn test_custom_endpoint_secret() {
 #[tokio::test]
 async fn test_endpoint_secret_encryption() {
     let org_id = OrganizationId::new(None, None);
-    let cfg = get_default_test_config();
-    let (client, _jh) = start_svix_server_with_cfg_and_org_id(&cfg, org_id.clone()).await;
 
     #[derive(Deserialize)]
     pub struct EndpointSecretOutTest {
         pub key: String,
     }
 
+    let cfg = get_default_test_config();
+    let (client, jh) = start_svix_server_with_cfg_and_org_id(&cfg, org_id.clone()).await;
     let app_id = create_test_app(&client, "app1").await.unwrap().id;
 
     let ep_in = default_test_endpoint();
@@ -1367,11 +1368,12 @@ async fn test_endpoint_secret_encryption() {
         .await
         .unwrap()
         .key;
+    jh.abort();
 
     // Now add encryption and check the secret is still fine
     let mut cfg = get_default_test_config();
     cfg.encryption = Encryption::new([1; 32]);
-    let (client, _jh) = start_svix_server_with_cfg_and_org_id(&cfg, org_id.clone()).await;
+    let (client, jh) = start_svix_server_with_cfg_and_org_id(&cfg, org_id.clone()).await;
 
     let secret2 = client
         .get::<EndpointSecretOutTest>(
@@ -1406,6 +1408,7 @@ async fn test_endpoint_secret_encryption() {
 
     // Ensure loading and saving works for encrypted
     assert_eq!(secret, secret2);
+    jh.abort();
 
     // Make sure we can't read it with the secret unset
     let cfg = get_default_test_config();
diff --git a/server/svix-server/tests/worker.rs b/server/svix-server/tests/worker.rs
@@ -130,15 +130,15 @@ async fn test_no_redirects_policy() {
 
 /// This tests that endpoints are successfully disabled after the retry schedule is exhausted
 /// multiple times without intermittent success over a period exceeding the grace period. So the
-/// tests don't take too long, thes grace period and expiration period will be reconfigured to be
+/// tests don't take too long, these grace period and expiration period will be reconfigured to be
 /// on the order of seconds
 #[tokio::test]
 async fn test_endpoint_disable_on_repeated_failure() {
     let mut cfg = get_default_test_config();
 
     if !matches!(cfg.cache_type, svix_server::cfg::CacheType::None) {
         cfg.retry_schedule = vec![];
-        cfg.endpoint_failure_disable_after = Duration::from_secs(1);
+        cfg.endpoint_failure_disable_after = Duration::from_secs(2);
 
         let (client, _jh) = start_svix_server_with_cfg(&cfg).await;
 
@@ -153,7 +153,7 @@ async fn test_endpoint_disable_on_repeated_failure() {
             .unwrap()
             .id;
 
-        tokio::time::sleep(Duration::from_millis(1200)).await;
+        tokio::time::sleep(Duration::from_millis(2_500)).await;
 
         let _msg_id = create_test_message(&client, &app_id, serde_json::json!({}))
             .await