Skip to content

Commit 4dfd4ec

Browse files
authored
test(policy): address timeouts and flakiness (#14773)
Policy tests are very flaky. Currently one of the main culprits is that service account creation sometimes isn't caught as an event by the watcher, blocking `await_service_account` until it times out after 60s. We already have in place up to 3 retries when calling `cargo nextest`, but these tests are sequential and the 60s timeouts start accumulating until we reach the CI job timeout at 20min. This change first lowers the service account creation timeout down to 15s, understanding that if the watcher catches that event it will do pretty quickly or else block indefinitely. So better to fail faster and trigger the test retry ASAP. With this change, `test-policy (v1.34, linkerd, experimental)` is finally passing, taking 17m due to the large number of retries: ``` Summary [ 779.409s] 151 tests run: 151 passed (15 flaky), 0 skipped FLAKY 2/4 [ 0.069s] linkerd-policy-test::admit_network_authentication rejects_invalid_cidr FLAKY 3/4 [ 15.019s] linkerd-policy-test::e2e_audit ns_audit FLAKY 2/4 [ 10.964s] linkerd-policy-test::e2e_authorization_policy targets_route FLAKY 3/4 [ 3.830s] linkerd-policy-test::e2e_egress_network default_traffic_policy_http_allow FLAKY 2/4 [ 37.004s] linkerd-policy-test::e2e_http_local_ratelimit_policy ratelimit_total FLAKY 2/4 [ 7.947s] linkerd-policy-test::e2e_server_authorization network FLAKY 2/4 [ 0.142s] linkerd-policy-test::inbound_http_route_status inbound_accepted_parent FLAKY 2/4 [ 0.167s] linkerd-policy-test::inbound_http_route_status inbound_multiple_parents FLAKY 2/4 [ 1.681s] linkerd-policy-test::outbound_api multiple_routes FLAKY 2/4 [ 1.013s] linkerd-policy-test::outbound_api routes_without_backends FLAKY 3/4 [ 1.153s] linkerd-policy-test::outbound_api service_with_routes_with_cross_namespace_backend FLAKY 2/4 [ 0.282s] linkerd-policy-test::outbound_api_failure_accrual consecutive_failure_accrual FLAKY 3/4 [ 0.290s] linkerd-policy-test::outbound_api_failure_accrual default_failure_accrual FLAKY 2/4 [ 0.354s] linkerd-policy-test::outbound_api_http http_route_gateway_timeouts FLAKY 2/4 [ 0.740s] linkerd-policy-test::outbound_api_http http_route_retries_and_timeouts ``` After having measured this, we also added a watcher for the namespace, vastly reducing flakiness: ``` Summary [ 517.330s] 151 tests run: 151 passed (2 flaky), 0 skipped FLAKY 2/4 [ 0.941s] linkerd-policy-test::outbound_api routes_without_backends FLAKY 2/4 [ 0.459s] linkerd-policy-test::outbound_api_tcp multiple_tcp_routes ``` Finally, the jaeger chart version has to be pinned to an earlier one as the latest one is presenting some breaking changes that are making the tracing test fail.
1 parent 8394331 commit 4dfd4ec

File tree

2 files changed

+41
-17
lines changed

2 files changed

+41
-17
lines changed

policy-test/src/lib.rs

Lines changed: 40 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -708,7 +708,7 @@ where
708708
.await;
709709
tracing::trace!(?ns);
710710
tokio::time::timeout(
711-
tokio::time::Duration::from_secs(60),
711+
tokio::time::Duration::from_secs(15),
712712
await_service_account(&client, &ns.name_unchecked(), "default"),
713713
)
714714
.await
@@ -754,33 +754,57 @@ where
754754
}
755755
}
756756

757-
pub async fn await_service_account(client: &kube::Client, ns: &str, name: &str) {
757+
async fn await_resource<T>(
758+
mut watcher: impl futures::Stream<
759+
Item = Result<kube::runtime::watcher::Event<T>, kube::runtime::watcher::Error>,
760+
> + Unpin,
761+
predicate: impl Fn(&T) -> bool,
762+
) where
763+
T: kube::Resource + std::fmt::Debug,
764+
{
758765
use futures::StreamExt;
759766

760-
tracing::trace!(%name, %ns, "Waiting for serviceaccount");
761-
tokio::pin! {
762-
let sas = kube::runtime::watcher(
763-
kube::Api::<k8s::ServiceAccount>::namespaced(client.clone(), ns),
764-
Default::default(),
765-
);
766-
}
767767
loop {
768-
let ev = sas
768+
let ev = watcher
769769
.next()
770770
.await
771-
.expect("serviceaccounts watch must not end")
772-
.expect("serviceaccounts watch must not fail");
771+
.expect("watch must not end")
772+
.expect("watch must not fail");
773773
tracing::info!(?ev);
774774
match ev {
775-
kube::runtime::watcher::Event::InitApply(sa)
776-
| kube::runtime::watcher::Event::Apply(sa)
777-
if sa.name_unchecked() == name =>
775+
kube::runtime::watcher::Event::InitApply(resource)
776+
| kube::runtime::watcher::Event::Apply(resource)
777+
if predicate(&resource) =>
778778
{
779-
return
779+
break
780780
}
781781
_ => {}
782782
}
783783
}
784+
}
785+
786+
pub async fn await_service_account(client: &kube::Client, ns: &str, name: &str) {
787+
tracing::trace!(%ns, "Waiting for namespace");
788+
789+
let label_selector = format!("kubernetes.io/metadata.name={}", ns);
790+
let watcher_config = kube::runtime::watcher::Config::default().labels(&label_selector);
791+
tokio::pin! {
792+
let namespaces = kube::runtime::watcher(
793+
kube::Api::<k8s::Namespace>::all(client.clone()),
794+
watcher_config,
795+
);
796+
}
797+
await_resource(namespaces, |namespace| namespace.name_unchecked() == ns).await;
798+
799+
tracing::trace!(%name, %ns, "Waiting for serviceaccount");
800+
801+
tokio::pin! {
802+
let sas = kube::runtime::watcher(
803+
kube::Api::<k8s::ServiceAccount>::namespaced(client.clone(), ns),
804+
Default::default(),
805+
);
806+
}
807+
await_resource(sas, |sa| sa.name_unchecked() == name).await;
784808

785809
// XXX In some versions of k8s, it may be appropriate to wait for the
786810
// ServiceAccount's secret to be created, but, as of v1.24,

test/integration/tracing/tracing/tracing_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -134,7 +134,7 @@ func installTracing(t *testing.T, namespace string) {
134134
if err != nil {
135135
testutil.AnnotatedFatalf(t, "failed to add OpenTelemetry repository", "failed to add OpenTelemetry repository\n%s\n------\n%s\n", stdout, stderr)
136136
}
137-
stdout, stderr, err = TestHelper.HelmRun("install", "jaeger", "jaegertracing/jaeger", "--namespace=tracing", "--values=testdata/jaeger-aio-values.yaml")
137+
stdout, stderr, err = TestHelper.HelmRun("install", "jaeger", "jaegertracing/jaeger", "--namespace=tracing", "--values=testdata/jaeger-aio-values.yaml", "--version=3.4.1")
138138
if err != nil {
139139
testutil.AnnotatedFatalf(t, "failed to install jaeger", "failed to install jaeger\n%s\n------\n%s\n", stdout, stderr)
140140
}

0 commit comments

Comments
 (0)