Skip to content

Commit bc437a6

Browse files
committed
e2e: add kds-pcs-downtime test
1 parent d7b380b commit bc437a6

File tree

5 files changed

+224
-5
lines changed

5 files changed

+224
-5
lines changed

.github/workflows/e2e_manual.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ on:
1616
- gpu
1717
- imagepuller-auth
1818
- imagestore
19+
- kds-pcs-downtime
1920
- memdump
2021
- multiple-cpus
2122
- openssl

.github/workflows/e2e_nightly.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,7 @@ jobs:
5656
- gpu
5757
- imagepuller-auth
5858
- imagestore
59+
- kds-pcs-downtime
5960
- memdump
6061
- openssl
6162
- peerrecovery

e2e/internal/contrasttest/contrasttest.go

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -421,14 +421,16 @@ func (ct *ContrastTest) Verify(t *testing.T) {
421421
require.NoError(t, ct.RunVerify(t.Context()))
422422
}
423423

424-
// Recover runs the contrast recover subcommand.
424+
// Recover runs the contrast recover subcommand and fails the test if it is not successful.
425425
func (ct *ContrastTest) Recover(t *testing.T) {
426-
require := require.New(t)
426+
require.NoError(t, ct.runAgainstCoordinator(t.Context(), cmd.NewRecoverCmd()))
427+
}
427428

428-
ctx, cancel := context.WithTimeout(t.Context(), 3*time.Minute)
429+
// RunRecover runs the contrast recover subcommand.
430+
func (ct *ContrastTest) RunRecover(ctx context.Context) error {
431+
ctx, cancel := context.WithTimeout(ctx, 3*time.Minute)
429432
defer cancel()
430-
431-
require.NoError(ct.runAgainstCoordinator(ctx, cmd.NewRecoverCmd()))
433+
return ct.runAgainstCoordinator(ctx, cmd.NewRecoverCmd())
432434
}
433435

434436
// MeshCACert returns a CertPool that contains the coordinator mesh CA cert.
Lines changed: 214 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,214 @@
1+
// Copyright 2025 Edgeless Systems GmbH
2+
// SPDX-License-Identifier: BUSL-1.1
3+
4+
//go:build e2e
5+
6+
package kdspcsdowntime
7+
8+
import (
9+
"context"
10+
"flag"
11+
"fmt"
12+
"net"
13+
"net/http"
14+
"os"
15+
"sync/atomic"
16+
"testing"
17+
"time"
18+
19+
"github.com/edgelesssys/contrast/e2e/internal/contrasttest"
20+
"github.com/edgelesssys/contrast/e2e/internal/kubeclient"
21+
"github.com/edgelesssys/contrast/internal/kuberesource"
22+
"github.com/edgelesssys/contrast/internal/manifest"
23+
"github.com/edgelesssys/contrast/internal/platforms"
24+
"github.com/elazarl/goproxy"
25+
"github.com/stretchr/testify/require"
26+
)
27+
28+
const (
29+
kdsAddr = "kdsintf.amd.com:443"
30+
pcsAddr = "api.trustedservices.intel.com:443"
31+
)
32+
33+
func TestKDSPCSDowntime(t *testing.T) {
34+
platform, err := platforms.FromString(contrasttest.Flags.PlatformStr)
35+
require.NoError(t, err)
36+
ct := contrasttest.New(t)
37+
38+
runtimeHandler, err := manifest.RuntimeHandler(platform)
39+
require.NoError(t, err)
40+
resources := kuberesource.CoordinatorBundle()
41+
resources = kuberesource.PatchRuntimeHandlers(resources, runtimeHandler)
42+
resources = kuberesource.AddPortForwarders(resources)
43+
ct.Init(t, resources)
44+
45+
proxy := goproxy.NewProxyHttpServer()
46+
server := http.Server{Handler: proxy}
47+
errCh := make(chan error)
48+
49+
// If set to true, connections to KDS and PCS will be blocked by the proxy.
50+
var blockKDSPCS atomic.Bool
51+
// connectionProxied will be set to true if the proxy performs an HTTP CONNECT to the address of KDS or PCS.
52+
var connectionProxied atomic.Bool
53+
proxy.ConnectDial = func(network string, addr string) (net.Conn, error) {
54+
t.Logf("Proxying connection: %q", addr)
55+
if (addr == kdsAddr || addr == pcsAddr) && blockKDSPCS.Load() {
56+
t.Logf("Blocking connection to KDS/PCS %q", addr)
57+
connectionProxied.Store(true)
58+
return nil, fmt.Errorf("connection to KDS/PCS %q blocked by test proxy", addr)
59+
}
60+
return (&net.Dialer{}).DialContext(t.Context(), network, addr)
61+
}
62+
63+
proxyListener, err := (&net.ListenConfig{}).Listen(t.Context(), "tcp", "127.0.0.1:")
64+
require.NoError(t, err)
65+
66+
t.Cleanup(func() {
67+
require.NoError(t, server.Close())
68+
err := <-errCh
69+
require.ErrorIs(t, err, http.ErrServerClosed)
70+
})
71+
72+
go func() {
73+
errCh <- server.Serve(proxyListener)
74+
}()
75+
76+
t.Setenv("https_proxy", proxyListener.Addr().String())
77+
78+
require.True(t, t.Run("generate", ct.Generate), "contrast generate needs to succeed for subsequent tests")
79+
require.True(t, t.Run("apply", ct.Apply), "Kubernetes resources need to be applied for subsequent tests")
80+
81+
t.Run("kds downtime", func(t *testing.T) {
82+
if !platforms.IsSNP(platform) {
83+
t.Skip("KDS downtime test is only applicable to SEV-SNP workloads")
84+
}
85+
86+
require := require.New(t)
87+
88+
ctx, cancel := context.WithTimeout(t.Context(), ct.FactorPlatformTimeout(3*time.Minute))
89+
defer cancel()
90+
91+
require.NoError(ct.Kubeclient.WaitForCoordinator(ctx, ct.Namespace))
92+
93+
//
94+
// Look at dev-docs/endorsement-caching.md for table of different cases.
95+
//
96+
97+
// Coordinator and CLI cache are empty at the beginning.
98+
99+
coordinatorPods, err := ct.Kubeclient.PodsFromOwner(ctx, ct.Namespace, "StatefulSet", "coordinator")
100+
require.NoError(err)
101+
require.NotEmpty(coordinatorPods, "pod not found: %s/%s", ct.Namespace, "coordinator")
102+
103+
// Block coordinator access to KDS.
104+
etcHosts, stderr, err := ct.Kubeclient.Exec(ctx, ct.Namespace, coordinatorPods[0].Name, []string{"/bin/sh", "-c", "cat /etc/hosts"})
105+
require.NoError(err, "stderr: %q", stderr)
106+
_, stderr, err = ct.Kubeclient.Exec(ctx, ct.Namespace, coordinatorPods[0].Name, []string{"/bin/sh", "-c", "echo 127.0.0.1 kdsintf.amd.com >> /etc/hosts"})
107+
require.NoError(err, "stderr: %q", stderr)
108+
109+
// Block CLI access to KDS.
110+
blockKDSPCS.Store(true)
111+
112+
// Set should fail because neither coordinator nor CLI can reach KDS and there is no cached data.
113+
// Set loop considers context deadline exceeded from KDS as a retriable error.
114+
// Lower the timeout so the set loop doesn't exceed the test timeout.
115+
setCtx, setCancel := context.WithTimeout(ctx, ct.FactorPlatformTimeout(1*time.Minute))
116+
defer setCancel()
117+
err = ct.RunSet(setCtx)
118+
t.Logf("Set error: %v", err)
119+
require.ErrorContains(err, "transport: authentication handshake failed: context deadline exceeded")
120+
require.True(connectionProxied.Load(), "expected connection to KDS to be proxied")
121+
connectionProxied.Store(false)
122+
123+
// Unblock coordinator access to KDS.
124+
_, stderr, err = ct.Kubeclient.Exec(ctx, ct.Namespace, coordinatorPods[0].Name, []string{"/bin/sh", "-c", fmt.Sprintf("echo '%s' > /etc/hosts", etcHosts)})
125+
require.NoError(err, "updating /etc/hosts: stderr: %q", stderr)
126+
127+
// Set should succeed because coordinator can reach KDS.
128+
require.NoError(ct.RunSet(ctx))
129+
130+
// Block coordinator access to KDS again.
131+
_, stderr, err = ct.Kubeclient.Exec(ctx, ct.Namespace, coordinatorPods[0].Name, []string{"/bin/sh", "-c", "echo 127.0.0.1 kdsintf.amd.com >> /etc/hosts"})
132+
require.NoError(err, "updating /etc/hosts: stderr: %q", stderr)
133+
134+
// Verify should succeed because certs are now cached by coordinator.
135+
require.NoError(ct.RunVerify(ctx))
136+
137+
// Clear coordinator cache by restarting it.
138+
require.NoError(ct.Kubeclient.Restart(ctx, kubeclient.StatefulSet{}, ct.Namespace, "coordinator"))
139+
require.NoError(ct.Kubeclient.WaitForCoordinator(ctx, ct.Namespace))
140+
141+
coordinatorPods, err = ct.Kubeclient.PodsFromOwner(ctx, ct.Namespace, "StatefulSet", "coordinator")
142+
require.NoError(err)
143+
require.NotEmpty(coordinatorPods, "pod not found: %s/%s", ct.Namespace, "coordinator")
144+
145+
// Block coordinator access to KDS.
146+
_, stderr, err = ct.Kubeclient.Exec(ctx, ct.Namespace, coordinatorPods[0].Name, []string{"/bin/sh", "-c", "echo 127.0.0.1 kdsintf.amd.com >> /etc/hosts"})
147+
require.NoError(err, "updating /etc/hosts: stderr: %q", stderr)
148+
149+
// Unblock CLI access to KDS.
150+
blockKDSPCS.Store(false)
151+
152+
// Recover should succeed because CLI can reach KDS.
153+
require.NoError(ct.RunRecover(ctx))
154+
155+
// Block CLI access to KDS again.
156+
blockKDSPCS.Store(true)
157+
158+
// Verify should succeed because CLI has now cached the certs.
159+
require.NoError(ct.RunVerify(ctx))
160+
})
161+
162+
t.Run("pcs downtime", func(t *testing.T) {
163+
if !platforms.IsTDX(platform) {
164+
t.Skip("PCS downtime test is only applicable to TDX workloads")
165+
}
166+
167+
require := require.New(t)
168+
169+
ctx, cancel := context.WithTimeout(t.Context(), ct.FactorPlatformTimeout(2*time.Minute))
170+
defer cancel()
171+
172+
c := kubeclient.NewForTest(t)
173+
174+
require.NoError(c.WaitForCoordinator(ctx, ct.Namespace))
175+
176+
//
177+
// We can't test PCS downtime on the issuer side, since PCS/PCCS are accessed from the host.
178+
// Look at dev-docs/endorsement-caching.md for table of different cases.
179+
//
180+
181+
// CLI cache is empty at the beginning. Block CLI access to PCS.
182+
blockKDSPCS.Store(true)
183+
184+
// Set should fail because the CLI can't reach the PCS and there is no cached data.
185+
// Set loop considers context deadline exceeded from PCS as a retriable error.
186+
// Lower the timeout so the set loop doesn't exceed the test timeout.
187+
setCtx, setCancel := context.WithTimeout(ctx, ct.FactorPlatformTimeout(1*time.Minute))
188+
defer setCancel()
189+
err = ct.RunSet(setCtx)
190+
t.Logf("Set error: %v", err)
191+
require.ErrorContains(err, "transport: authentication handshake failed: context deadline exceeded")
192+
require.True(connectionProxied.Load(), "expected connection to PCS to be proxied")
193+
connectionProxied.Store(false)
194+
195+
// Unblock CLI access to PCS.
196+
blockKDSPCS.Store(false)
197+
198+
// Set should succeed because the CLI can reach PCS.
199+
require.NoError(ct.RunSet(ctx))
200+
201+
// Block CLI access to PCS again.
202+
blockKDSPCS.Store(true)
203+
204+
// Verify should succeed because collateral is now cached by CLI.
205+
require.NoError(ct.RunVerify(ctx))
206+
})
207+
}
208+
209+
func TestMain(m *testing.M) {
210+
contrasttest.RegisterFlags()
211+
flag.Parse()
212+
213+
os.Exit(m.Run())
214+
}

packages/by-name/contrast/e2e/package.nix

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,7 @@ buildGoModule {
6464
"e2e/gpu"
6565
"e2e/imagepuller-auth"
6666
"e2e/imagestore"
67+
"e2e/kds-pcs-downtime"
6768
"e2e/memdump"
6869
"e2e/multiple-cpus"
6970
"e2e/openssl"

0 commit comments

Comments
 (0)