Skip to content

Commit 3a5819f

Browse files
turan18sondavidb
authored andcommitted
Fix network retry integration test
Signed-off-by: Yasin Turan <[email protected]> Signed-off-by: David Son <[email protected]>
1 parent f2e9453 commit 3a5819f

File tree

1 file changed

+47
-110
lines changed

1 file changed

+47
-110
lines changed

integration/run_test.go

Lines changed: 47 additions & 110 deletions
Original file line numberDiff line numberDiff line change
@@ -35,20 +35,19 @@ package integration
3535
import (
3636
"bufio"
3737
"bytes"
38-
"encoding/json"
3938
"fmt"
40-
"io"
4139
"math"
4240
"os"
4341
"regexp"
44-
"runtime"
4542
"strconv"
4643
"strings"
44+
"sync"
4745
"testing"
4846
"time"
4947

5048
"github.com/awslabs/soci-snapshotter/soci/store"
5149
shell "github.com/awslabs/soci-snapshotter/util/dockershell"
50+
"github.com/awslabs/soci-snapshotter/util/testutil"
5251
)
5352

5453
func TestRunWithDefaultConfig(t *testing.T) {
@@ -221,15 +220,7 @@ type retryConfig struct {
221220
// TestNetworkRetry runs a container, disables network access to the remote image, asks the container
222221
// to do something requiring the remote image, waits for some/all requests to fail, enables the network,
223222
// confirms retries and success/failure
224-
225-
// TODO: Refactor test
226223
func TestNetworkRetry(t *testing.T) {
227-
// This test makes the testing suite hang on ARM.
228-
// Disabling for now till we can refactor this.
229-
if runtime.GOARCH == "arm64" {
230-
return
231-
}
232-
233224
const containerImage = alpineImage
234225

235226
tests := []struct {
@@ -247,12 +238,12 @@ func TestNetworkRetry(t *testing.T) {
247238
},
248239
},
249240
{
250-
name: "1s network interruption, no retries allowed, failure",
241+
name: "6s network interruption, no retries allowed, failure",
251242
config: retryConfig{
252243
maxRetries: -1,
253244
minWaitMsec: 0,
254245
maxWaitMsec: 0,
255-
networkDisableMsec: 1000,
246+
networkDisableMsec: 6000,
256247
expectedSuccess: false,
257248
},
258249
},
@@ -267,12 +258,12 @@ func TestNetworkRetry(t *testing.T) {
267258
},
268259
},
269260
{
270-
name: "10s network interruption, ~6-7s retries allowed, failure",
261+
name: "12s network interruption, ~6-7s retries allowed, failure",
271262
config: retryConfig{
272263
maxRetries: 1,
273264
minWaitMsec: 100,
274265
maxWaitMsec: 1600,
275-
networkDisableMsec: 10000,
266+
networkDisableMsec: 12000,
276267
expectedSuccess: false,
277268
},
278269
},
@@ -296,135 +287,81 @@ max_wait_msec = ` + strconv.FormatInt(tt.config.maxWaitMsec, 10) + `
296287
disable = true
297288
`
298289

299-
m := rebootContainerd(t, sh, getContainerdConfigToml(t, false), getSnapshotterConfigToml(t, false, config))
290+
rebootContainerd(t, sh, getContainerdConfigToml(t, false), getSnapshotterConfigToml(t, false, config))
300291
// Mirror image
301292
copyImage(sh, dockerhub(containerImage), regConfig.mirror(containerImage))
302-
// Pull image, create SOCI index with all layers and small (100kiB) spans
303-
indexDigest := buildIndex(sh, regConfig.mirror(containerImage), withMinLayerSize(0), withSpanSize(100*1024))
304-
sh.X(append(imagePullCmd, "--soci-index-digest", indexDigest, regConfig.mirror(containerImage).ref)...)
305-
306-
// Run the container
307293
image := regConfig.mirror(containerImage).ref
308-
sh.X(append(runSociCmd, "--name", "test-container", "-d", image, "sleep", "infinity")...)
309-
310-
sh.X("apt-get", "--no-install-recommends", "install", "-qy", "iptables")
311-
312-
// TODO: Wait for the container to be up and running
294+
indexDigest := buildIndex(sh, regConfig.mirror(containerImage), withMinLayerSize(0), withSpanSize(1<<20))
295+
sh.X("soci", "push", "--user", regConfig.creds(), regConfig.mirror(containerImage).ref)
313296

297+
rebootContainerd(t, sh, getContainerdConfigToml(t, false), getSnapshotterConfigToml(t, false, config))
298+
// Re-pull image from our local registry mirror
299+
sh.X(append(imagePullCmd, "--soci-index-digest", indexDigest, regConfig.mirror(containerImage).ref)...)
300+
sh.X("apt-get", "-qq", "--no-install-recommends", "install", "-y", "iptables")
314301
// Block network access to the registry
315302
if tt.config.networkDisableMsec > 0 {
316303
sh.X("iptables", "-A", "OUTPUT", "-d", registryHostIP, "-j", "DROP")
317304
}
318305

319-
// Do something in the container that should work without network access
320-
commandSucceedStdout, _, err := sh.R("nerdctl", "exec", "test-container", "sh", "-c", "times")
306+
// Read a file to trigger a synchronous network request.
307+
cmdStdout, cmdStderr, err := sh.R(append(runSociCmd, image, "cat", "/etc/hosts")...)
321308
if err != nil {
322-
t.Fatalf("attempt to run task without network access failed: %s", err)
323-
}
324-
325-
type ErrorLogLine struct {
326-
Error string `json:"error"`
327-
Msg string `json:"msg"`
309+
t.Fatalf("attempt to run task requiring network access failed: %s", err)
328310
}
329311

330-
gaveUpChannel := make(chan bool, 1)
331-
defer close(gaveUpChannel)
312+
successChannel := make(chan string, 1)
313+
failureChannel := make(chan string, 1)
332314

333-
monitorGaveUp := func(rawL string) {
334-
if i := strings.Index(rawL, "{"); i > 0 {
335-
rawL = rawL[i:] // trim garbage chars; expects "{...}"-styled JSON log
336-
}
337-
var logLine ErrorLogLine
338-
if err := json.Unmarshal([]byte(rawL), &logLine); err == nil {
339-
if logLine.Msg == "statFile error" && strings.Contains(logLine.Error, "giving up after") {
340-
gaveUpChannel <- true
341-
return
342-
}
315+
listener := func(c chan string) func(string) {
316+
var once sync.Once
317+
return func(rawLog string) {
318+
once.Do(func() { c <- rawLog })
343319
}
344320
}
345321

346-
m.Add("retry", monitorGaveUp)
347-
defer m.Remove("retry")
348-
349-
// Do something in the container to access un-fetched spans, requiring network access
350-
commandNetworkStdout, _, err := sh.R("nerdctl", "exec", "test-container", "cat", "/etc/hosts")
351-
if err != nil {
352-
t.Fatalf("attempt to run task requiring network access failed: %s", err)
353-
}
354-
355-
// Wait with network disabled
356-
time.Sleep(time.Duration(tt.config.networkDisableMsec) * time.Millisecond)
322+
// There's no way to split stdout and stderr with LogMonitor,
323+
// and sh.R could orphan goroutines, so make new LogMonitors
324+
// for stdout and stderr and monitor them separately
325+
r := testutil.NewTestingReporter(t)
357326

358-
// Short wait to allow commands to complete
359-
time.Sleep(time.Duration(1000) * time.Millisecond)
327+
stdoutLogMonitor := testutil.NewLogMonitor(r, cmdStdout, strings.NewReader(""))
328+
stdoutLogMonitor.Add("listener", listener(successChannel))
329+
defer stdoutLogMonitor.Remove("listener")
360330

361-
// Confirm first command succeeded while network was down
362-
buf := make([]byte, 100)
363-
if _, err = commandSucceedStdout.Read(buf); err != nil {
364-
t.Fatalf("read from expected successful task output failed: %s", err)
365-
}
366-
if !strings.Contains(string(buf), "s ") { // `times` output looks like "0m0.03s 0m0.05s"
367-
t.Fatalf("expected successful task produced unexpected output: %s", string(buf))
368-
}
331+
stderrLogMonitor := testutil.NewLogMonitor(r, strings.NewReader(""), cmdStderr)
332+
stderrLogMonitor.Add("listener", listener(failureChannel))
333+
defer stderrLogMonitor.Remove("listener")
369334

370-
// async read from command_network_stdout
371-
commandNetworkStdoutChannel := make(chan []byte)
372-
commandNetworkErrChannel := make(chan error)
373-
go func() {
374-
defer close(commandNetworkStdoutChannel)
375-
defer close(commandNetworkErrChannel)
376-
var b []byte
377-
if _, err := commandNetworkStdout.Read(b); err != nil && err != io.EOF {
378-
commandNetworkErrChannel <- fmt.Errorf("read from network bound task output failed: %s", err)
379-
return
380-
}
381-
commandNetworkStdoutChannel <- b
382-
if err == io.EOF {
383-
commandNetworkErrChannel <- fmt.Errorf("read from network bound task output encountered EOF")
384-
return
385-
}
386-
}()
335+
// Wait with network disabled. At this point the snapshotter should be
336+
// retrying.
337+
time.Sleep(time.Duration(tt.config.networkDisableMsec) * time.Millisecond)
387338

339+
// Restore network access
388340
if tt.config.networkDisableMsec > 0 {
389-
390-
// Confirm second command has not succeeded while network was down
391-
select {
392-
case err := <-commandNetworkErrChannel:
393-
t.Fatal(err)
394-
case data := <-commandNetworkStdoutChannel:
395-
if len(data) > 0 {
396-
t.Fatalf("network bound task produced unexpected output: %s", string(data))
397-
}
398-
case <-time.After(100 * time.Millisecond):
399-
}
400-
401341
// Restore access to the registry and image
402342
sh.X("iptables", "-D", "OUTPUT", "-d", registryHostIP, "-j", "DROP")
403-
404343
// Wait with network enabled, so a final retry has a chance to succeed
405344
time.Sleep(2 * time.Millisecond * time.Duration(math.Min(
406345
float64(tt.config.maxWaitMsec),
407346
math.Pow(2, float64(tt.config.maxRetries))*float64(tt.config.minWaitMsec),
408347
)))
409348
}
410349

411-
// Confirm whether second command has succeeded with network restored
412-
413350
select {
414-
case gaveUp := <-gaveUpChannel:
415-
if tt.config.expectedSuccess && gaveUp {
416-
t.Fatal("retries gave up despite test expecting retry success")
351+
case data := <-failureChannel:
352+
if tt.config.expectedSuccess {
353+
t.Fatalf("expected Read request to succeed; got data in stderr: %s", data)
354+
} else if len(successChannel) > 0 {
355+
t.Fatalf("expected Read request to fail; got data in (stdout, stderr) : (%s, %s)", data, <-successChannel)
417356
}
418-
case data := <-commandNetworkStdoutChannel:
357+
case data := <-successChannel:
419358
if !tt.config.expectedSuccess {
420-
if len(data) > 0 {
421-
t.Fatalf("network bound task produced unexpected output: %s", string(data))
422-
}
423-
}
424-
case <-time.After(100 * time.Millisecond):
425-
if tt.config.expectedSuccess {
426-
t.Fatal("network bound task produced no output when expecting success")
359+
t.Fatalf("expected Read request to fail; got data in stdout: %s", data)
360+
} else if len(failureChannel) > 0 {
361+
t.Fatalf("expected Read request to succeed; got data in (stdout, stderr) : (%s, %s)", data, <-failureChannel)
427362
}
363+
case <-time.After(15 * time.Second):
364+
t.Fatal("neither stdout or stderr has been written to")
428365
}
429366
})
430367
}

0 commit comments

Comments
 (0)