Skip to content

Commit f11d0f6

Browse files
authored
Merge pull request #642 from kzys/reduce-vms
Separate TestMultipleVMs_Isolated from other tests
2 parents 956c44a + 96f4ac2 commit f11d0f6

File tree

2 files changed

+114
-28
lines changed

2 files changed

+114
-28
lines changed

.buildkite/pipeline.yml

Lines changed: 19 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,24 @@ steps:
7373
- make test-in-docker
7474
timeout_in_minutes: 10
7575

76-
- label: ":rotating_light: :running_shirt_with_sash: runtime isolated tests"
76+
- label: ":running: runtime isolated tests"
77+
agents:
78+
queue: "${BUILDKITE_AGENT_META_DATA_QUEUE:-default}"
79+
distro: "${BUILDKITE_AGENT_META_DATA_DISTRO}"
80+
hostname: "${BUILDKITE_AGENT_META_DATA_HOSTNAME}"
81+
env:
82+
DOCKER_IMAGE_TAG: "$BUILDKITE_BUILD_NUMBER"
83+
NUMBER_OF_VMS: 10
84+
EXTRAGOARGS: "-v -count=1 -race"
85+
FICD_DM_VOLUME_GROUP: fcci-vg
86+
artifact_paths:
87+
- "runtime/logs/*"
88+
command:
89+
- make -C runtime integ-test FICD_DM_POOL=build_${BUILDKITE_BUILD_NUMBER}_runtime
90+
91+
- label: ":weight_lifter: stress tests"
92+
concurrency_group: stress
93+
concurrency: 1
7794
agents:
7895
queue: "${BUILDKITE_AGENT_META_DATA_QUEUE:-default}"
7996
distro: "${BUILDKITE_AGENT_META_DATA_DISTRO}"
@@ -86,7 +103,7 @@ steps:
86103
artifact_paths:
87104
- "runtime/logs/*"
88105
command:
89-
- make -C runtime integ-test FICD_DM_POOL=build_${BUILDKITE_BUILD_NUMBER}_runtime
106+
- make -C runtime integ-test-TestMultipleVMs_Isolated FICD_DM_POOL=stress$BUILDKITE_BUILD_NUMBER
90107

91108
- label: ":rotating_light: :exclamation: example tests"
92109
agents:

runtime/service_integ_test.go

Lines changed: 95 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -242,22 +242,55 @@ func createTapDevice(ctx context.Context, tapName string) error {
242242
func TestMultipleVMs_Isolated(t *testing.T) {
243243
integtest.Prepare(t)
244244

245-
// This test starts multiple VMs and some may hit firecracker-containerd's
246-
// default timeout. So overriding the timeout to wait longer.
247-
// One hour should be enough to start a VM, regardless of the load of
248-
// the underlying host.
249-
const createVMTimeout = time.Hour
250-
251-
netns, err := ns.GetCurrentNS()
252-
require.NoError(t, err, "failed to get a namespace")
245+
var err error
253246

254247
// numberOfVmsEnvName = NUMBER_OF_VMS ENV and is configurable from buildkite
255248
numberOfVms := defaultNumberOfVms
256249
if str := os.Getenv(numberOfVmsEnvName); str != "" {
257250
numberOfVms, err = strconv.Atoi(str)
258251
require.NoError(t, err, "failed to get NUMBER_OF_VMS env")
259252
}
260-
t.Logf("TestMultipleVMs_Isolated: will run %d vm's", numberOfVms)
253+
t.Logf("TestMultipleVMs_Isolated: will run up to %d VMs", numberOfVms)
254+
255+
// We should be able to run 10 VMs without any issues.
256+
if numberOfVms <= 10 {
257+
testMultipleVMs(t, 10)
258+
return
259+
}
260+
261+
// We have issues running 100 VMs (see #581).
262+
// Incrementally increase the number of VMs to find the breaking point.
263+
for i := 10; i <= numberOfVms; i += 10 {
264+
success := t.Run(fmt.Sprintf("VMs=%d", i), func(t *testing.T) {
265+
testMultipleVMs(t, i)
266+
})
267+
if !success {
268+
// If running N VMs doesn't work, no point to go further.
269+
return
270+
}
271+
}
272+
}
273+
274+
type Event int
275+
276+
const (
277+
Created Event = iota
278+
Stopped
279+
)
280+
281+
func testMultipleVMs(t *testing.T, count int) {
282+
// This test starts multiple VMs and some may hit firecracker-containerd's
283+
// default timeout. So overriding the timeout to wait longer.
284+
// One hour should be enough to start a VM, regardless of the load of
285+
// the underlying host.
286+
const createVMTimeout = 1 * time.Hour
287+
288+
// Apparently writing a lot from Firecracker's serial console blocks VMs.
289+
// https://github.com/firecracker-microvm/firecracker/blob/v1.1.0/docs/prod-host-setup.md
290+
kernelArgs := integtest.DefaultRuntimeConfig.KernelArgs + " 8250.nr_uarts=0 quiet loglevel=1"
291+
292+
netns, err := ns.GetCurrentNS()
293+
require.NoError(t, err, "failed to get a namespace")
261294

262295
tapPrefix := os.Getenv(tapPrefixEnvName)
263296

@@ -278,6 +311,7 @@ func TestMultipleVMs_Isolated(t *testing.T) {
278311
},
279312
{
280313
MaxContainers: 3,
314+
281315
JailerConfig: &proto.JailerConfig{
282316
UID: 300000,
283317
GID: 300000,
@@ -299,39 +333,56 @@ func TestMultipleVMs_Isolated(t *testing.T) {
299333
cfg, err := config.LoadConfig("")
300334
require.NoError(t, err, "failed to load config")
301335

336+
eventCh := make(chan Event)
337+
338+
// Creating tap devices without goroutines somehow stabilize this test.
339+
var devices []string
340+
341+
defer func() {
342+
for _, dev := range devices {
343+
err := deleteTapDevice(testCtx, dev)
344+
assert.NoError(t, err)
345+
}
346+
}()
347+
348+
for i := 0; i < count; i++ {
349+
tapName := fmt.Sprintf("%stap%d", tapPrefix, i)
350+
err := createTapDevice(testCtx, tapName)
351+
if err != nil {
352+
t.Errorf("failed to create %q: %s", tapName, err)
353+
return
354+
}
355+
devices = append(devices, tapName)
356+
}
357+
302358
// This test spawns separate VMs in parallel and ensures containers are spawned within each expected VM. It asserts each
303359
// container ends up in the right VM by assigning each VM a network device with a unique mac address and having each container
304360
// print the mac address it sees inside its VM.
305361
vmEg, vmEgCtx := errgroup.WithContext(testCtx)
306-
for i := 0; i < numberOfVms; i++ {
362+
for i, device := range devices {
307363
caseTypeNumber := i % len(cases)
308364
vmID := i
365+
device := device
309366
c := cases[caseTypeNumber]
310367

311368
f := func(ctx context.Context) error {
312369
containerCount := c.MaxContainers
313370
jailerConfig := c.JailerConfig
314371

315-
tapName := fmt.Sprintf("%stap%d", tapPrefix, vmID)
316-
err := createTapDevice(ctx, tapName)
317-
if err != nil {
318-
return err
319-
}
320-
defer deleteTapDevice(ctx, tapName)
321-
322372
rootfsPath := cfg.RootDrive
323373

324374
vmIDStr := strconv.Itoa(vmID)
325375
req := &proto.CreateVMRequest{
326-
VMID: vmIDStr,
376+
KernelArgs: kernelArgs,
377+
VMID: vmIDStr,
327378
RootDrive: &proto.FirecrackerRootDrive{
328379
HostPath: rootfsPath,
329380
},
330381
NetworkInterfaces: []*proto.FirecrackerNetworkInterface{
331382
{
332383
AllowMMDS: true,
333384
StaticConfig: &proto.StaticNetworkConfiguration{
334-
HostDevName: tapName,
385+
HostDevName: device,
335386
MacAddress: vmIDtoMacAddr(uint(vmID)),
336387
},
337388
},
@@ -349,6 +400,7 @@ func TestMultipleVMs_Isolated(t *testing.T) {
349400
if err != nil {
350401
return err
351402
}
403+
defer fcClient.Close()
352404

353405
resp, createVMErr := fcClient.CreateVM(ctx, req)
354406
if createVMErr != nil {
@@ -365,6 +417,7 @@ func TestMultipleVMs_Isolated(t *testing.T) {
365417
createVMErr,
366418
)
367419
}
420+
eventCh <- Created
368421

369422
containerEg, containerCtx := errgroup.WithContext(vmEgCtx)
370423
for containerID := 0; containerID < int(containerCount); containerID++ {
@@ -425,10 +478,8 @@ func TestMultipleVMs_Isolated(t *testing.T) {
425478
}
426479

427480
_, err = fcClient.StopVM(ctx, &proto.StopVMRequest{VMID: strconv.Itoa(vmID), TimeoutSeconds: 5})
428-
if err != nil {
429-
return err
430-
}
431-
return nil
481+
eventCh <- Stopped
482+
return err
432483
}
433484

434485
vmEg.Go(func() error {
@@ -440,8 +491,26 @@ func TestMultipleVMs_Isolated(t *testing.T) {
440491
})
441492
}
442493

443-
err = vmEg.Wait()
444-
require.NoError(t, err)
494+
ticker := time.NewTicker(10 * time.Second)
495+
defer ticker.Stop()
496+
497+
var created int
498+
for stopped := 0; stopped < count; {
499+
select {
500+
case <-vmEgCtx.Done():
501+
require.NoError(t, vmEg.Wait())
502+
return
503+
case e := <-eventCh:
504+
switch e {
505+
case Created:
506+
created++
507+
case Stopped:
508+
stopped++
509+
}
510+
case <-ticker.C:
511+
t.Logf("created=%d/%d stopped=%d/%d", created, count, stopped, count)
512+
}
513+
}
445514
}
446515

447516
func testMultipleExecs(
@@ -478,7 +547,7 @@ func testMultipleExecs(
478547
if err != nil {
479548
return err
480549
}
481-
defer newContainer.Delete(ctx)
550+
defer newContainer.Delete(ctx, containerd.WithSnapshotCleanup)
482551

483552
var taskStdout bytes.Buffer
484553
var taskStderr bytes.Buffer

0 commit comments

Comments
 (0)