Skip to content

Commit 96f4ac2

Browse files
committed
Separate TestMultipleVMs_Isolated from other tests
This test separates TestMultipleVMs_Isolated's 100 VMs run from other tests by - Changing the number of VMs from 100 to 10 in the runtime tests - Keeping 100 VMs run as a separate job with its own concurrency group. In addition to that, the test increases the number of VMs slowly, instead of running 100 VMs from the beginning to figure out the breaking point. Signed-off-by: Kazuyoshi Kato <[email protected]>
1 parent 956c44a commit 96f4ac2

File tree

2 files changed

+114
-28
lines changed

2 files changed

+114
-28
lines changed

.buildkite/pipeline.yml

Lines changed: 19 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,24 @@ steps:
7373
- make test-in-docker
7474
timeout_in_minutes: 10
7575

76-
- label: ":rotating_light: :running_shirt_with_sash: runtime isolated tests"
76+
- label: ":running: runtime isolated tests"
77+
agents:
78+
queue: "${BUILDKITE_AGENT_META_DATA_QUEUE:-default}"
79+
distro: "${BUILDKITE_AGENT_META_DATA_DISTRO}"
80+
hostname: "${BUILDKITE_AGENT_META_DATA_HOSTNAME}"
81+
env:
82+
DOCKER_IMAGE_TAG: "$BUILDKITE_BUILD_NUMBER"
83+
NUMBER_OF_VMS: 10
84+
EXTRAGOARGS: "-v -count=1 -race"
85+
FICD_DM_VOLUME_GROUP: fcci-vg
86+
artifact_paths:
87+
- "runtime/logs/*"
88+
command:
89+
- make -C runtime integ-test FICD_DM_POOL=build_${BUILDKITE_BUILD_NUMBER}_runtime
90+
91+
- label: ":weight_lifter: stress tests"
92+
concurrency_group: stress
93+
concurrency: 1
7794
agents:
7895
queue: "${BUILDKITE_AGENT_META_DATA_QUEUE:-default}"
7996
distro: "${BUILDKITE_AGENT_META_DATA_DISTRO}"
@@ -86,7 +103,7 @@ steps:
86103
artifact_paths:
87104
- "runtime/logs/*"
88105
command:
89-
- make -C runtime integ-test FICD_DM_POOL=build_${BUILDKITE_BUILD_NUMBER}_runtime
106+
- make -C runtime integ-test-TestMultipleVMs_Isolated FICD_DM_POOL=stress$BUILDKITE_BUILD_NUMBER
90107

91108
- label: ":rotating_light: :exclamation: example tests"
92109
agents:

runtime/service_integ_test.go

Lines changed: 95 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -242,22 +242,55 @@ func createTapDevice(ctx context.Context, tapName string) error {
242242
func TestMultipleVMs_Isolated(t *testing.T) {
243243
integtest.Prepare(t)
244244

245-
// This test starts multiple VMs and some may hit firecracker-containerd's
246-
// default timeout. So overriding the timeout to wait longer.
247-
// One hour should be enough to start a VM, regardless of the load of
248-
// the underlying host.
249-
const createVMTimeout = time.Hour
250-
251-
netns, err := ns.GetCurrentNS()
252-
require.NoError(t, err, "failed to get a namespace")
245+
var err error
253246

254247
// numberOfVmsEnvName = NUMBER_OF_VMS ENV and is configurable from buildkite
255248
numberOfVms := defaultNumberOfVms
256249
if str := os.Getenv(numberOfVmsEnvName); str != "" {
257250
numberOfVms, err = strconv.Atoi(str)
258251
require.NoError(t, err, "failed to get NUMBER_OF_VMS env")
259252
}
260-
t.Logf("TestMultipleVMs_Isolated: will run %d vm's", numberOfVms)
253+
t.Logf("TestMultipleVMs_Isolated: will run up to %d VMs", numberOfVms)
254+
255+
// We should be able to run 10 VMs without any issues.
256+
if numberOfVms <= 10 {
257+
testMultipleVMs(t, 10)
258+
return
259+
}
260+
261+
// We have issues running 100 VMs (see #581).
262+
// Incrementally increase the number of VMs to find the breaking point.
263+
for i := 10; i <= numberOfVms; i += 10 {
264+
success := t.Run(fmt.Sprintf("VMs=%d", i), func(t *testing.T) {
265+
testMultipleVMs(t, i)
266+
})
267+
if !success {
268+
// If running N VMs doesn't work, no point to go further.
269+
return
270+
}
271+
}
272+
}
273+
274+
type Event int
275+
276+
const (
277+
Created Event = iota
278+
Stopped
279+
)
280+
281+
func testMultipleVMs(t *testing.T, count int) {
282+
// This test starts multiple VMs and some may hit firecracker-containerd's
283+
// default timeout. So overriding the timeout to wait longer.
284+
// One hour should be enough to start a VM, regardless of the load of
285+
// the underlying host.
286+
const createVMTimeout = 1 * time.Hour
287+
288+
// Apparently writing a lot from Firecracker's serial console blocks VMs.
289+
// https://github.com/firecracker-microvm/firecracker/blob/v1.1.0/docs/prod-host-setup.md
290+
kernelArgs := integtest.DefaultRuntimeConfig.KernelArgs + " 8250.nr_uarts=0 quiet loglevel=1"
291+
292+
netns, err := ns.GetCurrentNS()
293+
require.NoError(t, err, "failed to get a namespace")
261294

262295
tapPrefix := os.Getenv(tapPrefixEnvName)
263296

@@ -278,6 +311,7 @@ func TestMultipleVMs_Isolated(t *testing.T) {
278311
},
279312
{
280313
MaxContainers: 3,
314+
281315
JailerConfig: &proto.JailerConfig{
282316
UID: 300000,
283317
GID: 300000,
@@ -299,39 +333,56 @@ func TestMultipleVMs_Isolated(t *testing.T) {
299333
cfg, err := config.LoadConfig("")
300334
require.NoError(t, err, "failed to load config")
301335

336+
eventCh := make(chan Event)
337+
338+
// Creating tap devices without goroutines somehow stabilize this test.
339+
var devices []string
340+
341+
defer func() {
342+
for _, dev := range devices {
343+
err := deleteTapDevice(testCtx, dev)
344+
assert.NoError(t, err)
345+
}
346+
}()
347+
348+
for i := 0; i < count; i++ {
349+
tapName := fmt.Sprintf("%stap%d", tapPrefix, i)
350+
err := createTapDevice(testCtx, tapName)
351+
if err != nil {
352+
t.Errorf("failed to create %q: %s", tapName, err)
353+
return
354+
}
355+
devices = append(devices, tapName)
356+
}
357+
302358
// This test spawns separate VMs in parallel and ensures containers are spawned within each expected VM. It asserts each
303359
// container ends up in the right VM by assigning each VM a network device with a unique mac address and having each container
304360
// print the mac address it sees inside its VM.
305361
vmEg, vmEgCtx := errgroup.WithContext(testCtx)
306-
for i := 0; i < numberOfVms; i++ {
362+
for i, device := range devices {
307363
caseTypeNumber := i % len(cases)
308364
vmID := i
365+
device := device
309366
c := cases[caseTypeNumber]
310367

311368
f := func(ctx context.Context) error {
312369
containerCount := c.MaxContainers
313370
jailerConfig := c.JailerConfig
314371

315-
tapName := fmt.Sprintf("%stap%d", tapPrefix, vmID)
316-
err := createTapDevice(ctx, tapName)
317-
if err != nil {
318-
return err
319-
}
320-
defer deleteTapDevice(ctx, tapName)
321-
322372
rootfsPath := cfg.RootDrive
323373

324374
vmIDStr := strconv.Itoa(vmID)
325375
req := &proto.CreateVMRequest{
326-
VMID: vmIDStr,
376+
KernelArgs: kernelArgs,
377+
VMID: vmIDStr,
327378
RootDrive: &proto.FirecrackerRootDrive{
328379
HostPath: rootfsPath,
329380
},
330381
NetworkInterfaces: []*proto.FirecrackerNetworkInterface{
331382
{
332383
AllowMMDS: true,
333384
StaticConfig: &proto.StaticNetworkConfiguration{
334-
HostDevName: tapName,
385+
HostDevName: device,
335386
MacAddress: vmIDtoMacAddr(uint(vmID)),
336387
},
337388
},
@@ -349,6 +400,7 @@ func TestMultipleVMs_Isolated(t *testing.T) {
349400
if err != nil {
350401
return err
351402
}
403+
defer fcClient.Close()
352404

353405
resp, createVMErr := fcClient.CreateVM(ctx, req)
354406
if createVMErr != nil {
@@ -365,6 +417,7 @@ func TestMultipleVMs_Isolated(t *testing.T) {
365417
createVMErr,
366418
)
367419
}
420+
eventCh <- Created
368421

369422
containerEg, containerCtx := errgroup.WithContext(vmEgCtx)
370423
for containerID := 0; containerID < int(containerCount); containerID++ {
@@ -425,10 +478,8 @@ func TestMultipleVMs_Isolated(t *testing.T) {
425478
}
426479

427480
_, err = fcClient.StopVM(ctx, &proto.StopVMRequest{VMID: strconv.Itoa(vmID), TimeoutSeconds: 5})
428-
if err != nil {
429-
return err
430-
}
431-
return nil
481+
eventCh <- Stopped
482+
return err
432483
}
433484

434485
vmEg.Go(func() error {
@@ -440,8 +491,26 @@ func TestMultipleVMs_Isolated(t *testing.T) {
440491
})
441492
}
442493

443-
err = vmEg.Wait()
444-
require.NoError(t, err)
494+
ticker := time.NewTicker(10 * time.Second)
495+
defer ticker.Stop()
496+
497+
var created int
498+
for stopped := 0; stopped < count; {
499+
select {
500+
case <-vmEgCtx.Done():
501+
require.NoError(t, vmEg.Wait())
502+
return
503+
case e := <-eventCh:
504+
switch e {
505+
case Created:
506+
created++
507+
case Stopped:
508+
stopped++
509+
}
510+
case <-ticker.C:
511+
t.Logf("created=%d/%d stopped=%d/%d", created, count, stopped, count)
512+
}
513+
}
445514
}
446515

447516
func testMultipleExecs(
@@ -478,7 +547,7 @@ func testMultipleExecs(
478547
if err != nil {
479548
return err
480549
}
481-
defer newContainer.Delete(ctx)
550+
defer newContainer.Delete(ctx, containerd.WithSnapshotCleanup)
482551

483552
var taskStdout bytes.Buffer
484553
var taskStderr bytes.Buffer

0 commit comments

Comments
 (0)