Skip to content

Commit 0c4ba21

Browse files
committed
integration: regression test for issue 10589
This issue was caused by a race between init exits and new exec process tracking inside the shim. The test operates by controlling the time between when the shim invokes "runc exec" and when the actual "runc exec" is triggered. This allows validating that races for shim state tracking between pre- and post-start of the exec process do not exist. Relates to containerd#10589 Signed-off-by: Samuel Karp <[email protected]> (cherry picked from commit 18725f0) Signed-off-by: Samuel Karp <[email protected]>
1 parent 1cc2cfa commit 0c4ba21

File tree

4 files changed

+366
-1
lines changed

4 files changed

+366
-1
lines changed

integration/client/container_linux_test.go

Lines changed: 210 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,14 +37,17 @@ import (
3737
. "github.com/containerd/containerd"
3838
"github.com/containerd/containerd/cio"
3939
"github.com/containerd/containerd/containers"
40+
"github.com/containerd/containerd/integration/failpoint"
4041
"github.com/containerd/containerd/oci"
42+
"github.com/containerd/containerd/pkg/fifosync"
4143
"github.com/containerd/containerd/plugin"
4244
"github.com/containerd/containerd/runtime/linux/runctypes"
4345
"github.com/containerd/containerd/runtime/v2/runc/options"
4446
"github.com/containerd/containerd/sys"
4547
"github.com/containerd/errdefs"
4648

4749
"github.com/opencontainers/runtime-spec/specs-go"
50+
"github.com/stretchr/testify/assert"
4851
"github.com/stretchr/testify/require"
4952
"golang.org/x/sys/unix"
5053
)
@@ -1595,3 +1598,210 @@ func TestIssue9103(t *testing.T) {
15951598
})
15961599
}
15971600
}
1601+
1602+
// TestIssue10589 is used as regression case for issue 10589.
1603+
//
1604+
// This issue was caused by a race between init exits and new exec process tracking inside the shim. The test operates
1605+
// by controlling the time between when the shim invokes "runc exec" and when the actual "runc exec" is triggered. This
1606+
// allows validating that races for shim state tracking between pre- and post-start of the exec process do not exist.
1607+
//
1608+
// The workflow is as follows:
1609+
// 1. Create a container as normal
1610+
// 2. Make an exec1 using runc-fp with delayexec
1611+
// 3. Wait until the exec is waiting to start (triggered by delayexec)
1612+
// 4. Kill the container init process (signalling it is easiest)
1613+
// 5. Make an exec2 using runc-fp with delayexec
1614+
// 6. Wait until the exec is waiting to start
1615+
// 7. Allow exec1 to proceed
1616+
// 8. Allow exec2 to proceed
1617+
// 9. See that the container has exited and all execs have exited too
1618+
//
1619+
// https://github.com/containerd/containerd/issues/10589
1620+
func TestIssue10589(t *testing.T) {
1621+
if f := os.Getenv("RUNC_FLAVOR"); f != "" && f != "runc" {
1622+
t.Skip("test requires runc")
1623+
}
1624+
if rt := os.Getenv("TEST_RUNTIME"); rt != "" && rt != plugin.RuntimeRuncV2 {
1625+
t.Skip("test requires io.containerd.runc.v2")
1626+
}
1627+
1628+
client, err := newClient(t, address)
1629+
require.NoError(t, err)
1630+
t.Cleanup(func() {
1631+
client.Close()
1632+
})
1633+
1634+
var (
1635+
image Image
1636+
ctx, cancel = testContext(t)
1637+
id = t.Name()
1638+
)
1639+
t.Cleanup(cancel)
1640+
1641+
image, err = client.GetImage(ctx, testImage)
1642+
require.NoError(t, err)
1643+
1644+
// 1. Create a sleeping container
1645+
t.Log("1. Create a sleeping container")
1646+
container, err := client.NewContainer(ctx, id,
1647+
WithNewSnapshot(id, image),
1648+
WithNewSpec(oci.WithImageConfig(image),
1649+
withProcessArgs("sleep", "inf"),
1650+
oci.WithAnnotations(map[string]string{
1651+
"oci.runc.failpoint.profile": "delayExec",
1652+
}),
1653+
),
1654+
WithRuntime(client.Runtime(), &options.Options{
1655+
BinaryName: "runc-fp",
1656+
}),
1657+
)
1658+
require.NoError(t, err, "create container")
1659+
t.Cleanup(func() {
1660+
ctx, cancel := context.WithTimeout(ctx, 10*time.Second)
1661+
err := container.Delete(ctx, WithSnapshotCleanup)
1662+
if err != nil {
1663+
t.Log("delete err", err)
1664+
}
1665+
cancel()
1666+
})
1667+
1668+
task, err := container.NewTask(ctx, empty())
1669+
require.NoError(t, err, "create task")
1670+
t.Cleanup(func() {
1671+
ctx, cancel := context.WithTimeout(ctx, 2*time.Second)
1672+
st, err := task.Delete(ctx, WithProcessKill)
1673+
t.Log("exit status", st)
1674+
if err != nil {
1675+
t.Log("kill err", err)
1676+
}
1677+
cancel()
1678+
})
1679+
1680+
err = task.Start(ctx)
1681+
require.NoError(t, err, "start container")
1682+
1683+
status, err := task.Status(ctx)
1684+
require.NoError(t, err, "container status")
1685+
require.Equal(t, Running, status.Status)
1686+
1687+
// 2. Create an exec
1688+
t.Log("2. Create exec1")
1689+
exec1ReadyFifo, err := fifosync.NewWaiter(filepath.Join(t.TempDir(), "exec1-ready.fifo"), 0600)
1690+
require.NoError(t, err, "create exec1 ready fifo")
1691+
exec1DelayFifo, err := fifosync.NewTrigger(filepath.Join(t.TempDir(), "exec1-delay.fifo"), 0600)
1692+
require.NoError(t, err, "create exec1 delay fifo")
1693+
exec1, err := task.Exec(ctx, "exec1", &specs.Process{
1694+
Args: []string{"/bin/sleep", "301"},
1695+
Cwd: "/",
1696+
Env: []string{
1697+
failpoint.DelayExecReadyEnv + "=" + exec1ReadyFifo.Name(),
1698+
failpoint.DelayExecDelayEnv + "=" + exec1DelayFifo.Name(),
1699+
},
1700+
}, cio.NullIO)
1701+
require.NoError(t, err, "create exec1")
1702+
1703+
exec1done := make(chan struct{})
1704+
go func() {
1705+
defer close(exec1done)
1706+
t.Log("Starting exec1")
1707+
err := exec1.Start(ctx)
1708+
assert.Error(t, err, "start exec1")
1709+
t.Logf("error starting exec1: %s", err)
1710+
}()
1711+
1712+
// 3. Wait until the exec is waiting to start
1713+
t.Log("3. Wait until exec1 is waiting to start")
1714+
err = exec1ReadyFifo.Wait()
1715+
require.NoError(t, err, "open exec1 fifo")
1716+
1717+
// 4. Kill the container init process
1718+
t.Log("4. Kill the container init process")
1719+
target := task.Pid()
1720+
t.Logf("Killing main pid (%v) of container %s", target, container.ID())
1721+
syscall.Kill(int(target), syscall.SIGKILL)
1722+
status, err = task.Status(ctx)
1723+
require.NoError(t, err, "container status")
1724+
t.Log("container status", status.Status)
1725+
1726+
// 5. Make an exec (2) using this failpoint
1727+
t.Log("5. Create exec2")
1728+
exec2ReadyFifo, err := fifosync.NewWaiter(filepath.Join(t.TempDir(), "exec2-ready.fifo"), 0600)
1729+
require.NoError(t, err, "create exec2 ready fifo: %q", exec2ReadyFifo)
1730+
exec2DelayFifo, err := fifosync.NewTrigger(filepath.Join(t.TempDir(), "exec2-delay.fifo"), 0600)
1731+
require.NoError(t, err, "create exec2 delay fifo: %q", exec2DelayFifo)
1732+
exec2, err := task.Exec(ctx, "exec2", &specs.Process{
1733+
Args: []string{"/bin/sleep", "302"},
1734+
Cwd: "/",
1735+
Env: []string{
1736+
failpoint.DelayExecReadyEnv + "=" + exec2ReadyFifo.Name(),
1737+
failpoint.DelayExecDelayEnv + "=" + exec2DelayFifo.Name(),
1738+
},
1739+
}, cio.NullIO)
1740+
require.NoError(t, err, "create exec2")
1741+
1742+
exec2done := make(chan struct{})
1743+
didExec2Run := true
1744+
go func() {
1745+
defer close(exec2done)
1746+
t.Log("Starting exec2")
1747+
err := exec2.Start(ctx)
1748+
assert.Error(t, err, "start exec2")
1749+
t.Logf("error starting exec2: %s", err)
1750+
}()
1751+
1752+
// 6. Wait until the exec is waiting to start
1753+
t.Log("6. Wait until exec2 is waiting to start")
1754+
exec2ready := make(chan struct{})
1755+
go func() {
1756+
exec2ReadyFifo.Wait()
1757+
close(exec2ready)
1758+
}()
1759+
select {
1760+
case <-exec2ready:
1761+
case <-exec2done:
1762+
didExec2Run = false
1763+
}
1764+
1765+
// 7. Allow exec=1 to proceed
1766+
t.Log("7. Allow exec=1 to proceed")
1767+
err = exec1DelayFifo.Trigger()
1768+
assert.NoError(t, err, "trigger exec1 fifo")
1769+
status, err = task.Status(ctx)
1770+
require.NoError(t, err, "container status")
1771+
t.Log("container status", status.Status)
1772+
<-exec1done
1773+
status, err = task.Status(ctx)
1774+
require.NoError(t, err, "container status")
1775+
t.Log("container status", status.Status)
1776+
1777+
// 8. Allow exec=2 to proceed
1778+
if didExec2Run {
1779+
t.Log("8. Allow exec2 to proceed")
1780+
err = exec2DelayFifo.Trigger()
1781+
assert.NoError(t, err, "trigger exec2 fifo")
1782+
status, err = task.Status(ctx)
1783+
require.NoError(t, err, "container status")
1784+
t.Log("container status", status.Status)
1785+
<-exec2done
1786+
status, err = task.Status(ctx)
1787+
require.NoError(t, err, "container status")
1788+
t.Log("container status", status.Status)
1789+
} else {
1790+
t.Log("8. Skip exec2")
1791+
}
1792+
1793+
// 9. Validate
1794+
t.Log("9. Validate")
1795+
status, err = exec1.Status(ctx)
1796+
require.NoError(t, err, "exec1 status")
1797+
t.Logf("exec1 status: %s", status.Status)
1798+
assert.Equal(t, Created, status.Status)
1799+
status, err = exec2.Status(ctx)
1800+
require.NoError(t, err, "exec2 status")
1801+
t.Logf("exec2 status: %s", status.Status)
1802+
assert.Equal(t, Created, status.Status)
1803+
status, err = task.Status(ctx)
1804+
t.Logf("task status: %s", status.Status)
1805+
require.NoError(t, err, "container status")
1806+
assert.Equal(t, Stopped, status.Status)
1807+
}
Lines changed: 131 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,131 @@
1+
//go:build linux
2+
3+
/*
4+
Copyright The containerd Authors.
5+
6+
Licensed under the Apache License, Version 2.0 (the "License");
7+
you may not use this file except in compliance with the License.
8+
You may obtain a copy of the License at
9+
10+
http://www.apache.org/licenses/LICENSE-2.0
11+
12+
Unless required by applicable law or agreed to in writing, software
13+
distributed under the License is distributed on an "AS IS" BASIS,
14+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15+
See the License for the specific language governing permissions and
16+
limitations under the License.
17+
*/
18+
19+
package main
20+
21+
import (
22+
"context"
23+
"encoding/json"
24+
"errors"
25+
"fmt"
26+
"io"
27+
"os"
28+
"strings"
29+
30+
"github.com/opencontainers/runtime-spec/specs-go"
31+
"github.com/sirupsen/logrus"
32+
33+
"github.com/containerd/containerd/integration/failpoint"
34+
"github.com/containerd/containerd/pkg/fifosync"
35+
)
36+
37+
// delayExec delays an "exec" command until a trigger is received from the calling test program. This can be used to
38+
// test races around container lifecycle and exec processes.
39+
func delayExec(ctx context.Context, method invoker) error {
40+
isExec := strings.Contains(strings.Join(os.Args, ","), ",exec,")
41+
if !isExec {
42+
if err := method(ctx); err != nil {
43+
return err
44+
}
45+
return nil
46+
}
47+
logrus.Debug("EXEC!")
48+
49+
if err := delay(); err != nil {
50+
return err
51+
}
52+
if err := method(ctx); err != nil {
53+
return err
54+
}
55+
return nil
56+
}
57+
58+
func delay() error {
59+
ready, delay, err := fifoFromProcessEnv()
60+
if err != nil {
61+
return err
62+
}
63+
if err := ready.Trigger(); err != nil {
64+
return err
65+
}
66+
return delay.Wait()
67+
}
68+
69+
// fifoFromProcessEnv finds a fifo specified in the environment variables of an exec process
70+
func fifoFromProcessEnv() (fifosync.Trigger, fifosync.Waiter, error) {
71+
env, err := processEnvironment()
72+
if err != nil {
73+
return nil, nil, err
74+
}
75+
76+
readyName, ok := env[failpoint.DelayExecReadyEnv]
77+
if !ok {
78+
return nil, nil, fmt.Errorf("fifo: failed to find %q env var in %v", failpoint.DelayExecReadyEnv, env)
79+
}
80+
delayName, ok := env[failpoint.DelayExecDelayEnv]
81+
if !ok {
82+
return nil, nil, fmt.Errorf("fifo: failed to find %q env var in %v", failpoint.DelayExecDelayEnv, env)
83+
}
84+
logrus.WithField("ready", readyName).WithField("delay", delayName).Debug("Found FIFOs!")
85+
readyFIFO, err := fifosync.NewTrigger(readyName, 0600)
86+
if err != nil {
87+
return nil, nil, err
88+
}
89+
delayFIFO, err := fifosync.NewWaiter(delayName, 0600)
90+
if err != nil {
91+
return nil, nil, err
92+
}
93+
return readyFIFO, delayFIFO, nil
94+
}
95+
96+
func processEnvironment() (map[string]string, error) {
97+
idx := 2
98+
for ; idx < len(os.Args); idx++ {
99+
if os.Args[idx] == "--process" {
100+
break
101+
}
102+
}
103+
104+
if idx >= len(os.Args)-1 || os.Args[idx] != "--process" {
105+
return nil, errors.New("env: option --process required")
106+
}
107+
108+
specFile := os.Args[idx+1]
109+
f, err := os.OpenFile(specFile, os.O_RDONLY, 0o644)
110+
if err != nil {
111+
return nil, fmt.Errorf("env: failed to open %s: %w", specFile, err)
112+
}
113+
114+
b, err := io.ReadAll(f)
115+
if err != nil {
116+
return nil, fmt.Errorf("env: failed to read spec from %q", specFile)
117+
}
118+
var spec specs.Process
119+
if err := json.Unmarshal(b, &spec); err != nil {
120+
return nil, fmt.Errorf("env: failed to unmarshal spec from %q: %w", specFile, err)
121+
}
122+
123+
// XXX: env vars can be specified multiple times, but we only keep one
124+
env := make(map[string]string)
125+
for _, e := range spec.Env {
126+
k, v, _ := strings.Cut(e, "=")
127+
env[k] = v
128+
}
129+
130+
return env, nil
131+
}

integration/failpoint/cmd/runc-fp/main.go

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,8 +25,9 @@ import (
2525
"os/exec"
2626
"syscall"
2727

28-
"github.com/containerd/containerd/oci"
2928
"github.com/sirupsen/logrus"
29+
30+
"github.com/containerd/containerd/oci"
3031
)
3132

3233
const (
@@ -40,6 +41,7 @@ type invokerInterceptor func(context.Context, invoker) error
4041
var (
4142
failpointProfiles = map[string]invokerInterceptor{
4243
"issue9103": issue9103KillInitAfterCreate,
44+
"delayExec": delayExec,
4345
}
4446
)
4547

0 commit comments

Comments
 (0)