@@ -35,14 +35,17 @@ import (
35
35
. "github.com/containerd/containerd"
36
36
"github.com/containerd/containerd/cio"
37
37
"github.com/containerd/containerd/containers"
38
+ "github.com/containerd/containerd/integration/failpoint"
38
39
"github.com/containerd/containerd/oci"
40
+ "github.com/containerd/containerd/pkg/fifosync"
39
41
"github.com/containerd/containerd/plugin"
40
42
"github.com/containerd/containerd/runtime/linux/runctypes"
41
43
"github.com/containerd/containerd/runtime/v2/runc/options"
42
44
"github.com/containerd/containerd/sys"
43
45
"github.com/containerd/errdefs"
44
46
45
47
"github.com/opencontainers/runtime-spec/specs-go"
48
+ "github.com/stretchr/testify/assert"
46
49
"github.com/stretchr/testify/require"
47
50
exec "golang.org/x/sys/execabs"
48
51
"golang.org/x/sys/unix"
@@ -1591,3 +1594,210 @@ func TestIssue9103(t *testing.T) {
1591
1594
})
1592
1595
}
1593
1596
}
1597
+
1598
+ // TestIssue10589 is used as regression case for issue 10589.
1599
+ //
1600
+ // This issue was caused by a race between init exits and new exec process tracking inside the shim. The test operates
1601
+ // by controlling the time between when the shim invokes "runc exec" and when the actual "runc exec" is triggered. This
1602
+ // allows validating that races for shim state tracking between pre- and post-start of the exec process do not exist.
1603
+ //
1604
+ // The workflow is as follows:
1605
+ // 1. Create a container as normal
1606
+ // 2. Make an exec1 using runc-fp with delayexec
1607
+ // 3. Wait until the exec is waiting to start (triggered by delayexec)
1608
+ // 4. Kill the container init process (signalling it is easiest)
1609
+ // 5. Make an exec2 using runc-fp with delayexec
1610
+ // 6. Wait until the exec is waiting to start
1611
+ // 7. Allow exec1 to proceed
1612
+ // 8. Allow exec2 to proceed
1613
+ // 9. See that the container has exited and all execs have exited too
1614
+ //
1615
+ // https://github.com/containerd/containerd/issues/10589
1616
+ func TestIssue10589 (t * testing.T ) {
1617
+ if f := os .Getenv ("RUNC_FLAVOR" ); f != "" && f != "runc" {
1618
+ t .Skip ("test requires runc" )
1619
+ }
1620
+ if rt := os .Getenv ("TEST_RUNTIME" ); rt != "" && rt != plugin .RuntimeRuncV2 {
1621
+ t .Skip ("test requires io.containerd.runc.v2" )
1622
+ }
1623
+
1624
+ client , err := newClient (t , address )
1625
+ require .NoError (t , err )
1626
+ t .Cleanup (func () {
1627
+ client .Close ()
1628
+ })
1629
+
1630
+ var (
1631
+ image Image
1632
+ ctx , cancel = testContext (t )
1633
+ id = t .Name ()
1634
+ )
1635
+ t .Cleanup (cancel )
1636
+
1637
+ image , err = client .GetImage (ctx , testImage )
1638
+ require .NoError (t , err )
1639
+
1640
+ // 1. Create a sleeping container
1641
+ t .Log ("1. Create a sleeping container" )
1642
+ container , err := client .NewContainer (ctx , id ,
1643
+ WithNewSnapshot (id , image ),
1644
+ WithNewSpec (oci .WithImageConfig (image ),
1645
+ withProcessArgs ("sleep" , "inf" ),
1646
+ oci .WithAnnotations (map [string ]string {
1647
+ "oci.runc.failpoint.profile" : "delayExec" ,
1648
+ }),
1649
+ ),
1650
+ WithRuntime (client .Runtime (), & options.Options {
1651
+ BinaryName : "runc-fp" ,
1652
+ }),
1653
+ )
1654
+ require .NoError (t , err , "create container" )
1655
+ t .Cleanup (func () {
1656
+ ctx , cancel := context .WithTimeout (ctx , 10 * time .Second )
1657
+ err := container .Delete (ctx , WithSnapshotCleanup )
1658
+ if err != nil {
1659
+ t .Log ("delete err" , err )
1660
+ }
1661
+ cancel ()
1662
+ })
1663
+
1664
+ task , err := container .NewTask (ctx , empty ())
1665
+ require .NoError (t , err , "create task" )
1666
+ t .Cleanup (func () {
1667
+ ctx , cancel := context .WithTimeout (ctx , 2 * time .Second )
1668
+ st , err := task .Delete (ctx , WithProcessKill )
1669
+ t .Log ("exit status" , st )
1670
+ if err != nil {
1671
+ t .Log ("kill err" , err )
1672
+ }
1673
+ cancel ()
1674
+ })
1675
+
1676
+ err = task .Start (ctx )
1677
+ require .NoError (t , err , "start container" )
1678
+
1679
+ status , err := task .Status (ctx )
1680
+ require .NoError (t , err , "container status" )
1681
+ require .Equal (t , Running , status .Status )
1682
+
1683
+ // 2. Create an exec
1684
+ t .Log ("2. Create exec1" )
1685
+ exec1ReadyFifo , err := fifosync .NewWaiter (filepath .Join (t .TempDir (), "exec1-ready.fifo" ), 0600 )
1686
+ require .NoError (t , err , "create exec1 ready fifo" )
1687
+ exec1DelayFifo , err := fifosync .NewTrigger (filepath .Join (t .TempDir (), "exec1-delay.fifo" ), 0600 )
1688
+ require .NoError (t , err , "create exec1 delay fifo" )
1689
+ exec1 , err := task .Exec (ctx , "exec1" , & specs.Process {
1690
+ Args : []string {"/bin/sleep" , "301" },
1691
+ Cwd : "/" ,
1692
+ Env : []string {
1693
+ failpoint .DelayExecReadyEnv + "=" + exec1ReadyFifo .Name (),
1694
+ failpoint .DelayExecDelayEnv + "=" + exec1DelayFifo .Name (),
1695
+ },
1696
+ }, cio .NullIO )
1697
+ require .NoError (t , err , "create exec1" )
1698
+
1699
+ exec1done := make (chan struct {})
1700
+ go func () {
1701
+ defer close (exec1done )
1702
+ t .Log ("Starting exec1" )
1703
+ err := exec1 .Start (ctx )
1704
+ assert .Error (t , err , "start exec1" )
1705
+ t .Logf ("error starting exec1: %s" , err )
1706
+ }()
1707
+
1708
+ // 3. Wait until the exec is waiting to start
1709
+ t .Log ("3. Wait until exec1 is waiting to start" )
1710
+ err = exec1ReadyFifo .Wait ()
1711
+ require .NoError (t , err , "open exec1 fifo" )
1712
+
1713
+ // 4. Kill the container init process
1714
+ t .Log ("4. Kill the container init process" )
1715
+ target := task .Pid ()
1716
+ t .Logf ("Killing main pid (%v) of container %s" , target , container .ID ())
1717
+ syscall .Kill (int (target ), syscall .SIGKILL )
1718
+ status , err = task .Status (ctx )
1719
+ require .NoError (t , err , "container status" )
1720
+ t .Log ("container status" , status .Status )
1721
+
1722
+ // 5. Make an exec (2) using this failpoint
1723
+ t .Log ("5. Create exec2" )
1724
+ exec2ReadyFifo , err := fifosync .NewWaiter (filepath .Join (t .TempDir (), "exec2-ready.fifo" ), 0600 )
1725
+ require .NoError (t , err , "create exec2 ready fifo: %q" , exec2ReadyFifo )
1726
+ exec2DelayFifo , err := fifosync .NewTrigger (filepath .Join (t .TempDir (), "exec2-delay.fifo" ), 0600 )
1727
+ require .NoError (t , err , "create exec2 delay fifo: %q" , exec2DelayFifo )
1728
+ exec2 , err := task .Exec (ctx , "exec2" , & specs.Process {
1729
+ Args : []string {"/bin/sleep" , "302" },
1730
+ Cwd : "/" ,
1731
+ Env : []string {
1732
+ failpoint .DelayExecReadyEnv + "=" + exec2ReadyFifo .Name (),
1733
+ failpoint .DelayExecDelayEnv + "=" + exec2DelayFifo .Name (),
1734
+ },
1735
+ }, cio .NullIO )
1736
+ require .NoError (t , err , "create exec2" )
1737
+
1738
+ exec2done := make (chan struct {})
1739
+ didExec2Run := true
1740
+ go func () {
1741
+ defer close (exec2done )
1742
+ t .Log ("Starting exec2" )
1743
+ err := exec2 .Start (ctx )
1744
+ assert .Error (t , err , "start exec2" )
1745
+ t .Logf ("error starting exec2: %s" , err )
1746
+ }()
1747
+
1748
+ // 6. Wait until the exec is waiting to start
1749
+ t .Log ("6. Wait until exec2 is waiting to start" )
1750
+ exec2ready := make (chan struct {})
1751
+ go func () {
1752
+ exec2ReadyFifo .Wait ()
1753
+ close (exec2ready )
1754
+ }()
1755
+ select {
1756
+ case <- exec2ready :
1757
+ case <- exec2done :
1758
+ didExec2Run = false
1759
+ }
1760
+
1761
+ // 7. Allow exec=1 to proceed
1762
+ t .Log ("7. Allow exec=1 to proceed" )
1763
+ err = exec1DelayFifo .Trigger ()
1764
+ assert .NoError (t , err , "trigger exec1 fifo" )
1765
+ status , err = task .Status (ctx )
1766
+ require .NoError (t , err , "container status" )
1767
+ t .Log ("container status" , status .Status )
1768
+ <- exec1done
1769
+ status , err = task .Status (ctx )
1770
+ require .NoError (t , err , "container status" )
1771
+ t .Log ("container status" , status .Status )
1772
+
1773
+ // 8. Allow exec=2 to proceed
1774
+ if didExec2Run {
1775
+ t .Log ("8. Allow exec2 to proceed" )
1776
+ err = exec2DelayFifo .Trigger ()
1777
+ assert .NoError (t , err , "trigger exec2 fifo" )
1778
+ status , err = task .Status (ctx )
1779
+ require .NoError (t , err , "container status" )
1780
+ t .Log ("container status" , status .Status )
1781
+ <- exec2done
1782
+ status , err = task .Status (ctx )
1783
+ require .NoError (t , err , "container status" )
1784
+ t .Log ("container status" , status .Status )
1785
+ } else {
1786
+ t .Log ("8. Skip exec2" )
1787
+ }
1788
+
1789
+ // 9. Validate
1790
+ t .Log ("9. Validate" )
1791
+ status , err = exec1 .Status (ctx )
1792
+ require .NoError (t , err , "exec1 status" )
1793
+ t .Logf ("exec1 status: %s" , status .Status )
1794
+ assert .Equal (t , Created , status .Status )
1795
+ status , err = exec2 .Status (ctx )
1796
+ require .NoError (t , err , "exec2 status" )
1797
+ t .Logf ("exec2 status: %s" , status .Status )
1798
+ assert .Equal (t , Created , status .Status )
1799
+ status , err = task .Status (ctx )
1800
+ t .Logf ("task status: %s" , status .Status )
1801
+ require .NoError (t , err , "container status" )
1802
+ assert .Equal (t , Stopped , status .Status )
1803
+ }
0 commit comments