@@ -37,14 +37,17 @@ import (
37
37
. "github.com/containerd/containerd"
38
38
"github.com/containerd/containerd/cio"
39
39
"github.com/containerd/containerd/containers"
40
+ "github.com/containerd/containerd/integration/failpoint"
40
41
"github.com/containerd/containerd/oci"
42
+ "github.com/containerd/containerd/pkg/fifosync"
41
43
"github.com/containerd/containerd/plugin"
42
44
"github.com/containerd/containerd/runtime/linux/runctypes"
43
45
"github.com/containerd/containerd/runtime/v2/runc/options"
44
46
"github.com/containerd/containerd/sys"
45
47
"github.com/containerd/errdefs"
46
48
47
49
"github.com/opencontainers/runtime-spec/specs-go"
50
+ "github.com/stretchr/testify/assert"
48
51
"github.com/stretchr/testify/require"
49
52
"golang.org/x/sys/unix"
50
53
)
@@ -1595,3 +1598,210 @@ func TestIssue9103(t *testing.T) {
1595
1598
})
1596
1599
}
1597
1600
}
1601
+
1602
+ // TestIssue10589 is used as regression case for issue 10589.
1603
+ //
1604
+ // This issue was caused by a race between init exits and new exec process tracking inside the shim. The test operates
1605
+ // by controlling the time between when the shim invokes "runc exec" and when the actual "runc exec" is triggered. This
1606
+ // allows validating that races for shim state tracking between pre- and post-start of the exec process do not exist.
1607
+ //
1608
+ // The workflow is as follows:
1609
+ // 1. Create a container as normal
1610
+ // 2. Make an exec1 using runc-fp with delayexec
1611
+ // 3. Wait until the exec is waiting to start (triggered by delayexec)
1612
+ // 4. Kill the container init process (signalling it is easiest)
1613
+ // 5. Make an exec2 using runc-fp with delayexec
1614
+ // 6. Wait until the exec is waiting to start
1615
+ // 7. Allow exec1 to proceed
1616
+ // 8. Allow exec2 to proceed
1617
+ // 9. See that the container has exited and all execs have exited too
1618
+ //
1619
+ // https://github.com/containerd/containerd/issues/10589
1620
+ func TestIssue10589 (t * testing.T ) {
1621
+ if f := os .Getenv ("RUNC_FLAVOR" ); f != "" && f != "runc" {
1622
+ t .Skip ("test requires runc" )
1623
+ }
1624
+ if rt := os .Getenv ("TEST_RUNTIME" ); rt != "" && rt != plugin .RuntimeRuncV2 {
1625
+ t .Skip ("test requires io.containerd.runc.v2" )
1626
+ }
1627
+
1628
+ client , err := newClient (t , address )
1629
+ require .NoError (t , err )
1630
+ t .Cleanup (func () {
1631
+ client .Close ()
1632
+ })
1633
+
1634
+ var (
1635
+ image Image
1636
+ ctx , cancel = testContext (t )
1637
+ id = t .Name ()
1638
+ )
1639
+ t .Cleanup (cancel )
1640
+
1641
+ image , err = client .GetImage (ctx , testImage )
1642
+ require .NoError (t , err )
1643
+
1644
+ // 1. Create a sleeping container
1645
+ t .Log ("1. Create a sleeping container" )
1646
+ container , err := client .NewContainer (ctx , id ,
1647
+ WithNewSnapshot (id , image ),
1648
+ WithNewSpec (oci .WithImageConfig (image ),
1649
+ withProcessArgs ("sleep" , "inf" ),
1650
+ oci .WithAnnotations (map [string ]string {
1651
+ "oci.runc.failpoint.profile" : "delayExec" ,
1652
+ }),
1653
+ ),
1654
+ WithRuntime (client .Runtime (), & options.Options {
1655
+ BinaryName : "runc-fp" ,
1656
+ }),
1657
+ )
1658
+ require .NoError (t , err , "create container" )
1659
+ t .Cleanup (func () {
1660
+ ctx , cancel := context .WithTimeout (ctx , 10 * time .Second )
1661
+ err := container .Delete (ctx , WithSnapshotCleanup )
1662
+ if err != nil {
1663
+ t .Log ("delete err" , err )
1664
+ }
1665
+ cancel ()
1666
+ })
1667
+
1668
+ task , err := container .NewTask (ctx , empty ())
1669
+ require .NoError (t , err , "create task" )
1670
+ t .Cleanup (func () {
1671
+ ctx , cancel := context .WithTimeout (ctx , 2 * time .Second )
1672
+ st , err := task .Delete (ctx , WithProcessKill )
1673
+ t .Log ("exit status" , st )
1674
+ if err != nil {
1675
+ t .Log ("kill err" , err )
1676
+ }
1677
+ cancel ()
1678
+ })
1679
+
1680
+ err = task .Start (ctx )
1681
+ require .NoError (t , err , "start container" )
1682
+
1683
+ status , err := task .Status (ctx )
1684
+ require .NoError (t , err , "container status" )
1685
+ require .Equal (t , Running , status .Status )
1686
+
1687
+ // 2. Create an exec
1688
+ t .Log ("2. Create exec1" )
1689
+ exec1ReadyFifo , err := fifosync .NewWaiter (filepath .Join (t .TempDir (), "exec1-ready.fifo" ), 0600 )
1690
+ require .NoError (t , err , "create exec1 ready fifo" )
1691
+ exec1DelayFifo , err := fifosync .NewTrigger (filepath .Join (t .TempDir (), "exec1-delay.fifo" ), 0600 )
1692
+ require .NoError (t , err , "create exec1 delay fifo" )
1693
+ exec1 , err := task .Exec (ctx , "exec1" , & specs.Process {
1694
+ Args : []string {"/bin/sleep" , "301" },
1695
+ Cwd : "/" ,
1696
+ Env : []string {
1697
+ failpoint .DelayExecReadyEnv + "=" + exec1ReadyFifo .Name (),
1698
+ failpoint .DelayExecDelayEnv + "=" + exec1DelayFifo .Name (),
1699
+ },
1700
+ }, cio .NullIO )
1701
+ require .NoError (t , err , "create exec1" )
1702
+
1703
+ exec1done := make (chan struct {})
1704
+ go func () {
1705
+ defer close (exec1done )
1706
+ t .Log ("Starting exec1" )
1707
+ err := exec1 .Start (ctx )
1708
+ assert .Error (t , err , "start exec1" )
1709
+ t .Logf ("error starting exec1: %s" , err )
1710
+ }()
1711
+
1712
+ // 3. Wait until the exec is waiting to start
1713
+ t .Log ("3. Wait until exec1 is waiting to start" )
1714
+ err = exec1ReadyFifo .Wait ()
1715
+ require .NoError (t , err , "open exec1 fifo" )
1716
+
1717
+ // 4. Kill the container init process
1718
+ t .Log ("4. Kill the container init process" )
1719
+ target := task .Pid ()
1720
+ t .Logf ("Killing main pid (%v) of container %s" , target , container .ID ())
1721
+ syscall .Kill (int (target ), syscall .SIGKILL )
1722
+ status , err = task .Status (ctx )
1723
+ require .NoError (t , err , "container status" )
1724
+ t .Log ("container status" , status .Status )
1725
+
1726
+ // 5. Make an exec (2) using this failpoint
1727
+ t .Log ("5. Create exec2" )
1728
+ exec2ReadyFifo , err := fifosync .NewWaiter (filepath .Join (t .TempDir (), "exec2-ready.fifo" ), 0600 )
1729
+ require .NoError (t , err , "create exec2 ready fifo: %q" , exec2ReadyFifo )
1730
+ exec2DelayFifo , err := fifosync .NewTrigger (filepath .Join (t .TempDir (), "exec2-delay.fifo" ), 0600 )
1731
+ require .NoError (t , err , "create exec2 delay fifo: %q" , exec2DelayFifo )
1732
+ exec2 , err := task .Exec (ctx , "exec2" , & specs.Process {
1733
+ Args : []string {"/bin/sleep" , "302" },
1734
+ Cwd : "/" ,
1735
+ Env : []string {
1736
+ failpoint .DelayExecReadyEnv + "=" + exec2ReadyFifo .Name (),
1737
+ failpoint .DelayExecDelayEnv + "=" + exec2DelayFifo .Name (),
1738
+ },
1739
+ }, cio .NullIO )
1740
+ require .NoError (t , err , "create exec2" )
1741
+
1742
+ exec2done := make (chan struct {})
1743
+ didExec2Run := true
1744
+ go func () {
1745
+ defer close (exec2done )
1746
+ t .Log ("Starting exec2" )
1747
+ err := exec2 .Start (ctx )
1748
+ assert .Error (t , err , "start exec2" )
1749
+ t .Logf ("error starting exec2: %s" , err )
1750
+ }()
1751
+
1752
+ // 6. Wait until the exec is waiting to start
1753
+ t .Log ("6. Wait until exec2 is waiting to start" )
1754
+ exec2ready := make (chan struct {})
1755
+ go func () {
1756
+ exec2ReadyFifo .Wait ()
1757
+ close (exec2ready )
1758
+ }()
1759
+ select {
1760
+ case <- exec2ready :
1761
+ case <- exec2done :
1762
+ didExec2Run = false
1763
+ }
1764
+
1765
+ // 7. Allow exec=1 to proceed
1766
+ t .Log ("7. Allow exec=1 to proceed" )
1767
+ err = exec1DelayFifo .Trigger ()
1768
+ assert .NoError (t , err , "trigger exec1 fifo" )
1769
+ status , err = task .Status (ctx )
1770
+ require .NoError (t , err , "container status" )
1771
+ t .Log ("container status" , status .Status )
1772
+ <- exec1done
1773
+ status , err = task .Status (ctx )
1774
+ require .NoError (t , err , "container status" )
1775
+ t .Log ("container status" , status .Status )
1776
+
1777
+ // 8. Allow exec=2 to proceed
1778
+ if didExec2Run {
1779
+ t .Log ("8. Allow exec2 to proceed" )
1780
+ err = exec2DelayFifo .Trigger ()
1781
+ assert .NoError (t , err , "trigger exec2 fifo" )
1782
+ status , err = task .Status (ctx )
1783
+ require .NoError (t , err , "container status" )
1784
+ t .Log ("container status" , status .Status )
1785
+ <- exec2done
1786
+ status , err = task .Status (ctx )
1787
+ require .NoError (t , err , "container status" )
1788
+ t .Log ("container status" , status .Status )
1789
+ } else {
1790
+ t .Log ("8. Skip exec2" )
1791
+ }
1792
+
1793
+ // 9. Validate
1794
+ t .Log ("9. Validate" )
1795
+ status , err = exec1 .Status (ctx )
1796
+ require .NoError (t , err , "exec1 status" )
1797
+ t .Logf ("exec1 status: %s" , status .Status )
1798
+ assert .Equal (t , Created , status .Status )
1799
+ status , err = exec2 .Status (ctx )
1800
+ require .NoError (t , err , "exec2 status" )
1801
+ t .Logf ("exec2 status: %s" , status .Status )
1802
+ assert .Equal (t , Created , status .Status )
1803
+ status , err = task .Status (ctx )
1804
+ t .Logf ("task status: %s" , status .Status )
1805
+ require .NoError (t , err , "container status" )
1806
+ assert .Equal (t , Stopped , status .Status )
1807
+ }
0 commit comments