@@ -2,10 +2,13 @@ package standalone
22
33import (
44 "context"
5+ "errors"
56 "fmt"
67 "os"
8+ "regexp"
79 "strconv"
810 "strings"
11+ "time"
912
1013 "github.com/docker/docker/api/types/container"
1114 "github.com/docker/docker/api/types/filters"
@@ -19,6 +22,11 @@ import (
1922// controllerContainerName is the name to use for the controller container.
2023const controllerContainerName = "docker-model-runner"
2124
25+ // concurrentInstallMatcher matches error message that indicate a concurrent
26+ // standalone model runner installation is taking place. It extracts the ID of
27+ // the conflicting container in a capture group.
28+ var concurrentInstallMatcher = regexp .MustCompile (`is already in use by container "([a-z0-9]+)"` )
29+
2230// FindControllerContainer searches for a running controller container. It
2331// returns the ID of the container (if found), the container name (if any), the
2432// full container summary (if found), or any error that occurred.
@@ -66,6 +74,28 @@ func determineBridgeGatewayIP(ctx context.Context, dockerClient *client.Client)
6674 return "" , nil
6775}
6876
77+ // waitForContainerToStart waits for a container to start.
78+ func waitForContainerToStart (ctx context.Context , dockerClient * client.Client , containerID string ) error {
79+ // Unfortunately the Docker API's /containers/{id}/wait API (and the
80+ // corresponding Client.ContainerWait method) don't allow waiting for
81+ // container startup, so instead we'll take a polling approach.
82+ for i := 5 ; i > 0 ; i -- {
83+ if status , err := dockerClient .ContainerInspect (ctx , containerID ); err != nil {
84+ return fmt .Errorf ("unable to inspect container (%s): %w" , containerID [:12 ], err )
85+ } else if status .State .Status == "running" {
86+ return nil
87+ }
88+ if i > 1 {
89+ select {
90+ case <- time .After (1 * time .Second ):
91+ case <- ctx .Done ():
92+ return errors .New ("waiting cancelled" )
93+ }
94+ }
95+ }
96+ return errors .New ("timed out" )
97+ }
98+
6999// CreateControllerContainer creates and starts a controller container.
70100func CreateControllerContainer (ctx context.Context , dockerClient * client.Client , port uint16 , environment string , doNotTrack bool , gpu gpupkg.GPUSupport , modelStorageVolume string , printer StatusPrinter ) error {
71101 // Determine the target image.
@@ -124,9 +154,17 @@ func CreateControllerContainer(ctx context.Context, dockerClient *client.Client,
124154 hostConfig .DeviceRequests = []container.DeviceRequest {{Count : - 1 , Capabilities : [][]string {{"gpu" }}}}
125155 }
126156
127- // Create the container.
157+ // Create the container. If we detect that a concurrent installation is in
158+ // progress, then we wait for whichever install process creates the
159+ // container first and then wait for its container to be ready.
128160 resp , err := dockerClient .ContainerCreate (ctx , config , hostConfig , nil , nil , controllerContainerName )
129161 if err != nil {
162+ if match := concurrentInstallMatcher .FindStringSubmatch (err .Error ()); match != nil {
163+ if err := waitForContainerToStart (ctx , dockerClient , match [1 ]); err != nil {
164+ return fmt .Errorf ("failed waiting for concurrent installation: %w" , err )
165+ }
166+ return nil
167+ }
130168 return fmt .Errorf ("failed to create container %s: %w" , controllerContainerName , err )
131169 }
132170
0 commit comments