@@ -6,12 +6,13 @@ import (
66 "context"
77 "errors"
88 "fmt"
9+ "io"
910 "os"
10- "regexp"
1111 "strconv"
1212 "strings"
1313 "time"
1414
15+ "github.com/containerd/errdefs"
1516 "github.com/docker/docker/api/types/container"
1617 "github.com/docker/docker/api/types/filters"
1718 "github.com/docker/docker/api/types/mount"
@@ -25,11 +26,6 @@ import (
2526// controllerContainerName is the name to use for the controller container.
2627const controllerContainerName = "docker-model-runner"
2728
28- // concurrentInstallMatcher matches error message that indicate a concurrent
29- // standalone model runner installation is taking place. It extracts the ID of
30- // the conflicting container in a capture group.
31- var concurrentInstallMatcher = regexp .MustCompile (`is already in use by container "([a-z0-9]+)"` )
32-
3329// copyDockerConfigToContainer copies the Docker config file from the host to the container
3430// and sets up proper ownership and permissions for the modelrunner user.
3531// It does nothing for Desktop and Cloud engine kinds.
@@ -177,38 +173,33 @@ func determineBridgeGatewayIP(ctx context.Context, dockerClient client.NetworkAP
177173 return "" , nil
178174}
179175
180- // waitForContainerToStart waits for a container to start.
181- func waitForContainerToStart (ctx context.Context , dockerClient client.ContainerAPIClient , containerID string ) error {
182- // Unfortunately the Docker API's /containers/{id}/wait API (and the
183- // corresponding Client.ContainerWait method) don't allow waiting for
184- // container startup, so instead we'll take a polling approach.
176+ // ensureContainerStarted ensures that a container has started. It may be called
177+ // concurrently, taking advantage of the fact that ContainerStart is idempotent.
178+ func ensureContainerStarted (ctx context.Context , dockerClient client.ContainerAPIClient , containerID string ) error {
185179 for i := 10 ; i > 0 ; i -- {
186- if status , err := dockerClient .ContainerInspect (ctx , containerID ); err != nil {
187- // There is a small gap between the time that a container ID and
188- // name are registered and the time that the container is actually
189- // created and shows up in container list and inspect requests:
190- //
191- // https://github.com/moby/moby/blob/de24c536b0ea208a09e0fff3fd896c453da6ef2e/daemon/container.go#L138-L156
192- //
193- // Given that multiple install operations tend to end up tightly
194- // synchronized by the preceeding pull operation and that this
195- // method is specifically designed to work around these race
196- // conditions, we'll allow 404 errors to pass silently (at least up
197- // until the polling time out - unfortunately we can't make the 404
198- // acceptance window any smaller than that because the CUDA-based
199- // containers are large and can take time to create).
200- if ! strings .Contains (err .Error (), "No such container" ) {
201- return fmt .Errorf ("unable to inspect container (%s): %w" , containerID [:12 ], err )
202- }
203- } else {
204- switch status .State .Status {
205- case container .StateRunning :
206- return nil
207- case container .StateCreated , container .StateRestarting :
208- // wait for container to start
209- default :
210- return fmt .Errorf ("container is in unexpected state %q" , status .State .Status )
211- }
180+ err := dockerClient .ContainerStart (ctx , containerID , container.StartOptions {})
181+ if err == nil {
182+ return nil
183+ }
184+ // There is a small gap between the time that a container ID and
185+ // name are registered and the time that the container is actually
186+ // created and shows up in container list and inspect requests:
187+ //
188+ // https://github.com/moby/moby/blob/de24c536b0ea208a09e0fff3fd896c453da6ef2e/daemon/container.go#L138-L156
189+ //
190+ // Given that multiple install operations tend to end up tightly
191+ // synchronized by the preceeding pull operation and that this
192+ // method is specifically designed to work around these race
193+ // conditions, we'll allow 404 errors to pass silently (at least up
194+ // until the polling time out - unfortunately we can't make the 404
195+ // acceptance window any smaller than that because the CUDA-based
196+ // containers are large and can take time to create).
197+ //
198+ // For some reason, this error case can also manifest as an EOF on the
199+ // request (I'm not sure where this arises in the Moby server), so we'll
200+ // let that pass silently too.
201+ if ! (errdefs .IsNotFound (err ) || errors .Is (err , io .EOF )) {
202+ return err
212203 }
213204 if i > 1 {
214205 select {
@@ -280,30 +271,31 @@ func CreateControllerContainer(ctx context.Context, dockerClient *client.Client,
280271 }
281272
282273 // Create the container. If we detect that a concurrent installation is in
283- // progress, then we wait for whichever install process creates the
284- // container first and then wait for its container to be ready.
274+ // progress (as indicated by a conflicting container name (which should have
275+ // been detected just before installation)), then we'll allow the error to
276+ // pass silently and simply work in conjunction with any concurrent
277+ // installers to start the container.
285278 resp , err := dockerClient .ContainerCreate (ctx , config , hostConfig , nil , nil , controllerContainerName )
286- if err != nil {
287- if match := concurrentInstallMatcher .FindStringSubmatch (err .Error ()); match != nil {
288- if err := waitForContainerToStart (ctx , dockerClient , match [1 ]); err != nil {
289- return fmt .Errorf ("failed waiting for concurrent installation: %w" , err )
290- }
291- return nil
292- }
279+ if err != nil && ! errdefs .IsConflict (err ) {
293280 return fmt .Errorf ("failed to create container %s: %w" , controllerContainerName , err )
294281 }
282+ created := err == nil
295283
296284 // Start the container.
297285 printer .Printf ("Starting model runner container %s...\n " , controllerContainerName )
298- if err := dockerClient .ContainerStart (ctx , resp .ID , container.StartOptions {}); err != nil {
299- _ = dockerClient .ContainerRemove (ctx , resp .ID , container.RemoveOptions {Force : true })
286+ if err := ensureContainerStarted (ctx , dockerClient , controllerContainerName ); err != nil {
287+ if created {
288+ _ = dockerClient .ContainerRemove (ctx , resp .ID , container.RemoveOptions {Force : true })
289+ }
300290 return fmt .Errorf ("failed to start container %s: %w" , controllerContainerName , err )
301291 }
302292
303- // Copy Docker config file if it exists
304- if err := copyDockerConfigToContainer (ctx , dockerClient , resp .ID , engineKind ); err != nil {
305- // Log warning but continue - don't fail container creation
306- printer .Printf ("Warning: failed to copy Docker config: %v\n " , err )
293+ // Copy Docker config file if it exists and we're the container creator.
294+ if created {
295+ if err := copyDockerConfigToContainer (ctx , dockerClient , resp .ID , engineKind ); err != nil {
296+ // Log warning but continue - don't fail container creation
297+ printer .Printf ("Warning: failed to copy Docker config: %v\n " , err )
298+ }
307299 }
308300 return nil
309301}
0 commit comments