Skip to content

Commit 569b508

Browse files
ominsclaude
andcommitted
fix: register sandbox before WaitForEnvd for TCP firewall proxy
The TCP firewall proxy looks up sandboxes by source address to allow egress traffic. When initEnvd makes outbound connections (e.g., GCS for volume mount), the proxy couldn't find the sandbox because it wasn't registered yet - Insert happened after ResumeSandbox returned. Fix: Move sandbox registration into Factory, right before WaitForEnvd. Remove the ineffective 1s delay that was added as a workaround. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
1 parent 33249bf commit 569b508

File tree

6 files changed

+20
-16
lines changed

6 files changed

+20
-16
lines changed

packages/orchestrator/benchmark_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -175,7 +175,7 @@ func BenchmarkBaseImageLaunch(b *testing.B) {
175175
templateCache.Start(b.Context())
176176
b.Cleanup(templateCache.Stop)
177177

178-
sandboxFactory := sandbox.NewFactory(config.BuilderConfig, networkPool, devicePool, featureFlags)
178+
sandboxFactory := sandbox.NewFactory(config.BuilderConfig, networkPool, devicePool, featureFlags, nil)
179179

180180
dockerhubRepository, err := dockerhub.GetRemoteRepository(b.Context())
181181
require.NoError(b, err)

packages/orchestrator/cmd/build-template/main.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -216,7 +216,7 @@ func buildTemplate(
216216
return fmt.Errorf("failed to create build metrics: %w", err)
217217
}
218218

219-
sandboxFactory := sandbox.NewFactory(c.BuilderConfig, networkPool, devicePool, featureFlags)
219+
sandboxFactory := sandbox.NewFactory(c.BuilderConfig, networkPool, devicePool, featureFlags, nil)
220220

221221
builder := build.NewBuilder(
222222
builderConfig,

packages/orchestrator/internal/sandbox/envd.go

Lines changed: 0 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -126,18 +126,6 @@ func (s *Sandbox) initEnvd(ctx context.Context) (e error) {
126126
span.End()
127127
}()
128128

129-
// When volume is configured, add a delay to allow VM network stack to fully initialize.
130-
// Without this delay, the envd may try to connect to GCS before network is ready,
131-
// resulting in EOF errors during TLS handshake.
132-
if s.volumeInitConfig != nil {
133-
const networkInitDelay = 300 * time.Millisecond
134-
logger.L().Info(ctx, "waiting for VM network initialization before volume mount",
135-
logger.WithSandboxID(s.Runtime.SandboxID),
136-
zap.Duration("delay", networkInitDelay),
137-
)
138-
time.Sleep(networkInitDelay)
139-
}
140-
141129
attributes := []attribute.KeyValue{telemetry.WithEnvdVersion(s.Config.Envd.Version), attribute.Int64("timeout_ms", s.internalConfig.EnvdInitRequestTimeout.Milliseconds())}
142130
attributesFail := append(attributes, attribute.Bool("success", false))
143131
attributesSuccess := append(attributes, attribute.Bool("success", true))

packages/orchestrator/internal/sandbox/sandbox.go

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -177,19 +177,22 @@ type Factory struct {
177177
featureFlags *featureflags.Client
178178
volumes *VolumesConfig
179179
tokenMinter *gcstoken.Minter
180+
sandboxes *Map
180181
}
181182

182183
func NewFactory(
183184
config cfg.BuilderConfig,
184185
networkPool *network.Pool,
185186
devicePool *nbd.DevicePool,
186187
featureFlags *featureflags.Client,
188+
sandboxes *Map,
187189
) *Factory {
188190
return &Factory{
189191
config: config,
190192
networkPool: networkPool,
191193
devicePool: devicePool,
192194
featureFlags: featureFlags,
195+
sandboxes: sandboxes,
193196
}
194197
}
195198

@@ -682,11 +685,22 @@ func (f *Factory) ResumeSandbox(
682685

683686
telemetry.ReportEvent(execCtx, "waiting for envd")
684687

688+
// Register sandbox before WaitForEnvd so TCP firewall proxy can find it.
689+
// This is needed because initEnvd may make outbound connections (e.g., GCS for volume mount)
690+
// that go through the TCP firewall proxy, which looks up sandboxes by source address.
691+
if f.sandboxes != nil {
692+
f.sandboxes.Insert(sbx)
693+
}
694+
685695
err = sbx.WaitForEnvd(
686696
ctx,
687697
f.config.EnvdTimeout,
688698
)
689699
if err != nil {
700+
// Remove from map on failure
701+
if f.sandboxes != nil {
702+
f.sandboxes.Remove(sbx.Runtime.SandboxID)
703+
}
690704
return nil, fmt.Errorf("failed to wait for sandbox start: %w", err)
691705
}
692706

packages/orchestrator/internal/server/sandboxes.go

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -189,7 +189,9 @@ func (s *Server) Create(ctx context.Context, req *orchestrator.SandboxCreateRequ
189189
return nil, status.Errorf(codes.Internal, "failed to create sandbox: %s", err)
190190
}
191191

192-
s.sandboxes.Insert(sbx)
192+
// Note: sandbox is already inserted into the map by Factory.ResumeSandbox
193+
// before WaitForEnvd is called, so TCP firewall proxy can find it.
194+
193195
go func() {
194196
ctx, childSpan := tracer.Start(context.WithoutCancel(ctx), "sandbox-create-stop", trace.WithNewRoot())
195197
defer childSpan.End()

packages/orchestrator/main.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -411,7 +411,7 @@ func run(config cfg.Config) (success bool) {
411411
closers = append(closers, closer{"network pool", networkPool.Close})
412412

413413
// sandbox factory
414-
sandboxFactory := sandbox.NewFactory(config.BuilderConfig, networkPool, devicePool, featureFlags)
414+
sandboxFactory := sandbox.NewFactory(config.BuilderConfig, networkPool, devicePool, featureFlags, sandboxes)
415415

416416
// Configure volumes support if enabled
417417
if config.VolumesRedisURL != "" {

0 commit comments

Comments
 (0)