Skip to content

Commit feb7c5a

Browse files
authored
Merge pull request NVIDIA#391 from klueska/fix-bug-termination
Ensure we exit cleanly when compute-domain-daemon shutdown prematurely
2 parents 3e82511 + 4985ec4 commit feb7c5a

File tree

3 files changed

+13
-6
lines changed

3 files changed

+13
-6
lines changed

cmd/compute-domain-daemon/computedomain.go

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -197,7 +197,11 @@ func (m *ComputeDomainManager) UpdateComputeDomainNodeInfo(ctx context.Context,
197197

198198
// BlockUntilAllNodesJoinComputeDomain waits until all nodes have joined the compute domain
199199
// and returns the list of nodes in the compute domain.
200-
func (m *ComputeDomainManager) BlockUntilAllNodesJoinComputeDomain() []*nvapi.ComputeDomainNode {
201-
<-m.nodesChan
202-
return m.nodes
200+
func (m *ComputeDomainManager) BlockUntilAllNodesJoinComputeDomain(ctx context.Context) ([]*nvapi.ComputeDomainNode, error) {
201+
select {
202+
case <-ctx.Done():
203+
return nil, ctx.Err()
204+
case <-m.nodesChan:
205+
return m.nodes, nil
206+
}
203207
}

cmd/compute-domain-daemon/controller.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,6 @@ func (c *Controller) Run(ctx context.Context) error {
104104

105105
// BlockUntilAllNodesJoinComputeDomain waits until all nodes have joined the compute domain
106106
// and returns the list of nodes in the compute domain.
107-
func (c *Controller) BlockUntilAllNodesJoinComputeDomain() []*nvapi.ComputeDomainNode {
108-
return c.computeDomainManager.BlockUntilAllNodesJoinComputeDomain()
107+
func (c *Controller) BlockUntilAllNodesJoinComputeDomain(ctx context.Context) ([]*nvapi.ComputeDomainNode, error) {
108+
return c.computeDomainManager.BlockUntilAllNodesJoinComputeDomain(ctx)
109109
}

cmd/compute-domain-daemon/main.go

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -173,7 +173,10 @@ func run(ctx context.Context, cancel context.CancelFunc, flags *Flags) error {
173173
}()
174174

175175
// Wait until all nodes have joined the compute domain
176-
nodes := controller.BlockUntilAllNodesJoinComputeDomain()
176+
nodes, err := controller.BlockUntilAllNodesJoinComputeDomain(ctx)
177+
if err != nil {
178+
return fmt.Errorf("error waiting for all nodes to join ComputeDomain: %w", err)
179+
}
177180

178181
if flags.cliqueID == "" {
179182
fmt.Println("ClusterUUID and CliqueId are NOT set for GPUs on this node.")

0 commit comments

Comments
 (0)