Skip to content

Commit a92b36e

Browse files
committed
Make maxNodesPerIMEXDomain configurable (default at 18)
Signed-off-by: Kevin Klues <kklues@nvidia.com>
1 parent df8e1d4 commit a92b36e

File tree

6 files changed

+52
-22
lines changed

6 files changed

+52
-22
lines changed

cmd/compute-domain-controller/controller.go

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,9 @@ type ManagerConfig struct {
3838
// imageName is the full image name to use when rendering templates
3939
imageName string
4040

41+
// maxNodesPerIMEXDomain is the maximum number of nodes per IMEX domain to allocate
42+
maxNodesPerIMEXDomain int
43+
4144
// clientsets provides access to various Kubernetes API client interfaces
4245
clientsets flags.ClientSets
4346

@@ -67,12 +70,13 @@ func (c *Controller) Run(ctx context.Context) error {
6770
workQueue := workqueue.New(workqueue.DefaultControllerRateLimiter())
6871

6972
managerConfig := &ManagerConfig{
70-
driverName: c.config.driverName,
71-
driverNamespace: c.config.flags.namespace,
72-
additionalNamespaces: c.config.flags.additionalNamespaces.Value(),
73-
imageName: c.config.flags.imageName,
74-
clientsets: c.config.clientsets,
75-
workQueue: workQueue,
73+
driverName: c.config.driverName,
74+
driverNamespace: c.config.flags.namespace,
75+
additionalNamespaces: c.config.flags.additionalNamespaces.Value(),
76+
imageName: c.config.flags.imageName,
77+
maxNodesPerIMEXDomain: c.config.flags.maxNodesPerIMEXDomain,
78+
clientsets: c.config.clientsets,
79+
workQueue: workQueue,
7680
}
7781

7882
cdManager := NewComputeDomainManager(managerConfig)

cmd/compute-domain-controller/daemonset.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@ type DaemonSetTemplateData struct {
5050
ComputeDomainLabelValue types.UID
5151
ResourceClaimTemplateName string
5252
ImageName string
53+
MaxNodesPerIMEXDomain int
5354
FeatureGates map[string]bool
5455
}
5556

@@ -200,6 +201,7 @@ func (m *DaemonSetManager) Create(ctx context.Context, cd *nvapi.ComputeDomain)
200201
ComputeDomainLabelValue: cd.UID,
201202
ResourceClaimTemplateName: rct.Name,
202203
ImageName: m.config.imageName,
204+
MaxNodesPerIMEXDomain: m.config.maxNodesPerIMEXDomain,
203205
FeatureGates: featuregates.ToMap(),
204206
}
205207

cmd/compute-domain-controller/main.go

Lines changed: 17 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -44,16 +44,23 @@ import (
4444

4545
const (
4646
DriverName = "compute-domain.nvidia.com"
47+
48+
// This constant provides a reasonable default for the maximum size of
49+
// a given IMEX Domain. On GB200 and GB300 the limit is 18, so we pick
50+
// this for now. It can be overridden as an environment variable or
51+
// command line argument as required.
52+
defaultMaxNodesPerIMEXDomain = 18
4753
)
4854

4955
type Flags struct {
5056
kubeClientConfig flags.KubeClientConfig
5157
loggingConfig *flags.LoggingConfig
5258
featureGateConfig *flags.FeatureGateConfig
5359

54-
podName string
55-
namespace string
56-
imageName string
60+
podName string
61+
namespace string
62+
imageName string
63+
maxNodesPerIMEXDomain int
5764

5865
httpEndpoint string
5966
metricsPath string
@@ -103,6 +110,13 @@ func newApp() *cli.App {
103110
Destination: &flags.imageName,
104111
EnvVars: []string{"IMAGE_NAME"},
105112
},
113+
&cli.IntFlag{
114+
Name: "max-nodes-per-imex-domain",
115+
Usage: "The maximum number of possible nodes per IMEX domain",
116+
Value: defaultMaxNodesPerIMEXDomain,
117+
EnvVars: []string{"MAX_NODES_PER_IMEX_DOMAIN"},
118+
Destination: &flags.maxNodesPerIMEXDomain,
119+
},
106120
&cli.StringFlag{
107121
Category: "HTTP server:",
108122
Name: "http-endpoint",

cmd/compute-domain-daemon/dnsnames.go

Lines changed: 13 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,6 @@ import (
3030
)
3131

3232
const (
33-
maxDNSNames = 18
3433
hostsFilePath = "/etc/hosts"
3534
dnsNamePrefix = "compute-domain-daemon-"
3635
dnsNameFormat = dnsNamePrefix + "%d"
@@ -42,17 +41,19 @@ type IPToDNSNameMap map[string]string
4241
// DNSNameManager manages the allocation of static DNS names to IP addresses.
4342
type DNSNameManager struct {
4443
sync.Mutex
45-
ipToDNSName IPToDNSNameMap
46-
cliqueID string
47-
nodesConfigPath string
44+
ipToDNSName IPToDNSNameMap
45+
cliqueID string
46+
maxNodesPerIMEXDomain int
47+
nodesConfigPath string
4848
}
4949

5050
// NewDNSNameManager creates a new DNS name manager.
51-
func NewDNSNameManager(cliqueID string, nodesConfigPath string) *DNSNameManager {
51+
func NewDNSNameManager(cliqueID string, maxNodesPerIMEXDomain int, nodesConfigPath string) *DNSNameManager {
5252
return &DNSNameManager{
53-
ipToDNSName: make(IPToDNSNameMap),
54-
cliqueID: cliqueID,
55-
nodesConfigPath: nodesConfigPath,
53+
ipToDNSName: make(IPToDNSNameMap),
54+
cliqueID: cliqueID,
55+
maxNodesPerIMEXDomain: maxNodesPerIMEXDomain,
56+
nodesConfigPath: nodesConfigPath,
5657
}
5758
}
5859

@@ -135,7 +136,7 @@ func (m *DNSNameManager) allocateDNSName(ip string) (string, error) {
135136
}
136137

137138
// Find the next available DNS name
138-
for i := 0; i < maxDNSNames; i++ {
139+
for i := 0; i < m.maxNodesPerIMEXDomain; i++ {
139140
dnsName := fmt.Sprintf(dnsNameFormat, i)
140141
// Check if this DNS name is already in use
141142
inUse := false
@@ -152,7 +153,7 @@ func (m *DNSNameManager) allocateDNSName(ip string) (string, error) {
152153
}
153154

154155
// If all DNS names are used, return an error
155-
return "", fmt.Errorf("no DNS names available (max: %d)", maxDNSNames)
156+
return "", fmt.Errorf("no DNS names available (max: %d)", m.maxNodesPerIMEXDomain)
156157
}
157158

158159
// updateHostsFile updates the /etc/hosts file with current IP to DNS name mappings.
@@ -216,14 +217,14 @@ func (m *DNSNameManager) WriteNodesConfig() error {
216217
defer f.Close()
217218

218219
// Write static DNS names
219-
for i := 0; i < maxDNSNames; i++ {
220+
for i := 0; i < m.maxNodesPerIMEXDomain; i++ {
220221
dnsName := fmt.Sprintf(dnsNameFormat, i)
221222
if _, err := fmt.Fprintf(f, "%s\n", dnsName); err != nil {
222223
return fmt.Errorf("failed to write to nodes config file: %w", err)
223224
}
224225
}
225226

226-
klog.Infof("Created static nodes config file with %d DNS names using format %s", maxDNSNames, dnsNameFormat)
227+
klog.Infof("Created static nodes config file with %d DNS names using format %s", m.maxNodesPerIMEXDomain, dnsNameFormat)
227228

228229
return nil
229230
}

cmd/compute-domain-daemon/main.go

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@ type Flags struct {
5151
computeDomainNamespace string
5252
nodeName string
5353
podIP string
54+
maxNodesPerIMEXDomain int
5455
loggingConfig *flags.LoggingConfig
5556
featureGateConfig *flags.FeatureGateConfig
5657
}
@@ -124,6 +125,12 @@ func newApp() *cli.App {
124125
EnvVars: []string{"POD_IP"},
125126
Destination: &flags.podIP,
126127
},
128+
&cli.IntFlag{
129+
Name: "max-nodes-per-imex-domain",
130+
Usage: "The maximum number of possible nodes per IMEX domain",
131+
EnvVars: []string{"MAX_NODES_PER_IMEX_DOMAIN"},
132+
Destination: &flags.maxNodesPerIMEXDomain,
133+
},
127134
}
128135
cliFlags = append(cliFlags, flags.featureGateConfig.Flags()...)
129136
cliFlags = append(cliFlags, flags.loggingConfig.Flags()...)
@@ -181,7 +188,7 @@ func run(ctx context.Context, cancel context.CancelFunc, flags *Flags) error {
181188
var dnsNameManager *DNSNameManager
182189
if featuregates.Enabled(featuregates.IMEXDaemonsWithDNSNames) {
183190
// Prepare DNS name manager
184-
dnsNameManager = NewDNSNameManager(flags.cliqueID, nodesConfigPath)
191+
dnsNameManager = NewDNSNameManager(flags.cliqueID, flags.maxNodesPerIMEXDomain, nodesConfigPath)
185192

186193
// Create static nodes config file with DNS names
187194
if err := dnsNameManager.WriteNodesConfig(); err != nil {

templates/compute-domain-daemon.tmpl.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,8 @@ spec:
2626
image: {{ .ImageName }}
2727
command: ["compute-domain-daemon", "-v", "6", "run"]
2828
env:
29+
- name: MAX_NODES_PER_IMEX_DOMAIN
30+
value: "{{ .MaxNodesPerIMEXDomain }}"
2931
- name: NODE_NAME
3032
valueFrom:
3133
fieldRef:

0 commit comments

Comments
 (0)