Skip to content

Commit 07d3ca0

Browse files
craig[bot]DarrylWong
andcommitted
Merge #147731
147731: roachprod: fix service registration for unregistered services r=herkolategan a=DarrylWong As of #125393, we no longer register all virtual clusters with DNS. This was an optimization made to reduce unnecessary DNS usage when there was only one tenant using the default ports. Specifically, we no longer register shared process tenants and system tenants using the default ports. However, some of the existing logic still assumes that registration happens for all virtual clusters. For example, `clusterSynced.Stop` errors out if a shared process virtual cluster is not registered, even though this is now the expected behavior. This change: 1. Establishes that the `system interface` is a `shared-process service` as it contains both a KV and SQL server. Previously, a system tenant could be registered as either depending on if custom ports were used or not. 2. Renames `DiscoverService` to `ServiceDescriptor` to differentiate it from `DiscoverServices`. `DiscoverService` is a convenience function for `DiscoverServices` that returns just a single service. Importantly, it also implements fall back logic to handle when a service is not found which is necessary now that we no longer register all services. 3. Renames `DiscoverServices` to `discoverServices`. Since this function does not implement fall back logic needed for correctness, it should not be used directly. All existing usage of `discoverServices` is refactored to use `ServiceDescriptor` instead. 4. Adds a unit test to ensure our new fall back logic behaves as expected. Release note: none Epic: none Fixes: none Co-authored-by: DarrylWong <[email protected]>
2 parents fd18c1b + c7413eb commit 07d3ca0

File tree

8 files changed

+537
-295
lines changed

8 files changed

+537
-295
lines changed

pkg/roachprod/install/BUILD.bazel

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,7 @@ go_test(
7171
embed = [":install"],
7272
deps = [
7373
"//pkg/roachprod/cloud",
74+
"//pkg/roachprod/config",
7475
"//pkg/roachprod/logger",
7576
"//pkg/roachprod/vm",
7677
"//pkg/roachprod/vm/gce/testutils",
@@ -79,6 +80,7 @@ go_test(
7980
"//pkg/util/leaktest",
8081
"//pkg/util/randutil",
8182
"//pkg/util/retry",
83+
"//pkg/util/syncutil",
8284
"@com_github_cockroachdb_datadriven//:datadriven",
8385
"@com_github_cockroachdb_errors//:errors",
8486
"@com_github_stretchr_testify//require",

pkg/roachprod/install/cluster_synced.go

Lines changed: 20 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,10 @@ type SyncedCluster struct {
7070

7171
// Nodes is used by most commands (e.g. Start, Stop, Monitor). It describes
7272
// the list of nodes the operation pertains to.
73+
// $ roachprod create local -n 4
74+
// $ roachprod start local # [1, 2, 3, 4]
75+
// $ roachprod start local:2-4 # [2, 3, 4]
76+
// $ roachprod start local:2,1,4 # [1, 2, 4]
7377
Nodes Nodes
7478

7579
ClusterSettings
@@ -239,17 +243,6 @@ func (c *SyncedCluster) localVMDir(n Node) string {
239243
return local.VMDir(c.Name, int(n))
240244
}
241245

242-
// TargetNodes is the fully expanded, ordered list of nodes that any given
243-
// roachprod command is intending to target.
244-
//
245-
// $ roachprod create local -n 4
246-
// $ roachprod start local # [1, 2, 3, 4]
247-
// $ roachprod start local:2-4 # [2, 3, 4]
248-
// $ roachprod start local:2,1,4 # [1, 2, 4]
249-
func (c *SyncedCluster) TargetNodes() Nodes {
250-
return append(Nodes{}, c.Nodes...)
251-
}
252-
253246
// GetInternalIP returns the internal IP address of the specified node.
254247
func (c *SyncedCluster) GetInternalIP(n Node) (string, error) {
255248
if c.IsLocal() {
@@ -471,31 +464,24 @@ func (c *SyncedCluster) Stop(
471464
// killProcesses indicates whether processed need to be stopped.
472465
killProcesses := true
473466

474-
// Non system shared process virtual clusters don't get killed but are stopped via SQL.
475-
// Figure out if the virtual cluster is one or not.
467+
// For shared process secondary tenants, we just stop the service via SQL.
468+
// Find out of this is a shared process secondary tenant.
476469
if virtualClusterLabel != "" {
477470
name, sqlInstance, err := VirtualClusterInfoFromLabel(virtualClusterLabel)
478471
if err != nil {
479472
return err
480473
}
481474

482-
if name != SystemInterfaceName {
483-
services, err := c.DiscoverServices(ctx, name, ServiceTypeSQL)
475+
if !IsSystemInterface(name) {
476+
isExternal, err := c.IsExternalService(ctx, name)
484477
if err != nil {
485478
return err
486479
}
487480

488-
if len(services) == 0 {
489-
return fmt.Errorf("no service for virtual cluster %q", virtualClusterName)
490-
}
491-
492-
virtualClusterName = name
493-
if services[0].ServiceMode == ServiceModeShared {
494-
// For shared process virtual clusters, we just stop the service
495-
// via SQL.
496-
killProcesses = false
497-
} else {
481+
if isExternal {
498482
virtualClusterDisplay = fmt.Sprintf(" virtual cluster %q, instance %d", virtualClusterName, sqlInstance)
483+
} else {
484+
killProcesses = false
499485
}
500486
}
501487
}
@@ -2275,7 +2261,7 @@ func (c *SyncedCluster) pgurls(
22752261
}
22762262
m := make(map[Node]string, len(hosts))
22772263
for node, host := range hosts {
2278-
desc, err := c.DiscoverService(ctx, node, virtualClusterName, ServiceTypeSQL, sqlInstance)
2264+
desc, err := c.ServiceDescriptor(ctx, node, virtualClusterName, ServiceTypeSQL, sqlInstance)
22792265
if err != nil {
22802266
return nil, err
22812267
}
@@ -2310,24 +2296,19 @@ func (c *SyncedCluster) loadBalancerURL(
23102296
sqlInstance int,
23112297
auth PGAuthMode,
23122298
) (string, error) {
2313-
services, err := c.DiscoverServices(ctx, virtualClusterName, ServiceTypeSQL)
2299+
// Note that it's possible for our service to not be running on the entire roachprod
2300+
// cluster, e.g. one of the nodes is a workload node, or we have a separate process
2301+
// virtual cluster running on a subset of nodes. We must search for our service on
2302+
// the entire cluster.
2303+
descs, err := c.ServiceDescriptors(ctx, c.Nodes, virtualClusterName, ServiceTypeSQL, sqlInstance)
23142304
if err != nil {
23152305
return "", err
23162306
}
2317-
port := config.DefaultSQLPort
2318-
serviceMode := ServiceModeExternal
2319-
for _, service := range services {
2320-
if service.VirtualClusterName == virtualClusterName && service.Instance == sqlInstance {
2321-
serviceMode = service.ServiceMode
2322-
port = service.Port
2323-
break
2324-
}
2325-
}
2326-
address, err := c.FindLoadBalancer(l, port)
2307+
address, err := c.FindLoadBalancer(l, descs[0].Port)
23272308
if err != nil {
23282309
return "", err
23292310
}
2330-
loadBalancerURL := c.NodeURL(address.IP, address.Port, virtualClusterName, serviceMode, auth, "" /* database */)
2311+
loadBalancerURL := c.NodeURL(address.IP, address.Port, virtualClusterName, descs[0].ServiceMode, auth, "" /* database */)
23312312
return loadBalancerURL, nil
23322313
}
23332314

@@ -2732,7 +2713,7 @@ func (c *SyncedCluster) Reset(l *logger.Logger) error {
27322713
return nil
27332714
}
27342715

2735-
nodes := c.TargetNodes()
2716+
nodes := c.Nodes
27362717
targetVMs := make(vm.List, len(nodes))
27372718
for idx, node := range nodes {
27382719
targetVMs[idx] = c.VMs[node-1]

0 commit comments

Comments
 (0)