Skip to content

Commit 32362ef

Browse files
Julien-Benfealebenpae
authored andcommitted
CLOUDP-294373: Configure ClusterIP services for mongos (#4134)
# Summary When configuring a multi-cluster sharded deployment with external access but no external domain, the operator was not creating clusterIP services for the components. They could not reach each other. ## Proof of Work New test ensuring we can deploy a multi-cluster sharded resource with `externalAccess` defined, but no `externalDomain`, with a service mesh. The test was failing without the fix, as pods couldn't communicate.
1 parent 167f154 commit 32362ef

11 files changed

+172
-37
lines changed

.evergreen-tasks.yml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1165,6 +1165,11 @@ tasks:
11651165
commands:
11661166
- func: e2e_test
11671167

1168+
- name: e2e_multi_cluster_sharded_external_access_no_ext_domain
1169+
tags: [ "patch-run" ]
1170+
commands:
1171+
- func: e2e_test
1172+
11681173
- name: e2e_multi_cluster_sharded_tls_no_mesh
11691174
tags: [ "patch-run" ]
11701175
commands:

.evergreen.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -789,6 +789,7 @@ task_groups:
789789
- e2e_multi_cluster_sharded_simplest
790790
- e2e_multi_cluster_sharded_snippets
791791
- e2e_multi_cluster_sharded_simplest_no_mesh
792+
- e2e_multi_cluster_sharded_external_access_no_ext_domain
792793
- e2e_multi_cluster_sharded_tls_no_mesh
793794
- e2e_multi_cluster_sharded_tls
794795
- e2e_multi_cluster_sharded_disaster_recovery

RELEASE_NOTES.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
## Bug Fixes
99
* Fixes the bug when status of `MongoDBUser` was being set to `Updated` prematurely. For example, new users were not immediately usable following `MongoDBUser` creation despite the operator reporting `Updated` state.
1010
* Fixed a bug causing cluster health check issues when ordering of users and tokens differed in Kubeconfig.
11+
* Fixed a bug when deploying a Multi-Cluster sharded resource with an external access configuration could result in pods not being able to reach each others.
1112

1213
# MongoDB Enterprise Kubernetes Operator 1.31.0
1314

controllers/operator/clusterchecks_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -137,7 +137,7 @@ func (c *clusterChecks) checkExternalServices(ctx context.Context, statefulSetNa
137137
}
138138
}
139139

140-
func (c *clusterChecks) checkExternalServicesDontNotExist(ctx context.Context, statefulSetName string, expectedMembers int) {
140+
func (c *clusterChecks) checkExternalServicesDontExist(ctx context.Context, statefulSetName string, expectedMembers int) {
141141
for podIdx := 0; podIdx < expectedMembers; podIdx++ {
142142
svc := corev1.Service{}
143143
serviceName := fmt.Sprintf("%s-%d-svc-external", statefulSetName, podIdx)

controllers/operator/mongodbshardedcluster_controller.go

Lines changed: 13 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -2374,22 +2374,6 @@ func (r *ShardedClusterReconcileHelper) GetConfigSrvServiceName(memberCluster mu
23742374
}
23752375
}
23762376

2377-
func (r *ShardedClusterReconcileHelper) GetMongosServiceName(memberCluster multicluster.MemberCluster) string {
2378-
if memberCluster.Legacy {
2379-
return r.sc.ServiceName()
2380-
} else {
2381-
return dns.GetMultiHeadlessServiceName(r.sc.Name, memberCluster.Index)
2382-
}
2383-
}
2384-
2385-
func (r *ShardedClusterReconcileHelper) GetShardServiceName(memberCluster multicluster.MemberCluster) string {
2386-
if memberCluster.Legacy {
2387-
return r.sc.ServiceName()
2388-
} else {
2389-
return r.sc.ShardServiceName()
2390-
}
2391-
}
2392-
23932377
func (r *ShardedClusterReconcileHelper) replicateAgentKeySecret(ctx context.Context, conn om.Connection, agentKey string, log *zap.SugaredLogger) error {
23942378
for _, memberCluster := range getHealthyMemberClusters(r.allMemberClusters) {
23952379
var databaseSecretPath string
@@ -2519,6 +2503,7 @@ func (r *ShardedClusterReconcileHelper) reconcileConfigServerServices(ctx contex
25192503
if configServerExternalAccess == nil {
25202504
configServerExternalAccess = r.sc.Spec.DbCommonSpec.ExternalAccessConfiguration
25212505
}
2506+
// Config servers need external services only if an externalDomain is configured
25222507
if configServerExternalAccess != nil && configServerExternalAccess.ExternalDomain != nil {
25232508
log.Debugf("creating external services for %s in cluster: %s", r.sc.ConfigRsName(), memberCluster.Name)
25242509
svc, err := r.getPodExternalService(memberCluster,
@@ -2532,7 +2517,9 @@ func (r *ShardedClusterReconcileHelper) reconcileConfigServerServices(ctx contex
25322517
if err = mekoService.CreateOrUpdateService(ctx, memberCluster.Client, svc); err != nil && !errors.IsAlreadyExists(err) {
25332518
return xerrors.Errorf("failed to create external service %s in cluster: %s, err: %w", svc.Name, memberCluster.Name, err)
25342519
}
2535-
} else {
2520+
}
2521+
// We don't need internal per-pod services in case we have externalAccess configured AND an external domain
2522+
if configServerExternalAccess == nil || configServerExternalAccess.ExternalDomain == nil {
25362523
log.Debugf("creating internal services for %s in cluster: %s", r.sc.ConfigRsName(), memberCluster.Name)
25372524
svc := r.getPodService(r.sc.ConfigRsName(), memberCluster, podNum, portOrDefault)
25382525
if err := mekoService.CreateOrUpdateService(ctx, memberCluster.Client, svc); err != nil && !errors.IsAlreadyExists(err) {
@@ -2556,6 +2543,7 @@ func (r *ShardedClusterReconcileHelper) reconcileShardServices(ctx context.Conte
25562543
scaler := r.GetShardScaler(shardIdx, memberCluster)
25572544

25582545
for podNum := 0; podNum < scaler.DesiredReplicas(); podNum++ {
2546+
// Shards need external services only if an externalDomain is configured
25592547
if shardsExternalAccess != nil && shardsExternalAccess.ExternalDomain != nil {
25602548
log.Debugf("creating external services for %s in cluster: %s", r.sc.ShardRsName(shardIdx), memberCluster.Name)
25612549
svc, err := r.getPodExternalService(
@@ -2570,7 +2558,9 @@ func (r *ShardedClusterReconcileHelper) reconcileShardServices(ctx context.Conte
25702558
if err = mekoService.CreateOrUpdateService(ctx, memberCluster.Client, svc); err != nil && !errors.IsAlreadyExists(err) {
25712559
return xerrors.Errorf("failed to create external service %s in cluster: %s, err: %w", svc.Name, memberCluster.Name, err)
25722560
}
2573-
} else {
2561+
}
2562+
// We don't need internal per-pod services in case we have externalAccess configured AND an external domain
2563+
if shardsExternalAccess == nil || shardsExternalAccess.ExternalDomain == nil {
25742564
log.Debugf("creating internal services for %s in cluster: %s", r.sc.ShardRsName(shardIdx), memberCluster.Name)
25752565
svc := r.getPodService(r.sc.ShardRsName(shardIdx), memberCluster, podNum, portOrDefault)
25762566
if err := mekoService.CreateOrUpdateService(ctx, memberCluster.Client, svc); err != nil {
@@ -2595,6 +2585,7 @@ func (r *ShardedClusterReconcileHelper) reconcileMongosServices(ctx context.Cont
25952585
if mongosExternalAccess == nil {
25962586
mongosExternalAccess = r.sc.Spec.DbCommonSpec.ExternalAccessConfiguration
25972587
}
2588+
// Mongos always need external services if externalAccess is configured
25982589
if mongosExternalAccess != nil {
25992590
log.Debugf("creating external services for %s in cluster: %s", r.sc.MongosRsName(), memberCluster.Name)
26002591
svc, err := r.getPodExternalService(memberCluster,
@@ -2608,7 +2599,9 @@ func (r *ShardedClusterReconcileHelper) reconcileMongosServices(ctx context.Cont
26082599
if err = mekoService.CreateOrUpdateService(ctx, memberCluster.Client, svc); err != nil && !errors.IsAlreadyExists(err) {
26092600
return xerrors.Errorf("failed to create external service %s in cluster: %s, err: %w", svc.Name, memberCluster.Name, err)
26102601
}
2611-
} else {
2602+
}
2603+
// We don't need internal per-pod services in case we have externalAccess configured AND an external domain
2604+
if mongosExternalAccess == nil || mongosExternalAccess.ExternalDomain == nil {
26122605
log.Debugf("creating internal services for %s in cluster: %s", r.sc.MongosRsName(), memberCluster.Name)
26132606
svc := r.getPodService(r.sc.MongosRsName(), memberCluster, podNum, portOrDefault)
26142607
if err := mekoService.CreateOrUpdateService(ctx, memberCluster.Client, svc); err != nil && !errors.IsAlreadyExists(err) {
@@ -2617,6 +2610,7 @@ func (r *ShardedClusterReconcileHelper) reconcileMongosServices(ctx context.Cont
26172610

26182611
_ = append(allServices, &svc)
26192612
}
2613+
26202614
if err := createHeadlessServiceForStatefulSet(ctx, r.sc.MongosRsName(), portOrDefault, r.sc.Namespace, memberCluster); err != nil {
26212615
return err
26222616
}

controllers/operator/mongodbshardedcluster_controller_multi_test.go

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -550,21 +550,22 @@ func TestReconcileCreateMultiClusterShardedClusterWithExternalAccessAndNoExterna
550550
configSrvStsName := fmt.Sprintf("%s-config-%d", sc.Name, clusterIdx)
551551
configMembers := memberClusters.ConfigServerDistribution[clusterSpecItem.ClusterName]
552552
memberClusterChecks.checkInternalServices(ctx, configSrvStsName)
553-
memberClusterChecks.checkExternalServicesDontNotExist(ctx, configSrvStsName, configMembers)
553+
memberClusterChecks.checkExternalServicesDontExist(ctx, configSrvStsName, configMembers)
554554
memberClusterChecks.checkPerPodServices(ctx, configSrvStsName, configMembers)
555555

556556
mongosStsName := fmt.Sprintf("%s-mongos-%d", sc.Name, clusterIdx)
557557
mongosMembers := memberClusters.MongosDistribution[clusterSpecItem.ClusterName]
558558
memberClusterChecks.checkExternalServices(ctx, mongosStsName, mongosMembers)
559559
memberClusterChecks.checkInternalServices(ctx, mongosStsName)
560-
memberClusterChecks.checkPerPodServicesDontExist(ctx, mongosStsName, mongosMembers)
560+
// Without external domain, we need per-pod mongos services
561+
memberClusterChecks.checkPerPodServices(ctx, mongosStsName, mongosMembers)
561562
memberClusterChecks.checkServiceAnnotations(ctx, mongosStsName, mongosMembers, sc, clusterSpecItem.ClusterName, clusterIdx, test.ExampleAccessWithNoExternalDomain.MongosExternalDomain)
562563

563564
for shardIdx := 0; shardIdx < memberClusters.ShardCount(); shardIdx++ {
564565
shardStsName := fmt.Sprintf("%s-%d-%d", sc.Name, shardIdx, clusterIdx)
565566
shardMembers := memberClusters.ShardDistribution[shardIdx][clusterSpecItem.ClusterName]
566567
memberClusterChecks.checkInternalServices(ctx, shardStsName)
567-
memberClusterChecks.checkExternalServicesDontNotExist(ctx, shardStsName, shardMembers)
568+
memberClusterChecks.checkExternalServicesDontExist(ctx, shardStsName, shardMembers)
568569
memberClusterChecks.checkPerPodServices(ctx, shardStsName, shardMembers)
569570
}
570571
memberClusterChecks.checkHostnameOverrideConfigMap(ctx, fmt.Sprintf("%s-hostname-override", sc.Name), expectedHostnameOverrideMap)

controllers/operator/mongodbshardedcluster_controller_test.go

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -149,16 +149,16 @@ func TestReconcileCreateSingleClusterShardedClusterWithExternalDomainSimplest(t
149149
memberClusterChecks.checkServiceAnnotations(ctx, mongosStatefulSetName, sc.Spec.MongosCount, sc, multicluster.LegacyCentralClusterName, 0, test.ExampleExternalClusterDomains.SingleClusterDomain)
150150

151151
configServerStatefulSetName := fmt.Sprintf("%s-config", sc.Name)
152-
memberClusterChecks.checkExternalServicesDontNotExist(ctx, configServerStatefulSetName, sc.Spec.ConfigServerCount)
152+
memberClusterChecks.checkExternalServicesDontExist(ctx, configServerStatefulSetName, sc.Spec.ConfigServerCount)
153153
memberClusterChecks.checkPerPodServicesDontExist(ctx, configServerStatefulSetName, sc.Spec.ConfigServerCount)
154154
// This is something to be unified - why MC and SC Services are called differently?
155155
configServerInternalServiceName := fmt.Sprintf("%s-cs", sc.Name)
156156
memberClusterChecks.checkServiceExists(ctx, configServerInternalServiceName)
157157

158-
memberClusterChecks.checkExternalServicesDontNotExist(ctx, fmt.Sprintf("%s-config", sc.Name), sc.Spec.ConfigServerCount)
158+
memberClusterChecks.checkExternalServicesDontExist(ctx, fmt.Sprintf("%s-config", sc.Name), sc.Spec.ConfigServerCount)
159159
for shardIdx := 0; shardIdx < sc.Spec.ShardCount; shardIdx++ {
160160
shardStatefulSetName := fmt.Sprintf("%s-%d", sc.Name, shardIdx)
161-
memberClusterChecks.checkExternalServicesDontNotExist(ctx, shardStatefulSetName, sc.Spec.ShardCount)
161+
memberClusterChecks.checkExternalServicesDontExist(ctx, shardStatefulSetName, sc.Spec.ShardCount)
162162
memberClusterChecks.checkPerPodServicesDontExist(ctx, shardStatefulSetName, sc.Spec.ShardCount)
163163
// This is something to be unified - why MC and SC Services are called differently?
164164
shardInternalServiceName := fmt.Sprintf("%s-sh", sc.Name)
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,113 @@
1+
from collections import defaultdict
2+
from typing import Dict, List, Optional
3+
4+
import kubernetes
5+
from kubernetes import client
6+
from kubetester import find_fixture, try_load
7+
from kubetester.kubetester import ensure_ent_version
8+
from kubetester.mongodb import MongoDB, Phase
9+
from kubetester.operator import Operator
10+
from pytest import fixture, mark
11+
from tests import test_logger
12+
from tests.conftest import get_member_cluster_api_client, get_member_cluster_names
13+
from tests.shardedcluster.conftest import (
14+
enable_multi_cluster_deployment,
15+
setup_external_access,
16+
)
17+
18+
MDB_RESOURCE_NAME = "sh"
19+
logger = test_logger.get_test_logger(__name__)
20+
21+
22+
@fixture(scope="module")
23+
def sharded_cluster(namespace: str, custom_mdb_version: str) -> MongoDB:
24+
resource = MongoDB.from_yaml(
25+
find_fixture("sharded-cluster-multi-cluster.yaml"), namespace=namespace, name=MDB_RESOURCE_NAME
26+
)
27+
28+
if try_load(resource):
29+
return resource
30+
31+
resource.set_version(ensure_ent_version(custom_mdb_version))
32+
33+
enable_multi_cluster_deployment(resource=resource)
34+
setup_external_access(resource=resource, enable_external_domain=False)
35+
36+
resource.set_architecture_annotation()
37+
38+
return resource
39+
40+
41+
@mark.e2e_multi_cluster_sharded_external_access_no_ext_domain
42+
def test_deploy_operator(multi_cluster_operator: Operator):
43+
multi_cluster_operator.assert_is_running()
44+
45+
46+
@mark.e2e_multi_cluster_sharded_external_access_no_ext_domain
47+
def test_sharded_cluster(sharded_cluster: MongoDB):
48+
sharded_cluster.update()
49+
sharded_cluster.assert_reaches_phase(Phase.Running, timeout=800)
50+
51+
52+
def service_exists(service_name: str, namespace: str, api_client: Optional[kubernetes.client.ApiClient] = None) -> bool:
53+
try:
54+
client.CoreV1Api(api_client=api_client).read_namespaced_service(service_name, namespace)
55+
except client.rest.ApiException as e:
56+
logger.error(f"Error reading {service_name}: {e}")
57+
return False
58+
return True
59+
60+
61+
@mark.e2e_multi_cluster_sharded_external_access_no_ext_domain
62+
def test_services_were_created(sharded_cluster: MongoDB, namespace: str):
63+
resource_name = sharded_cluster.name
64+
expected_services: Dict[str, List[str]] = defaultdict(list)
65+
member_clusters = get_member_cluster_names()
66+
67+
# Global services
68+
for cluster in member_clusters:
69+
expected_services[cluster].append(f"{resource_name}-svc")
70+
expected_services[cluster].append(f"{resource_name}-{resource_name}")
71+
72+
# All components get a headless service and per-pod services
73+
# Config server also gets an additional headless service suffixed -cs
74+
config_clusters = sharded_cluster["spec"]["configSrv"]["clusterSpecList"]
75+
for idx, cluster_spec in enumerate(config_clusters):
76+
members = cluster_spec["members"]
77+
expected_services[cluster_spec["clusterName"]].append(f"{resource_name}-config-{idx}-svc")
78+
expected_services[cluster_spec["clusterName"]].append(f"{resource_name}-{idx}-cs")
79+
for pod in range(members):
80+
expected_services[cluster_spec["clusterName"]].append(f"{resource_name}-config-{idx}-{pod}-svc")
81+
82+
# Mongos also get an external service per pod
83+
mongos_clusters = sharded_cluster["spec"]["mongos"]["clusterSpecList"]
84+
for idx, cluster_spec in enumerate(mongos_clusters):
85+
members = cluster_spec["members"]
86+
cluster_name = cluster_spec["clusterName"]
87+
expected_services[cluster_name].append(f"{resource_name}-mongos-{idx}-svc")
88+
for pod in range(members):
89+
expected_services[cluster_name].append(f"{resource_name}-mongos-{idx}-{pod}-svc")
90+
expected_services[cluster_name].append(f"{resource_name}-mongos-{idx}-{pod}-svc-external")
91+
92+
shard_count = sharded_cluster["spec"]["shardCount"]
93+
shard_clusters = sharded_cluster["spec"]["shard"]["clusterSpecList"]
94+
for shard in range(shard_count):
95+
for idx, cluster_spec in enumerate(shard_clusters):
96+
members = cluster_spec["members"]
97+
cluster_name = cluster_spec["clusterName"]
98+
expected_services[cluster_name].append(f"{resource_name}-{shard}-{idx}-svc")
99+
for pod in range(members):
100+
expected_services[cluster_name].append(f"{resource_name}-{shard}-{idx}-{pod}-svc")
101+
102+
logger.debug("Asserting the following services exist:")
103+
for cluster, services in expected_services.items():
104+
logger.debug(f"Cluster: {cluster}, service count: {len(services)}")
105+
logger.debug(f"Services: {services}")
106+
107+
# Assert that each expected service exists in its corresponding cluster.
108+
for cluster, services in expected_services.items():
109+
api_client = get_member_cluster_api_client(cluster) # Retrieve the API client for the cluster
110+
for svc in services:
111+
assert service_exists(
112+
svc, namespace, api_client
113+
), f"Service {svc} not found. Cluster: {cluster} Namespace: {namespace}"

docker/mongodb-enterprise-tests/tests/multicluster_shardedcluster/multi_cluster_sharded_simplest_no_mesh.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ def sharded_cluster(namespace: str, custom_mdb_version: str) -> MongoDB:
2828
resource.set_version(ensure_ent_version(custom_mdb_version))
2929

3030
enable_multi_cluster_deployment(resource=resource)
31-
setup_external_access(resource=resource)
31+
setup_external_access(resource=resource, enable_external_domain=True)
3232

3333
resource.set_architecture_annotation()
3434

@@ -55,7 +55,7 @@ def test_deploy_operator(multi_cluster_operator: Operator):
5555
@mark.e2e_multi_cluster_sharded_simplest_no_mesh
5656
def test_sharded_cluster(sharded_cluster: MongoDB):
5757
sharded_cluster.update()
58-
sharded_cluster.assert_reaches_phase(Phase.Running, timeout=1800)
58+
sharded_cluster.assert_reaches_phase(Phase.Running, timeout=800)
5959

6060

6161
# Testing connectivity with External Access requires using the same DNS as deployed in Kube within

docker/mongodb-enterprise-tests/tests/multicluster_shardedcluster/multi_cluster_sharded_tls_no_mesh.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -152,7 +152,7 @@ def test_deploy_certs(all_certs, agent_certs):
152152
@mark.e2e_multi_cluster_sharded_tls_no_mesh
153153
def test_sharded_cluster_with_prefix_gets_to_running_state(sharded_cluster: MongoDB):
154154
sharded_cluster.update()
155-
sharded_cluster.assert_reaches_phase(Phase.Running, timeout=1800)
155+
sharded_cluster.assert_reaches_phase(Phase.Running, timeout=800)
156156

157157

158158
# TODO: (slaskawi) clearly the client tries to connect to mongos without TLS (we can see this in the logs).

0 commit comments

Comments
 (0)