Skip to content

Commit e8271c4

Browse files
Merge pull request #115 from datum-cloud/feature/track-cert-health-on-proxy
Add hostname and proxy status conditions to show cert health
2 parents 0473f87 + 5981193 commit e8271c4

12 files changed

+1235
-55
lines changed

api/v1alpha/httpproxy_types.go

Lines changed: 43 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -261,6 +261,9 @@ const (
261261
// This condition is true when connector metadata has been programmed
262262
// via the downstream EnvoyPatchPolicy.
263263
HTTPProxyConditionConnectorMetadataProgrammed = "ConnectorMetadataProgrammed"
264+
265+
// This condition is true when all HTTPS hostnames have ready TLS certificates.
266+
HTTPProxyConditionCertificatesReady = "CertificatesReady"
264267
)
265268

266269
const (
@@ -283,6 +286,25 @@ const (
283286
// HostnameConditionAvailable indicates whether the hostname was successfully
284287
// claimed by this resource or is already in use by another resource.
285288
HostnameConditionAvailable = "Available"
289+
290+
// HostnameConditionCertificateReady tracks whether a TLS certificate has been
291+
// provisioned for this hostname (cert-manager Certificate in the downstream cluster).
292+
HostnameConditionCertificateReady = "CertificateReady"
293+
)
294+
295+
// Reasons for HostnameConditionCertificateReady.
296+
const (
297+
// CertificateReadyReasonCertificateIssued indicates the certificate has been issued and is ready.
298+
CertificateReadyReasonCertificateIssued = "CertificateIssued"
299+
300+
// CertificateReadyReasonPending indicates the certificate is not yet ready (e.g. not found or provisioning).
301+
CertificateReadyReasonPending = "Pending"
302+
303+
// CertificateReadyReasonProvisioningFailed indicates certificate provisioning failed.
304+
CertificateReadyReasonProvisioningFailed = "ProvisioningFailed"
305+
306+
// CertificateReadyReasonChallengeInProgress indicates an ACME challenge is in progress.
307+
CertificateReadyReasonChallengeInProgress = "ChallengeInProgress"
286308
)
287309

288310
// Reasons for HostnameConditionAvailable.
@@ -316,9 +338,16 @@ const (
316338
DNSRecordReasonZoneNotReady = "DNSZoneNotReady"
317339

318340
// DNSRecordReasonDomainNotVerified indicates the Domain resource for this
319-
// hostname has not been verified via a DNSZone.
341+
// hostname has not been verified.
320342
DNSRecordReasonDomainNotVerified = "DomainNotVerified"
321343

344+
// DNSRecordReasonDNSAuthorityMissing indicates the Domain is verified but
345+
// Datum DNS does not have authority over the domain. This happens when:
346+
// - The DNSZone is not ready (Accepted/Programmed conditions not True)
347+
// - The DNSZone has no nameservers assigned yet
348+
// - The domain's nameservers don't include the DNSZone's nameservers
349+
DNSRecordReasonDNSAuthorityMissing = "DNSAuthorityMissing"
350+
322351
// DNSRecordReasonConflict indicates an existing DNSRecordSet for this
323352
// hostname is managed by a different actor.
324353
DNSRecordReasonConflict = "ConflictWithUserRecord"
@@ -353,6 +382,18 @@ const (
353382
DNSRecordsProgrammedReasonPartialFailure = "PartialFailure"
354383
)
355384

385+
// Reasons for HTTPProxyConditionCertificatesReady.
386+
const (
387+
// CertificatesReadyReasonAllCertificatesReady indicates all HTTPS hostnames have ready certificates.
388+
CertificatesReadyReasonAllCertificatesReady = "AllCertificatesReady"
389+
390+
// CertificatesReadyReasonCertificatesPending indicates one or more certificates are pending or in progress.
391+
CertificatesReadyReasonCertificatesPending = "CertificatesPending"
392+
393+
// CertificatesReadyReasonCertificatesFailed indicates one or more certificate provisioning attempts failed.
394+
CertificatesReadyReasonCertificatesFailed = "CertificatesFailed"
395+
)
396+
356397
const (
357398

358399
// HTTPProxyReasonAccepted indicates that the HTTP proxy has been accepted.
@@ -394,6 +435,7 @@ const (
394435
//
395436
// +kubebuilder:printcolumn:name="Hostname",type=string,JSONPath=`.status.hostnames[*]`
396437
// +kubebuilder:printcolumn:name="Programmed",type=string,JSONPath=`.status.conditions[?(@.type=="Programmed")].status`
438+
// +kubebuilder:printcolumn:name="Certificates",type=string,JSONPath=`.status.conditions[?(@.type=="CertificatesReady")].status`
397439
// +kubebuilder:printcolumn:name="Age",type=date,JSONPath=`.metadata.creationTimestamp`
398440
type HTTPProxy struct {
399441
metav1.TypeMeta `json:",inline"`

api/v1alpha/httpproxy_types_test.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -216,6 +216,7 @@ func TestConditionConstants(t *testing.T) {
216216
assert.Equal(t, "DNSZoneNotFound", networkingv1alpha.DNSRecordReasonZoneNotFound)
217217
assert.Equal(t, "DNSZoneNotReady", networkingv1alpha.DNSRecordReasonZoneNotReady)
218218
assert.Equal(t, "DomainNotVerified", networkingv1alpha.DNSRecordReasonDomainNotVerified)
219+
assert.Equal(t, "DNSAuthorityMissing", networkingv1alpha.DNSRecordReasonDNSAuthorityMissing)
219220
assert.Equal(t, "ConflictWithUserRecord", networkingv1alpha.DNSRecordReasonConflict)
220221
assert.Equal(t, "RecordCreationFailed", networkingv1alpha.DNSRecordReasonFailed)
221222
assert.Equal(t, "RetryPending", networkingv1alpha.DNSRecordReasonRetryPending)

config/rbac_downstream/role.yaml

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -134,6 +134,14 @@ rules:
134134
- httproutes/status
135135
verbs:
136136
- get
137+
- apiGroups:
138+
- cert-manager.io
139+
resources:
140+
- certificates
141+
verbs:
142+
- get
143+
- list
144+
- watch
137145
- apiGroups:
138146
- gateway.envoyproxy.io
139147
resources:

internal/controller/domain_controller.go

Lines changed: 2 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ import (
3333
"go.datum.net/network-services-operator/internal/config"
3434
"go.datum.net/network-services-operator/internal/registrydata"
3535
conditionutil "go.datum.net/network-services-operator/internal/util/condition"
36+
dnsutil "go.datum.net/network-services-operator/internal/util/dns"
3637
)
3738

3839
// DomainReconciler reconciles a Domain object
@@ -320,34 +321,6 @@ var dnsZoneListGVK = schema.GroupVersionKind{
320321
Kind: "DNSZoneList",
321322
}
322323

323-
func normalizeNameserverHostname(s string) string {
324-
return strings.ToLower(strings.TrimSuffix(strings.TrimSpace(s), "."))
325-
}
326-
327-
func hasStringOverlapNormalized(a, b []string) bool {
328-
if len(a) == 0 || len(b) == 0 {
329-
return false
330-
}
331-
set := make(map[string]struct{}, len(a))
332-
for _, v := range a {
333-
n := normalizeNameserverHostname(v)
334-
if n == "" {
335-
continue
336-
}
337-
set[n] = struct{}{}
338-
}
339-
for _, v := range b {
340-
n := normalizeNameserverHostname(v)
341-
if n == "" {
342-
continue
343-
}
344-
if _, ok := set[n]; ok {
345-
return true
346-
}
347-
}
348-
return false
349-
}
350-
351324
func (r *DomainReconciler) attemptDNSZoneVerification(
352325
ctx context.Context,
353326
reader client.Reader,
@@ -439,7 +412,7 @@ func (r *DomainReconciler) attemptDNSZoneVerification(
439412
verifiedDNSZoneCondition.Message = fmt.Sprintf("Waiting for Domain status.nameservers before verifying via DNSZone %q", zoneName)
440413
continue
441414
}
442-
if !hasStringOverlapNormalized(domainNS, zoneNS) {
415+
if !dnsutil.HasNameserverOverlap(domainNS, zoneNS) {
443416
verifiedDNSZoneCondition.Reason = networkingv1alpha.DomainReasonDNSZoneNameserverMismatch
444417
verifiedDNSZoneCondition.Message = fmt.Sprintf("Domain nameservers do not match DNSZone %q nameservers yet", zoneName)
445418
continue

internal/controller/gateway_controller.go

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -797,6 +797,14 @@ func (r *GatewayReconciler) finalizeGateway(
797797
) (result Result) {
798798
logger := log.FromContext(ctx)
799799
logger.Info("finalizing gateway")
800+
801+
// Clean up DNS records created by this gateway
802+
if r.Config.Gateway.EnableDNSIntegration {
803+
if cleanupResult := r.cleanupDNSRecordSets(ctx, upstreamClient, upstreamGateway); cleanupResult.ShouldReturn() {
804+
return cleanupResult
805+
}
806+
}
807+
800808
// Go through downstream http routes that are attached to the downstream
801809
// gateway and remove the parentRef from the status. If it's the last parent
802810
// ref, delete the downstream route. If there's a race condition on delete/create,
@@ -870,6 +878,54 @@ func (r *GatewayReconciler) finalizeGateway(
870878
return result
871879
}
872880

881+
// cleanupDNSRecordSets deletes all DNSRecordSet resources that were created by
882+
// this gateway. This is called during gateway finalization to ensure DNS records
883+
// are cleaned up when the gateway is deleted, since owner reference-based garbage
884+
// collection doesn't work (the dns-operator sets itself as the controller owner).
885+
func (r *GatewayReconciler) cleanupDNSRecordSets(
886+
ctx context.Context,
887+
upstreamClient client.Client,
888+
upstreamGateway *gatewayv1.Gateway,
889+
) (result Result) {
890+
logger := log.FromContext(ctx)
891+
892+
var recordSetList dnsv1alpha1.DNSRecordSetList
893+
if err := upstreamClient.List(ctx, &recordSetList,
894+
client.InNamespace(upstreamGateway.Namespace),
895+
client.MatchingLabels{
896+
labelDNSManaged: "true",
897+
labelManagedBy: labelManagedByValue,
898+
labelDNSSourceKind: KindGateway,
899+
labelDNSSourceName: upstreamGateway.Name,
900+
labelDNSSourceNS: upstreamGateway.Namespace,
901+
},
902+
); err != nil {
903+
// If the CRD doesn't exist, there's nothing to clean up
904+
if apimeta.IsNoMatchError(err) {
905+
return result
906+
}
907+
result.Err = fmt.Errorf("failed listing DNSRecordSets for cleanup: %w", err)
908+
return result
909+
}
910+
911+
for _, rs := range recordSetList.Items {
912+
logger.Info("deleting DNSRecordSet during gateway finalization",
913+
"name", rs.Name,
914+
"hostname", rs.Annotations[annotationDNSHostname],
915+
)
916+
if err := upstreamClient.Delete(ctx, &rs); err != nil && !apierrors.IsNotFound(err) {
917+
result.Err = fmt.Errorf("failed to delete DNSRecordSet %q: %w", rs.Name, err)
918+
return result
919+
}
920+
}
921+
922+
if len(recordSetList.Items) > 0 {
923+
logger.Info("deleted DNSRecordSets during gateway finalization", "count", len(recordSetList.Items))
924+
}
925+
926+
return result
927+
}
928+
873929
// TODO(jreese) revisit the parameters here to clean them up. It's a bit messy
874930
// as this function is used against both the upstream and downstream resources.
875931
func (r *GatewayReconciler) detachHTTPRoutes(

internal/controller/gateway_dns_controller.go

Lines changed: 67 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ import (
2525
mcreconcile "sigs.k8s.io/multicluster-runtime/pkg/reconcile"
2626

2727
networkingv1alpha "go.datum.net/network-services-operator/api/v1alpha"
28+
dnsutil "go.datum.net/network-services-operator/internal/util/dns"
2829
dnsv1alpha1 "go.miloapis.com/dns-operator/api/v1alpha1"
2930
)
3031

@@ -102,7 +103,7 @@ func (r *GatewayReconciler) ensureDNSRecordSets(
102103
continue
103104
}
104105

105-
// Find the most specific Domain + DNSZone combination.
106+
// Find the most specific Domain + DNSZone combination where Datum DNS has authority.
106107
var domain *networkingv1alpha.Domain
107108
var dnsZone *dnsv1alpha1.DNSZone
108109
var matchedZoneName string
@@ -114,10 +115,9 @@ func (r *GatewayReconciler) ensureDNSRecordSets(
114115
continue
115116
}
116117

117-
// Check if the Domain has VerifiedDNSZone=True.
118-
if !apimeta.IsStatusConditionTrue(d.Status.Conditions, networkingv1alpha.DomainConditionVerifiedDNSZone) {
119-
// Domain exists but isn't verified; record this for potential error message
120-
// but keep looking for a more specific verified domain.
118+
// Check if the Domain is verified (ownership proven via any method).
119+
if !apimeta.IsStatusConditionTrue(d.Status.Conditions, networkingv1alpha.DomainConditionVerified) {
120+
// Domain exists but isn't verified; keep looking.
121121
continue
122122
}
123123

@@ -135,36 +135,87 @@ func (r *GatewayReconciler) ensureDNSRecordSets(
135135
continue
136136
}
137137

138-
// Found a matching Domain + DNSZone.
138+
// Check if Datum DNS has authority (DNSZone ready + nameservers match).
139+
zone := &dnsZoneList.Items[0]
140+
if !dnsutil.HasDNSAuthority(&d, zone) {
141+
// Domain verified but Datum DNS doesn't have authority yet.
142+
continue
143+
}
144+
145+
// Found a matching Domain + DNSZone where Datum DNS has authority.
139146
domain = &d
140-
dnsZone = &dnsZoneList.Items[0]
147+
dnsZone = zone
141148
matchedZoneName = zoneName
142149
break
143150
}
144151

145152
// If no matching Domain + DNSZone found, check why and set appropriate status.
146153
if domain == nil || dnsZone == nil {
147154
// Try to provide a helpful message by checking what we found.
148-
var unverifiedDomain *networkingv1alpha.Domain
155+
var (
156+
unverifiedDomain *networkingv1alpha.Domain
157+
noAuthorityDomain *networkingv1alpha.Domain
158+
noAuthorityZone *dnsv1alpha1.DNSZone
159+
)
149160
for _, zoneName := range zoneNames {
150161
d, found := findDomainByName(domainList.Items, zoneName)
151-
if found {
152-
if !apimeta.IsStatusConditionTrue(d.Status.Conditions, networkingv1alpha.DomainConditionVerifiedDNSZone) {
153-
unverifiedDomain = &d
154-
break
155-
}
162+
if !found {
163+
continue
164+
}
165+
166+
// Check if domain is verified
167+
if !apimeta.IsStatusConditionTrue(d.Status.Conditions, networkingv1alpha.DomainConditionVerified) {
168+
unverifiedDomain = &d
169+
break
170+
}
171+
172+
// Domain is verified; check if there's a DNSZone
173+
var dnsZoneList dnsv1alpha1.DNSZoneList
174+
if err := upstreamClient.List(ctx, &dnsZoneList,
175+
client.InNamespace(upstreamGateway.Namespace),
176+
client.MatchingFields{dnsZoneDomainNameIndex: zoneName},
177+
); err != nil {
178+
continue
156179
}
180+
if len(dnsZoneList.Items) == 0 {
181+
continue
182+
}
183+
184+
// DNSZone exists but Datum DNS doesn't have authority
185+
noAuthorityDomain = &d
186+
noAuthorityZone = &dnsZoneList.Items[0]
187+
break
157188
}
158189

159-
if unverifiedDomain != nil {
190+
switch {
191+
case unverifiedDomain != nil:
160192
apimeta.SetStatusCondition(&hs.Conditions, metav1.Condition{
161193
Type: networkingv1alpha.HostnameConditionDNSRecordProgrammed,
162194
Status: metav1.ConditionFalse,
163195
Reason: networkingv1alpha.DNSRecordReasonDomainNotVerified,
164-
Message: fmt.Sprintf("Domain %q has not been verified via a DNSZone (VerifiedDNSZone condition is not True)", unverifiedDomain.Name),
196+
Message: fmt.Sprintf("Domain %q ownership has not been verified", unverifiedDomain.Name),
197+
ObservedGeneration: upstreamGateway.Generation,
198+
})
199+
case noAuthorityDomain != nil:
200+
msg := fmt.Sprintf("Domain %q is verified but Datum DNS does not have authority", noAuthorityDomain.Name)
201+
if noAuthorityZone != nil {
202+
if !apimeta.IsStatusConditionTrue(noAuthorityZone.Status.Conditions, "Accepted") ||
203+
!apimeta.IsStatusConditionTrue(noAuthorityZone.Status.Conditions, "Programmed") {
204+
msg = fmt.Sprintf("DNSZone %q is not ready (waiting for Accepted and Programmed conditions)", noAuthorityZone.Name)
205+
} else if len(noAuthorityZone.Status.Nameservers) == 0 {
206+
msg = fmt.Sprintf("DNSZone %q has no nameservers assigned yet", noAuthorityZone.Name)
207+
} else {
208+
msg = fmt.Sprintf("Domain %q nameservers do not include DNSZone %q nameservers; update your registrar's NS records", noAuthorityDomain.Name, noAuthorityZone.Name)
209+
}
210+
}
211+
apimeta.SetStatusCondition(&hs.Conditions, metav1.Condition{
212+
Type: networkingv1alpha.HostnameConditionDNSRecordProgrammed,
213+
Status: metav1.ConditionFalse,
214+
Reason: networkingv1alpha.DNSRecordReasonDNSAuthorityMissing,
215+
Message: msg,
165216
ObservedGeneration: upstreamGateway.Generation,
166217
})
167-
} else {
218+
default:
168219
apimeta.SetStatusCondition(&hs.Conditions, metav1.Condition{
169220
Type: networkingv1alpha.HostnameConditionDNSRecordProgrammed,
170221
Status: metav1.ConditionTrue,

0 commit comments

Comments
 (0)