diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..8362e1a --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,185 @@ +# Changelog + +Notable changes to this project will be documented in this file. To keep it lightweight, releases 2+ minor versions back will be churned regularly. + +The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). + +## [0.10.x] - Unreleased + +### Changed + +#### DNS Errors Now Reported With Fallback (Breaking Change) + +Previously, when DNS resolution failed but a cached address was available for fallback, the error was silently swallowed and `WatcherResponse.Err` would be `nil`. This made it impossible to detect DNS degradation in monitoring systems. + +**Now**, DNS errors are always reported in `WatcherResponse.Err`, even when fallback succeeds. This provides visibility into DNS issues while maintaining resilience. + +**Before:** +```go +// DNS fails but fallback uses cached address → resp.Err == nil +resp := <-wadjit.Responses() +if resp.Err != nil { + // Only triggered for hard failures + log.Error("Request failed", resp.Err) +} +``` + +**After:** +```go +// DNS fails with fallback → resp.Err != nil (contains DNS error) +resp := <-wadjit.Responses() +if resp.Err != nil { + // This now triggers for DNS failures too! + // But request may have completed using cached address + log.Error("Request failed", resp.Err) +} +``` + +### Added + +#### New DNS Metadata Fields + +Two new fields added to `DNSMetadata` and `DNSDecision`: + +- `FallbackUsed bool`: Indicates whether a cached address was used due to DNS lookup failure +- `Err error`: Contains the DNS resolution error, if any occurred + +**Access DNS error information:** +```go +resp := <-wadjit.Responses() +md := resp.Metadata() +if md.DNS != nil { + if md.DNS.FallbackUsed { + log.Warn("Using cached DNS entry", "error", md.DNS.Err) + } + if md.DNS.Err != nil { + log.Warn("DNS error occurred", "error", md.DNS.Err) + } +} +``` + +#### Enhanced DNS Decision Callbacks + +DNS decision hooks now receive fallback and error information: + +```go +policy := wadjit.DNSPolicy{ + Mode: wadjit.DNSRefreshTTL, + TTLMin: time.Minute, + DecisionCallback: func(ctx context.Context, decision wadjit.DNSDecision) { + if decision.FallbackUsed { + log.Warn("DNS fallback triggered", "error", decision.Err) + } + if decision.Err != nil { + metrics.RecordDNSError(decision.Host, decision.Err) + } + }, +} +``` + +### Migration Guide + +#### 1. Distinguish Between Hard Failures and Fallback Scenarios + +Update error handling to check if fallback was used: + +```go +resp := <-wadjit.Responses() +if resp.Err != nil { + md := resp.Metadata() + if md.DNS != nil && md.DNS.FallbackUsed { + // DNS failed but request completed with cached address + log.Warn("DNS fallback used", + "error", md.DNS.Err, + "cached_addr", md.DNS.ResolvedAddrs) + // Maybe alert but don't fail completely + } else { + // Hard failure - request did not complete + log.Error("Request failed completely", resp.Err) + // Definitely alert/retry + } +} +``` + +#### 2. Update Monitoring and Alerting + +Expect increased error counts in dashboards - this is correct behavior. DNS failures were always happening, just not visible before. + +Distinguish between error types in metrics: + +```go +resp := <-wadjit.Responses() +if resp.Err != nil { + md := resp.Metadata() + if md.DNS != nil { + if md.DNS.FallbackUsed { + metrics.IncrCounter("dns.fallback.used") // Degraded but working + metrics.IncrCounter("dns.errors.soft") + } else if md.DNS.Err != nil { + metrics.IncrCounter("dns.errors.hard") // Complete failure + } + } + metrics.IncrCounter("requests.errors.total") +} +``` + +#### 3. Refine Retry Logic + +Consider different retry strategies for fallback vs hard failures: + +```go +func shouldRetry(resp WatcherResponse) bool { + if resp.Err == nil { + return false // Success + } + + md := resp.Metadata() + if md.DNS != nil && md.DNS.FallbackUsed { + // DNS failed but got cached data - maybe don't retry immediately + // as the DNS issue might be temporary + return false + } + + // Hard failure or non-DNS error - retry + return true +} +``` + +#### 4. Update Error Propagation Logic + +If code assumed `Err == nil` means complete success: + +```go +// OLD - May need updating +if resp.Err != nil { + return fmt.Errorf("health check failed: %w", resp.Err) +} + +// NEW - More nuanced +if resp.Err != nil { + md := resp.Metadata() + if md.DNS != nil && md.DNS.FallbackUsed { + // Degrade gracefully - log warning but continue + metrics.IncrDNSFallback(resp.URL.Host) + } else { + // Hard failure - propagate error + return fmt.Errorf("health check failed: %w", resp.Err) + } +} +``` + +### Recommended Migration Steps + +1. **Update error handling** to check for `DNS.FallbackUsed` before treating errors as hard failures +2. **Update monitoring** to distinguish soft (fallback) vs hard DNS failures +3. **Review retry logic** - consider not retrying immediately on fallback errors +4. **Test in staging** - watch for increased error counts (expected and correct) +5. **Update runbooks** - DNS fallback errors may now trigger alerts that need different responses + +### Technical Details + +- Modified `dnsResolveOutcome` to include `fallbackErr` field for error propagation +- Updated `resolveTTL()`, `resolveCadence()`, and `resolveSingle()` to return DNS decisions even on error +- Added `lastDecision` field to `dnsPolicyManager` for error reporting in HTTP responses +- Created `httpTaskResponseError` type to attach DNS metadata to error responses +- Enhanced decision context propagation to include errors and fallback state diff --git a/README.md b/README.md index bde4a55..2198185 100644 --- a/README.md +++ b/README.md @@ -5,6 +5,8 @@ Wadjit (pronounced /ˈwɒdʒɪt/, or "watch it") is a program for endpoint monitoring and analysis. +> **Note**: Version 0.10.x introduces breaking changes to DNS error handling. See [CHANGELOG.md](CHANGELOG.md#010x) for migration details. + > Wadjet is the ancient Egyptian goddess of protection and royal authority. She is sometimes shown as the Eye of Ra, acting as a protector of the country and the king, and her vigilant eye would watch over the land. - [Wikipedia](https://en.wikipedia.org/wiki/Wadjet) `wadjit.New()` creates a manager for an arbitrary number of watchers. The watchers monitor pre-defined endpoints according to their configuration, and feed the results back to the manager. A single Wadjit manager can hold watchers for many different tasks, as responses on the response channel are separated by watcher ID, or you may choose to create several managers as a more strict separation of concerns. @@ -23,6 +25,10 @@ go get github.com/jkbrsn/wadjit@latest - Buffered responses: Non-blocking channel with watcher IDs and metadata. - Metrics: Access scheduler metrics via `Metrics()`. +## Changelog + +The [CHANGELOG.md](CHANGELOG.md) shows recent changes and explains mitigations that might be needed when migrating from one version to the next. + ## Quick Start Minimal example with one HTTP and one WS task and basic response handling: @@ -142,6 +148,26 @@ Guard rails add safety nets on top of any mode. Configure `DNSGuardRailPolicy` w Set a global default for every watcher by supplying `wadjit.WithDefaultDNSPolicy(...)` when creating the Wadjit. Endpoints that do not call `WithDNSPolicy` inherit this default automatically, while explicit endpoint policies still win. +##### Error Handling + +DNS errors are always reported in `WatcherResponse.Err`, even when fallback to cached addresses succeeds. This provides visibility into DNS degradation while maintaining resilience: + +```go +resp := <-manager.Responses() +if resp.Err != nil { + md := resp.Metadata() + if md.DNS != nil && md.DNS.FallbackUsed { + // DNS failed but request completed with cached address + log.Warn("DNS fallback used", "error", md.DNS.Err) + } else { + // Hard failure - request did not complete + log.Error("Request failed", resp.Err) + } +} +``` + +**New in 0.10.x**: `DNSMetadata` includes `FallbackUsed bool` and `Err error` fields to distinguish between degraded (fallback) and failed states. See [CHANGELOG.md](CHANGELOG.md#010x) for migration guidance. + ##### Examples Assuming a parsed target URL: diff --git a/dns_policy.go b/dns_policy.go index 330cbc1..defa333 100644 --- a/dns_policy.go +++ b/dns_policy.go @@ -102,6 +102,7 @@ type DNSDecision struct { LookupDuration time.Duration GuardRailTriggered bool Err error + FallbackUsed bool } // Validate ensures the DNS policy fields are coherent. diff --git a/dns_policy_manager.go b/dns_policy_manager.go index b3212a1..c5bb238 100644 --- a/dns_policy_manager.go +++ b/dns_policy_manager.go @@ -32,11 +32,12 @@ type dnsPolicyManager struct { resolver TTLResolver hook DNSDecisionCallback - mu sync.Mutex - cache dnsCacheEntry - cadenceNext time.Time - forceLookup bool - guard guardRailState + mu sync.Mutex + cache dnsCacheEntry + cadenceNext time.Time + forceLookup bool + guard guardRailState + lastDecision *DNSDecision // Last DNS decision for error reporting lookupGroup singleflight.Group } @@ -66,6 +67,7 @@ type dnsResolveOutcome struct { address string forceNewConn bool decision *DNSDecision + fallbackErr error } const serverErrorThreshold = 500 @@ -95,12 +97,36 @@ func (m *dnsPolicyManager) prepareRequest( outcome, err := m.resolve(ctx, host, port) if err != nil { - if outcome.decision != nil && m.hook != nil { + if outcome.decision != nil { outcome.decision.Err = err - m.hook(ctx, *outcome.decision) + // Store decision for retrieval after error + m.mu.Lock() + m.lastDecision = outcome.decision + m.mu.Unlock() + if m.hook != nil { + m.hook(ctx, *outcome.decision) + } } return ctx, false, err } + + if outcome.fallbackErr != nil && outcome.decision != nil { + outcome.decision.FallbackUsed = true + // Store decision for retrieval after error + m.mu.Lock() + m.lastDecision = outcome.decision + m.mu.Unlock() + // Add decision to context even when returning error + resultCtx := context.WithValue(ctx, dnsDecisionKey{}, *outcome.decision) + if outcome.address != "" { + plan := dnsDialPlan{target: outcome.address} + resultCtx = context.WithValue(resultCtx, dnsPlanKey{}, plan) + } + if m.hook != nil { + m.hook(resultCtx, *outcome.decision) + } + return resultCtx, outcome.forceNewConn, outcome.fallbackErr + } guardTriggered := m.consumeGuardTrigger() decision := outcome.decision if decision == nil && (m.hook != nil || guardTriggered) { @@ -184,7 +210,14 @@ func (m *dnsPolicyManager) resolveSingle( addrs, ttl, lookupDur, err := m.lookup(ctx, host) if err != nil { - return dnsResolveOutcome{}, err + // No cache available - return error with decision for observability + decision := DNSDecision{ + Host: host, + Mode: m.policy.Mode, + LookupDuration: lookupDur, + Err: err, + } + return dnsResolveOutcome{decision: &decision}, err } entry := dnsCacheEntry{addrs: addrs, lastLookup: time.Now()} @@ -244,9 +277,16 @@ func (m *dnsPolicyManager) resolveTTL( ExpiresAt: cached.expiresAt, Err: err, } - return dnsResolveOutcome{address: addr, decision: &decision}, nil + return dnsResolveOutcome{address: addr, decision: &decision, fallbackErr: err}, nil } - return dnsResolveOutcome{}, err + // No cache available - return error with decision for observability + decision := DNSDecision{ + Host: host, + Mode: m.policy.Mode, + LookupDuration: lookupDur, + Err: err, + } + return dnsResolveOutcome{decision: &decision}, err } ttl = m.policy.normalizeTTL(ttl) @@ -311,9 +351,16 @@ func (m *dnsPolicyManager) resolveCadence( ExpiresAt: cadenceNext, Err: err, } - return dnsResolveOutcome{address: addr, decision: &decision}, nil + return dnsResolveOutcome{address: addr, decision: &decision, fallbackErr: err}, nil + } + // No cache available - return error with decision for observability + decision := DNSDecision{ + Host: host, + Mode: m.policy.Mode, + LookupDuration: lookupDur, + Err: err, } - return dnsResolveOutcome{}, err + return dnsResolveOutcome{decision: &decision}, err } entry := dnsCacheEntry{addrs: addrs, lastLookup: now} @@ -408,6 +455,13 @@ func (*dnsPolicyManager) dialContext( } } +// getLastDecision retrieves the last DNS decision, if any. This is useful for error reporting. +func (m *dnsPolicyManager) getLastDecision() *DNSDecision { + m.mu.Lock() + defer m.mu.Unlock() + return m.lastDecision +} + // observeResult records request outcomes to drive guard-rail thresholds. func (m *dnsPolicyManager) observeResult(statusCode int, resultErr error) { if !m.guard.policy.Enabled() { diff --git a/dns_policy_manager_test.go b/dns_policy_manager_test.go index 6f1f7a1..ec61b03 100644 --- a/dns_policy_manager_test.go +++ b/dns_policy_manager_test.go @@ -29,6 +29,25 @@ func TestDNSPolicyManagerDefaultMode(t *testing.T) { assert.Nil(t, ctx.Value(dnsPlanKey{})) } +func TestDNSPolicyManagerDefaultModeError(t *testing.T) { + t.Parallel() + + // Use an invalid hostname that should fail DNS resolution + u := &url.URL{Scheme: "https", Host: "invalid.hostname.that.does.not.exist.test"} + + mgr := newDNSPolicyManager(DNSPolicy{Mode: DNSRefreshDefault}, nil) + transport := &http.Transport{} + + // Default mode uses Go's native resolver, which should eventually fail + // We're just verifying no panic and graceful error handling + ctx, force, err := mgr.prepareRequest(context.Background(), u, transport) + // Default mode doesn't do custom DNS resolution, so this typically succeeds + // at the prepareRequest level (error comes later during actual dial) + require.NoError(t, err) + assert.False(t, force) + assert.Nil(t, ctx.Value(dnsPlanKey{})) +} + func TestDNSPolicyManagerTTLRefresh(t *testing.T) { t.Parallel() @@ -36,7 +55,10 @@ func TestDNSPolicyManagerTTLRefresh(t *testing.T) { u := &url.URL{Scheme: "https", Host: host} policy := DNSPolicy{Mode: DNSRefreshTTL, TTLMin: time.Second, TTLMax: 10 * time.Second} - mgr := newDNSPolicyManager(policy, nil) + decisionCh := make(chan DNSDecision, 4) + mgr := newDNSPolicyManager(policy, func(_ context.Context, d DNSDecision) { + decisionCh <- d + }) mgr.resolver = newTestResolver([]netip.Addr{netip.MustParseAddr("192.0.2.10")}, 5*time.Second) transport := &http.Transport{} @@ -153,7 +175,10 @@ func TestDNSPolicyManagerTTLFallbackGuardRail(t *testing.T) { Action: DNSGuardRailActionForceLookup, }, } - mgr := newDNSPolicyManager(policy, nil) + decisionCh := make(chan DNSDecision, 4) + mgr := newDNSPolicyManager(policy, func(_ context.Context, d DNSDecision) { + decisionCh <- d + }) initialAddr := netip.MustParseAddr("192.0.2.80") resolver := newTestResolver([]netip.Addr{initialAddr}, 20*time.Millisecond) mgr.resolver = resolver @@ -168,16 +193,26 @@ func TestDNSPolicyManagerTTLFallbackGuardRail(t *testing.T) { firstExpiry := decision.ExpiresAt require.False(t, firstExpiry.IsZero()) assert.Equal(t, initialAddr, decision.ResolvedAddrs[0]) + select { + case <-decisionCh: + default: + } time.Sleep(2 * policy.TTLMin) resolver.SetError(errors.New("lookup failed")) - ctx, force, err = mgr.prepareRequest(context.Background(), u, transport) - require.NoError(t, err) - assert.False(t, force, "fallback should reuse cached address when lookup fails") - decision, ok = ctx.Value(dnsDecisionKey{}).(DNSDecision) - require.True(t, ok) + _, force, err = mgr.prepareRequest(context.Background(), u, transport) + require.Error(t, err) + assert.ErrorContains(t, err, "lookup failed") + assert.False(t, force, + "fallback should reuse cached address when lookup fails but surface error") + select { + case decision = <-decisionCh: + case <-time.After(time.Second): + require.Fail(t, "expected decision hook to fire") + } + assert.True(t, decision.FallbackUsed) require.NotNil(t, decision.Err) assert.Equal(t, initialAddr, decision.ResolvedAddrs[0]) assert.Equal(t, firstExpiry, decision.ExpiresAt) @@ -231,6 +266,144 @@ func TestDNSPolicyManagerTTLFallbackDisabled(t *testing.T) { assert.False(t, force, "no dial plan provided on error") } +func TestDNSPolicyManagerCadenceFallbackError(t *testing.T) { + t.Parallel() + + host := "cadence-fallback.example" + u := &url.URL{Scheme: "https", Host: host} + + policy := DNSPolicy{ + Mode: DNSRefreshCadence, + Cadence: 20 * time.Millisecond, + } + decisionCh := make(chan DNSDecision, 4) + mgr := newDNSPolicyManager(policy, func(_ context.Context, d DNSDecision) { + decisionCh <- d + }) + initialAddr := netip.MustParseAddr("192.0.2.100") + resolver := newTestResolver([]netip.Addr{initialAddr}, 0) + mgr.resolver = resolver + + transport := &http.Transport{} + + // Initial successful lookup + ctx, force, err := mgr.prepareRequest(context.Background(), u, transport) + require.NoError(t, err) + assert.True(t, force, "first lookup should force new connection") + decision, ok := ctx.Value(dnsDecisionKey{}).(DNSDecision) + require.True(t, ok) + assert.Equal(t, initialAddr, decision.ResolvedAddrs[0]) + firstExpiry := decision.ExpiresAt + require.False(t, firstExpiry.IsZero()) + + // Clear any decision from the channel + select { + case <-decisionCh: + default: + } + + // Wait for cadence to expire + time.Sleep(policy.Cadence + 10*time.Millisecond) + + // Make resolver fail + resolver.SetError(errors.New("cadence lookup failed")) + + // Next request should return error but use cached address + _, force, err = mgr.prepareRequest(context.Background(), u, transport) + require.Error(t, err) + assert.ErrorContains(t, err, "cadence lookup failed") + assert.False(t, force, "fallback should reuse cached address when lookup fails") + + // Verify decision hook received fallback information + select { + case decision = <-decisionCh: + case <-time.After(time.Second): + require.Fail(t, "expected decision hook to fire") + } + assert.True(t, decision.FallbackUsed) + require.NotNil(t, decision.Err) + assert.ErrorContains(t, decision.Err, "cadence lookup failed") + assert.Equal(t, initialAddr, decision.ResolvedAddrs[0]) + assert.Equal(t, firstExpiry, decision.ExpiresAt) +} + +func TestDNSPolicyManagerDNSFailureWithoutCache(t *testing.T) { + t.Parallel() + + host := "no-cache.example" + u := &url.URL{Scheme: "https", Host: host} + + policy := DNSPolicy{ + Mode: DNSRefreshTTL, + TTLMin: 10 * time.Millisecond, + TTLMax: 30 * time.Millisecond, + } + decisionCh := make(chan DNSDecision, 4) + mgr := newDNSPolicyManager(policy, func(_ context.Context, d DNSDecision) { + decisionCh <- d + }) + + // Resolver fails immediately (no successful lookup to establish cache) + resolver := newTestResolver(nil, 0) + resolver.SetError(errors.New("initial lookup failed")) + mgr.resolver = resolver + + transport := &http.Transport{} + + // First request with no cache should fail immediately + _, force, err := mgr.prepareRequest(context.Background(), u, transport) + require.Error(t, err) + assert.ErrorContains(t, err, "initial lookup failed") + assert.False(t, force, "no dial plan provided on error") + + // Verify decision hook received error with no fallback + select { + case decision := <-decisionCh: + assert.False(t, decision.FallbackUsed, "no fallback should occur without cached address") + require.NotNil(t, decision.Err) + assert.ErrorContains(t, decision.Err, "initial lookup failed") + assert.Empty(t, decision.ResolvedAddrs, "no addresses should be resolved") + case <-time.After(time.Second): + require.Fail(t, "expected decision hook to fire") + } +} + +func TestDNSPolicyManagerCadenceFallbackDisabled(t *testing.T) { + t.Parallel() + + host := "cadence-disable.example" + u := &url.URL{Scheme: "https", Host: host} + + policy := DNSPolicy{ + Mode: DNSRefreshCadence, + Cadence: 20 * time.Millisecond, + DisableFallback: true, + } + mgr := newDNSPolicyManager(policy, nil) + initialAddr := netip.MustParseAddr("192.0.2.110") + resolver := newTestResolver([]netip.Addr{initialAddr}, 0) + mgr.resolver = resolver + + transport := &http.Transport{} + + // Initial successful lookup + _, force, err := mgr.prepareRequest(context.Background(), u, transport) + require.NoError(t, err) + assert.True(t, force, "first lookup should force new connection") + + // Wait for cadence to expire + time.Sleep(policy.Cadence + 10*time.Millisecond) + + // Make resolver fail + resolver.SetError(errors.New("lookup failed")) + + // With fallback disabled, should return error and not use cached address + _, force, err = mgr.prepareRequest(context.Background(), u, transport) + require.Error(t, err) + assert.ErrorContains(t, err, "lookup failed") + assert.False(t, force, "no dial plan provided on error") +} + func TestDNSPolicyManagerTTLZeroTTLExpiresImmediately(t *testing.T) { t.Parallel() diff --git a/docs/dns-error-handling-fix.md b/docs/dns-error-handling-fix.md deleted file mode 100644 index 71f0b8a..0000000 --- a/docs/dns-error-handling-fix.md +++ /dev/null @@ -1,335 +0,0 @@ -# DNS Error Handling Implementation Plan - -## Problem Statement - -DNS resolution errors are not consistently propagated to the response channel when HTTP tasks execute. This results in missing error responses for certain DNS policy configurations, making it difficult for clients to detect and respond to DNS failures. - -## Observed Behavior - -### Working Cases -- **DNSRefreshStatic**: Errors are properly reported when targeting non-existing resources - - Static mode bypasses DNS entirely - - Connection failures at TCP level bubble up through `client.Do()` - - Error responses are sent to response channel via `task_http.go:197` - -### Failing Cases -- **DNSRefreshTTL**: DNS lookup failures may not generate error responses -- **DNSRefreshCadence**: DNS lookup failures may not generate error responses -- **DNSRefreshDefault** (suspected): May inherit problematic behavior in certain configurations - -## Root Cause Analysis - -### DNS Fallback Logic Swallows Errors - -When DNS lookups fail in TTL or Cadence modes and fallback is enabled, the error is recorded in the `DNSDecision` struct but the request continues with a cached address. This prevents error propagation to the response channel. - -#### Code Location: dns_policy_manager.go:234-249 (resolveTTL) - -```go -addrs, ttl, lookupDur, err := m.lookup(ctx, host) -if err != nil { - if len(cached.addrs) > 0 && m.policy.fallbackEnabled() { - addr := net.JoinHostPort(cached.addrs[0].String(), port) - ttl := nonNegativeDuration(cached.expiresAt.Sub(cached.lastLookup)) - decision := DNSDecision{ - Host: host, - Mode: m.policy.Mode, - ResolvedAddrs: cached.addrs, - TTL: ttl, - ExpiresAt: cached.expiresAt, - Err: err, // ← Error recorded here - } - return dnsResolveOutcome{address: addr, decision: &decision}, nil // ← Returns SUCCESS - } - return dnsResolveOutcome{}, err -} -``` - -**What happens:** -1. DNS lookup fails (`err != nil`) -2. Code checks if cached address exists and fallback is enabled -3. Returns cached address with `nil` error -4. DNS error only stored in `DNSDecision.Err` field -5. Request proceeds with potentially stale/incorrect address -6. **No error response sent to response channel** - -#### Code Location: dns_policy_manager.go:303-316 (resolveCadence) - -Same pattern exists in cadence mode: - -```go -addrs, ttl, lookupDur, err := m.lookup(ctx, host) -if err != nil { - if len(cached.addrs) > 0 && m.policy.fallbackEnabled() { - addr := net.JoinHostPort(cached.addrs[0].String(), port) - decision := DNSDecision{ - Host: host, - Mode: m.policy.Mode, - ResolvedAddrs: cached.addrs, - ExpiresAt: cadenceNext, - Err: err, // ← Error recorded but not propagated - } - return dnsResolveOutcome{address: addr, decision: &decision}, nil // ← Returns SUCCESS - } - return dnsResolveOutcome{}, err -} -``` - -### Error Flow Analysis - -#### Normal Error Path (Working) -1. DNS/connection error occurs -2. `client.Do()` returns error (`task_http.go:192`) -3. `dnsMgr.observeResult()` called with error (`task_http.go:195`) -4. Error response sent to channel (`task_http.go:197`) -5. Early return with error (`task_http.go:198`) - -#### Broken Error Path (Fallback Enabled) -1. DNS lookup fails in `prepareRequest()` → `resolve()` -2. Fallback logic returns `nil` error with cached address -3. `RoundTrip()` proceeds with cached address (`dns_policy_manager.go:495`) -4. Request may succeed with stale address OR fail later at connection level -5. Original DNS error never reaches response channel - -## Affected Components - -### Primary Files -- `dns_policy_manager.go`: Contains fallback logic that swallows errors - - Lines 234-249: `resolveTTL()` fallback path - - Lines 303-316: `resolveCadence()` fallback path - -### Secondary Files -- `task_http.go`: Error handling in `httpRequest.Execute()` - - Lines 192-198: Normal error path (working correctly) -- `dns_policy.go`: Policy configuration and validation - - Lines 147-150: `fallbackEnabled()` method - -## Impact Assessment - -### Critical -- Monitoring systems cannot detect DNS resolution failures -- Failed requests may appear successful if cached address is reused -- Guard rail mechanisms may not trigger appropriately -- Silent failures violate principle of least surprise - -### Moderate -- DNS decision hooks receive error information but most clients don't use them -- Fallback behavior may be intentional for resilience, but should be explicit - -### Low -- Static mode works correctly (no DNS involved) -- Single lookup mode works correctly for initial lookup failure - -## Implementation Options - -### Option 1: Always Propagate DNS Errors (Strictest) - -**Change:** Remove fallback behavior entirely or make it opt-in via explicit flag - -**Pros:** -- Clear error semantics -- Monitoring systems see all failures -- Predictable behavior - -**Cons:** -- May break existing resilience assumptions -- Could increase error volume in transient DNS issues -- Breaking change for users relying on fallback - -### Option 2: Report DNS Errors While Using Fallback (Balanced) - -**Change:** Send error response to channel even when falling back to cached address - -**Implementation:** -```go -if err != nil { - if len(cached.addrs) > 0 && m.policy.fallbackEnabled() { - // Create decision with error - decision := DNSDecision{ - Host: host, - Mode: m.policy.Mode, - ResolvedAddrs: cached.addrs, - Err: err, - FallbackUsed: true, // New field - } - // Return outcome but preserve error for reporting - return dnsResolveOutcome{ - address: addr, - decision: &decision, - shouldReport: true, // New field to trigger error response - }, err // Return error instead of nil - } - return dnsResolveOutcome{}, err -} -``` - -**Pros:** -- Maintains resilience of fallback behavior -- Errors visible to monitoring systems -- Clear indication of degraded state - -**Cons:** -- Requires changes to `dnsResolveOutcome` struct -- Needs new logic to send error while continuing request -- More complex error handling - -### Option 3: Explicit Fallback Mode with Warnings (Most Flexible) - -**Change:** Add `DNSFallbackBehavior` enum to policy configuration - -```go -type DNSFallbackBehavior uint8 - -const ( - DNSFallbackError DNSFallbackBehavior = iota // Fail immediately on DNS error - DNSFallbackWarn // Use cache but send warning response - DNSFallbackSilent // Current behavior (use cache, no error) -) - -type DNSPolicy struct { - // ... existing fields ... - FallbackBehavior DNSFallbackBehavior // Replace DisableFallback bool -} -``` - -**Pros:** -- Maximum flexibility for different use cases -- Non-breaking (default to Silent for backward compat) -- Explicit control over error reporting - -**Cons:** -- More configuration complexity -- Requires additional testing scenarios -- Documentation burden - -## Recommended Solution - -**Option 2** (Report DNS Errors While Using Fallback) provides the best balance: - -1. **Preserve resilience**: Fallback to cached addresses still works -2. **Improve observability**: DNS errors always reported to response channel -3. **Clear semantics**: Clients can distinguish between clean success and degraded fallback -4. **Non-breaking**: Existing fallback behavior continues, just with better error visibility - -### Implementation Steps - -1. **Modify `dnsResolveOutcome` struct** (dns_policy_manager.go): - ```go - type dnsResolveOutcome struct { - address string - forceNewConn bool - decision *DNSDecision - fallbackErr error // New: DNS error when fallback is used - } - ``` - -2. **Update `resolveTTL()` fallback path** (dns_policy_manager.go:234-249): - ```go - if err != nil { - if len(cached.addrs) > 0 && m.policy.fallbackEnabled() { - // ... create decision ... - return dnsResolveOutcome{ - address: addr, - decision: &decision, - fallbackErr: err, // Preserve error for reporting - }, nil - } - return dnsResolveOutcome{}, err - } - ``` - -3. **Update `resolveCadence()` fallback path** (dns_policy_manager.go:303-316): - Same pattern as `resolveTTL()` - -4. **Modify `prepareRequest()` to check for fallback errors** (dns_policy_manager.go:82-133): - ```go - outcome, err := m.resolve(ctx, host, port) - if err != nil { - // ... existing error handling ... - } - - // Check if fallback was used (new) - if outcome.fallbackErr != nil && outcome.decision != nil { - outcome.decision.FallbackUsed = true - // Return error context for caller to handle - return ctx, outcome.forceNewConn, outcome.fallbackErr - } - ``` - -5. **Add error handling in `policyTransport.RoundTrip()`** (dns_policy_manager.go:485-496): - ```go - func (p *policyTransport) RoundTrip(req *http.Request) (*http.Response, error) { - ctx, _, err := p.mgr.prepareRequest(req.Context(), req.URL, p.base) - if err != nil { - // DNS error occurred (either hard failure or fallback scenario) - return nil, err - } - - newReq := req.Clone(ctx) - return p.base.RoundTrip(newReq) - } - ``` - -6. **Add `FallbackUsed` field to `DNSDecision`** (dns_policy.go): - ```go - type DNSDecision struct { - // ... existing fields ... - FallbackUsed bool // New: indicates cached address used due to DNS error - } - ``` - -7. **Add comprehensive tests**: - - Test DNS failure with fallback enabled (should send error response) - - Test DNS failure with fallback disabled (should send error response) - - Test DNS failure without cached address (should send error response) - - Verify decision hook receives correct fallback state - -### Testing Strategy - -1. **Unit tests** for each `resolve*()` method: - - DNS failure with cached address and fallback enabled - - DNS failure with cached address and fallback disabled - - DNS failure without cached address - -2. **Integration tests** for end-to-end flow: - - HTTP task execution with DNS failure scenarios - - Verify error responses sent to response channel - - Verify guard rail mechanisms trigger appropriately - -3. **Regression tests**: - - Ensure static mode continues to work - - Ensure default mode continues to work - - Ensure normal success paths unchanged - -## Migration Considerations - -### Backward Compatibility -- Existing fallback behavior preserved (requests still use cached addresses) -- New error reporting may increase error response volume -- Clients should be prepared to handle errors that were previously silent - -### Documentation Updates -- Update DNS policy documentation to explain fallback error reporting -- Add examples showing how to handle DNS fallback errors -- Document difference between hard DNS failures and fallback scenarios - -### Monitoring Impact -- Error dashboards may show increased error counts (this is expected and correct) -- Add metrics to distinguish between: - - Hard DNS failures (no cached address) - - Soft DNS failures (fallback to cached address) - - Guard rail triggers due to DNS issues - -## Future Enhancements - -1. **Fallback address TTL/expiration**: Don't fall back to very stale cached addresses -2. **Fallback metrics**: Track how often fallback is used per endpoint -3. **Configurable fallback policy**: Per-endpoint control over fallback behavior -4. **DNS error classification**: Distinguish temporary vs permanent DNS failures -5. **Circuit breaker integration**: Use DNS error patterns to trigger circuit breakers - -## References - -- Issue discovered: DNS resolution failures not producing error responses -- Affected files: `dns_policy_manager.go`, `task_http.go`, `dns_policy.go` -- Related guard rail mechanism: `DNSGuardRailPolicy` in `dns_policy.go` diff --git a/response.go b/response.go index 4a9836e..65842b1 100644 --- a/response.go +++ b/response.go @@ -111,6 +111,10 @@ type DNSMetadata struct { // LookupDuration is the duration taken by the policy-managed DNS lookup, when applicable. LookupDuration time.Duration GuardRailTriggered bool + // FallbackUsed indicates whether a cached address was used due to DNS lookup failure. + FallbackUsed bool + // Err contains the DNS resolution error, if any occurred. + Err error } // String returns a string representation of the metadata. @@ -282,6 +286,8 @@ func (h *httpTaskResponse) Metadata() TaskResponseMetadata { ResolvedAddrs: append([]netip.Addr(nil), decision.ResolvedAddrs...), LookupDuration: decision.LookupDuration, GuardRailTriggered: decision.GuardRailTriggered, + FallbackUsed: decision.FallbackUsed, + Err: decision.Err, } // If httptrace didn't capture DNS timings but the policy did, surface it in TimeData. if md.TimeData.DNSLookup == nil && decision.LookupDuration > 0 { @@ -342,3 +348,46 @@ func (w *wsTaskResponse) Metadata() TaskResponseMetadata { TimeData: TimeDataFromTimestamps(w.timestamps), } } + +// +// httpTaskResponseError +// + +// httpTaskResponseError is a minimal TaskResponse for error cases that still need DNS metadata. +type httpTaskResponseError struct { + dnsDecision *DNSDecision +} + +// Close is a no-op for error responses. +func (*httpTaskResponseError) Close() error { + return nil +} + +// Data returns nil for error responses. +func (*httpTaskResponseError) Data() ([]byte, error) { + return nil, nil +} + +// Reader returns nil for error responses. +func (*httpTaskResponseError) Reader() (io.ReadCloser, error) { + return nil, nil +} + +// Metadata returns DNS metadata if available. +func (h *httpTaskResponseError) Metadata() TaskResponseMetadata { + md := TaskResponseMetadata{} + if h.dnsDecision != nil { + decision := *h.dnsDecision + md.DNS = &DNSMetadata{ + Mode: decision.Mode, + TTL: decision.TTL, + ExpiresAt: decision.ExpiresAt, + ResolvedAddrs: append([]netip.Addr(nil), decision.ResolvedAddrs...), + LookupDuration: decision.LookupDuration, + GuardRailTriggered: decision.GuardRailTriggered, + FallbackUsed: decision.FallbackUsed, + Err: decision.Err, + } + } + return md +} diff --git a/task_http.go b/task_http.go index 36a059e..ce3a480 100644 --- a/task_http.go +++ b/task_http.go @@ -193,6 +193,15 @@ func (r httpRequest) Execute() error { if err != nil { if r.endpoint.dnsMgr != nil { r.endpoint.dnsMgr.observeResult(0, err) + // Get DNS decision from manager for error reporting + if decision := r.endpoint.dnsMgr.getLastDecision(); decision != nil { + wr := errorResponse(err, r.endpoint.ID, r.endpoint.watcherID, &urlClone) + wr.Payload = &httpTaskResponseError{ + dnsDecision: decision, + } + r.respChan <- wr + return err + } } r.respChan <- errorResponse(err, r.endpoint.ID, r.endpoint.watcherID, &urlClone) return err diff --git a/task_http_test.go b/task_http_test.go index c8c90e2..d7a4c44 100644 --- a/task_http_test.go +++ b/task_http_test.go @@ -1,6 +1,7 @@ package wadjit import ( + "errors" "net" "net/http" "net/http/httptest" @@ -209,6 +210,125 @@ func TestHTTPEndpointExecute(t *testing.T) { } }, }, + { + name: "dns failure with fallback - error propagates", + run: func(t *testing.T) { + server := httptest.NewServer(http.HandlerFunc(echoHandler)) + defer server.Close() + + u, err := url.Parse(server.URL) + require.NoError(t, err) + + realAddr, ok := server.Listener.Addr().(*net.TCPAddr) + require.True(t, ok) + + // Create resolver that will succeed first, then fail + resolver := newTestResolver( + []netip.Addr{realAddr.AddrPort().Addr()}, 20*time.Millisecond) + policy := DNSPolicy{ + Mode: DNSRefreshTTL, + TTLMin: 15 * time.Millisecond, + TTLMax: 50 * time.Millisecond, + Resolver: resolver, + } + + respCh := make(chan WatcherResponse, 2) + ep := NewHTTPEndpoint( + u, + http.MethodGet, + WithID("fallback-test"), + WithReadFast(), + WithDNSPolicy(policy), + ) + + require.NoError(t, ep.Initialize("fallback-watcher", respCh)) + + // First execution - should succeed and establish cache + require.NoError(t, ep.Task().Execute()) + resp1 := <-respCh + require.NoError(t, resp1.Err) + + // Wait for TTL to expire + time.Sleep(25 * time.Millisecond) + + // Make DNS fail + resolver.SetError(errors.New("dns lookup failed")) + + // Second execution - should return error + err = ep.Task().Execute() + require.Error(t, err, "Execute should return DNS error") + assert.ErrorContains(t, err, "dns lookup failed") + + select { + case resp2 := <-respCh: + // Error should be reported in response + require.Error(t, resp2.Err) + assert.ErrorContains(t, resp2.Err, "dns lookup failed") + + // But metadata should show fallback was used + md := resp2.Metadata() + require.NotNil(t, md.DNS) + assert.True(t, md.DNS.FallbackUsed) + require.NotNil(t, md.DNS.Err) + assert.ErrorContains(t, md.DNS.Err, "dns lookup failed") + assert.Equal(t, realAddr.AddrPort().Addr(), md.DNS.ResolvedAddrs[0]) + case <-time.After(time.Second): + t.Fatal("timeout waiting for response") + } + }, + }, + { + name: "dns failure without cache - hard failure", + run: func(t *testing.T) { + server := httptest.NewServer(http.HandlerFunc(echoHandler)) + defer server.Close() + + u, err := url.Parse(server.URL) + require.NoError(t, err) + + // Create resolver that fails immediately + resolver := newTestResolver(nil, 0) + resolver.SetError(errors.New("dns resolution failed")) + + policy := DNSPolicy{ + Mode: DNSRefreshSingleLookup, + Resolver: resolver, + } + + respCh := make(chan WatcherResponse, 1) + ep := NewHTTPEndpoint( + u, + http.MethodGet, + WithID("no-cache-test"), + WithReadFast(), + WithDNSPolicy(policy), + ) + + require.NoError(t, ep.Initialize("no-cache-watcher", respCh)) + + // First execution with no cache should fail + err = ep.Task().Execute() + require.Error(t, err, "Execute should return DNS error") + assert.ErrorContains(t, err, "dns resolution failed") + + select { + case resp := <-respCh: + // Error should be reported + require.Error(t, resp.Err) + assert.ErrorContains(t, resp.Err, "dns resolution failed") + + // Metadata should show DNS error with no fallback + md := resp.Metadata() + require.NotNil(t, md.DNS) + assert.False(t, md.DNS.FallbackUsed) + require.NotNil(t, md.DNS.Err) + assert.ErrorContains(t, md.DNS.Err, "dns resolution failed") + assert.Empty(t, md.DNS.ResolvedAddrs) + case <-time.After(time.Second): + t.Fatal("timeout waiting for response") + } + }, + }, } for _, tc := range testCases {