Skip to content

Commit 3baac6f

Browse files
VA: Consolidate multiple metrics into one histogram (#7816)
- Add a new histogram, validationLatency - Add a VA.observeLatency for observing validation latency - Refactor to ensure this metric can be observed exclusively within VA.PerformValidation and VA.IsCAAValid. - Replace validationTime, localValidationTime, remoteValidationTime, remoteValidationFailures, caaCheckTime, localCAACheckTime, remoteCAACheckTime, and remoteCAACheckFailures with validationLatency
1 parent f9fb688 commit 3baac6f

File tree

5 files changed

+185
-165
lines changed

5 files changed

+185
-165
lines changed

va/caa.go

Lines changed: 36 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,6 @@ import (
1111
"time"
1212

1313
"github.com/miekg/dns"
14-
"github.com/prometheus/client_golang/prometheus"
1514

1615
"github.com/letsencrypt/boulder/bdns"
1716
"github.com/letsencrypt/boulder/canceled"
@@ -42,19 +41,46 @@ func (va *ValidationAuthorityImpl) IsCAAValid(ctx context.Context, req *vapb.IsC
4241
Requester: req.AccountURIID,
4342
Hostname: req.Domain,
4443
}
45-
checkStartTime := va.clk.Now()
4644

47-
validationMethod := core.AcmeChallenge(req.ValidationMethod)
48-
if !validationMethod.IsValid() {
45+
challType := core.AcmeChallenge(req.ValidationMethod)
46+
if !challType.IsValid() {
4947
return nil, berrors.InternalServerError("unrecognized validation method %q", req.ValidationMethod)
5048
}
5149

5250
acmeID := identifier.NewDNS(req.Domain)
5351
params := &caaParams{
5452
accountURIID: req.AccountURIID,
55-
validationMethod: validationMethod,
53+
validationMethod: challType,
5654
}
5755

56+
var prob *probs.ProblemDetails
57+
var internalErr error
58+
var localLatency time.Duration
59+
start := va.clk.Now()
60+
61+
defer func() {
62+
probType := ""
63+
outcome := fail
64+
if prob != nil {
65+
// CAA check failed.
66+
probType = string(prob.Type)
67+
logEvent.Error = prob.Error()
68+
} else {
69+
// CAA check passed.
70+
outcome = pass
71+
}
72+
// Observe local check latency (primary|remote).
73+
va.observeLatency(opCAA, va.perspective, string(challType), probType, outcome, localLatency)
74+
if va.isPrimaryVA() {
75+
// Observe total check latency (primary+remote).
76+
va.observeLatency(opCAA, allPerspectives, string(challType), probType, outcome, va.clk.Since(start))
77+
}
78+
// Log the total check latency.
79+
logEvent.ValidationLatency = va.clk.Since(start).Round(time.Millisecond).Seconds()
80+
81+
va.log.AuditObject("CAA check result", logEvent)
82+
}()
83+
5884
var remoteCAAResults chan *remoteVAResult
5985
if features.Get().EnforceMultiCAA {
6086
if remoteVACount := len(va.remoteVAs); remoteVACount > 0 {
@@ -63,16 +89,10 @@ func (va *ValidationAuthorityImpl) IsCAAValid(ctx context.Context, req *vapb.IsC
6389
}
6490
}
6591

66-
checkResult := "success"
67-
err := va.checkCAA(ctx, acmeID, params)
68-
localCheckLatency := time.Since(checkStartTime)
69-
var prob *probs.ProblemDetails
70-
if err != nil {
71-
prob = detailedError(err)
72-
logEvent.Error = prob.Error()
73-
logEvent.InternalError = err.Error()
92+
internalErr = va.checkCAA(ctx, acmeID, params)
93+
if internalErr != nil {
94+
prob = detailedError(internalErr)
7495
prob.Detail = fmt.Sprintf("While processing CAA for %s: %s", req.Domain, prob.Detail)
75-
checkResult = "failure"
7696
} else if remoteCAAResults != nil {
7797
if !features.Get().EnforceMultiCAA && features.Get().MultiCAAFullResults {
7898
// If we're not going to enforce multi CAA but we are logging the
@@ -82,40 +102,24 @@ func (va *ValidationAuthorityImpl) IsCAAValid(ctx context.Context, req *vapb.IsC
82102
_ = va.processRemoteCAAResults(
83103
req.Domain,
84104
req.AccountURIID,
85-
string(validationMethod),
105+
string(challType),
86106
remoteCAAResults)
87107
}()
88108
} else if features.Get().EnforceMultiCAA {
89109
remoteProb := va.processRemoteCAAResults(
90110
req.Domain,
91111
req.AccountURIID,
92-
string(validationMethod),
112+
string(challType),
93113
remoteCAAResults)
94114

95115
// If the remote result was a non-nil problem then fail the CAA check
96116
if remoteProb != nil {
97117
prob = remoteProb
98-
// We only set .Error here, not InternalError, because the remote VA doesn't send
99-
// us the internal error. But that's okay, because it got logged at the remote VA.
100-
logEvent.Error = remoteProb.Error()
101-
checkResult = "failure"
102118
va.log.Infof("CAA check failed due to remote failures: identifier=%v err=%s",
103119
req.Domain, remoteProb)
104-
va.metrics.remoteCAACheckFailures.Inc()
105120
}
106121
}
107122
}
108-
checkLatency := time.Since(checkStartTime)
109-
logEvent.ValidationLatency = checkLatency.Round(time.Millisecond).Seconds()
110-
111-
va.metrics.localCAACheckTime.With(prometheus.Labels{
112-
"result": checkResult,
113-
}).Observe(localCheckLatency.Seconds())
114-
va.metrics.caaCheckTime.With(prometheus.Labels{
115-
"result": checkResult,
116-
}).Observe(checkLatency.Seconds())
117-
118-
va.log.AuditObject("CAA check result", logEvent)
119123

120124
if prob != nil {
121125
// The ProblemDetails will be serialized through gRPC, which requires UTF-8.
@@ -154,15 +158,6 @@ func (va *ValidationAuthorityImpl) processRemoteCAAResults(
154158
challengeType string,
155159
remoteResultsChan <-chan *remoteVAResult) *probs.ProblemDetails {
156160

157-
state := "failure"
158-
start := va.clk.Now()
159-
160-
defer func() {
161-
va.metrics.remoteCAACheckTime.With(prometheus.Labels{
162-
"result": state,
163-
}).Observe(va.clk.Since(start).Seconds())
164-
}()
165-
166161
required := len(va.remoteVAs) - va.maxRemoteFailures
167162
good := 0
168163
bad := 0
@@ -190,7 +185,6 @@ func (va *ValidationAuthorityImpl) processRemoteCAAResults(
190185
// success or failure threshold is met.
191186
if !features.Get().MultiCAAFullResults {
192187
if good >= required {
193-
state = "success"
194188
return nil
195189
} else if bad > va.maxRemoteFailures {
196190
modifiedProblem := *result.Problem
@@ -217,7 +211,6 @@ func (va *ValidationAuthorityImpl) processRemoteCAAResults(
217211

218212
// Based on the threshold of good/bad return nil or a problem.
219213
if good >= required {
220-
state = "success"
221214
return nil
222215
} else if bad > va.maxRemoteFailures {
223216
modifiedProblem := *firstProb

va/caa_test.go

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ import (
99
"testing"
1010

1111
"github.com/miekg/dns"
12+
"github.com/prometheus/client_golang/prometheus"
1213

1314
"github.com/letsencrypt/boulder/bdns"
1415
"github.com/letsencrypt/boulder/core"
@@ -676,6 +677,7 @@ func TestMultiCAARechecking(t *testing.T) {
676677
expectedProbSubstring string
677678
expectedProbType probs.ProblemType
678679
expectedDiffLogSubstring string
680+
expectedLabels prometheus.Labels
679681
localDNSClient bdns.Client
680682
}{
681683
{
@@ -687,6 +689,13 @@ func TestMultiCAARechecking(t *testing.T) {
687689
{remoteVA, remoteUA},
688690
{remoteVA, remoteUA},
689691
},
692+
expectedLabels: prometheus.Labels{
693+
"operation": opCAA,
694+
"perspective": allPerspectives,
695+
"challenge_type": string(core.ChallengeTypeDNS01),
696+
"problem_type": "",
697+
"result": pass,
698+
},
690699
},
691700
{
692701
name: "broken localVA, RVAs functional, no CAA records",
@@ -699,6 +708,13 @@ func TestMultiCAARechecking(t *testing.T) {
699708
{remoteVA, remoteUA},
700709
{remoteVA, remoteUA},
701710
},
711+
expectedLabels: prometheus.Labels{
712+
"operation": opCAA,
713+
"perspective": allPerspectives,
714+
"challenge_type": string(core.ChallengeTypeDNS01),
715+
"problem_type": string(probs.DNSProblem),
716+
"result": fail,
717+
},
702718
},
703719
{
704720
name: "functional localVA, 1 broken RVA, no CAA records",
@@ -712,6 +728,13 @@ func TestMultiCAARechecking(t *testing.T) {
712728
{remoteVA, remoteUA},
713729
{remoteVA, remoteUA},
714730
},
731+
expectedLabels: prometheus.Labels{
732+
"operation": opCAA,
733+
"perspective": allPerspectives,
734+
"challenge_type": string(core.ChallengeTypeDNS01),
735+
"problem_type": string(probs.DNSProblem),
736+
"result": fail,
737+
},
715738
},
716739
{
717740
name: "functional localVA, all broken RVAs, no CAA records",
@@ -725,6 +748,13 @@ func TestMultiCAARechecking(t *testing.T) {
725748
{brokenVA, brokenUA},
726749
{brokenVA, brokenUA},
727750
},
751+
expectedLabels: prometheus.Labels{
752+
"operation": opCAA,
753+
"perspective": allPerspectives,
754+
"challenge_type": string(core.ChallengeTypeDNS01),
755+
"problem_type": string(probs.DNSProblem),
756+
"result": fail,
757+
},
728758
},
729759
{
730760
name: "all VAs functional, CAA issue type present",
@@ -735,6 +765,13 @@ func TestMultiCAARechecking(t *testing.T) {
735765
{remoteVA, remoteUA},
736766
{remoteVA, remoteUA},
737767
},
768+
expectedLabels: prometheus.Labels{
769+
"operation": opCAA,
770+
"perspective": allPerspectives,
771+
"challenge_type": string(core.ChallengeTypeDNS01),
772+
"problem_type": "",
773+
"result": pass,
774+
},
738775
},
739776
{
740777
name: "functional localVA, 1 broken RVA, CAA issue type present",
@@ -748,6 +785,13 @@ func TestMultiCAARechecking(t *testing.T) {
748785
{remoteVA, remoteUA},
749786
{remoteVA, remoteUA},
750787
},
788+
expectedLabels: prometheus.Labels{
789+
"operation": opCAA,
790+
"perspective": allPerspectives,
791+
"challenge_type": string(core.ChallengeTypeDNS01),
792+
"problem_type": string(probs.DNSProblem),
793+
"result": fail,
794+
},
751795
},
752796
{
753797
name: "functional localVA, all broken RVAs, CAA issue type present",
@@ -761,6 +805,13 @@ func TestMultiCAARechecking(t *testing.T) {
761805
{brokenVA, brokenUA},
762806
{brokenVA, brokenUA},
763807
},
808+
expectedLabels: prometheus.Labels{
809+
"operation": opCAA,
810+
"perspective": allPerspectives,
811+
"challenge_type": string(core.ChallengeTypeDNS01),
812+
"problem_type": string(probs.DNSProblem),
813+
"result": fail,
814+
},
764815
},
765816
{
766817
// The localVA kicks off the background goroutines before doing its
@@ -954,6 +1005,10 @@ func TestMultiCAARechecking(t *testing.T) {
9541005
} else {
9551006
test.AssertEquals(t, len(gotAnyRemoteFailures), 0)
9561007
}
1008+
1009+
if tc.expectedLabels != nil {
1010+
test.AssertMetricWithLabelsEquals(t, va.metrics.validationLatency, tc.expectedLabels, 1)
1011+
}
9571012
})
9581013
}
9591014
}

va/dns_test.go

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -28,10 +28,12 @@ func TestDNSValidationEmpty(t *testing.T) {
2828
test.AssertEquals(t, res.Problems.ProblemType, "unauthorized")
2929
test.AssertEquals(t, res.Problems.Detail, "No TXT record found at _acme-challenge.empty-txts.com")
3030

31-
test.AssertMetricWithLabelsEquals(t, va.metrics.validationTime, prometheus.Labels{
32-
"type": "dns-01",
33-
"result": "invalid",
34-
"problem_type": "unauthorized",
31+
test.AssertMetricWithLabelsEquals(t, va.metrics.validationLatency, prometheus.Labels{
32+
"operation": opChallAndCAA,
33+
"perspective": PrimaryPerspective,
34+
"challenge_type": string(core.ChallengeTypeDNS01),
35+
"problem_type": string(probs.UnauthorizedProblem),
36+
"result": fail,
3537
}, 1)
3638
}
3739

0 commit comments

Comments
 (0)