@@ -17,145 +17,20 @@ limitations under the License.
17
17
package metrics
18
18
19
19
import (
20
- "context"
21
20
"fmt"
22
21
"math"
23
- "sort"
24
- "strconv"
25
- "strings"
26
22
"time"
27
23
28
- metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
29
- "k8s.io/apimachinery/pkg/util/sets"
30
- clientset "k8s.io/client-go/kubernetes"
31
- "k8s.io/component-base/metrics/testutil"
32
- "k8s.io/kubernetes/pkg/master/ports"
33
- schedulermetric "k8s.io/kubernetes/pkg/scheduler/metrics"
34
24
e2elog "k8s.io/kubernetes/test/e2e/framework/log"
35
- e2essh "k8s.io/kubernetes/test/e2e/framework/ssh"
36
- "k8s.io/kubernetes/test/e2e/system"
37
-
38
- "github.com/onsi/gomega"
39
25
)
40
26
41
27
const (
42
28
// SingleCallTimeout is how long to try single API calls (like 'get' or 'list'). Used to prevent
43
29
// transient failures from failing tests.
44
30
// TODO: client should not apply this timeout to Watch calls. Increased from 30s until that is fixed.
45
31
SingleCallTimeout = 5 * time .Minute
46
-
47
- // nodeStartupThreshold is a rough estimate of the time allocated for a pod to start on a node.
48
- nodeStartupThreshold = 4 * time .Second
49
-
50
- // We are setting 1s threshold for apicalls even in small clusters to avoid flakes.
51
- // The problem is that if long GC is happening in small clusters (where we have e.g.
52
- // 1-core master machines) and tests are pretty short, it may consume significant
53
- // portion of CPU and basically stop all the real work.
54
- // Increasing threshold to 1s is within our SLO and should solve this problem.
55
- apiCallLatencyThreshold time.Duration = 1 * time .Second
56
-
57
- // We use a higher threshold for list apicalls if the cluster is big (i.e having > 500 nodes)
58
- // as list response sizes are bigger in general for big clusters. We also use a higher threshold
59
- // for list calls at cluster scope (this includes non-namespaced and all-namespaced calls).
60
- apiListCallLatencyThreshold time.Duration = 5 * time .Second
61
- apiClusterScopeListCallThreshold time.Duration = 10 * time .Second
62
- bigClusterNodeCountThreshold = 500
63
32
)
64
33
65
- var schedulingLatencyMetricName = schedulermetric .SchedulerSubsystem + "_" + schedulermetric .SchedulingLatencyName
66
-
67
- func readLatencyMetrics (c clientset.Interface ) (* APIResponsiveness , error ) {
68
- var a APIResponsiveness
69
-
70
- body , err := getMetrics (c )
71
- if err != nil {
72
- return nil , err
73
- }
74
-
75
- samples , err := testutil .ExtractMetricSamples (body )
76
- if err != nil {
77
- return nil , err
78
- }
79
-
80
- ignoredResources := sets .NewString ("events" )
81
- // TODO: figure out why we're getting non-capitalized proxy and fix this.
82
- ignoredVerbs := sets .NewString ("WATCH" , "WATCHLIST" , "PROXY" , "proxy" , "CONNECT" )
83
-
84
- for _ , sample := range samples {
85
- // Example line:
86
- // apiserver_request_latencies_summary{resource="namespaces",verb="LIST",quantile="0.99"} 908
87
- // apiserver_request_total{resource="pods",verb="LIST",client="kubectl",code="200",contentType="json"} 233
88
- if sample .Metric [testutil .MetricNameLabel ] != "apiserver_request_latencies_summary" &&
89
- sample .Metric [testutil .MetricNameLabel ] != "apiserver_request_total" {
90
- continue
91
- }
92
-
93
- resource := string (sample .Metric ["resource" ])
94
- subresource := string (sample .Metric ["subresource" ])
95
- verb := string (sample .Metric ["verb" ])
96
- scope := string (sample .Metric ["scope" ])
97
- if ignoredResources .Has (resource ) || ignoredVerbs .Has (verb ) {
98
- continue
99
- }
100
-
101
- switch sample .Metric [testutil .MetricNameLabel ] {
102
- case "apiserver_request_latencies_summary" :
103
- latency := sample .Value
104
- quantile , err := strconv .ParseFloat (string (sample .Metric [testutil .QuantileLabel ]), 64 )
105
- if err != nil {
106
- return nil , err
107
- }
108
- a .addMetricRequestLatency (resource , subresource , verb , scope , quantile , time .Duration (int64 (latency ))* time .Microsecond )
109
- case "apiserver_request_total" :
110
- count := sample .Value
111
- a .addMetricRequestCount (resource , subresource , verb , scope , int (count ))
112
-
113
- }
114
- }
115
-
116
- return & a , err
117
- }
118
-
119
- // HighLatencyRequests prints top five summary metrics for request types with latency and returns
120
- // number of such request types above threshold. We use a higher threshold for
121
- // list calls if nodeCount is above a given threshold (i.e. cluster is big).
122
- func HighLatencyRequests (c clientset.Interface , nodeCount int ) (int , * APIResponsiveness , error ) {
123
- isBigCluster := (nodeCount > bigClusterNodeCountThreshold )
124
- metrics , err := readLatencyMetrics (c )
125
- if err != nil {
126
- return 0 , metrics , err
127
- }
128
- sort .Sort (sort .Reverse (metrics ))
129
- badMetrics := 0
130
- top := 5
131
- for i := range metrics .APICalls {
132
- latency := metrics .APICalls [i ].Latency .Perc99
133
- isListCall := (metrics .APICalls [i ].Verb == "LIST" )
134
- isClusterScopedCall := (metrics .APICalls [i ].Scope == "cluster" )
135
- isBad := false
136
- latencyThreshold := apiCallLatencyThreshold
137
- if isListCall && isBigCluster {
138
- latencyThreshold = apiListCallLatencyThreshold
139
- if isClusterScopedCall {
140
- latencyThreshold = apiClusterScopeListCallThreshold
141
- }
142
- }
143
- if latency > latencyThreshold {
144
- isBad = true
145
- badMetrics ++
146
- }
147
- if top > 0 || isBad {
148
- top --
149
- prefix := ""
150
- if isBad {
151
- prefix = "WARNING "
152
- }
153
- e2elog .Logf ("%vTop latency metric: %+v" , prefix , metrics .APICalls [i ])
154
- }
155
- }
156
- return badMetrics , metrics , nil
157
- }
158
-
159
34
// VerifyLatencyWithinThreshold verifies whether 50, 90 and 99th percentiles of a latency metric are
160
35
// within the expected threshold.
161
36
func VerifyLatencyWithinThreshold (threshold , actual LatencyMetric , metricName string ) error {
@@ -171,147 +46,6 @@ func VerifyLatencyWithinThreshold(threshold, actual LatencyMetric, metricName st
171
46
return nil
172
47
}
173
48
174
- // ResetMetrics resets latency metrics in apiserver.
175
- func ResetMetrics (c clientset.Interface ) error {
176
- e2elog .Logf ("Resetting latency metrics in apiserver..." )
177
- body , err := c .CoreV1 ().RESTClient ().Delete ().AbsPath ("/metrics" ).DoRaw ()
178
- if err != nil {
179
- return err
180
- }
181
- if string (body ) != "metrics reset\n " {
182
- return fmt .Errorf ("Unexpected response: %q" , string (body ))
183
- }
184
- return nil
185
- }
186
-
187
- // Retrieves metrics information.
188
- func getMetrics (c clientset.Interface ) (string , error ) {
189
- body , err := c .CoreV1 ().RESTClient ().Get ().AbsPath ("/metrics" ).DoRaw ()
190
- if err != nil {
191
- return "" , err
192
- }
193
- return string (body ), nil
194
- }
195
-
196
- // Sends REST request to kube scheduler metrics
197
- func sendRestRequestToScheduler (c clientset.Interface , op , provider , cloudMasterName , masterHostname string ) (string , error ) {
198
- opUpper := strings .ToUpper (op )
199
- if opUpper != "GET" && opUpper != "DELETE" {
200
- return "" , fmt .Errorf ("Unknown REST request" )
201
- }
202
-
203
- nodes , err := c .CoreV1 ().Nodes ().List (metav1.ListOptions {})
204
- // The following 4 lines are intended to replace framework.ExpectNoError(err)
205
- if err != nil {
206
- e2elog .Logf ("Unexpected error occurred: %v" , err )
207
- }
208
- gomega .ExpectWithOffset (1 , err ).NotTo (gomega .HaveOccurred ())
209
-
210
- var masterRegistered = false
211
- for _ , node := range nodes .Items {
212
- if system .DeprecatedMightBeMasterNode (node .Name ) {
213
- masterRegistered = true
214
- }
215
- }
216
-
217
- var responseText string
218
- if masterRegistered {
219
- ctx , cancel := context .WithTimeout (context .Background (), SingleCallTimeout )
220
- defer cancel ()
221
-
222
- body , err := c .CoreV1 ().RESTClient ().Verb (opUpper ).
223
- Context (ctx ).
224
- Namespace (metav1 .NamespaceSystem ).
225
- Resource ("pods" ).
226
- Name (fmt .Sprintf ("kube-scheduler-%v:%v" , cloudMasterName , ports .InsecureSchedulerPort )).
227
- SubResource ("proxy" ).
228
- Suffix ("metrics" ).
229
- Do ().Raw ()
230
-
231
- // The following 4 lines are intended to replace
232
- // framework.ExpectNoError(err).
233
- if err != nil {
234
- e2elog .Logf ("Unexpected error occurred: %v" , err )
235
- }
236
- gomega .ExpectWithOffset (1 , err ).NotTo (gomega .HaveOccurred ())
237
- responseText = string (body )
238
- } else {
239
- // If master is not registered fall back to old method of using SSH.
240
- if provider == "gke" || provider == "eks" {
241
- e2elog .Logf ("Not grabbing scheduler metrics through master SSH: unsupported for %s" , provider )
242
- return "" , nil
243
- }
244
-
245
- cmd := "curl -X " + opUpper + " http://localhost:10251/metrics"
246
- sshResult , err := e2essh .SSH (cmd , masterHostname + ":22" , provider )
247
- if err != nil || sshResult .Code != 0 {
248
- return "" , fmt .Errorf ("unexpected error (code: %d) in ssh connection to master: %#v" , sshResult .Code , err )
249
- }
250
- responseText = sshResult .Stdout
251
- }
252
- return responseText , nil
253
- }
254
-
255
- // Retrieves scheduler latency metrics.
256
- func getSchedulingLatency (c clientset.Interface , provider , cloudMasterName , masterHostname string ) (* SchedulingMetrics , error ) {
257
- result := SchedulingMetrics {}
258
- data , err := sendRestRequestToScheduler (c , "GET" , provider , cloudMasterName , masterHostname )
259
- if err != nil {
260
- return nil , err
261
- }
262
-
263
- samples , err := testutil .ExtractMetricSamples (data )
264
- if err != nil {
265
- return nil , err
266
- }
267
-
268
- for _ , sample := range samples {
269
- if string (sample .Metric [testutil .MetricNameLabel ]) != schedulingLatencyMetricName {
270
- continue
271
- }
272
-
273
- var metric * LatencyMetric
274
- switch sample .Metric [schedulermetric .OperationLabel ] {
275
- case schedulermetric .PredicateEvaluation :
276
- metric = & result .PredicateEvaluationLatency
277
- case schedulermetric .PriorityEvaluation :
278
- metric = & result .PriorityEvaluationLatency
279
- case schedulermetric .PreemptionEvaluation :
280
- metric = & result .PreemptionEvaluationLatency
281
- case schedulermetric .Binding :
282
- metric = & result .BindingLatency
283
- }
284
- if metric == nil {
285
- continue
286
- }
287
-
288
- quantile , err := strconv .ParseFloat (string (sample .Metric [testutil .QuantileLabel ]), 64 )
289
- if err != nil {
290
- return nil , err
291
- }
292
- setQuantile (metric , quantile , time .Duration (int64 (float64 (sample .Value )* float64 (time .Second ))))
293
- }
294
- return & result , nil
295
- }
296
-
297
- // VerifySchedulerLatency verifies (currently just by logging them) the scheduling latencies.
298
- func VerifySchedulerLatency (c clientset.Interface , provider , cloudMasterName , masterHostname string ) (* SchedulingMetrics , error ) {
299
- latency , err := getSchedulingLatency (c , provider , cloudMasterName , masterHostname )
300
- if err != nil {
301
- return nil , err
302
- }
303
- return latency , nil
304
- }
305
-
306
- // ResetSchedulerMetrics sends a DELETE request to kube-scheduler for resetting metrics.
307
- func ResetSchedulerMetrics (c clientset.Interface , provider , cloudMasterName , masterHostname string ) error {
308
- responseText , err := sendRestRequestToScheduler (c , "DELETE" , provider , cloudMasterName , masterHostname )
309
- if err != nil {
310
- return fmt .Errorf ("Unexpected response: %q, %v" , responseText , err )
311
- }
312
- return nil
313
- }
314
-
315
49
// PodLatencyData encapsulates pod startup latency information.
316
50
type PodLatencyData struct {
317
51
// Name of the pod
@@ -339,21 +73,6 @@ func ExtractLatencyMetrics(latencies []PodLatencyData) LatencyMetric {
339
73
return LatencyMetric {Perc50 : perc50 , Perc90 : perc90 , Perc99 : perc99 , Perc100 : perc100 }
340
74
}
341
75
342
- // LogSuspiciousLatency logs metrics/docker errors from all nodes that had slow startup times
343
- // If latencyDataLag is nil then it will be populated from latencyData
344
- func LogSuspiciousLatency (latencyData []PodLatencyData , latencyDataLag []PodLatencyData , nodeCount int , c clientset.Interface ) {
345
- if latencyDataLag == nil {
346
- latencyDataLag = latencyData
347
- }
348
- for _ , l := range latencyData {
349
- if l .Latency > nodeStartupThreshold {
350
- HighLatencyKubeletOperations (c , 1 * time .Second , l .Node , e2elog .Logf )
351
- }
352
- }
353
- e2elog .Logf ("Approx throughput: %v pods/min" ,
354
- float64 (nodeCount )/ (latencyDataLag [len (latencyDataLag )- 1 ].Latency .Minutes ()))
355
- }
356
-
357
76
// PrintLatencies outputs latencies to log with readable format.
358
77
func PrintLatencies (latencies []PodLatencyData , header string ) {
359
78
metrics := ExtractLatencyMetrics (latencies )
0 commit comments