Skip to content

Commit 14f8f55

Browse files
authored
Merge pull request kubernetes#121719 from ruiwen-zhao/metric-size
Add image pull duration metric with bucketed image size
2 parents 7ec1a89 + 0f5cf6c commit 14f8f55

File tree

11 files changed

+354
-13
lines changed

11 files changed

+354
-13
lines changed

pkg/kubelet/container/runtime.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -162,6 +162,8 @@ type ImageService interface {
162162
ImageStats(ctx context.Context) (*ImageStats, error)
163163
// ImageFsInfo returns a list of file systems for containers/images
164164
ImageFsInfo(ctx context.Context) (*runtimeapi.ImageFsInfoResponse, error)
165+
// GetImageSize returns the size of the image
166+
GetImageSize(ctx context.Context, image ImageSpec) (uint64, error)
165167
}
166168

167169
// Attacher interface allows to attach a container.

pkg/kubelet/container/testing/fake_runtime.go

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -362,6 +362,14 @@ func (f *FakeRuntime) GetImageRef(_ context.Context, image kubecontainer.ImageSp
362362
return "", f.InspectErr
363363
}
364364

365+
func (f *FakeRuntime) GetImageSize(_ context.Context, image kubecontainer.ImageSpec) (uint64, error) {
366+
f.Lock()
367+
defer f.Unlock()
368+
369+
f.CalledFunctions = append(f.CalledFunctions, "GetImageSize")
370+
return 0, f.Err
371+
}
372+
365373
func (f *FakeRuntime) ListImages(_ context.Context) ([]kubecontainer.Image, error) {
366374
f.Lock()
367375
defer f.Unlock()

pkg/kubelet/container/testing/runtime_mock.go

Lines changed: 30 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pkg/kubelet/images/image_manager.go

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ import (
3232
crierrors "k8s.io/cri-api/pkg/errors"
3333
kubecontainer "k8s.io/kubernetes/pkg/kubelet/container"
3434
"k8s.io/kubernetes/pkg/kubelet/events"
35+
"k8s.io/kubernetes/pkg/kubelet/metrics"
3536
"k8s.io/kubernetes/pkg/util/parsers"
3637
)
3738

@@ -166,8 +167,10 @@ func (m *imageManager) EnsureImageExists(ctx context.Context, pod *v1.Pod, conta
166167
return "", msg, err
167168
}
168169
m.podPullingTimeRecorder.RecordImageFinishedPulling(pod.UID)
169-
m.logIt(ref, v1.EventTypeNormal, events.PulledImage, logPrefix, fmt.Sprintf("Successfully pulled image %q in %v (%v including waiting)",
170-
container.Image, imagePullResult.pullDuration.Truncate(time.Millisecond), time.Since(startTime).Truncate(time.Millisecond)), klog.Info)
170+
imagePullDuration := time.Since(startTime).Truncate(time.Millisecond)
171+
m.logIt(ref, v1.EventTypeNormal, events.PulledImage, logPrefix, fmt.Sprintf("Successfully pulled image %q in %v (%v including waiting). Image size: %v bytes.",
172+
container.Image, imagePullResult.pullDuration.Truncate(time.Millisecond), imagePullDuration, imagePullResult.imageSize), klog.Info)
173+
metrics.ImagePullDuration.WithLabelValues(metrics.GetImageSizeBucket(imagePullResult.imageSize)).Observe(imagePullDuration.Seconds())
171174
m.backOff.GC()
172175
return imagePullResult.imageRef, "", nil
173176
}

pkg/kubelet/images/image_manager_test.go

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,7 @@ func pullerTestCases() []pullerTestCase {
6969
qps: 0.0,
7070
burst: 0,
7171
expected: []pullerExpects{
72-
{[]string{"GetImageRef", "PullImage"}, nil, true, true},
72+
{[]string{"GetImageRef", "PullImage", "GetImageSize"}, nil, true, true},
7373
}},
7474

7575
{ // image present, don't pull
@@ -94,9 +94,9 @@ func pullerTestCases() []pullerTestCase {
9494
qps: 0.0,
9595
burst: 0,
9696
expected: []pullerExpects{
97-
{[]string{"GetImageRef", "PullImage"}, nil, true, true},
98-
{[]string{"GetImageRef", "PullImage"}, nil, true, true},
99-
{[]string{"GetImageRef", "PullImage"}, nil, true, true},
97+
{[]string{"GetImageRef", "PullImage", "GetImageSize"}, nil, true, true},
98+
{[]string{"GetImageRef", "PullImage", "GetImageSize"}, nil, true, true},
99+
{[]string{"GetImageRef", "PullImage", "GetImageSize"}, nil, true, true},
100100
}},
101101
// missing image, error PullNever
102102
{containerImage: "missing_image",
@@ -149,9 +149,9 @@ func pullerTestCases() []pullerTestCase {
149149
qps: 400.0,
150150
burst: 600,
151151
expected: []pullerExpects{
152-
{[]string{"GetImageRef", "PullImage"}, nil, true, true},
153-
{[]string{"GetImageRef", "PullImage"}, nil, true, true},
154-
{[]string{"GetImageRef", "PullImage"}, nil, true, true},
152+
{[]string{"GetImageRef", "PullImage", "GetImageSize"}, nil, true, true},
153+
{[]string{"GetImageRef", "PullImage", "GetImageSize"}, nil, true, true},
154+
{[]string{"GetImageRef", "PullImage", "GetImageSize"}, nil, true, true},
155155
}},
156156
// image present, non-zero qps, try to pull when qps exceeded
157157
{containerImage: "present_image",
@@ -356,7 +356,7 @@ func TestPullAndListImageWithPodAnnotations(t *testing.T) {
356356
inspectErr: nil,
357357
pullerErr: nil,
358358
expected: []pullerExpects{
359-
{[]string{"GetImageRef", "PullImage"}, nil, true, true},
359+
{[]string{"GetImageRef", "PullImage", "GetImageSize"}, nil, true, true},
360360
}}
361361

362362
useSerializedEnv := true
@@ -412,7 +412,7 @@ func TestPullAndListImageWithRuntimeHandlerInImageCriAPIFeatureGate(t *testing.T
412412
inspectErr: nil,
413413
pullerErr: nil,
414414
expected: []pullerExpects{
415-
{[]string{"GetImageRef", "PullImage"}, nil, true, true},
415+
{[]string{"GetImageRef", "PullImage", "GetImageSize"}, nil, true, true},
416416
}}
417417

418418
useSerializedEnv := true

pkg/kubelet/images/puller.go

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ import (
2828

2929
type pullResult struct {
3030
imageRef string
31+
imageSize uint64
3132
err error
3233
pullDuration time.Duration
3334
}
@@ -58,8 +59,14 @@ func (pip *parallelImagePuller) pullImage(ctx context.Context, spec kubecontaine
5859
}
5960
startTime := time.Now()
6061
imageRef, err := pip.imageService.PullImage(ctx, spec, pullSecrets, podSandboxConfig)
62+
var size uint64
63+
if err == nil && imageRef != "" {
64+
// Getting the image size with best effort, ignoring the error.
65+
size, _ = pip.imageService.GetImageSize(ctx, spec)
66+
}
6167
pullChan <- pullResult{
6268
imageRef: imageRef,
69+
imageSize: size,
6370
err: err,
6471
pullDuration: time.Since(startTime),
6572
}
@@ -102,9 +109,16 @@ func (sip *serialImagePuller) processImagePullRequests() {
102109
for pullRequest := range sip.pullRequests {
103110
startTime := time.Now()
104111
imageRef, err := sip.imageService.PullImage(pullRequest.ctx, pullRequest.spec, pullRequest.pullSecrets, pullRequest.podSandboxConfig)
112+
var size uint64
113+
if err == nil && imageRef != "" {
114+
// Getting the image size with best effort, ignoring the error.
115+
size, _ = sip.imageService.GetImageSize(pullRequest.ctx, pullRequest.spec)
116+
}
105117
pullRequest.pullChan <- pullResult{
106-
imageRef: imageRef,
107-
err: err,
118+
imageRef: imageRef,
119+
imageSize: size,
120+
err: err,
121+
// Note: pullDuration includes credential resolution and getting the image size.
108122
pullDuration: time.Since(startTime),
109123
}
110124
}

pkg/kubelet/kuberuntime/kuberuntime_image.go

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,18 @@ func (m *kubeGenericRuntimeManager) GetImageRef(ctx context.Context, image kubec
9696
return resp.Image.Id, nil
9797
}
9898

99+
func (m *kubeGenericRuntimeManager) GetImageSize(ctx context.Context, image kubecontainer.ImageSpec) (uint64, error) {
100+
resp, err := m.imageService.ImageStatus(ctx, toRuntimeAPIImageSpec(image), false)
101+
if err != nil {
102+
klog.ErrorS(err, "Failed to get image status", "image", image.Image)
103+
return 0, err
104+
}
105+
if resp.Image == nil {
106+
return 0, nil
107+
}
108+
return resp.Image.Size_, nil
109+
}
110+
99111
// ListImages gets all images currently on the machine.
100112
func (m *kubeGenericRuntimeManager) ListImages(ctx context.Context) ([]kubecontainer.Image, error) {
101113
var images []kubecontainer.Image

pkg/kubelet/kuberuntime/kuberuntime_image_test.go

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -149,6 +149,20 @@ func TestGetImageRef(t *testing.T) {
149149
assert.Equal(t, image, imageRef)
150150
}
151151

152+
func TestImageSize(t *testing.T) {
153+
ctx := context.Background()
154+
_, fakeImageService, fakeManager, err := createTestRuntimeManager()
155+
assert.NoError(t, err)
156+
157+
const imageSize = uint64(64)
158+
fakeImageService.SetFakeImageSize(imageSize)
159+
image := "busybox"
160+
fakeImageService.SetFakeImages([]string{image})
161+
actualSize, err := fakeManager.GetImageSize(ctx, kubecontainer.ImageSpec{Image: image})
162+
assert.NoError(t, err)
163+
assert.Equal(t, imageSize, actualSize)
164+
}
165+
152166
func TestGetImageRefImageNotAvailableLocally(t *testing.T) {
153167
ctx := context.Background()
154168
_, _, fakeManager, err := createTestRuntimeManager()

pkg/kubelet/metrics/metrics.go

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,7 @@ const (
7070
WorkingPodCountKey = "working_pods"
7171
OrphanedRuntimePodTotalKey = "orphaned_runtime_pods_total"
7272
RestartedPodTotalKey = "restarted_pods_total"
73+
ImagePullDurationKey = "image_pull_duration_seconds"
7374

7475
// Metrics keys of remote runtime operations
7576
RuntimeOperationsKey = "runtime_operations_total"
@@ -126,8 +127,30 @@ const (
126127
EphemeralContainer = "ephemeral_container"
127128
)
128129

130+
type imageSizeBucket struct {
131+
lowerBoundInBytes uint64
132+
label string
133+
}
134+
129135
var (
130136
podStartupDurationBuckets = []float64{0.5, 1, 2, 3, 4, 5, 6, 8, 10, 20, 30, 45, 60, 120, 180, 240, 300, 360, 480, 600, 900, 1200, 1800, 2700, 3600}
137+
imagePullDurationBuckets = []float64{1, 5, 10, 20, 30, 60, 120, 180, 240, 300, 360, 480, 600, 900, 1200, 1800, 2700, 3600}
138+
// imageSizeBuckets has the labels to be associated with image_pull_duration_seconds metric. For example, if the size of
139+
// an image pulled is between 1GB and 5GB, the label will be "1GB-5GB".
140+
imageSizeBuckets = []imageSizeBucket{
141+
{0, "0-10MB"},
142+
{10 * 1024 * 1024, "10MB-100MB"},
143+
{100 * 1024 * 1024, "100MB-500MB"},
144+
{500 * 1024 * 1024, "500MB-1GB"},
145+
{1 * 1024 * 1024 * 1024, "1GB-5GB"},
146+
{5 * 1024 * 1024 * 1024, "5GB-10GB"},
147+
{10 * 1024 * 1024 * 1024, "10GB-20GB"},
148+
{20 * 1024 * 1024 * 1024, "20GB-30GB"},
149+
{30 * 1024 * 1024 * 1024, "30GB-40GB"},
150+
{40 * 1024 * 1024 * 1024, "40GB-60GB"},
151+
{60 * 1024 * 1024 * 1024, "60GB-100GB"},
152+
{100 * 1024 * 1024 * 1024, "GT100GB"},
153+
}
131154
)
132155

133156
var (
@@ -822,6 +845,20 @@ var (
822845
StabilityLevel: metrics.ALPHA,
823846
},
824847
)
848+
849+
// ImagePullDuration is a Histogram that tracks the duration (in seconds) it takes for an image to be pulled,
850+
// including the time spent in the waiting queue of image puller.
851+
// The metric is broken down by bucketed image size.
852+
ImagePullDuration = metrics.NewHistogramVec(
853+
&metrics.HistogramOpts{
854+
Subsystem: KubeletSubsystem,
855+
Name: ImagePullDurationKey,
856+
Help: "Duration in seconds to pull an image.",
857+
Buckets: imagePullDurationBuckets,
858+
StabilityLevel: metrics.ALPHA,
859+
},
860+
[]string{"image_size_in_bytes"},
861+
)
825862
)
826863

827864
var registerMetrics sync.Once
@@ -835,6 +872,7 @@ func Register(collectors ...metrics.StableCollector) {
835872
legacyregistry.MustRegister(PodStartDuration)
836873
legacyregistry.MustRegister(PodStartSLIDuration)
837874
legacyregistry.MustRegister(PodStartTotalDuration)
875+
legacyregistry.MustRegister(ImagePullDuration)
838876
legacyregistry.MustRegister(NodeStartupPreKubeletDuration)
839877
legacyregistry.MustRegister(NodeStartupPreRegistrationDuration)
840878
legacyregistry.MustRegister(NodeStartupRegistrationDuration)
@@ -921,3 +959,18 @@ func SinceInSeconds(start time.Time) float64 {
921959
func SetNodeName(name types.NodeName) {
922960
NodeName.WithLabelValues(string(name)).Set(1)
923961
}
962+
963+
func GetImageSizeBucket(sizeInBytes uint64) string {
964+
if sizeInBytes == 0 {
965+
return "N/A"
966+
}
967+
968+
for i := len(imageSizeBuckets) - 1; i >= 0; i-- {
969+
if sizeInBytes > imageSizeBuckets[i].lowerBoundInBytes {
970+
return imageSizeBuckets[i].label
971+
}
972+
}
973+
974+
// return empty string when sizeInBytes is 0 (error getting image size)
975+
return ""
976+
}
Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,77 @@
1+
/*
2+
Copyright 2024 The Kubernetes Authors.
3+
4+
Licensed under the Apache License, Version 2.0 (the "License");
5+
you may not use this file except in compliance with the License.
6+
You may obtain a copy of the License at
7+
8+
http://www.apache.org/licenses/LICENSE-2.0
9+
10+
Unless required by applicable law or agreed to in writing, software
11+
distributed under the License is distributed on an "AS IS" BASIS,
12+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
See the License for the specific language governing permissions and
14+
limitations under the License.
15+
*/
16+
17+
package metrics
18+
19+
import (
20+
"os"
21+
"testing"
22+
23+
"k8s.io/component-base/metrics/testutil"
24+
)
25+
26+
const imagePullDurationKey = "kubelet_" + ImagePullDurationKey
27+
28+
func TestImagePullDurationMetric(t *testing.T) {
29+
t.Run("register image pull duration", func(t *testing.T) {
30+
Register()
31+
defer clearMetrics()
32+
33+
// Pairs of image size in bytes and pull duration in seconds
34+
dataPoints := [][]float64{
35+
// 0 byets, 0 seconds
36+
{0, 0},
37+
// 5MB, 10 seconds
38+
{5 * 1024 * 1024, 10},
39+
// 15MB, 20 seconds
40+
{15 * 1024 * 1024, 20},
41+
// 500 MB, 200 seconds
42+
{500 * 1024 * 1024, 200},
43+
// 15 GB, 6000 seconds,
44+
{15 * 1024 * 1024 * 1024, 6000},
45+
// 200 GB, 10000 seconds
46+
{200 * 1024 * 1024 * 1024, 10000},
47+
}
48+
49+
for _, dp := range dataPoints {
50+
imageSize := int64(dp[0])
51+
duration := dp[1]
52+
t.Log(imageSize, duration)
53+
t.Log(GetImageSizeBucket(uint64(imageSize)))
54+
ImagePullDuration.WithLabelValues(GetImageSizeBucket(uint64(imageSize))).Observe(duration)
55+
}
56+
57+
wants, err := os.Open("testdata/image_pull_duration_metric")
58+
defer func() {
59+
if err := wants.Close(); err != nil {
60+
t.Error(err)
61+
}
62+
}()
63+
64+
if err != nil {
65+
t.Fatal(err)
66+
}
67+
68+
if err := testutil.GatherAndCompare(GetGather(), wants, imagePullDurationKey); err != nil {
69+
t.Error(err)
70+
}
71+
72+
})
73+
}
74+
75+
func clearMetrics() {
76+
ImagePullDuration.Reset()
77+
}

0 commit comments

Comments
 (0)