Skip to content

Commit 1761a8c

Browse files
authored
metrics: collect mount items for detail metrics (#16)
Introduce a new Prometheus metric for tracking detailed information about mounted items and refines the cache manager to export richer metrics. It also adds corresponding tests to ensure correct metric collection. The changes enhance observability of mounted volumes, distinguishing between pvc, inline and dynamic models. Additionally, it improves the cache size calculation method to avoid double-counting file sizes from hardlinks. Signed-off-by: imeoer <[email protected]>
1 parent 60f684c commit 1761a8c

File tree

7 files changed

+262
-36
lines changed

7 files changed

+262
-36
lines changed

go.mod

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ require (
1818
github.com/opencontainers/image-spec v1.1.1
1919
github.com/pkg/errors v0.9.1
2020
github.com/prometheus/client_golang v1.22.0
21+
github.com/prometheus/client_model v0.6.2
2122
github.com/rexray/gocsi v1.2.2
2223
github.com/sirupsen/logrus v1.9.3
2324
github.com/stretchr/testify v1.11.1
@@ -86,6 +87,7 @@ require (
8687
github.com/kevinburke/ssh_config v1.2.0 // indirect
8788
github.com/klauspost/compress v1.18.0 // indirect
8889
github.com/klauspost/cpuid/v2 v2.3.0 // indirect
90+
github.com/kylelemons/godebug v1.1.0 // indirect
8991
github.com/labstack/gommon v0.4.2 // indirect
9092
github.com/libgit2/git2go/v34 v34.0.0 // indirect
9193
github.com/mailru/easyjson v0.7.7 // indirect
@@ -99,7 +101,6 @@ require (
99101
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
100102
github.com/pjbgf/sha1cd v0.3.2 // indirect
101103
github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect
102-
github.com/prometheus/client_model v0.6.2 // indirect
103104
github.com/prometheus/common v0.63.0 // indirect
104105
github.com/prometheus/procfs v0.16.1 // indirect
105106
github.com/rivo/uniseg v0.4.7 // indirect

pkg/metrics/mount_collector.go

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
package metrics
2+
3+
import (
4+
"sync/atomic"
5+
6+
"github.com/prometheus/client_golang/prometheus"
7+
)
8+
9+
type MountItem struct {
10+
Reference string
11+
Type string
12+
VolumeName string
13+
MountID string
14+
}
15+
16+
type MountItemCollector struct {
17+
desc *prometheus.Desc
18+
items atomic.Value // stores []MountItem
19+
}
20+
21+
func NewMountItemCollector() *MountItemCollector {
22+
c := &MountItemCollector{
23+
desc: prometheus.NewDesc(
24+
Prefix+"mount_item",
25+
"Mounted item list (pvc, inline, dynamic types), value is always 1 for existing items.",
26+
[]string{"reference", "type", "volume_name", "mount_id"},
27+
nil,
28+
),
29+
}
30+
c.items.Store([]MountItem(nil))
31+
return c
32+
}
33+
34+
func (c *MountItemCollector) Set(items []MountItem) {
35+
c.items.Store(append([]MountItem(nil), items...))
36+
}
37+
38+
func (c *MountItemCollector) Describe(ch chan<- *prometheus.Desc) {
39+
ch <- c.desc
40+
}
41+
42+
func (c *MountItemCollector) Collect(ch chan<- prometheus.Metric) {
43+
v := c.items.Load()
44+
if v == nil {
45+
return
46+
}
47+
items := v.([]MountItem)
48+
for _, it := range items {
49+
ch <- prometheus.MustNewConstMetric(
50+
c.desc,
51+
prometheus.GaugeValue,
52+
1,
53+
it.Reference,
54+
it.Type,
55+
it.VolumeName,
56+
it.MountID,
57+
)
58+
}
59+
}
60+
61+
var MountItems = NewMountItemCollector()

pkg/metrics/registry.go

Lines changed: 16 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -76,15 +76,21 @@ var (
7676
},
7777
)
7878

79-
NodeMountedStaticImages = prometheus.NewGauge(
79+
NodeMountedPVCModels = prometheus.NewGauge(
8080
prometheus.GaugeOpts{
81-
Name: Prefix + "node_mounted_static_images",
81+
Name: Prefix + "node_mounted_pvc_models",
8282
},
8383
)
8484

85-
NodeMountedDynamicImages = prometheus.NewGauge(
85+
NodeMountedInlineModels = prometheus.NewGauge(
8686
prometheus.GaugeOpts{
87-
Name: Prefix + "node_mounted_dynamic_images",
87+
Name: Prefix + "node_mounted_inline_models",
88+
},
89+
)
90+
91+
NodeMountedDynamicModels = prometheus.NewGauge(
92+
prometheus.GaugeOpts{
93+
Name: Prefix + "node_mounted_dynamic_models",
8894
},
8995
)
9096

@@ -140,7 +146,9 @@ func NodePullOpObserve(op string, size int64, start time.Time, err error) {
140146
func init() {
141147
DummyRegistry.MustRegister()
142148

143-
DetailRegistry.MustRegister()
149+
DetailRegistry.MustRegister(
150+
MountItems,
151+
)
144152

145153
Registry.MustRegister(
146154
NodeNotReady,
@@ -155,8 +163,9 @@ func init() {
155163
ControllerOpLatency,
156164

157165
NodeCacheSizeInBytes,
158-
NodeMountedStaticImages,
159-
NodeMountedDynamicImages,
166+
NodeMountedPVCModels,
167+
NodeMountedInlineModels,
168+
NodeMountedDynamicModels,
160169
NodePullLayerTooLong,
161170
)
162171
}

pkg/server/server_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -558,7 +558,7 @@ func TestServer(t *testing.T) {
558558
require.NoError(t, err)
559559
cfg.Get().RootDir = rootDir
560560
cfg.Get().PullConfig.ProxyURL = ""
561-
service.CacheSacnInterval = 1 * time.Second
561+
service.CacheScanInterval = 1 * time.Second
562562

563563
service.NewPuller = func(ctx context.Context, pullCfg *config.PullConfig, hook *status.Hook, diskQuotaChecker *service.DiskQuotaChecker) service.Puller {
564564
return &mockPuller{

pkg/service/cache.go

Lines changed: 72 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -8,34 +8,35 @@ import (
88
"github.com/modelpack/model-csi-driver/pkg/config"
99
"github.com/modelpack/model-csi-driver/pkg/logger"
1010
"github.com/modelpack/model-csi-driver/pkg/metrics"
11+
"github.com/modelpack/model-csi-driver/pkg/status"
1112
"github.com/pkg/errors"
1213
)
1314

14-
var CacheSacnInterval = 60 * time.Second
15+
var CacheScanInterval = 60 * time.Second
16+
17+
const (
18+
mountTypePVC = "pvc"
19+
mountTypeInline = "inline"
20+
mountTypeDynamic = "dynamic"
21+
)
1522

1623
type CacheManager struct {
1724
cfg *config.Config
25+
sm *status.StatusManager
1826
}
1927

2028
func (cm *CacheManager) getCacheSize() (int64, error) {
21-
var total int64
22-
if err := filepath.Walk(cm.cfg.Get().RootDir, func(path string, info os.FileInfo, err error) error {
23-
if err != nil {
24-
return err
25-
}
26-
if info.IsDir() {
27-
return nil
28-
}
29-
total += info.Size()
30-
return nil
31-
}); err != nil {
32-
return 0, err
29+
size, err := getUsedSize(cm.cfg.Get().RootDir)
30+
if err != nil {
31+
return 0, errors.Wrapf(err, "get used size: %s", cm.cfg.Get().RootDir)
3332
}
34-
return total, nil
33+
34+
return size, nil
3535
}
3636

3737
func (cm *CacheManager) scanModels() error {
38-
staticModels := 0
38+
pvcModels := 0
39+
inlineModels := 0
3940
dynamicModels := 0
4041
volumesDir := cm.cfg.Get().GetVolumesDir()
4142
volumeDirs, err := os.ReadDir(volumesDir)
@@ -45,29 +46,73 @@ func (cm *CacheManager) scanModels() error {
4546
}
4647
return errors.Wrapf(err, "read volume dirs from %s", volumesDir)
4748
}
49+
50+
mountItems := []metrics.MountItem{}
4851
for _, volumeDir := range volumeDirs {
4952
if !volumeDir.IsDir() {
5053
continue
5154
}
52-
if isStaticVolume(volumeDir.Name()) {
53-
staticModels += 1
55+
volumeName := volumeDir.Name()
56+
if isStaticVolume(volumeName) {
57+
statusPath := filepath.Join(volumesDir, volumeName, "status.json")
58+
modelStatus, err := cm.sm.Get(statusPath)
59+
if err == nil {
60+
mountItems = append(mountItems, metrics.MountItem{
61+
Reference: modelStatus.Reference,
62+
Type: mountTypePVC,
63+
VolumeName: volumeName,
64+
MountID: modelStatus.MountID,
65+
})
66+
pvcModels += 1
67+
}
5468
}
55-
if isDynamicVolume(volumeDir.Name()) {
56-
modelsDir := cm.cfg.Get().GetModelsDirForDynamic(volumeDir.Name())
69+
if isDynamicVolume(volumeName) {
70+
modelsDir := cm.cfg.Get().GetModelsDirForDynamic(volumeName)
5771
modelDirs, err := os.ReadDir(modelsDir)
58-
if err != nil {
59-
return errors.Wrapf(err, "read model dirs from %s", modelsDir)
72+
if err != nil {
73+
if os.IsNotExist(err) {
74+
// This is potentially an inline model, the status file is expected
75+
// to be directly under the volume directory.
76+
statusPath := filepath.Join(volumesDir, volumeName, "status.json")
77+
modelStatus, err := cm.sm.Get(statusPath)
78+
if err == nil {
79+
mountItems = append(mountItems, metrics.MountItem{
80+
Reference: modelStatus.Reference,
81+
Type: mountTypeInline,
82+
VolumeName: volumeName,
83+
MountID: modelStatus.MountID,
84+
})
85+
inlineModels += 1
86+
}
87+
continue
88+
}
89+
logger.Logger().WithError(err).Warnf("read model dirs from %s", modelsDir)
90+
continue
6091
}
6192
for _, modelDir := range modelDirs {
6293
if !modelDir.IsDir() {
6394
continue
6495
}
65-
dynamicModels += 1
96+
statusPath := filepath.Join(modelsDir, modelDir.Name(), "status.json")
97+
modelStatus, err := cm.sm.Get(statusPath)
98+
if err == nil {
99+
mountItems = append(mountItems, metrics.MountItem{
100+
Reference: modelStatus.Reference,
101+
Type: mountTypeDynamic,
102+
VolumeName: volumeName,
103+
MountID: modelStatus.MountID,
104+
})
105+
dynamicModels += 1
106+
}
66107
}
67108
}
68109
}
69-
metrics.NodeMountedStaticImages.Set(float64(staticModels))
70-
metrics.NodeMountedDynamicImages.Set(float64(dynamicModels))
110+
111+
metrics.MountItems.Set(mountItems)
112+
metrics.NodeMountedPVCModels.Set(float64(pvcModels))
113+
metrics.NodeMountedInlineModels.Set(float64(inlineModels))
114+
metrics.NodeMountedDynamicModels.Set(float64(dynamicModels))
115+
71116
return nil
72117
}
73118

@@ -87,17 +132,18 @@ func (cm *CacheManager) Scan() error {
87132
return nil
88133
}
89134

90-
func NewCacheManager(cfg *config.Config) (*CacheManager, error) {
135+
func NewCacheManager(cfg *config.Config, sm *status.StatusManager) (*CacheManager, error) {
91136
cm := CacheManager{
92137
cfg: cfg,
138+
sm: sm,
93139
}
94140

95141
go func() {
96142
for {
97143
if err := cm.Scan(); err != nil && !errors.Is(err, os.ErrNotExist) {
98144
logger.Logger().WithError(err).Warnf("scan cache failed")
99145
}
100-
time.Sleep(CacheSacnInterval)
146+
time.Sleep(CacheScanInterval)
101147
}
102148
}()
103149

0 commit comments

Comments
 (0)