Skip to content

Commit 6975b43

Browse files
committed
opt metric collection
1 parent 5ca614b commit 6975b43

File tree

4 files changed

+56
-170
lines changed

4 files changed

+56
-170
lines changed

internal/commands/daemon.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -146,7 +146,7 @@ func runDaemon() {
146146
// OTel SDK calls callbacks at CollectionInterval, we collect slightly faster
147147
metricsInterval := time.Duration(cfg.CollectionInterval) * time.Second
148148
if metricsInterval == 0 {
149-
metricsInterval = 15 * time.Second
149+
metricsInterval = 30 * time.Second
150150
}
151151
metricsTicker := time.NewTicker(metricsInterval)
152152
defer metricsTicker.Stop()
@@ -202,7 +202,7 @@ func runDaemon() {
202202
func startMetricsCollection(cfg *config.Config, hostname string) bool {
203203
interval := time.Duration(cfg.CollectionInterval) * time.Second
204204
if interval == 0 {
205-
interval = 15 * time.Second
205+
interval = 30 * time.Second
206206
}
207207

208208
otelCfg := &metrics.OTelConfig{

internal/metrics/collector.go

Lines changed: 18 additions & 105 deletions
Original file line numberDiff line numberDiff line change
@@ -393,7 +393,7 @@ func StartOTelCollector(cfg *OTelConfig) error {
393393

394394
interval := cfg.CollectionInterval
395395
if interval == 0 {
396-
interval = 15 * time.Second
396+
interval = 30 * time.Second
397397
}
398398

399399
meterProvider = sdkmetric.NewMeterProvider(
@@ -1776,10 +1776,12 @@ func collectProcesses(limit int) ([]ProcessInfo, error) {
17761776
continue
17771777
}
17781778

1779-
cpuPercent, _ := p.CPUPercent()
1779+
// Only use MemoryPercent for filtering (non-blocking)
1780+
// Skip CPUPercent - it blocks for 100ms per process!
17801781
memPercent, _ := p.MemoryPercent()
17811782

1782-
if cpuPercent < 0.1 && memPercent < 0.1 {
1783+
// Filter by memory only (processes with < 0.1% memory are not interesting)
1784+
if memPercent < 0.1 {
17831785
continue
17841786
}
17851787

@@ -1788,82 +1790,32 @@ func collectProcesses(limit int) ([]ProcessInfo, error) {
17881790
Name: name,
17891791
}
17901792

1791-
if ppid, err := p.Ppid(); err == nil {
1792-
pi.PPID = int(ppid)
1793-
}
1794-
1795-
pi.CPUPercent = cpuPercent
17961793
pi.MemoryPercent = float64(memPercent)
17971794

1795+
// Minimal syscalls: only cmdline and memory info
17981796
if cmdline, err := p.Cmdline(); err == nil {
17991797
pi.Command = truncateString(cmdline, 200)
18001798
} else {
18011799
pi.Command = name
18021800
}
18031801

1804-
if exe, err := p.Exe(); err == nil {
1805-
pi.Exe = exe
1806-
}
1807-
1808-
if uids, err := p.Uids(); err == nil && len(uids) > 0 {
1809-
pi.UID = uids[0]
1810-
pi.User = fmt.Sprintf("%d", uids[0])
1811-
}
1812-
1813-
if gids, err := p.Gids(); err == nil && len(gids) > 0 {
1814-
pi.GID = gids[0]
1815-
}
1816-
18171802
if memInfo, err := p.MemoryInfo(); err == nil && memInfo != nil {
18181803
pi.MemoryRSS = memInfo.RSS
1819-
pi.MemoryVMS = memInfo.VMS
18201804
}
18211805

18221806
if status, err := p.Status(); err == nil && len(status) > 0 {
18231807
pi.Status = string(status[0])
18241808
}
18251809

1826-
if threads, err := p.NumThreads(); err == nil {
1827-
pi.NumThreads = uint16(threads)
1828-
}
1829-
1830-
if fds, err := p.NumFDs(); err == nil {
1831-
pi.NumFDs = uint32(fds)
1832-
}
1833-
1834-
if ioCounters, err := p.IOCounters(); err == nil && ioCounters != nil {
1835-
pi.IOReadBytes = ioCounters.ReadBytes
1836-
pi.IOWriteBytes = ioCounters.WriteBytes
1837-
}
1838-
1839-
if createTime, err := p.CreateTime(); err == nil {
1840-
pi.CreateTime = createTime / 1000
1841-
}
1842-
1843-
if times, err := p.Times(); err == nil && times != nil {
1844-
pi.CPUTimeUser = times.User
1845-
pi.CPUTimeSystem = times.System
1846-
}
1847-
1848-
if nice, err := p.Nice(); err == nil {
1849-
pi.Nice = int8(nice)
1850-
}
1851-
1852-
if terminal, err := p.Terminal(); err == nil {
1853-
pi.TTY = terminal
1854-
}
1855-
1856-
pi.CPUUsage = pi.CPUPercent
18571810
pi.MemoryUsage = pi.MemoryPercent
18581811
pi.MemoryKB = int64(pi.MemoryRSS / 1024)
1859-
pi.Threads = int(pi.NumThreads)
18601812

18611813
processes = append(processes, pi)
18621814
}
18631815

1864-
// Sort by CPU usage
1816+
// Sort by memory usage (since we don't have CPU anymore)
18651817
sort.Slice(processes, func(i, j int) bool {
1866-
return processes[i].CPUPercent > processes[j].CPUPercent
1818+
return processes[i].MemoryPercent > processes[j].MemoryPercent
18671819
})
18681820

18691821
if len(processes) > limit {
@@ -1890,8 +1842,9 @@ func collectContainers() ([]ContainerMetrics, error) {
18901842
}
18911843

18921844
func collectDockerContainers() ([]ContainerMetrics, error) {
1893-
// Check if docker is available
1894-
cmd := exec.Command("docker", "ps", "-q")
1845+
// Single call to docker stats - gets all running containers at once
1846+
// Skip "docker ps" check - if no containers, stats returns empty
1847+
cmd := exec.Command("docker", "stats", "--no-stream", "--format", "{{json .}}")
18951848
output, err := cmd.Output()
18961849
if err != nil {
18971850
return nil, err
@@ -1901,13 +1854,6 @@ func collectDockerContainers() ([]ContainerMetrics, error) {
19011854
return nil, nil
19021855
}
19031856

1904-
// Get container stats
1905-
cmd = exec.Command("docker", "stats", "--no-stream", "--format", "{{json .}}")
1906-
output, err = cmd.Output()
1907-
if err != nil {
1908-
return nil, err
1909-
}
1910-
19111857
var containers []ContainerMetrics
19121858

19131859
lines := strings.Split(strings.TrimSpace(string(output)), "\n")
@@ -1917,14 +1863,11 @@ func collectDockerContainers() ([]ContainerMetrics, error) {
19171863
}
19181864

19191865
var stats struct {
1920-
ID string `json:"ID"`
1921-
Name string `json:"Name"`
1922-
CPUPerc string `json:"CPUPerc"`
1923-
MemUsage string `json:"MemUsage"`
1924-
MemPerc string `json:"MemPerc"`
1925-
NetIO string `json:"NetIO"`
1926-
BlockIO string `json:"BlockIO"`
1927-
PIDs string `json:"PIDs"`
1866+
ID string `json:"ID"`
1867+
Name string `json:"Name"`
1868+
CPUPerc string `json:"CPUPerc"`
1869+
MemUsage string `json:"MemUsage"`
1870+
MemPerc string `json:"MemPerc"`
19281871
}
19291872

19301873
if err := json.Unmarshal([]byte(line), &stats); err != nil {
@@ -1953,38 +1896,8 @@ func collectDockerContainers() ([]ContainerMetrics, error) {
19531896
containers = append(containers, c)
19541897
}
19551898

1956-
// Get more details with docker inspect
1957-
for i := range containers {
1958-
cmd := exec.Command("docker", "inspect", "--format", "{{json .}}", containers[i].ContainerID)
1959-
output, err := cmd.Output()
1960-
if err != nil {
1961-
continue
1962-
}
1963-
1964-
var inspect struct {
1965-
Config struct {
1966-
Image string `json:"Image"`
1967-
Labels map[string]string `json:"Labels"`
1968-
} `json:"Config"`
1969-
State struct {
1970-
Status string `json:"Status"`
1971-
Health *struct{ Status string } `json:"Health"`
1972-
StartedAt string `json:"StartedAt"`
1973-
ExitCode int16 `json:"ExitCode"`
1974-
} `json:"State"`
1975-
}
1976-
1977-
if err := json.Unmarshal(output, &inspect); err == nil {
1978-
containers[i].ImageName = inspect.Config.Image
1979-
containers[i].Status = inspect.State.Status
1980-
if inspect.State.Health != nil {
1981-
containers[i].Health = inspect.State.Health.Status
1982-
}
1983-
if labels, err := json.Marshal(inspect.Config.Labels); err == nil {
1984-
containers[i].Labels = string(labels)
1985-
}
1986-
}
1987-
}
1899+
// Skip docker inspect for each container - too expensive (N syscalls)
1900+
// Basic stats from "docker stats" are enough for monitoring
19881901

19891902
return containers, nil
19901903
}

internal/metrics/cpu.go

Lines changed: 35 additions & 62 deletions
Original file line numberDiff line numberDiff line change
@@ -21,14 +21,13 @@ type CPUMetrics struct {
2121

2222
// CPU state cache for delta-based calculation
2323
var (
24-
lastCpuTimes cpu.TimesStat
25-
lastPerCoreCpuTimes []cpu.TimesStat
26-
lastCpuMetrics CPUMetrics
27-
cpuCacheMu sync.RWMutex
28-
cpuCacheInitialized bool
29-
perCoreCacheInit bool
30-
lastCpuSampleTime time.Time
31-
minCPUSampleInterval = 100 * time.Millisecond // Minimum time between samples for accurate delta
24+
lastCpuTimes cpu.TimesStat
25+
lastPerCoreCpuTimes []cpu.TimesStat
26+
lastCpuMetrics CPUMetrics
27+
cpuCacheMu sync.RWMutex
28+
cpuCacheInitialized bool
29+
perCoreCacheInit bool
30+
lastCpuSampleTime time.Time
3231
)
3332

3433
// init initializes CPU monitoring by storing initial CPU times
@@ -47,81 +46,55 @@ func init() {
4746
}
4847
}
4948

50-
// GetCPUMetrics calculates detailed CPU usage metrics.
51-
// Uses gopsutil's built-in cpu.Percent for accurate cross-platform CPU measurement.
52-
// On first call or when cache is stale, performs a blocking measurement (100ms).
53-
// Subsequent calls within the sample interval return cached values instantly.
49+
// GetCPUMetrics calculates detailed CPU usage metrics using delta-based calculation.
50+
// Uses cpu.Times() which is non-blocking (reads /proc/stat on Linux, sysctl on macOS).
51+
// Calculates CPU usage as delta between current and previous sample.
52+
// First call returns zero values (baseline), subsequent calls return accurate metrics.
5453
func GetCPUMetrics() (CPUMetrics, error) {
5554
cpuCacheMu.Lock()
5655
defer cpuCacheMu.Unlock()
5756

58-
// Check if we have a recent enough measurement WITH valid data
59-
// lastCpuMetrics.Total > 0 ensures we don't return empty cache from init()
60-
if cpuCacheInitialized && lastCpuMetrics.Total > 0 && time.Since(lastCpuSampleTime) < minCPUSampleInterval {
61-
// Return last calculated metrics (avoid too frequent measurements)
62-
return lastCpuMetrics, nil
63-
}
64-
65-
// Use gopsutil's built-in Percent function which handles all platform differences
66-
// This is the most accurate way to measure CPU on macOS/Linux/Windows
67-
percentages, err := cpu.Percent(100*time.Millisecond, false)
68-
if err != nil || len(percentages) == 0 {
57+
// Get current CPU times (non-blocking)
58+
times, err := cpu.Times(false)
59+
if err != nil || len(times) == 0 {
6960
return CPUMetrics{}, err
7061
}
7162

72-
totalCPU := percentages[0]
73-
74-
// Get CPU times for breakdown (user/system/idle/iowait)
75-
times, err := cpu.Times(false)
76-
if err != nil || len(times) == 0 {
77-
// If times fail, at least return total CPU
78-
metrics := CPUMetrics{
79-
Total: totalCPU,
80-
Idle: 100 - totalCPU,
81-
}
82-
lastCpuMetrics = metrics
63+
// First call - just store baseline and return zeros
64+
if !cpuCacheInitialized {
65+
lastCpuTimes = times[0]
8366
lastCpuSampleTime = time.Now()
8467
cpuCacheInitialized = true
85-
return metrics, nil
68+
return CPUMetrics{}, nil
8669
}
8770

88-
// Calculate breakdown if we have previous sample
89-
var metrics CPUMetrics
90-
if cpuCacheInitialized {
91-
t1 := lastCpuTimes
92-
t2 := times[0]
71+
// Calculate delta from previous sample
72+
t1 := lastCpuTimes
73+
t2 := times[0]
9374

94-
t1All, _ := getAllBusy(t1)
95-
t2All, _ := getAllBusy(t2)
96-
totalDelta := t2All - t1All
75+
t1All, t1Busy := getAllBusy(t1)
76+
t2All, t2Busy := getAllBusy(t2)
77+
totalDelta := t2All - t1All
78+
79+
var metrics CPUMetrics
80+
if totalDelta > 0 {
81+
// Calculate total CPU usage
82+
totalCPU := clampPercent((t2Busy - t1Busy) / totalDelta * 100)
9783

98-
if totalDelta > 0 {
99-
metrics = CPUMetrics{
100-
Total: totalCPU, // Use gopsutil's accurate total
101-
User: clampPercent((t2.User - t1.User) / totalDelta * 100),
102-
System: clampPercent((t2.System - t1.System) / totalDelta * 100),
103-
Iowait: clampPercent((t2.Iowait - t1.Iowait) / totalDelta * 100),
104-
Steal: clampPercent((t2.Steal - t1.Steal) / totalDelta * 100),
105-
Idle: clampPercent((t2.Idle - t1.Idle) / totalDelta * 100),
106-
}
107-
} else {
108-
metrics = CPUMetrics{
109-
Total: totalCPU,
110-
Idle: 100 - totalCPU,
111-
}
112-
}
113-
} else {
11484
metrics = CPUMetrics{
115-
Total: totalCPU,
116-
Idle: 100 - totalCPU,
85+
Total: totalCPU,
86+
User: clampPercent((t2.User - t1.User) / totalDelta * 100),
87+
System: clampPercent((t2.System - t1.System) / totalDelta * 100),
88+
Iowait: clampPercent((t2.Iowait - t1.Iowait) / totalDelta * 100),
89+
Steal: clampPercent((t2.Steal - t1.Steal) / totalDelta * 100),
90+
Idle: clampPercent((t2.Idle - t1.Idle) / totalDelta * 100),
11791
}
11892
}
11993

12094
// Update cache
12195
lastCpuTimes = times[0]
12296
lastCpuMetrics = metrics
12397
lastCpuSampleTime = time.Now()
124-
cpuCacheInitialized = true
12598

12699
return metrics, nil
127100
}

internal/otelcol/manager.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -199,7 +199,7 @@ func (m *Manager) extractBinary(r io.Reader) error {
199199
func (m *Manager) GenerateConfig(cfg *Config) error {
200200
interval := cfg.CollectionInterval
201201
if interval == 0 {
202-
interval = 15
202+
interval = 30
203203
}
204204

205205
config := fmt.Sprintf(`# CatOps OTel Collector Configuration

0 commit comments

Comments
 (0)