Skip to content

Commit e9d5462

Browse files
authored
fix(snmp): fix data race in health status map (#1365)
* fix(snmp): fix data race in health status map * perf(snmp): add load-first --------- Co-authored-by: peter <>
1 parent 0ff3ecd commit e9d5462

File tree

2 files changed

+34
-19
lines changed

2 files changed

+34
-19
lines changed

inputs/snmp/health_check.go

Lines changed: 29 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -33,21 +33,24 @@ func (ins *Instance) StartHealthMonitor() {
3333
}
3434

3535
func (ins *Instance) checkAgentHealth(i int, agent string) {
36-
status, exists := ins.targetStatus[agent]
36+
val, exists := ins.targetStatus.Load(agent)
3737
if !exists {
38-
ins.targetStatus[agent] = &TargetStatus{healthy: true, lastSeen: time.Now()}
39-
status = ins.targetStatus[agent]
38+
newStatus := &TargetStatus{
39+
healthy: true,
40+
lastSeen: time.Now(),
41+
}
42+
val, _ = ins.targetStatus.LoadOrStore(agent, newStatus)
4043
}
44+
status := val.(*TargetStatus)
4145

4246
// Don't check too frequently if already marked as unhealthy
43-
if !status.healthy {
44-
status.mu.RLock()
45-
timeSinceLastCheck := time.Since(status.lastSeen)
46-
status.mu.RUnlock()
47+
status.mu.RLock()
48+
healthy := status.healthy
49+
timeSinceLastCheck := time.Since(status.lastSeen)
50+
status.mu.RUnlock()
4751

48-
if timeSinceLastCheck < time.Duration(ins.RecoveryInterval) {
49-
return
50-
}
52+
if !healthy && timeSinceLastCheck < time.Duration(ins.RecoveryInterval) {
53+
return
5154
}
5255

5356
// Create a connection with shorter timeout for health checking
@@ -111,12 +114,22 @@ func (ins *Instance) checkAgentHealth(i int, agent string) {
111114
}
112115

113116
func (ins *Instance) markAgentUnhealthy(agent string) {
114-
status, exists := ins.targetStatus[agent]
117+
val, exists := ins.targetStatus.Load(agent)
115118
if !exists {
116-
ins.targetStatus[agent] = &TargetStatus{healthy: false, lastSeen: time.Now(), failCount: ins.MaxFailCount}
117-
return
119+
newStatus := &TargetStatus{
120+
healthy: false,
121+
lastSeen: time.Now(),
122+
failCount: ins.MaxFailCount,
123+
}
124+
var loaded bool
125+
val, loaded = ins.targetStatus.LoadOrStore(agent, newStatus)
126+
if !loaded {
127+
return
128+
}
118129
}
119130

131+
// Existing status found, update it
132+
status := val.(*TargetStatus)
120133
status.mu.Lock()
121134
defer status.mu.Unlock()
122135

@@ -131,11 +144,13 @@ func (ins *Instance) markAgentUnhealthy(agent string) {
131144
}
132145

133146
func (ins *Instance) isAgentHealthy(agent string) bool {
134-
status, exists := ins.targetStatus[agent]
147+
val, exists := ins.targetStatus.Load(agent)
135148
if !exists {
136149
return true // Default to considering it healthy if no status exists
137150
}
138151

152+
// Existing status found, update it
153+
status := val.(*TargetStatus)
139154
status.mu.RLock()
140155
defer status.mu.RUnlock()
141156
return status.healthy

inputs/snmp/instances.go

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,7 @@ type Instance struct {
5454
Mappings map[string]map[string]string `toml:"mappings"`
5555

5656
// Track health status of each agent
57-
targetStatus map[string]*TargetStatus
57+
targetStatus sync.Map // key: agent string, value: *TargetStatus
5858
targetStatusInit sync.Once
5959
healthMonitorStarted bool
6060

@@ -120,12 +120,11 @@ func (ins *Instance) Init() error {
120120

121121
// Initialize target status tracking
122122
ins.targetStatusInit.Do(func() {
123-
ins.targetStatus = make(map[string]*TargetStatus)
124123
for _, agent := range ins.Agents {
125-
ins.targetStatus[agent] = &TargetStatus{
124+
ins.targetStatus.Store(agent, &TargetStatus{
126125
healthy: true,
127126
lastSeen: time.Now(),
128-
}
127+
})
129128
}
130129
})
131130

@@ -215,7 +214,8 @@ func (ins *Instance) Gather(slist *types.SampleList) {
215214
if m, ok := ins.Mappings[agent]; ok {
216215
extraTags = m
217216
}
218-
if status, exists := ins.targetStatus[agent]; exists {
217+
if val, exists := ins.targetStatus.Load(agent); exists {
218+
status := val.(*TargetStatus)
219219
status.mu.Lock()
220220
status.healthy = true
221221
status.failCount = 0

0 commit comments

Comments
 (0)