Skip to content

Commit 8f0fcd6

Browse files
Thomas StrombergThomas Stromberg
authored andcommitted
Reliability improvements - logging, races
1 parent f070ef4 commit 8f0fcd6

File tree

7 files changed

+80
-63
lines changed

7 files changed

+80
-63
lines changed

cmd/server/main.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -145,7 +145,7 @@ func run(ctx context.Context, cancel context.CancelFunc, cfg *config.ServerConfi
145145
"startup_message", "Starting Slacker server...")
146146

147147
// Initialize config manager for repo configs.
148-
configManager := config.New(ctx)
148+
configManager := config.New()
149149

150150
// Initialize GitHub installation manager.
151151
githubManager, err := github.NewManager(ctx, cfg.GitHubAppID, cfg.GitHubPrivateKey, cfg.AllowPersonalAccounts)

go.mod

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3,11 +3,11 @@ module github.com/codeGROOVE-dev/slacker
33
go 1.25.1
44

55
require (
6-
cloud.google.com/go/datastore v1.20.0
6+
cloud.google.com/go/datastore v1.21.0
77
github.com/codeGROOVE-dev/gh-mailto v0.0.0-20251019162917-c3412c017b1f
88
github.com/codeGROOVE-dev/gsm v0.0.0-20251019065141-833fe2363d22
99
github.com/codeGROOVE-dev/retry v1.2.0
10-
github.com/codeGROOVE-dev/sprinkler v0.0.0-20251020140418-efb533e2ff51
10+
github.com/codeGROOVE-dev/sprinkler v0.0.0-20251020171924-1aac68f58e14
1111
github.com/codeGROOVE-dev/turnclient v0.0.0-20251018202306-7cdc0d51856e
1212
github.com/golang-jwt/jwt/v5 v5.3.0
1313
github.com/google/go-github/v50 v50.2.0
@@ -48,9 +48,9 @@ require (
4848
golang.org/x/sys v0.37.0 // indirect
4949
golang.org/x/text v0.30.0 // indirect
5050
google.golang.org/api v0.252.0 // indirect
51-
google.golang.org/genproto v0.0.0-20251014184007-4626949a642f // indirect
52-
google.golang.org/genproto/googleapis/api v0.0.0-20251014184007-4626949a642f // indirect
53-
google.golang.org/genproto/googleapis/rpc v0.0.0-20251014184007-4626949a642f // indirect
51+
google.golang.org/genproto v0.0.0-20251020155222-88f65dc88635 // indirect
52+
google.golang.org/genproto/googleapis/api v0.0.0-20251020155222-88f65dc88635 // indirect
53+
google.golang.org/genproto/googleapis/rpc v0.0.0-20251020155222-88f65dc88635 // indirect
5454
google.golang.org/grpc v1.76.0 // indirect
5555
google.golang.org/protobuf v1.36.10 // indirect
5656
)

go.sum

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,8 @@ cloud.google.com/go/auth/oauth2adapt v0.2.8 h1:keo8NaayQZ6wimpNSmW5OPc283g65QNIi
66
cloud.google.com/go/auth/oauth2adapt v0.2.8/go.mod h1:XQ9y31RkqZCcwJWNSx2Xvric3RrU88hAYYbjDWYDL+c=
77
cloud.google.com/go/compute/metadata v0.9.0 h1:pDUj4QMoPejqq20dK0Pg2N4yG9zIkYGdBtwLoEkH9Zs=
88
cloud.google.com/go/compute/metadata v0.9.0/go.mod h1:E0bWwX5wTnLPedCKqk3pJmVgCBSM6qQI1yTBdEb3C10=
9-
cloud.google.com/go/datastore v1.20.0 h1:NNpXoyEqIJmZFc0ACcwBEaXnmscUpcG4NkKnbCePmiM=
10-
cloud.google.com/go/datastore v1.20.0/go.mod h1:uFo3e+aEpRfHgtp5pp0+6M0o147KoPaYNaPAKpfh8Ew=
9+
cloud.google.com/go/datastore v1.21.0 h1:dUrYq47ysCA4nM7u8kRT0WnbfXc6TzX49cP3TCwIiA0=
10+
cloud.google.com/go/datastore v1.21.0/go.mod h1:9l+KyAHO+YVVcdBbNQZJu8svF17Nw5sMKuFR0LYf1nY=
1111
github.com/ProtonMail/go-crypto v1.3.0 h1:ILq8+Sf5If5DCpHQp4PbZdS1J7HDFRXz/+xKBiRGFrw=
1212
github.com/ProtonMail/go-crypto v1.3.0/go.mod h1:9whxjD8Rbs29b4XWbB8irEcE8KHMqaR2e7GWU1R+/PE=
1313
github.com/cloudflare/circl v1.6.1 h1:zqIqSPIndyBh1bjLVVDHMPpVKqp8Su/V+6MeDzzQBQ0=
@@ -22,8 +22,8 @@ github.com/codeGROOVE-dev/prx v0.0.0-20251016165946-00c6c6e90c29 h1:MSBy3Ywr3ky/
2222
github.com/codeGROOVE-dev/prx v0.0.0-20251016165946-00c6c6e90c29/go.mod h1:7qLbi18baOyS8yO/6/64SBIqtyzSzLFdsDST15NPH3w=
2323
github.com/codeGROOVE-dev/retry v1.2.0 h1:xYpYPX2PQZmdHwuiQAGGzsBm392xIMl4nfMEFApQnu8=
2424
github.com/codeGROOVE-dev/retry v1.2.0/go.mod h1:8OgefgV1XP7lzX2PdKlCXILsYKuz6b4ZpHa/20iLi8E=
25-
github.com/codeGROOVE-dev/sprinkler v0.0.0-20251020140418-efb533e2ff51 h1:oPVbUoZ1jxgmrqybgRCfhwdT8KaXE/hzQ4vAswRybt0=
26-
github.com/codeGROOVE-dev/sprinkler v0.0.0-20251020140418-efb533e2ff51/go.mod h1:/kd3ncsRNldD0MUpbtp5ojIzfCkyeXB7JdOrpuqG7Gg=
25+
github.com/codeGROOVE-dev/sprinkler v0.0.0-20251020171924-1aac68f58e14 h1:AKcULaDrbhKDkf6vpWGo36iyLoiOVhLu1MFcnNmDbWg=
26+
github.com/codeGROOVE-dev/sprinkler v0.0.0-20251020171924-1aac68f58e14/go.mod h1:/kd3ncsRNldD0MUpbtp5ojIzfCkyeXB7JdOrpuqG7Gg=
2727
github.com/codeGROOVE-dev/turnclient v0.0.0-20251018202306-7cdc0d51856e h1:3qoY6h8SgoeNsIYRM7P6PegTXAHPo8OSOapUunVP/Gs=
2828
github.com/codeGROOVE-dev/turnclient v0.0.0-20251018202306-7cdc0d51856e/go.mod h1:fYwtN9Ql6lY8t2WvCfENx+mP5FUwjlqwXCLx9CVLY20=
2929
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
@@ -118,12 +118,12 @@ gonum.org/v1/gonum v0.16.0 h1:5+ul4Swaf3ESvrOnidPp4GZbzf0mxVQpDCYUQE7OJfk=
118118
gonum.org/v1/gonum v0.16.0/go.mod h1:fef3am4MQ93R2HHpKnLk4/Tbh/s0+wqD5nfa6Pnwy4E=
119119
google.golang.org/api v0.252.0 h1:xfKJeAJaMwb8OC9fesr369rjciQ704AjU/psjkKURSI=
120120
google.golang.org/api v0.252.0/go.mod h1:dnHOv81x5RAmumZ7BWLShB/u7JZNeyalImxHmtTHxqw=
121-
google.golang.org/genproto v0.0.0-20251014184007-4626949a642f h1:vLd1CJuJOUgV6qijD7KT5Y2ZtC97ll4dxjTUappMnbo=
122-
google.golang.org/genproto v0.0.0-20251014184007-4626949a642f/go.mod h1:PI3KrSadr00yqfv6UDvgZGFsmLqeRIwt8x4p5Oo7CdM=
123-
google.golang.org/genproto/googleapis/api v0.0.0-20251014184007-4626949a642f h1:OiFuztEyBivVKDvguQJYWq1yDcfAHIID/FVrPR4oiI0=
124-
google.golang.org/genproto/googleapis/api v0.0.0-20251014184007-4626949a642f/go.mod h1:kprOiu9Tr0JYyD6DORrc4Hfyk3RFXqkQ3ctHEum3ZbM=
125-
google.golang.org/genproto/googleapis/rpc v0.0.0-20251014184007-4626949a642f h1:1FTH6cpXFsENbPR5Bu8NQddPSaUUE6NA2XdZdDSAJK4=
126-
google.golang.org/genproto/googleapis/rpc v0.0.0-20251014184007-4626949a642f/go.mod h1:7i2o+ce6H/6BluujYR+kqX3GKH+dChPTQU19wjRPiGk=
121+
google.golang.org/genproto v0.0.0-20251020155222-88f65dc88635 h1:I5FLgnlmGA5voD3BZp9Rc17FGiius/DlMB3WsJ1C4Xw=
122+
google.golang.org/genproto v0.0.0-20251020155222-88f65dc88635/go.mod h1:1Ic78BnpzY8OaTCmzxJDP4qC9INZPbGZl+54RKjtyeI=
123+
google.golang.org/genproto/googleapis/api v0.0.0-20251020155222-88f65dc88635 h1:1wvBeYv+A2zfEbxROscJl69OP0m74S8wGEO+Syat26o=
124+
google.golang.org/genproto/googleapis/api v0.0.0-20251020155222-88f65dc88635/go.mod h1:fDMmzKV90WSg1NbozdqrE64fkuTv6mlq2zxo9ad+3yo=
125+
google.golang.org/genproto/googleapis/rpc v0.0.0-20251020155222-88f65dc88635 h1:3uycTxukehWrxH4HtPRtn1PDABTU331ViDjyqrUbaog=
126+
google.golang.org/genproto/googleapis/rpc v0.0.0-20251020155222-88f65dc88635/go.mod h1:7i2o+ce6H/6BluujYR+kqX3GKH+dChPTQU19wjRPiGk=
127127
google.golang.org/grpc v1.76.0 h1:UnVkv1+uMLYXoIz6o7chp59WfQUYA2ex/BXQ9rHZu7A=
128128
google.golang.org/grpc v1.76.0/go.mod h1:Ju12QI8M6iQJtbcsV+awF5a4hfJMLi4X0JLo94ULZ6c=
129129
google.golang.org/protobuf v1.36.10 h1:AYd7cD/uASjIL6Q9LiTjz8JLcrh/88q5UObnmY3aOOE=

internal/bot/bot_sprinkler.go

Lines changed: 23 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -278,19 +278,26 @@ func (c *Coordinator) RunWithSprinklerClient(ctx context.Context) error {
278278
NoReconnect: false, // Enable automatic reconnection
279279
PingInterval: 0, // Use default (30 seconds)
280280
OnConnect: func() {
281-
slog.Info("sprinkler client connected",
281+
slog.Warn("🟢 SPRINKLER CONNECTED",
282282
"organization", organization,
283-
"url", c.sprinklerURL)
283+
"url", c.sprinklerURL,
284+
"subscribed_events", "*",
285+
"critical", "now receiving real-time webhook events")
284286
},
285287
OnDisconnect: func(err error) {
286288
if err != nil {
287-
slog.Error("sprinkler client disconnected",
289+
slog.Error("🔴 SPRINKLER DISCONNECTED - WILL MISS EVENTS UNTIL RECONNECTED",
288290
"organization", organization,
289-
"error", err)
291+
"error", err,
292+
"impact", "real-time webhook events will be missed",
293+
"fallback", "5-minute polling will catch missed events",
294+
"action_required", "investigate why connection dropped")
290295
return
291296
}
292-
slog.Info("sprinkler client disconnected normally",
293-
"organization", organization)
297+
slog.Warn("🟡 SPRINKLER DISCONNECTED (graceful)",
298+
"organization", organization,
299+
"reason", "clean shutdown or reconnection attempt",
300+
"impact", "may miss events during reconnection window")
294301
},
295302
OnEvent: func(event client.Event) {
296303
// SECURITY NOTE: Use detached context for event processing to prevent webhook
@@ -319,6 +326,10 @@ func (c *Coordinator) RunWithSprinklerClient(ctx context.Context) error {
319326
cleanupTicker := time.NewTicker(6 * time.Hour)
320327
defer cleanupTicker.Stop()
321328

329+
// Connection health monitoring - log every minute to detect silent disconnections
330+
healthTicker := time.NewTicker(1 * time.Minute)
331+
defer healthTicker.Stop()
332+
322333
go func() {
323334
for {
324335
select {
@@ -327,6 +338,12 @@ func (c *Coordinator) RunWithSprinklerClient(ctx context.Context) error {
327338
case <-cleanupTicker.C:
328339
c.threadCache.Cleanup(30 * 24 * time.Hour)
329340
slog.Debug("cleaned up old thread cache entries", "organization", organization)
341+
case <-healthTicker.C:
342+
// Log connection health every minute
343+
// This helps detect if we're silently disconnected for long periods
344+
slog.Debug("sprinkler connection health check",
345+
"organization", organization,
346+
"note", "if you see this log but no events for >2min during active PR work, connection may be broken")
330347
}
331348
}
332349
}()

internal/bot/polling.go

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -412,11 +412,12 @@ func (c *Coordinator) StartupReconciliation(ctx context.Context) {
412412

413413
// Determine if we should notify
414414
var reason string
415-
if lastNotified.IsZero() {
415+
switch {
416+
case lastNotified.IsZero():
416417
reason = "never_notified"
417-
} else if pr.UpdatedAt.After(lastNotified) {
418+
case pr.UpdatedAt.After(lastNotified):
418419
reason = "updated_since_last_notification"
419-
} else {
420+
default:
420421
skippedCount++
421422
slog.Debug("skipping PR - already notified and not updated",
422423
"pr", fmt.Sprintf("%s/%s#%d", pr.Owner, pr.Repo, pr.Number),

internal/config/config.go

Lines changed: 35 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ import (
99
"net/http"
1010
"strings"
1111
"sync"
12+
"sync/atomic"
1213
"time"
1314

1415
"github.com/codeGROOVE-dev/retry"
@@ -59,12 +60,13 @@ type configCacheEntry struct {
5960
}
6061

6162
// configCache manages configuration caching with TTL and thread safety.
63+
// Statistics counters use atomic operations to avoid races during concurrent reads.
6264
type configCache struct {
6365
entries map[string]configCacheEntry
6466
ttl time.Duration
6567
mu sync.RWMutex
66-
hits int64
67-
misses int64
68+
hits atomic.Int64 // Atomic to avoid race during concurrent get() calls
69+
misses atomic.Int64 // Atomic to avoid race during concurrent get() calls
6870
}
6971

7072
// get retrieves a cached configuration if it exists and is not expired.
@@ -74,16 +76,16 @@ func (c *configCache) get(org string) (*RepoConfig, bool) {
7476

7577
entry, exists := c.entries[org]
7678
if !exists {
77-
c.misses++
79+
c.misses.Add(1)
7880
return nil, false
7981
}
8082

8183
if time.Since(entry.timestamp) > c.ttl {
82-
c.misses++
84+
c.misses.Add(1)
8385
return nil, false
8486
}
8587

86-
c.hits++
88+
c.hits.Add(1)
8789
return entry.config, true
8890
}
8991

@@ -117,10 +119,9 @@ func (c *configCache) invalidateAll() {
117119
}
118120

119121
// stats returns cache statistics.
122+
// No lock needed - atomic reads are inherently thread-safe.
120123
func (c *configCache) stats() (hits, misses int64) {
121-
c.mu.RLock()
122-
defer c.mu.RUnlock()
123-
return c.hits, c.misses
124+
return c.hits.Load(), c.misses.Load()
124125
}
125126

126127
// Manager manages repository configurations.
@@ -133,7 +134,7 @@ type Manager struct {
133134
}
134135

135136
// New creates a new config manager.
136-
func New(ctx context.Context) *Manager {
137+
func New() *Manager {
137138
return &Manager{
138139
configs: make(map[string]*RepoConfig),
139140
clients: make(map[string]*github.Client),
@@ -183,7 +184,7 @@ func createDefaultConfig() *RepoConfig {
183184
// LoadConfig loads the configuration for a GitHub org with retry logic.
184185
func (m *Manager) LoadConfig(ctx context.Context, org string) error {
185186
// Check cache first
186-
if cachedConfig, found := m.cache.get(org); found {
187+
if cfg, found := m.cache.get(org); found {
187188
hits, misses := m.cache.stats()
188189
slog.Debug("using cached config for organization",
189190
logFieldOrg, org,
@@ -192,7 +193,7 @@ func (m *Manager) LoadConfig(ctx context.Context, org string) error {
192193
"cache_hit_ratio", float64(hits)/float64(hits+misses))
193194

194195
m.mu.Lock()
195-
m.configs[org] = cachedConfig
196+
m.configs[org] = cfg
196197
m.mu.Unlock()
197198
return nil
198199
}
@@ -284,9 +285,9 @@ func (m *Manager) LoadConfig(ctx context.Context, org string) error {
284285
)
285286
if err != nil {
286287
// Use default empty config if not found
287-
defaultConfig := createDefaultConfig()
288-
m.configs[org] = defaultConfig
289-
m.cache.set(org, defaultConfig)
288+
cfg := createDefaultConfig()
289+
m.configs[org] = cfg
290+
m.cache.set(org, cfg)
290291

291292
hits, misses := m.cache.stats()
292293
slog.Info("using default configuration for org",
@@ -307,9 +308,9 @@ func (m *Manager) LoadConfig(ctx context.Context, org string) error {
307308

308309
var config RepoConfig
309310
if err := yaml.Unmarshal([]byte(configContent), &config); err != nil {
310-
defaultConfig := createDefaultConfig()
311-
m.configs[org] = defaultConfig
312-
m.cache.set(org, defaultConfig)
311+
cfg := createDefaultConfig()
312+
m.configs[org] = cfg
313+
m.cache.set(org, cfg)
313314

314315
hits, misses := m.cache.stats()
315316
slog.Error("failed to parse YAML configuration - using defaults",
@@ -331,31 +332,29 @@ func (m *Manager) LoadConfig(ctx context.Context, org string) error {
331332
"email_domain", config.Global.EmailDomain)
332333

333334
// Count channel configurations
334-
mutedChannels := 0
335-
totalRepos := 0
336-
wildcardChannels := 0
337-
for channelName, channelConfig := range config.Channels {
338-
if channelConfig.Mute {
339-
mutedChannels++
335+
var muted, repos, wildcard int
336+
for name, ch := range config.Channels {
337+
if ch.Mute {
338+
muted++
340339
}
341-
totalRepos += len(channelConfig.Repos)
340+
repos += len(ch.Repos)
342341

343-
hasWildcard := false
344-
for _, repo := range channelConfig.Repos {
342+
hasWild := false
343+
for _, repo := range ch.Repos {
345344
if repo == "*" {
346-
wildcardChannels++
347-
hasWildcard = true
345+
wildcard++
346+
hasWild = true
348347
break
349348
}
350349
}
351350

352351
slog.Debug("channel configuration loaded",
353352
logFieldOrg, org,
354-
"channel", channelName,
355-
"repos_count", len(channelConfig.Repos),
356-
"repos", channelConfig.Repos,
357-
"muted", channelConfig.Mute,
358-
"has_wildcard", hasWildcard)
353+
"channel", name,
354+
"repos_count", len(ch.Repos),
355+
"repos", ch.Repos,
356+
"muted", ch.Mute,
357+
"has_wildcard", hasWild)
359358
}
360359

361360
m.configs[org] = &config
@@ -371,9 +370,9 @@ func (m *Manager) LoadConfig(ctx context.Context, org string) error {
371370
"email_domain": config.Global.EmailDomain,
372371
"daily_reminders": config.Global.DailyReminders,
373372
"total_channels": len(config.Channels),
374-
"muted_channels": mutedChannels,
375-
"wildcard_channels": wildcardChannels,
376-
"total_repo_mappings": totalRepos,
373+
"muted_channels": muted,
374+
"wildcard_channels": wildcard,
375+
"total_repo_mappings": repos,
377376
},
378377
"cached", true,
379378
"cache_hits", hits,

pkg/home/fetcher.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -232,9 +232,9 @@ func (f *Fetcher) enrichPRs(ctx context.Context, prs []PR, githubUsername string
232232
checkResult, err = turnClient.Check(ctx, pr.URL, f.botUsername, pr.LastEventTime)
233233
return err
234234
},
235-
retry.Attempts(3), // Fewer attempts for per-PR enrichment
235+
retry.Attempts(5),
236236
retry.Delay(500*time.Millisecond),
237-
retry.MaxDelay(30*time.Second),
237+
retry.MaxDelay(2*time.Minute),
238238
retry.DelayType(retry.BackOffDelay),
239239
retry.MaxJitter(time.Second),
240240
retry.Context(ctx),

0 commit comments

Comments
 (0)