Skip to content

Commit 9f83bb5

Browse files
Thomas StrombergThomas Stromberg
authored andcommitted
lint fixes
1 parent 0453320 commit 9f83bb5

File tree

8 files changed

+144
-134
lines changed

8 files changed

+144
-134
lines changed

cmd/server/main.go

Lines changed: 28 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -34,8 +34,8 @@ import (
3434
func detectGCPProjectID(ctx context.Context) string {
3535
// Try metadata service (works on Cloud Run, GCE, GKE, Cloud Functions)
3636
client := &http.Client{Timeout: 2 * time.Second}
37-
req, err := http.NewRequestWithContext(ctx, "GET",
38-
"http://metadata.google.internal/computeMetadata/v1/project/project-id", nil)
37+
req, err := http.NewRequestWithContext(ctx, http.MethodGet,
38+
"http://metadata.google.internal/computeMetadata/v1/project/project-id", http.NoBody)
3939
if err != nil {
4040
return ""
4141
}
@@ -46,7 +46,11 @@ func detectGCPProjectID(ctx context.Context) string {
4646
slog.Debug("metadata service not available (not running on GCP?)", "error", err)
4747
return ""
4848
}
49-
defer resp.Body.Close()
49+
defer func() {
50+
if err := resp.Body.Close(); err != nil {
51+
slog.Debug("failed to close metadata response body", "error", err)
52+
}
53+
}()
5054

5155
if resp.StatusCode != http.StatusOK {
5256
slog.Debug("metadata service returned non-200", "status", resp.StatusCode)
@@ -75,8 +79,11 @@ const (
7579
)
7680

7781
func main() {
78-
// Configure logging with source locations and PID for better debugging
79-
pid := os.Getpid()
82+
// Configure logging with source locations and instance ID for better debugging
83+
hostname, err := os.Hostname()
84+
if err != nil {
85+
hostname = "unknown"
86+
}
8087
logHandler := slog.NewTextHandler(os.Stdout, &slog.HandlerOptions{
8188
AddSource: true,
8289
Level: slog.LevelInfo,
@@ -93,8 +100,10 @@ func main() {
93100
return a
94101
},
95102
})
96-
// Create logger with PID as a default attribute
97-
logger := slog.New(logHandler).With("pid", pid)
103+
// Create logger with hostname as a default attribute
104+
// In Cloud Run, hostname uniquely identifies each instance (e.g., slacker-abc123-xyz789)
105+
// This is critical for disambiguating instances during rolling deployments
106+
logger := slog.New(logHandler).With("instance", hostname)
98107
slog.SetDefault(logger)
99108

100109
// Load configuration from environment.
@@ -189,20 +198,23 @@ func run(ctx context.Context, cancel context.CancelFunc, cfg *config.ServerConfi
189198
if datastoreDB != "" && projectID != "" {
190199
slog.Info("initializing Cloud Datastore for persistent state",
191200
"project_id", projectID,
192-
"database", datastoreDB,
193-
"fallback", "JSON files")
201+
"database", datastoreDB)
194202
var err error
195203
stateStore, err = state.NewDatastoreStore(ctx, projectID, datastoreDB)
196204
if err != nil {
197-
slog.Error("failed to initialize Datastore, using JSON only",
205+
// FATAL: If DATASTORE is explicitly configured, fail startup on initialization errors.
206+
// This prevents silent fallback to JSON-only mode which causes duplicate messages
207+
// during rolling deployments (no cross-instance event deduplication).
208+
slog.Error("FATAL: failed to initialize Cloud Datastore - DATASTORE variable is set but initialization failed",
209+
"project_id", projectID,
210+
"database", datastoreDB,
198211
"error", err)
199-
stateStore, err = state.NewJSONStore()
200-
if err != nil {
201-
slog.Error("failed to initialize JSON store", "error", err)
202-
cancel()
203-
return 1
204-
}
212+
cancel()
213+
return 1
205214
}
215+
slog.Info("successfully initialized Cloud Datastore",
216+
"project_id", projectID,
217+
"database", datastoreDB)
206218
} else {
207219
var reason string
208220
if datastoreDB == "" {

internal/bot/bot.go

Lines changed: 26 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -83,20 +83,16 @@ func (tc *ThreadCache) Cleanup(maxAge time.Duration) {
8383

8484
// Coordinator coordinates between GitHub, Slack, and notifications for a single org.
8585
type Coordinator struct {
86-
slack *slackpkg.Client
87-
github *github.Client
88-
configManager *config.Manager
89-
notifier *notify.Manager
90-
userMapper *usermapping.Service
91-
sprinklerURL string
92-
threadCache *ThreadCache // In-memory cache for fast lookups
93-
stateStore StateStore // Persistent state across restarts
94-
workspaceName string // Track workspace name for better logging
95-
processedEvents map[string]time.Time // In-memory event deduplication: "timestamp:url:type" -> processed time
96-
processedEventMu sync.RWMutex
97-
processingEvents map[string]bool // Track events currently being processed (prevents concurrent duplicates)
98-
processingEventMu sync.Mutex
99-
eventSemaphore chan struct{} // Limits concurrent event processing (prevents overwhelming APIs)
86+
slack *slackpkg.Client
87+
github *github.Client
88+
configManager *config.Manager
89+
notifier *notify.Manager
90+
userMapper *usermapping.Service
91+
sprinklerURL string
92+
threadCache *ThreadCache // In-memory cache for fast lookups
93+
stateStore StateStore // Persistent state across restarts
94+
workspaceName string // Track workspace name for better logging
95+
eventSemaphore chan struct{} // Limits concurrent event processing (prevents overwhelming APIs)
10096
}
10197

10298
// StateStore interface for persistent state - allows dependency injection for testing.
@@ -134,9 +130,7 @@ func New(
134130
prThreads: make(map[string]ThreadInfo),
135131
creating: make(map[string]bool),
136132
},
137-
processedEvents: make(map[string]time.Time),
138-
processingEvents: make(map[string]bool),
139-
eventSemaphore: make(chan struct{}, 10), // Allow 10 concurrent events per org
133+
eventSemaphore: make(chan struct{}, 10), // Allow 10 concurrent events per org
140134
}
141135

142136
// Set GitHub client in config manager for this org.
@@ -621,10 +615,13 @@ func (c *Coordinator) handlePullRequestEventWithData(ctx context.Context, owner,
621615
"warning", "DMs are sent async - if instance crashes before completion, another instance may retry and send duplicates")
622616

623617
// Send DMs asynchronously to avoid blocking event processing
624-
// Use a detached context with timeout to allow graceful completion even if parent context is cancelled
618+
// SECURITY NOTE: Use detached context to allow graceful completion of DM notifications
619+
// even if parent context is cancelled during shutdown. Operations are still bounded by
620+
// explicit 15-second timeout, ensuring reasonably fast shutdown while handling slow API calls.
621+
// This pattern prevents incomplete DM delivery while maintaining shutdown security.
625622
// Note: No panic recovery - we want panics to propagate and restart the service (Cloud Run will handle it)
626623
// A quiet failure is worse than a visible crash that triggers automatic recovery
627-
dmCtx, dmCancel := context.WithTimeout(context.WithoutCancel(ctx), 2*time.Minute)
624+
dmCtx, dmCancel := context.WithTimeout(context.WithoutCancel(ctx), 15*time.Second)
628625
go func() {
629626
defer dmCancel()
630627
c.sendDMNotifications(dmCtx, workspaceID, owner, repo, prNumber, uniqueUsers, event, prState)
@@ -1028,8 +1025,11 @@ func (c *Coordinator) processPRForChannel(
10281025
domain := c.configManager.Domain(owner)
10291026
if len(blockedUsers) > 0 {
10301027
// Run email lookups in background to avoid blocking message delivery
1031-
// Use detached context to allow completion even if parent context is cancelled
1032-
lookupCtx, lookupCancel := context.WithTimeout(context.WithoutCancel(ctx), 2*time.Minute)
1028+
// SECURITY NOTE: Use detached context to complete email lookups even during shutdown.
1029+
// Operations bounded by 15-second timeout. This ensures reasonably fast shutdown while
1030+
// completing active lookups for accurate DM delivery (most lookups hit cache instantly,
1031+
// but occasional cold lookups can take 10+ seconds).
1032+
lookupCtx, lookupCancel := context.WithTimeout(context.WithoutCancel(ctx), 15*time.Second)
10331033
go func() {
10341034
defer lookupCancel()
10351035
for _, githubUser := range blockedUsers {
@@ -1310,8 +1310,11 @@ func (c *Coordinator) createPRThread(ctx context.Context, channel, owner, repo s
13101310
// This avoids blocking thread creation on slow email lookups (13-20 seconds each)
13111311
domain := c.configManager.Domain(owner)
13121312
if checkResult != nil && len(checkResult.Analysis.NextAction) > 0 {
1313-
// Use detached context to allow completion even if parent context is cancelled
1314-
enrichCtx, enrichCancel := context.WithTimeout(context.WithoutCancel(ctx), 2*time.Minute)
1313+
// SECURITY NOTE: Use detached context to complete message enrichment even during shutdown.
1314+
// Operations bounded by 15-second timeout. This ensures reasonably fast shutdown while
1315+
// completing active message updates (most lookups hit cache instantly, but occasional
1316+
// cold lookups can take 10+ seconds).
1317+
enrichCtx, enrichCancel := context.WithTimeout(context.WithoutCancel(ctx), 15*time.Second)
13151318
// Capture variables to avoid data race
13161319
capturedThreadTS := threadTS
13171320
capturedOwner := owner

internal/bot/bot_sprinkler.go

Lines changed: 32 additions & 82 deletions
Original file line numberDiff line numberDiff line change
@@ -36,87 +36,35 @@ func parsePRNumberFromURL(url string) (int, error) {
3636
return num, nil
3737
}
3838

39-
// handleSprinklerEvent processes a single event from sprinkler.
40-
func (c *Coordinator) handleSprinklerEvent(ctx context.Context, event client.Event, organization string) {
41-
// Deduplicate events using delivery_id if available, otherwise fall back to timestamp + URL + type
42-
// delivery_id is unique per GitHub webhook and is the same across all instances receiving the event
43-
var eventKey string
39+
// eventKey generates a unique key for event deduplication.
40+
// Uses delivery_id if available (GitHub's unique webhook ID),
41+
// otherwise falls back to timestamp + URL + type.
42+
func eventKey(event client.Event) string {
4443
if event.Raw != nil {
45-
if deliveryID, ok := event.Raw["delivery_id"].(string); ok && deliveryID != "" {
46-
eventKey = deliveryID
44+
if id, ok := event.Raw["delivery_id"].(string); ok && id != "" {
45+
return id
4746
}
4847
}
49-
if eventKey == "" {
50-
// Fallback to timestamp-based key if delivery_id not available
51-
eventKey = fmt.Sprintf("%s:%s:%s", event.Timestamp.Format(time.RFC3339Nano), event.URL, event.Type)
52-
}
53-
54-
// Check persistent state first (survives restarts)
55-
if c.stateStore.WasProcessed(eventKey) {
56-
slog.Info("skipping duplicate event (persistent check)",
57-
"organization", organization,
58-
"type", event.Type,
59-
"url", event.URL,
60-
"timestamp", event.Timestamp,
61-
"event_key", eventKey)
62-
return
63-
}
64-
65-
// Check if this event is currently being processed (prevents concurrent duplicates)
66-
// This is critical when sprinkler delivers the same event twice in quick succession
67-
c.processingEventMu.Lock()
68-
if c.processingEvents[eventKey] {
69-
c.processingEventMu.Unlock()
70-
slog.Info("skipping duplicate event (currently processing)",
71-
"organization", organization,
72-
"type", event.Type,
73-
"url", event.URL,
74-
"timestamp", event.Timestamp,
75-
"event_key", eventKey)
76-
return
77-
}
78-
// Mark as currently processing
79-
c.processingEvents[eventKey] = true
80-
c.processingEventMu.Unlock()
81-
82-
// Ensure we clean up the processing flag when done
83-
defer func() {
84-
c.processingEventMu.Lock()
85-
delete(c.processingEvents, eventKey)
86-
c.processingEventMu.Unlock()
87-
}()
48+
return fmt.Sprintf("%s:%s:%s", event.Timestamp.Format(time.RFC3339Nano), event.URL, event.Type)
49+
}
8850

89-
// Also check in-memory for fast deduplication during normal operation
90-
c.processedEventMu.Lock()
91-
if processedTime, exists := c.processedEvents[eventKey]; exists {
92-
c.processedEventMu.Unlock()
93-
slog.Info("skipping duplicate event (memory check)",
51+
// handleSprinklerEvent processes a single event from sprinkler.
52+
func (c *Coordinator) handleSprinklerEvent(ctx context.Context, event client.Event, organization string) {
53+
// Generate event key using delivery_id if available, otherwise timestamp + URL + type.
54+
// delivery_id is unique per GitHub webhook and is the same across all instances.
55+
eventKey := eventKey(event)
56+
57+
// Try to claim this event atomically using persistent store (Datastore transaction).
58+
// This is the single source of truth for cross-instance deduplication.
59+
if err := c.stateStore.MarkProcessed(eventKey, 24*time.Hour); err != nil {
60+
slog.Info("skipping duplicate event",
9461
"organization", organization,
9562
"type", event.Type,
9663
"url", event.URL,
97-
"timestamp", event.Timestamp,
98-
"first_processed", processedTime,
99-
"event_key", eventKey)
64+
"event_key", eventKey,
65+
"reason", "already_processed")
10066
return
10167
}
102-
c.processedEvents[eventKey] = time.Now()
103-
104-
// Cleanup old in-memory events (older than 1 hour - persistent store handles long-term)
105-
cutoff := time.Now().Add(-1 * time.Hour)
106-
cleanedCount := 0
107-
for key, processedTime := range c.processedEvents {
108-
if processedTime.Before(cutoff) {
109-
delete(c.processedEvents, key)
110-
cleanedCount++
111-
}
112-
}
113-
if cleanedCount > 0 {
114-
slog.Debug("cleaned up old in-memory processed events",
115-
"organization", organization,
116-
"removed_count", cleanedCount,
117-
"remaining_count", len(c.processedEvents))
118-
}
119-
c.processedEventMu.Unlock()
12068

12169
slog.Info("accepted event for async processing",
12270
"organization", organization,
@@ -204,18 +152,17 @@ func (c *Coordinator) handleSprinklerEvent(ctx context.Context, event client.Eve
204152
"type", event.Type,
205153
"url", event.URL,
206154
"repo", repo)
207-
// Don't mark as processed if processing failed - allow retry
155+
// Event already marked as processed before goroutine started.
156+
// Failed processing won't be retried automatically.
157+
// This is intentional - we don't want infinite retries of broken events.
208158
return
209159
}
210160

211-
// Mark event as processed in persistent state (survives restarts)
212-
if err := c.stateStore.MarkProcessed(eventKey, 24*time.Hour); err != nil {
213-
slog.Warn("failed to mark event as processed",
214-
"organization", organization,
215-
"event_key", eventKey,
216-
"error", err)
217-
// Continue anyway - in-memory dedup will prevent immediate duplicates
218-
}
161+
slog.Info("successfully processed sprinkler event",
162+
"organization", organization,
163+
"type", event.Type,
164+
"url", event.URL,
165+
"event_key", eventKey)
219166
}() // Close the goroutine
220167
}
221168

@@ -289,7 +236,10 @@ func (c *Coordinator) RunWithSprinklerClient(ctx context.Context) error {
289236
"organization", organization)
290237
},
291238
OnEvent: func(event client.Event) {
292-
// Use background context for event processing to avoid losing events during shutdown.
239+
// SECURITY NOTE: Use detached context for event processing to prevent webhook
240+
// events from being lost during shutdown. Event processing has internal timeouts
241+
// (30s for turnclient, semaphore limits) to prevent resource exhaustion.
242+
// This ensures all GitHub events are processed reliably while maintaining security.
293243
// Note: No panic recovery - we want panics to propagate and restart the service.
294244
eventCtx := context.WithoutCancel(ctx)
295245
c.handleSprinklerEvent(eventCtx, event, organization)

internal/github/github.go

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -215,10 +215,16 @@ func (c *Client) authenticate(ctx context.Context) error {
215215
slog.Debug("token validated successfully (repo list check)")
216216
}
217217

218+
// Log minimal token info to reduce exposure in logs (security best practice)
219+
tokenStr := token.GetToken()
220+
tokenSuffix := "..."
221+
if len(tokenStr) >= 4 {
222+
tokenSuffix = "..." + tokenStr[len(tokenStr)-4:]
223+
}
218224
slog.Info("successfully authenticated GitHub App",
219225
"app_id", c.appID,
220-
"token_length", len(token.GetToken()),
221-
"token_prefix", token.GetToken()[:min(10, len(token.GetToken()))]+"...",
226+
"token_length", len(tokenStr),
227+
"token_suffix", tokenSuffix,
222228
"token_expires_at", token.GetExpiresAt())
223229
return nil
224230
}

internal/notify/daily.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -86,4 +86,4 @@ func (d *DailyDigestScheduler) CheckAndSend(ctx context.Context) {
8686
// - User mapping from GitHub to Slack
8787
// - PR analysis with turnclient
8888
// - Timezone-aware message delivery
89-
// - Deduplication with existing DM notifications
89+
// - Deduplication with existing DM notifications.

internal/state/datastore.go

Lines changed: 15 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,10 @@ const (
2727
kindNotify = "SlackerNotification"
2828
)
2929

30+
// ErrAlreadyProcessed indicates an event was already processed by another instance.
31+
// This is used for cross-instance deduplication during rolling deployments.
32+
var ErrAlreadyProcessed = errors.New("event already processed by another instance")
33+
3034
// Thread entity for Datastore.
3135
type threadEntity struct {
3236
ThreadTS string `datastore:"thread_ts"`
@@ -386,9 +390,9 @@ func (s *DatastoreStore) WasProcessed(eventKey string) bool {
386390
}
387391

388392
// MarkProcessed marks an event as processed (distributed coordination).
389-
// Returns true if successfully marked, false if already marked by another instance.
393+
// Returns error if already processed by another instance (enables race detection).
390394
func (s *DatastoreStore) MarkProcessed(eventKey string, ttl time.Duration) error {
391-
// Mark in JSON
395+
// Mark in JSON first for fast local lookups
392396
if err := s.json.MarkProcessed(eventKey, ttl); err != nil {
393397
slog.Warn("failed to mark event in JSON", "error", err)
394398
}
@@ -412,7 +416,7 @@ func (s *DatastoreStore) MarkProcessed(eventKey string, ttl time.Duration) error
412416

413417
// Already exists - another instance processed it
414418
if err == nil {
415-
return errors.New("event already processed")
419+
return ErrAlreadyProcessed
416420
}
417421

418422
// Not found - safe to insert
@@ -428,14 +432,19 @@ func (s *DatastoreStore) MarkProcessed(eventKey string, ttl time.Duration) error
428432
// Other error
429433
return err
430434
})
431-
432-
if err != nil && err.Error() != "event already processed" {
435+
// Return the error to caller so they can detect race condition
436+
if err != nil {
437+
if errors.Is(err, ErrAlreadyProcessed) {
438+
// This is expected during rolling deployments - return error to caller
439+
return err
440+
}
441+
// Unexpected error - log but don't fail processing
433442
slog.Warn("failed to mark event in Datastore",
434443
"event", eventKey,
435444
"error", err)
436445
}
437446

438-
return nil
447+
return err
439448
}
440449

441450
// GetLastNotification retrieves when a PR was last notified about.

0 commit comments

Comments
 (0)