Critical fix: Add panic recovery and reduce batch size to prevent OOM

johnwarden · johnwarden · commit 457048e2de7b · 2025-10-11T12:40:23.000+02:00
Problem:
- Archive worker silently died after context timeout at 02:39 on 2025-10-11
- Batch size was increased to 100 stories (from original 10)
- Processing 100 stories with large datasets caused OOM
- No panic recovery meant workers could crash silently
- Worker stopped running for days, database grew unchecked

Root cause:
- Batch size of 100 stories × 10 parallel workers × large JSON = memory exhaustion
- Context deadline exceeded during JSON generation for large datasets
- No panic recovery meant any panic would kill the worker goroutine
- Worker died silently with no indication in logs

Fixes:
1. Reduced batch size from 100 back to 5 stories
   - Processes fewer stories per cycle but much safer
   - Prevents memory exhaustion
   - Each cycle completes faster, reducing timeout risk

2. Added panic recovery to archiveWorker
   - Logs panic details before exiting
   - Prevents silent failures

3. Added panic recovery to purgeWorker
   - Same protection for consistency

4. Added panic recovery to pool tasks
   - Prevents one story's panic from crashing entire batch
   - Failed stories logged and skipped, others continue

Expected behavior after fix:
- Archive worker processes 5 stories every 5 minutes (safer)
- If panic occurs, it's logged and visible
- Worker crashes are visible in logs
- Memory usage stays under control
diff --git a/archive.go b/archive.go
@@ -211,21 +211,30 @@ func (app app) processArchivingOperations(ctx context.Context) error {
 	for _, storyID := range storyIDs {
 		sid := storyID
 		pool.Submit(func() {
+			// Recover from panics in worker tasks
+			defer func() {
+				if r := recover(); r != nil {
+					archiveErrorsTotal.Inc()
+					logger.Error("Archive task panic", fmt.Errorf("panic in story %d: %v", sid, r), "storyID", sid)
+					results <- archiveResult{storyID: sid, err: fmt.Errorf("panic: %v", r)}
+				}
+			}()
+
 			// Check context
 			if err := timeoutCtx.Err(); err != nil {
 				archiveErrorsTotal.Inc()
 				results <- archiveResult{storyID: sid, err: errors.Wrap(err, "context cancelled")}
 				return
 			}
-			
+
 			// Get max score to decide whether to upload to S3
 			maxScore, err := app.ndb.getMaxScore(timeoutCtx, sid)
 			if err != nil {
 				archiveErrorsTotal.Inc()
 				results <- archiveResult{storyID: sid, err: errors.Wrap(err, "failed to get max score")}
 				return
 			}
-			
+
 			if maxScore > 2 {
 				// High-score story: upload to S3 for backup
 				logger.Debug("Archiving story to S3", "storyID", sid, "maxScore", maxScore)
@@ -260,6 +269,14 @@ func (app app) processArchivingOperations(ctx context.Context) error {
 func (app app) archiveWorker(ctx context.Context) {
 	logger := app.logger
 
+	// Recover from panics to prevent worker from dying
+	defer func() {
+		if r := recover(); r != nil {
+			logger.Error("Archive worker panic recovered", fmt.Errorf("panic: %v", r))
+			// Worker will exit but at least we'll know why
+		}
+	}()
+
 	logger.Info("Archive worker started")
 
 	// Calculate initial delay until next 1-minute mark + 30 seconds
@@ -316,6 +333,14 @@ func (app app) archiveWorker(ctx context.Context) {
 func (app app) purgeWorker(ctx context.Context) {
 	logger := app.logger
 
+	// Recover from panics to prevent worker from dying
+	defer func() {
+		if r := recover(); r != nil {
+			logger.Error("Purge worker panic recovered", fmt.Errorf("panic: %v", r))
+			// Worker will exit but at least we'll know why
+		}
+	}()
+
 	logger.Info("Purge worker started")
 
 	for {
diff --git a/database.go b/database.go
@@ -385,13 +385,14 @@ func (ndb newsDatabase) selectStoriesToArchive(ctx context.Context) ([]int, erro
 
 	// Select old stories regardless of score
 	// High-score stories will be backed up to S3, low-score just marked for deletion
+	// Keep batch size small to avoid memory exhaustion
 	sqlStatement := `
 		select distinct stories.id
 		from stories
 		join dataset on stories.id = dataset.id
 		where stories.archived = 0
 		  and dataset.sampleTime <= strftime('%s', 'now') - 21*24*60*60
-		limit 100
+		limit 5
 	`
 
 	// Check context before query