Skip to content

Commit 457048e

Browse files
committed
Critical fix: Add panic recovery and reduce batch size to prevent OOM
Problem: - Archive worker silently died after context timeout at 02:39 on 2025-10-11 - Batch size was increased to 100 stories (from original 10) - Processing 100 stories with large datasets caused OOM - No panic recovery meant workers could crash silently - Worker stopped running for days, database grew unchecked Root cause: - Batch size of 100 stories × 10 parallel workers × large JSON = memory exhaustion - Context deadline exceeded during JSON generation for large datasets - No panic recovery meant any panic would kill the worker goroutine - Worker died silently with no indication in logs Fixes: 1. Reduced batch size from 100 back to 5 stories - Processes fewer stories per cycle but much safer - Prevents memory exhaustion - Each cycle completes faster, reducing timeout risk 2. Added panic recovery to archiveWorker - Logs panic details before exiting - Prevents silent failures 3. Added panic recovery to purgeWorker - Same protection for consistency 4. Added panic recovery to pool tasks - Prevents one story's panic from crashing entire batch - Failed stories logged and skipped, others continue Expected behavior after fix: - Archive worker processes 5 stories every 5 minutes (safer) - If panic occurs, it's logged and visible - Worker crashes are visible in logs - Memory usage stays under control
1 parent 6a17462 commit 457048e

File tree

2 files changed

+29
-3
lines changed

2 files changed

+29
-3
lines changed

archive.go

Lines changed: 27 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -211,21 +211,30 @@ func (app app) processArchivingOperations(ctx context.Context) error {
211211
for _, storyID := range storyIDs {
212212
sid := storyID
213213
pool.Submit(func() {
214+
// Recover from panics in worker tasks
215+
defer func() {
216+
if r := recover(); r != nil {
217+
archiveErrorsTotal.Inc()
218+
logger.Error("Archive task panic", fmt.Errorf("panic in story %d: %v", sid, r), "storyID", sid)
219+
results <- archiveResult{storyID: sid, err: fmt.Errorf("panic: %v", r)}
220+
}
221+
}()
222+
214223
// Check context
215224
if err := timeoutCtx.Err(); err != nil {
216225
archiveErrorsTotal.Inc()
217226
results <- archiveResult{storyID: sid, err: errors.Wrap(err, "context cancelled")}
218227
return
219228
}
220-
229+
221230
// Get max score to decide whether to upload to S3
222231
maxScore, err := app.ndb.getMaxScore(timeoutCtx, sid)
223232
if err != nil {
224233
archiveErrorsTotal.Inc()
225234
results <- archiveResult{storyID: sid, err: errors.Wrap(err, "failed to get max score")}
226235
return
227236
}
228-
237+
229238
if maxScore > 2 {
230239
// High-score story: upload to S3 for backup
231240
logger.Debug("Archiving story to S3", "storyID", sid, "maxScore", maxScore)
@@ -260,6 +269,14 @@ func (app app) processArchivingOperations(ctx context.Context) error {
260269
func (app app) archiveWorker(ctx context.Context) {
261270
logger := app.logger
262271

272+
// Recover from panics to prevent worker from dying
273+
defer func() {
274+
if r := recover(); r != nil {
275+
logger.Error("Archive worker panic recovered", fmt.Errorf("panic: %v", r))
276+
// Worker will exit but at least we'll know why
277+
}
278+
}()
279+
263280
logger.Info("Archive worker started")
264281

265282
// Calculate initial delay until next 1-minute mark + 30 seconds
@@ -316,6 +333,14 @@ func (app app) archiveWorker(ctx context.Context) {
316333
func (app app) purgeWorker(ctx context.Context) {
317334
logger := app.logger
318335

336+
// Recover from panics to prevent worker from dying
337+
defer func() {
338+
if r := recover(); r != nil {
339+
logger.Error("Purge worker panic recovered", fmt.Errorf("panic: %v", r))
340+
// Worker will exit but at least we'll know why
341+
}
342+
}()
343+
319344
logger.Info("Purge worker started")
320345

321346
for {

database.go

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -385,13 +385,14 @@ func (ndb newsDatabase) selectStoriesToArchive(ctx context.Context) ([]int, erro
385385

386386
// Select old stories regardless of score
387387
// High-score stories will be backed up to S3, low-score just marked for deletion
388+
// Keep batch size small to avoid memory exhaustion
388389
sqlStatement := `
389390
select distinct stories.id
390391
from stories
391392
join dataset on stories.id = dataset.id
392393
where stories.archived = 0
393394
and dataset.sampleTime <= strftime('%s', 'now') - 21*24*60*60
394-
limit 100
395+
limit 5
395396
`
396397

397398
// Check context before query

0 commit comments

Comments
 (0)