Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
18 commits
Select commit Hold shift + click to select a range
2469a16
feat(async): Add daily task to archive stale Slack channels
rgibert May 22, 2026
c57a49f
fix(async): Handle archive failures and history API errors
rgibert May 22, 2026
331af6c
fix(async): Add rate-limit delay and guard against silent archive
rgibert May 22, 2026
4bc34ba
fix(async): Filter bot messages from history check and fix cleanup paths
rgibert May 22, 2026
eb6b6d6
fix(async): Filter only own bot messages from history check
rgibert May 22, 2026
04dfff0
Delete docs/superpowers/specs/2026-05-22-archive-stale-channels-desig…
rgibert May 22, 2026
8d232a9
Merge branch 'main' into rgibert/archive-stale-channels
rgibert May 25, 2026
2ecd031
Merge branch 'main' into rgibert/archive-stale-channels
rgibert May 25, 2026
f6272da
fix(slack): Prevent premature archival of active incident channels
rgibert May 25, 2026
2fae4e2
fix(slack): Abort archive run when own bot ID is unresolvable
rgibert May 25, 2026
b95e846
Merge branch 'main' into rgibert/archive-stale-channels
rgibert Jul 3, 2026
91f6c2c
Merge remote-tracking branch 'origin/main' into rgibert/archive-stale…
rgibert Jul 3, 2026
3968031
fix(slack): Propagate exceptions from paginated get_channel_history a…
rgibert Jul 3, 2026
e5db3cd
fix(migrations): Add merge migration for conflicting leaf nodes
rgibert Jul 3, 2026
877ad2b
fix(archive): Use IncidentStatus.CANCELED (single L) to match enum de…
rgibert Jul 3, 2026
033c52d
fix(slack): Add ok check to limited get_channel_history path
rgibert Jul 3, 2026
f36375b
perf(archive): Short-circuit channel history scan on first non-own me…
rgibert Jul 3, 2026
6f30d54
fix(slack): Add type annotation for page variable to satisfy mypy
rgibert Jul 3, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
from django.db import migrations

from firetower.incidents.tasks import SCHEDULES


def create_schedule(apps, schema_editor):
Schedule = apps.get_model("django_q", "Schedule")
schedule_name = "archive_stale_channels"
Schedule.objects.get_or_create(
name=schedule_name, defaults=SCHEDULES[schedule_name]
)


def delete_schedule(apps, schema_editor):
Schedule = apps.get_model("django_q", "Schedule")
schedule_name = "archive_stale_channels"
Schedule.objects.filter(name=schedule_name).delete()


class Migration(migrations.Migration):
dependencies = [
("incidents", "0018_add_action_item_model"),
("django_q", "0018_task_success_index"),
]

operations = [
migrations.RunPython(create_schedule, delete_schedule),
]
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
from django.db import migrations


class Migration(migrations.Migration):
dependencies = [
("incidents", "0019_schedule_archive_stale_channels"),
("incidents", "0022_actionitem_last_nag"),
]

operations = []
11 changes: 11 additions & 0 deletions src/firetower/incidents/tasks/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,10 @@

from firetower.incidents.models import Incident
from firetower.incidents.tasks.action_items import send_action_item_reminder
from firetower.incidents.tasks.archive import (
ARCHIVE_NOTICE,
archive_stale_channels,
)
from firetower.incidents.tasks.decorators import datadog_log
from firetower.incidents.tasks.statuspage import (
STATUSPAGE_FOLLOWUP_REMINDER_MESSAGE,
Expand All @@ -13,9 +17,11 @@
)

__all__ = [
"ARCHIVE_NOTICE",
"SCHEDULES",
"STATUSPAGE_FOLLOWUP_REMINDER_MESSAGE",
"STATUSPAGE_REMINDER_MESSAGE",
"archive_stale_channels",
"datadog_log",
"schedule_demo",
"send_action_item_reminder",
Expand All @@ -35,9 +41,14 @@
"send_action_item_reminder": {
"func": "firetower.incidents.tasks.send_action_item_reminder",
"schedule_type": Schedule.MINUTES,
"minutes": 30,
"repeats": -1,
},
"archive_stale_channels": {
"func": "firetower.incidents.tasks.archive_stale_channels",
"schedule_type": Schedule.DAILY,
"repeats": -1,

Check warning on line 50 in src/firetower/incidents/tasks/__init__.py

View check run for this annotation

@sentry/warden / warden: code-review

Daily archive task iterates unbounded queryset with 2-second sleep plus multiple Slack API calls per channel, risking worker starvation

The `archive_stale_channels` task in `archive.py` queries all `ExternalLink` records for terminal-status incidents (DONE/CANCELED) with no `.limit()`, batch size, or `.iterator()`, then processes each sequentially with a `time.sleep(2)` plus several blocking Slack API calls (get_channel_info, iter_channel_history, optional get_thread_replies, post_message, archive_channel). As resolved incidents accumulate, this scales linearly and unbounded — at ~1,000 terminal incidents the run exceeds 30+ minutes, potentially exceeding the django-q2 task timeout and holding a worker for the full window. The schedule is DAILY with repeats=-1, so runtime grows over time. Consider a per-run cap, chunked `.iterator()`, or excluding already-processed/archived channels.
},
}


Expand Down
136 changes: 136 additions & 0 deletions src/firetower/incidents/tasks/archive.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,136 @@
import logging
import time

from django_q.tasks import Schedule

from firetower.incidents.models import (
ExternalLink,
ExternalLinkType,
IncidentStatus,
)
from firetower.incidents.tasks.decorators import datadog_log
from firetower.integrations.services.slack import SlackService

logger = logging.getLogger(__name__)

ARCHIVE_NOTICE = (
"This channel is being archived by Firetower because all message history "
"has been removed by the workspace retention policy and there doesn't "
"appear to be any active discussions."
)

ARCHIVE_CHANNEL_DELAY_SECONDS = 2


@datadog_log
def archive_stale_channels() -> None:
slack = SlackService()
if not slack.client:
logger.error(
"Slack client not initialized -- disabling archive_stale_channels schedule"
)
Schedule.objects.filter(name="archive_stale_channels").update(repeats=0)
return

own_bot_id = slack.bot_id
if not own_bot_id:
logger.error("Could not determine own bot ID, aborting archive run")
return

terminal_statuses = [IncidentStatus.DONE, IncidentStatus.CANCELED]
links = ExternalLink.objects.filter(
type=ExternalLinkType.SLACK,
incident__status__in=terminal_statuses,
).select_related("incident")

scanned = 0

Check warning on line 46 in src/firetower/incidents/tasks/archive.py

View check run for this annotation

@sentry/warden / warden: django-perf-review

Unbounded queryset loads all stale Slack links into memory without iterator()

In `archive_stale_slack_channels`, the `links` queryset fetches all matching `ExternalLink` records at once before the loop. Since this is a daily batch job that sleeps 2 seconds per item, the full materialized result set stays resident in worker memory for the entire (potentially long) run. Use `.iterator(chunk_size=500)` to stream rows instead of loading the full result set into memory, as the `ExternalLink` table grows unbounded as incidents reach terminal status.
archived = 0
skipped = 0
errored = 0

for i, link in enumerate(links):
if i > 0:
time.sleep(ARCHIVE_CHANNEL_DELAY_SECONDS)

Check warning on line 53 in src/firetower/incidents/tasks/archive.py

View check run for this annotation

@sentry/warden / warden: code-review

[WLZ-XEE] Daily archive task iterates unbounded queryset with 2-second sleep plus multiple Slack API calls per channel, risking worker starvation (additional location)

The `archive_stale_channels` task in `archive.py` queries all `ExternalLink` records for terminal-status incidents (DONE/CANCELED) with no `.limit()`, batch size, or `.iterator()`, then processes each sequentially with a `time.sleep(2)` plus several blocking Slack API calls (get_channel_info, iter_channel_history, optional get_thread_replies, post_message, archive_channel). As resolved incidents accumulate, this scales linearly and unbounded — at ~1,000 terminal incidents the run exceeds 30+ minutes, potentially exceeding the django-q2 task timeout and holding a worker for the full window. The schedule is DAILY with repeats=-1, so runtime grows over time. Consider a per-run cap, chunked `.iterator()`, or excluding already-processed/archived channels.

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Archiver exceeds worker timeout

Medium Severity

The daily archiver sleeps two seconds before each Slack link after the first, while Q_CLUSTER uses a 180-second task timeout. With dozens of terminal-incident Slack ExternalLink rows, sleep alone can exceed the limit, so django-q may terminate the run before every channel is scanned.

Fix in Cursor Fix in Web

Reviewed by Cursor Bugbot for commit 3968031. Configure here.

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Acknowledged. The 2s delay is intentional to stay under Slack's Tier 3 rate limit (~50 req/min). The task is idempotent -- if it times out, the next daily run picks up where it left off since already-archived channels are skipped via is_archived check. If channel volume grows enough to make this a real problem, we can batch into smaller chunks or move to an async queue, but that's premature for the current scale.

— Claude Code


scanned += 1
channel_id = slack.parse_channel_id_from_url(link.url)
if not channel_id:
skipped += 1
continue

try:
info = slack.get_channel_info(channel_id)
if info is None:
logger.warning(
f"Could not fetch info for channel {channel_id} "
f"(incident {link.incident.incident_number}), skipping"
)
skipped += 1
continue

if info.get("is_archived"):
skipped += 1
continue

Comment thread
github-actions[bot] marked this conversation as resolved.
has_activity = False
own_messages: list[dict] = []
for page in slack.iter_channel_history(channel_id):
for msg in page:
if msg.get("bot_id") != own_bot_id:
has_activity = True
break
own_messages.append(msg)
if has_activity:
break
if has_activity:
skipped += 1
continue

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

System messages block stale archival

Medium Severity

The stale-channel check treats any history item whose bot_id is not Firetower’s as blocking archival. Slack system events (e.g. channel_join) usually have no bot_id, so they are counted as foreign activity and the channel is skipped even when only Firetower bot posts remain besides those events.

Fix in Cursor Fix in Web

Reviewed by Cursor Bugbot for commit 033c52d. Configure here.

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Intentional. System messages (channel_join, channel_purpose, etc.) lack bot_id and are treated as non-own activity. This is the safer direction -- we'd rather skip a channel than wrongly archive one with real activity obscured by system events. If this causes too many false-skips in practice, we can add a subtype filter in a follow-up.

— Claude Code


for msg in own_messages:
if msg.get("reply_count", 0) > 0:
replies = slack.get_thread_replies(channel_id, msg["ts"])
if replies:
has_activity = True
break

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thread bot replies ignored

Medium Severity

When deciding if a channel is stale, top-level history treats any message whose bot_id is not Firetower’s as activity, but thread follow-up uses get_thread_replies, which drops replies that have a bot_id. A channel whose visible history is only Firetower bot posts can still be archived even when another bot has active thread replies.

Additional Locations (1)
Fix in Cursor Fix in Web

Reviewed by Cursor Bugbot for commit f36375b. Configure here.

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Intentional asymmetry. The top-level check is conservative: any non-Firetower message (including other bots) counts as activity to avoid wrongful archival. The thread check uses get_thread_replies which filters to human-only replies -- its purpose is to detect human discussion in threads, not bot-to-bot chatter. A channel with only Firetower top-level messages and only other-bot thread replies has no human activity and is a valid archive candidate.

— Claude Code

if has_activity:
skipped += 1
continue
Comment thread
rgibert marked this conversation as resolved.
Comment thread
rgibert marked this conversation as resolved.

notice_ts = slack.post_message(channel_id, ARCHIVE_NOTICE)
Comment thread
github-actions[bot] marked this conversation as resolved.
if not notice_ts:
logger.error(
f"Failed to post archive notice to channel {channel_id} "
f"(incident {link.incident.incident_number}), skipping archive"
)
errored += 1
continue

try:
if not slack.archive_channel(channel_id):
raise RuntimeError(
f"archive_channel returned False for {channel_id}"
)
archived += 1
Comment thread
cursor[bot] marked this conversation as resolved.
logger.info(
f"Archived stale channel {channel_id} "
f"(incident {link.incident.incident_number})"
)
except Exception:
errored += 1
logger.exception(
f"Failed to archive channel {channel_id} "
f"(incident {link.incident.incident_number}), "
Comment on lines +118 to +122

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Bug: A SlackApiError during channel archival is logged twice: once in the archive_channel function and again in the calling task, leading to redundant log entries.
Severity: LOW

Suggested Fix

Remove the redundant logging. Either the archive_channel function should not log the error and only return False, letting the caller handle all logging, or the caller should not log the exception again after archive_channel has already logged it.

Prompt for AI Agent
Review the code at the location below. A potential bug has been identified by an AI
agent. Verify if this is a real issue. If it is, propose a fix; if not, explain why it's
not valid.

Location: src/firetower/incidents/tasks/archive.py#L118-L122

Potential issue: When the `slack.archive_channel` function fails with a `SlackApiError`,
it logs the error and returns `False`. The calling code in `archive.py` detects this
`False` return value, raises a `RuntimeError`, and then immediately catches that
exception in an outer block. This `except Exception` block then logs the exception a
second time, resulting in redundant logging for the same failure event. While this does
not affect functionality, it creates unnecessary noise in the logs.

Also affects:

  • src/firetower/integrations/services/slack.py:580~590

f"deleting notice"
)
slack.delete_message(channel_id, notice_ts)
except Exception:
errored += 1
logger.exception(
f"Error processing channel {channel_id} "
f"(incident {link.incident.incident_number})"
)

logger.info(
f"archive_stale_channels complete: "
f"scanned={scanned} archived={archived} skipped={skipped} errored={errored}"
)
Loading
Loading