pytorch · clee2000 · Oct 6, 2025 · Oct 6, 2025 · Oct 6, 2025
diff --git a/.github/workflows/update_test_file_report.yml b/.github/workflows/update_test_file_report.yml
@@ -41,6 +41,8 @@ jobs:
           CLICKHOUSE_USERNAME: ${{ secrets.CLICKHOUSE_HUD_USER_USERNAME }}
           CLICKHOUSE_PASSWORD: ${{ secrets.CLICKHOUSE_HUD_USER_PASSWORD }}
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          AWS_INFRA_ALERTS_LAMBDA_URL: ${{ secrets.AWS_INFRA_ALERTS_LAMBDA_URL }}
+          TEST_REPORT_AWS_LAMBDA_TOKEN: ${{ secrets.TEST_REPORT_AWS_LAMBDA_TOKEN }}
         run: |
           python3 tools/torchci/test_insights/daily_regression.py
 
@@ -61,6 +63,8 @@ jobs:
 
       - name: Run notifier
         env:
-            GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          AWS_INFRA_ALERTS_LAMBDA_URL: ${{ secrets.AWS_INFRA_ALERTS_LAMBDA_URL }}
+          TEST_REPORT_AWS_LAMBDA_TOKEN: ${{ secrets.TEST_REPORT_AWS_LAMBDA_TOKEN }}
         run: |
           python3 tools/torchci/test_insights/weekly_notification.py
diff --git a/tools/torchci/test_insights/daily_regression.py b/tools/torchci/test_insights/daily_regression.py
@@ -1,17 +1,18 @@
+import datetime
 import json
 from collections import defaultdict
 from functools import lru_cache
 from typing import Any
 
 from torchci.test_insights.file_report_generator import FileReportGenerator
-from torchci.test_insights.weekly_notification import create_comment
+from torchci.test_insights.weekly_notification import send_to_aws_alerting_lambda
 
 
 FILE_REPORT_URL = "https://hud.pytorch.org/tests/fileReport"
 
-CONFIG = [
+CONFIG: list[dict[str, Any]] = [
     {
-        "team": "dev-infra",
+        "team": "pytorch-dev-infra",
         "condition": lambda _: True,
         "link": FILE_REPORT_URL,
     },
@@ -176,6 +177,35 @@ def _get_change(field: str, additional_processing) -> str:
             + f"\\# skipped change: {_get_change('skipped', additional_processing=lambda x: round(x, 2))}\n"
         )
 
+    def generate_alert_json(
+        self, team: str, report_url: str, regression_str: str
+    ) -> dict[str, Any]:
+        title = f"Regression Detected in Test Reports for {team}"
+        now = datetime.datetime.now(datetime.timezone.utc).strftime(
+            "%Y-%m-%dT%H:%M:%S.%fZ"
+        )
+        body = (
+            f"{regression_str}\n"
+            "This issue is a notification and should close immediately after creation to avoid clutter."
+        )
+        return {
+            "schema_version": 1,
+            "source": "test-infra-test-file-reports",
+            "state": "FIRING",
+            "title": title,
+            "description": regression_str,
+            "summary": regression_str,
+            "priority": "P2",
+            "occurred_at": now,
+            "teams": [team],
+            "identity": {
+                "alarm_id": f"test-file-reports-daily-regression-{team}-{now}"
+            },
+            "links": {
+                "dashboard_url": report_url,
+            },
+        }
+
     def determine_regressions(self) -> None:
         """
         Determine regressions in the test data based on the provided filter.
@@ -214,7 +244,6 @@ def _s3_to_json(bucket: str, key: str) -> Any:
             f"additional_info/weekly_file_report/data_{current_sha}.json.gz",
         )
 
-        regressions = []
         for team in CONFIG:
             change = self.gen_regression_for_team(
                 team=team,
@@ -223,19 +252,17 @@ def _s3_to_json(bucket: str, key: str) -> Any:
                 status_changes=status_changes,
             )
             if self.filter_thresholds(change):
-                regressions.append(
-                    {
-                        "team": team["team"],
-                        "regression": change,
-                        "link": team["link"],
-                    }
-                )
-                create_comment(
-                    {
-                        # "title": f"Regression detected for {team['team']}",
-                        "body": self.format_regression_string(team, change),
-                    }
+                print(f"Regression detected for team: {team['team']}")
+                print(self.format_regression_string(team, change))
+
+                alert = self.generate_alert_json(
+                    team=team["team"],
+                    report_url=team["link"],
+                    regression_str=self.format_regression_string(team, change),
                 )
+                send_to_aws_alerting_lambda(alert)
+            else:
+                print(f"No significant regression for team: {team['team']}")
 
 
 if __name__ == "__main__":

diff --git a/tools/torchci/test_insights/weekly_notification.py b/tools/torchci/test_insights/weekly_notification.py
@@ -1,3 +1,4 @@
+import datetime
 import json
 import os
 import urllib.request
@@ -8,52 +9,76 @@
 CONFIG: list[dict[str, Any]] = [
     {
         "team": "Inductor",
-        "condition": lambda info: info["labels"].includes("module: inductor"),
         "link": f"{HUD_URL_ROOT}?label=module:%20inductor",
     }
 ]
 
-TITLE = "New Test Report is Available for {module_name}"
-REASON = "A new test report has been generated for Team:{team}.  Please go to the following link to view the report: {report_url}"
-LABELS = ["area:alerting", "Pri:P3", "Source:custom"]
+TITLE = "New Test Report is Available for {team}"
+REASON = (
+    "A new test report has been generated for Team:{team}.  "
+    "Please go to the following link to view the report: {report_url}.  "
+    "This issue is a notification and should close immediately after creation to avoid clutter."
+)
 
 
-def generate_notification(
-    module_name: str, team: str, report_url: str
-) -> dict[str, str]:
-    title = TITLE.format(module_name=module_name)
-    reason = REASON.format(team=team, report_url=report_url)
+def generate_alert_json(
+    team: str,
+    report_url: str,
+) -> dict[str, Any]:
+    now = datetime.datetime.now(datetime.timezone.utc).strftime("%Y-%m-%dT%H:%M:%S.%fZ")
     return {
-        # "title": title,
-        "body": reason,
-        # "labels": LABELS + [f"Team:{team}"],
+        "schema_version": 1,
+        "source": "test-infra-test-file-reports",
+        "state": "FIRING",
+        "title": TITLE.format(team=team),
+        "description": REASON.format(team=team, report_url=report_url),
+        "summary": REASON.format(team=team, report_url=report_url),
+        "priority": "P2",
+        "occurred_at": now,
+        "teams": [team],
+        "identity": {"alarm_id": f"test-file-reports-weekly-notification-{team}-{now}"},
+        "links": {
+            "dashboard_url": report_url,
+        },
     }
 
 
-# Using a specific issue for the time being while the alerting system is set up
-# to handle custom webhooks
-GITHUB_ISSUE_URL = (
-    "https://api.github.com/repos/pytorch/test-infra/issues/7296/comments"
-)
+def send_to_aws_alerting_lambda(alert: dict[str, Any]) -> None:
+    def _send(alert: dict[str, Any]) -> None:
+        data = json.dumps(alert).encode()
+        headers = {
+            "Content-Type": "application/json",
+            "x-test-reports-normalized-signature": os.environ[
+                "TEST_REPORT_AWS_LAMBDA_TOKEN"
+            ],
+        }
+        req = urllib.request.Request(
+            os.environ["AWS_INFRA_ALERTS_LAMBDA_URL"],
+            data=data,
+            headers=headers,
+            method="POST",
+        )
+        with urllib.request.urlopen(req) as f:
+            response = f.read()
+            print(response)
+            return
 
+    _send(alert)
 
-def create_comment(issue: dict[str, str]) -> dict[str, str]:
-    # Create issue in github repo
-    auth = {"Authorization": f"Bearer {os.getenv('GITHUB_TOKEN')}"}
-    data = json.dumps(issue).encode()
-    req = urllib.request.Request(
-        GITHUB_ISSUE_URL, data=data, headers=auth, method="POST"
+    # Weird but the current setup doesn't handle notification style alerts very
+    # well, so we close it manually immediately
+    close_alert = alert.copy()
+    close_alert["state"] = "RESOLVED"
+    close_alert["occurred_at"] = datetime.datetime.now(datetime.timezone.utc).strftime(
+        "%Y-%m-%dT%H:%M:%S.%fZ"
     )
-    with urllib.request.urlopen(req) as f:
-        response = f.read()
-        print(f"Created issue: {response}")
-    return issue
+    _send(close_alert)
 
 
 if __name__ == "__main__":
     for config in CONFIG:
-        issue = generate_notification(
-            module_name=config["team"], team=config["team"], report_url=config["link"]
+        alert = generate_alert_json(
+            team=config["team"],
+            report_url=config["link"],
         )
-
-        create_comment(issue)
+        send_to_aws_alerting_lambda(alert)