From a67a9ce15feb858d78023b583400a794496c5f50 Mon Sep 17 00:00:00 2001 From: Catherine Lee Date: Mon, 6 Oct 2025 14:36:29 -0700 Subject: [PATCH 1/3] tc --- .github/workflows/update_test_file_report.yml | 9 ++- .../torchci/test_insights/daily_regression.py | 51 +++++++++----- .../test_insights/weekly_notification.py | 66 +++++++++++-------- 3 files changed, 81 insertions(+), 45 deletions(-) diff --git a/.github/workflows/update_test_file_report.yml b/.github/workflows/update_test_file_report.yml index 6052de7821..0aee20e8ae 100644 --- a/.github/workflows/update_test_file_report.yml +++ b/.github/workflows/update_test_file_report.yml @@ -7,11 +7,13 @@ on: # Run every tuesday at 3am UTC - cron: '0 3 * * 2' workflow_dispatch: + pull_request: jobs: file-report: if: > (github.event_name == 'schedule' && github.event.schedule.cron == '0 2 * * *') || + (github.event_name == 'pull_request') || (github.event_name == 'workflow_dispatch') permissions: issues: write @@ -41,12 +43,15 @@ jobs: CLICKHOUSE_USERNAME: ${{ secrets.CLICKHOUSE_HUD_USER_USERNAME }} CLICKHOUSE_PASSWORD: ${{ secrets.CLICKHOUSE_HUD_USER_PASSWORD }} GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + AWS_INFRA_ALERTS_LAMBDA_URL: ${{ secrets.AWS_INFRA_ALERTS_LAMBDA_URL }} + TEST_REPORT_AWS_LAMBDA_TOKEN: ${{ secrets.TEST_REPORT_AWS_LAMBDA_TOKEN }} run: | python3 tools/torchci/test_insights/daily_regression.py weekly-file-report-notification: if: > (github.event_name == 'schedule' && github.event.schedule.cron == '0 3 * * 2') || + (github.event_name == 'pull_request') || (github.event_name == 'workflow_dispatch') permissions: issues: write @@ -61,6 +66,8 @@ jobs: - name: Run notifier env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + AWS_INFRA_ALERTS_LAMBDA_URL: ${{ secrets.AWS_INFRA_ALERTS_LAMBDA_URL }} + TEST_REPORT_AWS_LAMBDA_TOKEN: ${{ secrets.TEST_REPORT_AWS_LAMBDA_TOKEN }} run: | python3 tools/torchci/test_insights/weekly_notification.py diff --git a/tools/torchci/test_insights/daily_regression.py b/tools/torchci/test_insights/daily_regression.py index cea6d8bea0..de8d35f54f 100644 --- a/tools/torchci/test_insights/daily_regression.py +++ b/tools/torchci/test_insights/daily_regression.py @@ -1,15 +1,16 @@ +import datetime import json from collections import defaultdict from functools import lru_cache from typing import Any from torchci.test_insights.file_report_generator import FileReportGenerator -from torchci.test_insights.weekly_notification import create_comment +from torchci.test_insights.weekly_notification import send_to_aws_alerting_lambda FILE_REPORT_URL = "https://hud.pytorch.org/tests/fileReport" -CONFIG = [ +CONFIG: list[dict[str, Any]] = [ { "team": "dev-infra", "condition": lambda _: True, @@ -176,6 +177,29 @@ def _get_change(field: str, additional_processing) -> str: + f"\\# skipped change: {_get_change('skipped', additional_processing=lambda x: round(x, 2))}\n" ) + def generate_alert_json( + self, team: str, report_url: str, regression_str: str + ) -> dict[str, Any]: + title = f"Regression Detected in Test Reports for {team}" + now = datetime.datetime.now(datetime.timezone.utc).strftime( + "%Y-%m-%dT%H:%M:%S.%fZ" + ) + return { + "schema_version": 1, + "source": "test-infra-test-file-reports", + "state": "FIRING", + "title": title, + "description": regression_str, + "summary": regression_str, + "priority": "P2", + "occurred_at": now, + "teams": [team], + "identity": {"alarm_id": f"test-file-reports-daily-regression-{team}"}, + "links": { + "dashboard_url": report_url, + }, + } + def determine_regressions(self) -> None: """ Determine regressions in the test data based on the provided filter. @@ -214,7 +238,6 @@ def _s3_to_json(bucket: str, key: str) -> Any: f"additional_info/weekly_file_report/data_{current_sha}.json.gz", ) - regressions = [] for team in CONFIG: change = self.gen_regression_for_team( team=team, @@ -223,19 +246,17 @@ def _s3_to_json(bucket: str, key: str) -> Any: status_changes=status_changes, ) if self.filter_thresholds(change): - regressions.append( - { - "team": team["team"], - "regression": change, - "link": team["link"], - } - ) - create_comment( - { - # "title": f"Regression detected for {team['team']}", - "body": self.format_regression_string(team, change), - } + print(f"Regression detected for team: {team['team']}") + print(self.format_regression_string(team, change)) + + alert = self.generate_alert_json( + team=team["team"], + report_url=team["link"], + regression_str=self.format_regression_string(team, change), ) + send_to_aws_alerting_lambda(alert) + else: + print(f"No significant regression for team: {team['team']}") if __name__ == "__main__": diff --git a/tools/torchci/test_insights/weekly_notification.py b/tools/torchci/test_insights/weekly_notification.py index aeafd88b09..66b8a010be 100644 --- a/tools/torchci/test_insights/weekly_notification.py +++ b/tools/torchci/test_insights/weekly_notification.py @@ -1,3 +1,4 @@ +import datetime import json import os import urllib.request @@ -8,52 +9,59 @@ CONFIG: list[dict[str, Any]] = [ { "team": "Inductor", - "condition": lambda info: info["labels"].includes("module: inductor"), "link": f"{HUD_URL_ROOT}?label=module:%20inductor", } ] -TITLE = "New Test Report is Available for {module_name}" +TITLE = "New Test Report is Available for {team}" REASON = "A new test report has been generated for Team:{team}. Please go to the following link to view the report: {report_url}" -LABELS = ["area:alerting", "Pri:P3", "Source:custom"] -def generate_notification( - module_name: str, team: str, report_url: str -) -> dict[str, str]: - title = TITLE.format(module_name=module_name) - reason = REASON.format(team=team, report_url=report_url) +def generate_alert_json( + team: str, + report_url: str, +) -> dict[str, Any]: + now = datetime.datetime.now(datetime.timezone.utc).strftime("%Y-%m-%dT%H:%M:%S.%fZ") return { - # "title": title, - "body": reason, - # "labels": LABELS + [f"Team:{team}"], + "schema_version": 1, + "source": "test-infra-test-file-reports", + "state": "FIRING", + "title": TITLE.format(team=team), + "description": REASON.format(team=team, report_url=report_url), + "summary": REASON.format(team=team, report_url=report_url), + "priority": "P2", + "occurred_at": now, + "teams": [team], + "identity": {"alarm_id": f"test-file-reports-weekly-notification-{team}"}, + "links": { + "dashboard_url": report_url, + }, } -# Using a specific issue for the time being while the alerting system is set up -# to handle custom webhooks -GITHUB_ISSUE_URL = ( - "https://api.github.com/repos/pytorch/test-infra/issues/7296/comments" -) - - -def create_comment(issue: dict[str, str]) -> dict[str, str]: - # Create issue in github repo - auth = {"Authorization": f"Bearer {os.getenv('GITHUB_TOKEN')}"} - data = json.dumps(issue).encode() +def send_to_aws_alerting_lambda(alert: dict[str, Any]) -> None: + headers = { + "Content-Type": "application/json", + "x-test-reports-normalized-signature": os.environ[ + "TEST_REPORT_AWS_LAMBDA_TOKEN" + ], + } + data = json.dumps(alert).encode() req = urllib.request.Request( - GITHUB_ISSUE_URL, data=data, headers=auth, method="POST" + os.environ["AWS_INFRA_ALERTS_LAMBDA_URL"], + data=data, + headers=headers, + method="POST", ) with urllib.request.urlopen(req) as f: response = f.read() - print(f"Created issue: {response}") - return issue + print(response) if __name__ == "__main__": for config in CONFIG: - issue = generate_notification( - module_name=config["team"], team=config["team"], report_url=config["link"] + alert = generate_alert_json( + team=config["team"], + report_url=config["link"], ) - - create_comment(issue) + send_to_aws_alerting_lambda(alert) From 53fd37962cd875766343c1e037e80349e4309291 Mon Sep 17 00:00:00 2001 From: Catherine Lee Date: Mon, 6 Oct 2025 15:01:59 -0700 Subject: [PATCH 2/3] tc --- .../torchci/test_insights/daily_regression.py | 10 +++- .../test_insights/weekly_notification.py | 51 ++++++++++++------- 2 files changed, 42 insertions(+), 19 deletions(-) diff --git a/tools/torchci/test_insights/daily_regression.py b/tools/torchci/test_insights/daily_regression.py index de8d35f54f..bd02ca8c01 100644 --- a/tools/torchci/test_insights/daily_regression.py +++ b/tools/torchci/test_insights/daily_regression.py @@ -12,7 +12,7 @@ CONFIG: list[dict[str, Any]] = [ { - "team": "dev-infra", + "team": "pytorch-dev-infra", "condition": lambda _: True, "link": FILE_REPORT_URL, }, @@ -184,6 +184,10 @@ def generate_alert_json( now = datetime.datetime.now(datetime.timezone.utc).strftime( "%Y-%m-%dT%H:%M:%S.%fZ" ) + body = ( + f"{regression_str}\n" + "This issue is a notification and should close immediately after creation to avoid clutter." + ) return { "schema_version": 1, "source": "test-infra-test-file-reports", @@ -194,7 +198,9 @@ def generate_alert_json( "priority": "P2", "occurred_at": now, "teams": [team], - "identity": {"alarm_id": f"test-file-reports-daily-regression-{team}"}, + "identity": { + "alarm_id": f"test-file-reports-daily-regression-{team}-{now}" + }, "links": { "dashboard_url": report_url, }, diff --git a/tools/torchci/test_insights/weekly_notification.py b/tools/torchci/test_insights/weekly_notification.py index 66b8a010be..2ed63b933c 100644 --- a/tools/torchci/test_insights/weekly_notification.py +++ b/tools/torchci/test_insights/weekly_notification.py @@ -14,7 +14,11 @@ ] TITLE = "New Test Report is Available for {team}" -REASON = "A new test report has been generated for Team:{team}. Please go to the following link to view the report: {report_url}" +REASON = ( + "A new test report has been generated for Team:{team}. " + "Please go to the following link to view the report: {report_url}. " + "This issue is a notification and should close immediately after creation to avoid clutter." +) def generate_alert_json( @@ -32,7 +36,7 @@ def generate_alert_json( "priority": "P2", "occurred_at": now, "teams": [team], - "identity": {"alarm_id": f"test-file-reports-weekly-notification-{team}"}, + "identity": {"alarm_id": f"test-file-reports-weekly-notification-{team}-{now}"}, "links": { "dashboard_url": report_url, }, @@ -40,22 +44,35 @@ def generate_alert_json( def send_to_aws_alerting_lambda(alert: dict[str, Any]) -> None: - headers = { - "Content-Type": "application/json", - "x-test-reports-normalized-signature": os.environ[ - "TEST_REPORT_AWS_LAMBDA_TOKEN" - ], - } - data = json.dumps(alert).encode() - req = urllib.request.Request( - os.environ["AWS_INFRA_ALERTS_LAMBDA_URL"], - data=data, - headers=headers, - method="POST", + def _send(alert: dict[str, Any]) -> None: + data = json.dumps(alert).encode() + headers = { + "Content-Type": "application/json", + "x-test-reports-normalized-signature": os.environ[ + "TEST_REPORT_AWS_LAMBDA_TOKEN" + ], + } + req = urllib.request.Request( + os.environ["AWS_INFRA_ALERTS_LAMBDA_URL"], + data=data, + headers=headers, + method="POST", + ) + with urllib.request.urlopen(req) as f: + response = f.read() + print(response) + return + + _send(alert) + + # Weird but the current setup doesn't handle notification style alerts very + # well, so we close it manually immediately + close_alert = alert.copy() + close_alert["state"] = "RESOLVED" + close_alert["occurred_at"] = datetime.datetime.now(datetime.timezone.utc).strftime( + "%Y-%m-%dT%H:%M:%S.%fZ" ) - with urllib.request.urlopen(req) as f: - response = f.read() - print(response) + _send(close_alert) if __name__ == "__main__": From c9ed63658f690f8e60079b0b1f243aea08ae7be6 Mon Sep 17 00:00:00 2001 From: Catherine Lee Date: Mon, 6 Oct 2025 15:03:53 -0700 Subject: [PATCH 3/3] removee pull request trigger --- .github/workflows/update_test_file_report.yml | 3 --- 1 file changed, 3 deletions(-) diff --git a/.github/workflows/update_test_file_report.yml b/.github/workflows/update_test_file_report.yml index 0aee20e8ae..b3c999af24 100644 --- a/.github/workflows/update_test_file_report.yml +++ b/.github/workflows/update_test_file_report.yml @@ -7,13 +7,11 @@ on: # Run every tuesday at 3am UTC - cron: '0 3 * * 2' workflow_dispatch: - pull_request: jobs: file-report: if: > (github.event_name == 'schedule' && github.event.schedule.cron == '0 2 * * *') || - (github.event_name == 'pull_request') || (github.event_name == 'workflow_dispatch') permissions: issues: write @@ -51,7 +49,6 @@ jobs: weekly-file-report-notification: if: > (github.event_name == 'schedule' && github.event.schedule.cron == '0 3 * * 2') || - (github.event_name == 'pull_request') || (github.event_name == 'workflow_dispatch') permissions: issues: write