Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion .github/workflows/update_test_file_report.yml
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,8 @@ jobs:
CLICKHOUSE_USERNAME: ${{ secrets.CLICKHOUSE_HUD_USER_USERNAME }}
CLICKHOUSE_PASSWORD: ${{ secrets.CLICKHOUSE_HUD_USER_PASSWORD }}
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
AWS_INFRA_ALERTS_LAMBDA_URL: ${{ secrets.AWS_INFRA_ALERTS_LAMBDA_URL }}
TEST_REPORT_AWS_LAMBDA_TOKEN: ${{ secrets.TEST_REPORT_AWS_LAMBDA_TOKEN }}
run: |
python3 tools/torchci/test_insights/daily_regression.py

Expand All @@ -61,6 +63,8 @@ jobs:

- name: Run notifier
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
AWS_INFRA_ALERTS_LAMBDA_URL: ${{ secrets.AWS_INFRA_ALERTS_LAMBDA_URL }}
TEST_REPORT_AWS_LAMBDA_TOKEN: ${{ secrets.TEST_REPORT_AWS_LAMBDA_TOKEN }}
run: |
python3 tools/torchci/test_insights/weekly_notification.py
59 changes: 43 additions & 16 deletions tools/torchci/test_insights/daily_regression.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,18 @@
import datetime
import json
from collections import defaultdict
from functools import lru_cache
from typing import Any

from torchci.test_insights.file_report_generator import FileReportGenerator
from torchci.test_insights.weekly_notification import create_comment
from torchci.test_insights.weekly_notification import send_to_aws_alerting_lambda


FILE_REPORT_URL = "https://hud.pytorch.org/tests/fileReport"

CONFIG = [
CONFIG: list[dict[str, Any]] = [
{
"team": "dev-infra",
"team": "pytorch-dev-infra",
"condition": lambda _: True,
"link": FILE_REPORT_URL,
},
Expand Down Expand Up @@ -176,6 +177,35 @@ def _get_change(field: str, additional_processing) -> str:
+ f"\\# skipped change: {_get_change('skipped', additional_processing=lambda x: round(x, 2))}\n"
)

def generate_alert_json(
self, team: str, report_url: str, regression_str: str
) -> dict[str, Any]:
title = f"Regression Detected in Test Reports for {team}"
now = datetime.datetime.now(datetime.timezone.utc).strftime(
"%Y-%m-%dT%H:%M:%S.%fZ"
)
body = (
f"{regression_str}\n"
"This issue is a notification and should close immediately after creation to avoid clutter."
)
return {
"schema_version": 1,
"source": "test-infra-test-file-reports",
"state": "FIRING",
"title": title,
"description": regression_str,
"summary": regression_str,
"priority": "P2",
"occurred_at": now,
"teams": [team],
"identity": {
"alarm_id": f"test-file-reports-daily-regression-{team}-{now}"
},
"links": {
"dashboard_url": report_url,
},
}

def determine_regressions(self) -> None:
"""
Determine regressions in the test data based on the provided filter.
Expand Down Expand Up @@ -214,7 +244,6 @@ def _s3_to_json(bucket: str, key: str) -> Any:
f"additional_info/weekly_file_report/data_{current_sha}.json.gz",
)

regressions = []
for team in CONFIG:
change = self.gen_regression_for_team(
team=team,
Expand All @@ -223,19 +252,17 @@ def _s3_to_json(bucket: str, key: str) -> Any:
status_changes=status_changes,
)
if self.filter_thresholds(change):
regressions.append(
{
"team": team["team"],
"regression": change,
"link": team["link"],
}
)
create_comment(
{
# "title": f"Regression detected for {team['team']}",
"body": self.format_regression_string(team, change),
}
print(f"Regression detected for team: {team['team']}")
print(self.format_regression_string(team, change))

alert = self.generate_alert_json(
team=team["team"],
report_url=team["link"],
regression_str=self.format_regression_string(team, change),
)
send_to_aws_alerting_lambda(alert)
else:
print(f"No significant regression for team: {team['team']}")


if __name__ == "__main__":
Expand Down
87 changes: 56 additions & 31 deletions tools/torchci/test_insights/weekly_notification.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import datetime
import json
import os
import urllib.request
Expand All @@ -8,52 +9,76 @@
CONFIG: list[dict[str, Any]] = [
{
"team": "Inductor",
"condition": lambda info: info["labels"].includes("module: inductor"),
"link": f"{HUD_URL_ROOT}?label=module:%20inductor",
}
]

TITLE = "New Test Report is Available for {module_name}"
REASON = "A new test report has been generated for Team:{team}. Please go to the following link to view the report: {report_url}"
LABELS = ["area:alerting", "Pri:P3", "Source:custom"]
TITLE = "New Test Report is Available for {team}"
REASON = (
"A new test report has been generated for Team:{team}. "
"Please go to the following link to view the report: {report_url}. "
"This issue is a notification and should close immediately after creation to avoid clutter."
)


def generate_notification(
module_name: str, team: str, report_url: str
) -> dict[str, str]:
title = TITLE.format(module_name=module_name)
reason = REASON.format(team=team, report_url=report_url)
def generate_alert_json(
team: str,
report_url: str,
) -> dict[str, Any]:
now = datetime.datetime.now(datetime.timezone.utc).strftime("%Y-%m-%dT%H:%M:%S.%fZ")
return {
# "title": title,
"body": reason,
# "labels": LABELS + [f"Team:{team}"],
"schema_version": 1,
"source": "test-infra-test-file-reports",
"state": "FIRING",
"title": TITLE.format(team=team),
"description": REASON.format(team=team, report_url=report_url),
"summary": REASON.format(team=team, report_url=report_url),
"priority": "P2",
"occurred_at": now,
"teams": [team],
"identity": {"alarm_id": f"test-file-reports-weekly-notification-{team}-{now}"},
"links": {
"dashboard_url": report_url,
},
}


# Using a specific issue for the time being while the alerting system is set up
# to handle custom webhooks
GITHUB_ISSUE_URL = (
"https://api.github.com/repos/pytorch/test-infra/issues/7296/comments"
)
def send_to_aws_alerting_lambda(alert: dict[str, Any]) -> None:
def _send(alert: dict[str, Any]) -> None:
data = json.dumps(alert).encode()
headers = {
"Content-Type": "application/json",
"x-test-reports-normalized-signature": os.environ[
"TEST_REPORT_AWS_LAMBDA_TOKEN"
],
}
req = urllib.request.Request(
os.environ["AWS_INFRA_ALERTS_LAMBDA_URL"],
data=data,
headers=headers,
method="POST",
)
with urllib.request.urlopen(req) as f:
response = f.read()
print(response)
return

_send(alert)

def create_comment(issue: dict[str, str]) -> dict[str, str]:
# Create issue in github repo
auth = {"Authorization": f"Bearer {os.getenv('GITHUB_TOKEN')}"}
data = json.dumps(issue).encode()
req = urllib.request.Request(
GITHUB_ISSUE_URL, data=data, headers=auth, method="POST"
# Weird but the current setup doesn't handle notification style alerts very
# well, so we close it manually immediately
close_alert = alert.copy()
close_alert["state"] = "RESOLVED"
close_alert["occurred_at"] = datetime.datetime.now(datetime.timezone.utc).strftime(
"%Y-%m-%dT%H:%M:%S.%fZ"
)
with urllib.request.urlopen(req) as f:
response = f.read()
print(f"Created issue: {response}")
return issue
_send(close_alert)


if __name__ == "__main__":
for config in CONFIG:
issue = generate_notification(
module_name=config["team"], team=config["team"], report_url=config["link"]
alert = generate_alert_json(
team=config["team"],
report_url=config["link"],
)

create_comment(issue)
send_to_aws_alerting_lambda(alert)