Skip to content

Commit 09967d3

Browse files
authored
[testing] file report: comment notification generator (#7303)
afaict the current alerting system isn't set up for custom webhooks, so its just writing to an issue in test-infra for the time being Every day, check if there are regressions >10% for certain configs. If there is one, make a comment on the issue (to be replaced by webhook -> alerting infra after that gets setup) Every week, make a simple comment with a link to the file report on HUD for certain configs
1 parent c3c1dd9 commit 09967d3

File tree

3 files changed

+340
-0
lines changed

3 files changed

+340
-0
lines changed

.github/workflows/update_test_file_report.yml

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,10 +4,17 @@ on:
44
schedule:
55
# Run every day at 2am UTC
66
- cron: '0 2 * * *'
7+
# Run every tuesday at 3am UTC
8+
- cron: '0 3 * * 2'
79
workflow_dispatch:
810

911
jobs:
1012
file-report:
13+
if: >
14+
(github.event_name == 'schedule' && github.event.schedule.cron == '0 2 * * *') ||
15+
(github.event_name == 'workflow_dispatch')
16+
permissions:
17+
issues: write
1118
runs-on: linux.2xlarge
1219
steps:
1320
- name: Checkout repository
@@ -18,6 +25,7 @@ jobs:
1825
cd tools/torchci && python3 -mpip install -e .
1926
2027
- name: Run file report generator
28+
if: false
2129
env:
2230
CLICKHOUSE_ENDPOINT: ${{ secrets.CLICKHOUSE_HUD_USER_URL }}
2331
CLICKHOUSE_USERNAME: ${{ secrets.CLICKHOUSE_HUD_USER_USERNAME }}
@@ -26,3 +34,33 @@ jobs:
2634
YESTERDAY=$(date -u -d "yesterday" +%Y-%m-%d)
2735
python3 tools/torchci/test_insights/file_report_generator.py \
2836
--add-dates "$YESTERDAY"
37+
38+
- name: Generate regression notification
39+
env:
40+
CLICKHOUSE_ENDPOINT: ${{ secrets.CLICKHOUSE_HUD_USER_URL }}
41+
CLICKHOUSE_USERNAME: ${{ secrets.CLICKHOUSE_HUD_USER_USERNAME }}
42+
CLICKHOUSE_PASSWORD: ${{ secrets.CLICKHOUSE_HUD_USER_PASSWORD }}
43+
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
44+
run: |
45+
python3 tools/torchci/test_insights/daily_regression.py
46+
47+
weekly-file-report-notification:
48+
if: >
49+
(github.event_name == 'schedule' && github.event.schedule.cron == '0 3 * * 2') ||
50+
(github.event_name == 'workflow_dispatch')
51+
permissions:
52+
issues: write
53+
runs-on: linux.2xlarge
54+
steps:
55+
- name: Checkout repository
56+
uses: actions/checkout@v3
57+
- name: Install dependencies
58+
run: |
59+
python3 -mpip install -r tools/torchci/requirements.txt
60+
cd tools/torchci && python3 -mpip install -e .
61+
62+
- name: Run notifier
63+
env:
64+
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
65+
run: |
66+
python3 tools/torchci/test_insights/weekly_notification.py
Lines changed: 243 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,243 @@
1+
import json
2+
from collections import defaultdict
3+
from functools import lru_cache
4+
from typing import Any
5+
6+
from torchci.test_insights.file_report_generator import FileReportGenerator
7+
from torchci.test_insights.weekly_notification import create_comment
8+
9+
10+
FILE_REPORT_URL = "https://hud.pytorch.org/tests/fileReport"
11+
12+
CONFIG = [
13+
{
14+
"team": "dev-infra",
15+
"condition": lambda _: True,
16+
"link": FILE_REPORT_URL,
17+
},
18+
{
19+
"team": "optim",
20+
"condition": lambda info: "module: optimizer" in info.get("labels", []),
21+
"link": f"{FILE_REPORT_URL}?label=module:%20optimizer",
22+
},
23+
{
24+
"team": "mps",
25+
"condition": lambda info: "mac" in info.get("short_job_name", ""),
26+
"link": f"{FILE_REPORT_URL}?job=mac&jobRegex=true",
27+
},
28+
]
29+
30+
31+
class RegressionNotification:
32+
"""
33+
Class to handle regression notifications for test insights.
34+
"""
35+
36+
def __init__(self):
37+
self.file_report_generator = FileReportGenerator(dry_run=True)
38+
39+
@lru_cache
40+
def _previous_regression_sha(self) -> str:
41+
text = self.file_report_generator._fetch_from_s3(
42+
"ossci-raw-job-status",
43+
"additional_info/weekly_file_report/regression_metadata.json.gz",
44+
)
45+
data = json.loads(text)
46+
return data
47+
48+
def gen_regression_for_team(
49+
self,
50+
team: dict[str, Any],
51+
prev_invoking_file_info: list[dict[str, Any]],
52+
curr_invoking_file_info: list[dict[str, Any]],
53+
status_changes: list[dict[str, Any]],
54+
) -> dict[str, Any]:
55+
relevant_status_changes = [
56+
change for change in status_changes if team["condition"](change)
57+
]
58+
# Aggregate status changes
59+
aggregated_status_changes = defaultdict(int)
60+
for change in relevant_status_changes:
61+
aggregated_status_changes[change["status"]] += 1
62+
63+
# Invoking_file_info diff
64+
relevant_curr_invoking_file_info = [
65+
info for info in curr_invoking_file_info if team["condition"](info)
66+
]
67+
relevant_keys = set(
68+
(info["short_job_name"], info["file"])
69+
for info in relevant_curr_invoking_file_info
70+
)
71+
relevant_prev_invoking_file_info = [
72+
info
73+
for info in prev_invoking_file_info
74+
if (info["short_job_name"], info["file"]) in relevant_keys
75+
]
76+
77+
def _sum_invoking_file_info(
78+
data: list[dict[str, Any]], field: str
79+
) -> dict[str, Any]:
80+
info = {
81+
"count": sum(item["count"] for item in data),
82+
"cost": sum(item["cost"] for item in data),
83+
"time": sum(item["time"] for item in data),
84+
"skipped": sum(item["skipped"] for item in data),
85+
}
86+
return info
87+
88+
agg_prev_file_info = _sum_invoking_file_info(
89+
relevant_prev_invoking_file_info, "prev"
90+
)
91+
agg_curr_file_info = _sum_invoking_file_info(
92+
relevant_curr_invoking_file_info, "curr"
93+
)
94+
95+
invoking_file_info_diff = {
96+
"count": {
97+
"previous": agg_prev_file_info["count"],
98+
"current": agg_curr_file_info["count"],
99+
},
100+
"cost": {
101+
"previous": agg_prev_file_info["cost"],
102+
"current": agg_curr_file_info["cost"],
103+
},
104+
"time": {
105+
"previous": agg_prev_file_info["time"],
106+
"current": agg_curr_file_info["time"],
107+
},
108+
"skipped": {
109+
"previous": agg_prev_file_info["skipped"],
110+
"current": agg_curr_file_info["skipped"],
111+
},
112+
}
113+
114+
return {
115+
"status_changes": aggregated_status_changes,
116+
"invoking_file_info": invoking_file_info_diff,
117+
}
118+
119+
def filter_thresholds(self, regression: dict[str, Any]) -> bool:
120+
def _exceeds_threshold(value: float, total: float) -> bool:
121+
if total == 0:
122+
return False
123+
percent_threshold = 0.1
124+
125+
return (value / total) >= percent_threshold
126+
127+
def _status_change_exceeds_threshold(field: str, total_field: str) -> bool:
128+
return _exceeds_threshold(
129+
regression["status_changes"].get(field, 0),
130+
regression["invoking_file_info"][total_field]["previous"],
131+
)
132+
133+
def _diff_exceeds_threshold(field: str) -> bool:
134+
return _exceeds_threshold(
135+
abs(
136+
regression["invoking_file_info"][field]["current"]
137+
- regression["invoking_file_info"][field]["previous"]
138+
),
139+
regression["invoking_file_info"][field]["previous"],
140+
)
141+
142+
return (
143+
_status_change_exceeds_threshold("removed", "count")
144+
or _status_change_exceeds_threshold("added", "count")
145+
or _status_change_exceeds_threshold("started_skipping", "skipped")
146+
or _status_change_exceeds_threshold("stopped_skipping", "skipped")
147+
or any(
148+
_diff_exceeds_threshold(key)
149+
for key in ["cost", "count", "skipped", "time"]
150+
)
151+
)
152+
153+
def format_regression_string(self, team, regression: dict[str, Any]) -> str:
154+
def _get_change(field: str, additional_processing) -> str:
155+
current = regression["invoking_file_info"][field]["current"]
156+
previous = regression["invoking_file_info"][field]["previous"]
157+
change = current - previous
158+
percent_change = (change / previous) * 100 if previous != 0 else 0
159+
percent_change = round(percent_change, 2)
160+
161+
current = additional_processing(current)
162+
previous = additional_processing(previous)
163+
change = additional_processing(change)
164+
return f"{change} ({percent_change}%), from {previous} to {current}"
165+
166+
return (
167+
f"Regression detected for Team:{team['team']}:\n"
168+
+ f"Link: {team['link']}\n"
169+
+ f"New tests: {regression['status_changes'].get('added', 0)}\n"
170+
+ f"Removed tests: {regression['status_changes'].get('removed', 0)}\n"
171+
+ f"Started skipping: {regression['status_changes'].get('started_skipping', 0)}\n"
172+
+ f"Stopped skipping: {regression['status_changes'].get('stopped_skipping', 0)}\n"
173+
+ f"Cost ($) change: {_get_change('cost', additional_processing=lambda x: round(x, 2))}\n"
174+
+ f"Time (min) change: {_get_change('time', additional_processing=lambda x: round(x / 60, 2))}\n"
175+
+ f"Test count change: {_get_change('count', additional_processing=lambda x: round(x, 2))}\n"
176+
+ f"\\# skipped change: {_get_change('skipped', additional_processing=lambda x: round(x, 2))}\n"
177+
)
178+
179+
def determine_regressions(self) -> None:
180+
"""
181+
Determine regressions in the test data based on the provided filter.
182+
Returns a list of regression entries.
183+
"""
184+
previous_regression_sha = self._previous_regression_sha()
185+
metadata = self.file_report_generator.fetch_existing_metadata()
186+
curr_sha = metadata[-1]
187+
188+
current_sha = curr_sha["sha"]
189+
if previous_regression_sha == current_sha:
190+
print(f"No new reports since last report: {previous_regression_sha}")
191+
return
192+
prev_sha = metadata[-2]["sha"]
193+
194+
status_changes = self.file_report_generator.get_status_changes(
195+
sha1=prev_sha,
196+
sha2=current_sha,
197+
sha2_push_date=curr_sha["push_date"],
198+
)
199+
200+
def _s3_to_json(bucket: str, key: str) -> Any:
201+
text = self.file_report_generator._fetch_from_s3(bucket, key)
202+
data = []
203+
for line in text.splitlines():
204+
data.append(json.loads(line))
205+
206+
return data
207+
208+
previous_sha_invoking_file_info = _s3_to_json(
209+
"ossci-raw-job-status",
210+
f"additional_info/weekly_file_report/data_{prev_sha}.json.gz",
211+
)
212+
current_sha_invoking_file_info = _s3_to_json(
213+
"ossci-raw-job-status",
214+
f"additional_info/weekly_file_report/data_{current_sha}.json.gz",
215+
)
216+
217+
regressions = []
218+
for team in CONFIG:
219+
change = self.gen_regression_for_team(
220+
team=team,
221+
prev_invoking_file_info=previous_sha_invoking_file_info,
222+
curr_invoking_file_info=current_sha_invoking_file_info,
223+
status_changes=status_changes,
224+
)
225+
if self.filter_thresholds(change):
226+
regressions.append(
227+
{
228+
"team": team["team"],
229+
"regression": change,
230+
"link": team["link"],
231+
}
232+
)
233+
create_comment(
234+
{
235+
# "title": f"Regression detected for {team['team']}",
236+
"body": self.format_regression_string(team, change),
237+
}
238+
)
239+
240+
241+
if __name__ == "__main__":
242+
regression_notifier = RegressionNotification()
243+
regression_notifier.determine_regressions()
Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
import json
2+
import os
3+
import urllib.request
4+
from typing import Any
5+
6+
7+
HUD_URL_ROOT = "https://hud.pytorch.org/tests/fileReport"
8+
CONFIG: list[dict[str, Any]] = [
9+
{
10+
"team": "Inductor",
11+
"condition": lambda info: info["labels"].includes("module: inductor"),
12+
"link": f"{HUD_URL_ROOT}?label=module:%20inductor",
13+
}
14+
]
15+
16+
TITLE = "New Test Report is Available for {module_name}"
17+
REASON = "A new test report has been generated for Team:{team}. Please go to the following link to view the report: {report_url}"
18+
LABELS = ["area:alerting", "Pri:P3", "Source:custom"]
19+
20+
21+
def generate_notification(
22+
module_name: str, team: str, report_url: str
23+
) -> dict[str, str]:
24+
title = TITLE.format(module_name=module_name)
25+
reason = REASON.format(team=team, report_url=report_url)
26+
return {
27+
# "title": title,
28+
"body": reason,
29+
# "labels": LABELS + [f"Team:{team}"],
30+
}
31+
32+
33+
# Using a specific issue for the time being while the alerting system is set up
34+
# to handle custom webhooks
35+
GITHUB_ISSUE_URL = (
36+
"https://api.github.com/repos/pytorch/test-infra/issues/7296/comments"
37+
)
38+
39+
40+
def create_comment(issue: dict[str, str]) -> dict[str, str]:
41+
# Create issue in github repo
42+
auth = {"Authorization": f"Bearer {os.getenv('GITHUB_TOKEN')}"}
43+
data = json.dumps(issue).encode()
44+
req = urllib.request.Request(
45+
GITHUB_ISSUE_URL, data=data, headers=auth, method="POST"
46+
)
47+
with urllib.request.urlopen(req) as f:
48+
response = f.read()
49+
print(f"Created issue: {response}")
50+
return issue
51+
52+
53+
if __name__ == "__main__":
54+
for config in CONFIG:
55+
issue = generate_notification(
56+
module_name=config["team"], team=config["team"], report_url=config["link"]
57+
)
58+
59+
create_comment(issue)

0 commit comments

Comments
 (0)