Skip to content

Commit 5445937

Browse files
authored
[TRTLLM-10248][feat] Support Bot to Send Perf Regression Msg to Slack Channel (#10489)
Signed-off-by: Chenfei Zhang <[email protected]>
1 parent 3a9a00b commit 5445937

File tree

7 files changed

+530
-142
lines changed

7 files changed

+530
-142
lines changed

jenkins/runPerfSanityTriage.groovy

Lines changed: 111 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,111 @@
1+
@Library(['bloom-jenkins-shared-lib@main', 'trtllm-jenkins-shared-lib@main']) _
2+
3+
import java.lang.InterruptedException
4+
5+
DOCKER_IMAGE = "urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.10-py3-x86_64-ubuntu24.04-trt10.13.3.9-skip-tritondevel-202510291120-8621"
6+
7+
// LLM repository configuration
8+
withCredentials([string(credentialsId: 'default-llm-repo', variable: 'DEFAULT_LLM_REPO')]) {
9+
LLM_REPO = env.gitlabSourceRepoHttpUrl ? env.gitlabSourceRepoHttpUrl : "${DEFAULT_LLM_REPO}"
10+
}
11+
LLM_ROOT = "llm"
12+
13+
def createKubernetesPodConfig(image, arch = "amd64")
14+
{
15+
def archSuffix = arch == "arm64" ? "arm" : "amd"
16+
def jnlpImage = "urm.nvidia.com/sw-ipp-blossom-sre-docker-local/lambda/custom_jnlp_images_${archSuffix}_linux:jdk17"
17+
18+
def podConfig = [
19+
cloud: "kubernetes-cpu",
20+
namespace: "sw-tensorrt",
21+
yaml: """
22+
apiVersion: v1
23+
kind: Pod
24+
spec:
25+
nodeSelector:
26+
nvidia.com/node_type: builder
27+
kubernetes.io/os: linux
28+
containers:
29+
- name: trt-llm
30+
image: ${image}
31+
command: ['cat']
32+
volumeMounts:
33+
- name: sw-tensorrt-pvc
34+
mountPath: "/mnt/sw-tensorrt-pvc"
35+
readOnly: false
36+
tty: true
37+
resources:
38+
requests:
39+
cpu: 2
40+
memory: 5Gi
41+
ephemeral-storage: 25Gi
42+
limits:
43+
cpu: 2
44+
memory: 5Gi
45+
ephemeral-storage: 25Gi
46+
imagePullPolicy: Always
47+
- name: jnlp
48+
image: ${jnlpImage}
49+
args: ['\$(JENKINS_SECRET)', '\$(JENKINS_NAME)']
50+
resources:
51+
requests:
52+
cpu: '2'
53+
memory: 5Gi
54+
ephemeral-storage: 25Gi
55+
limits:
56+
cpu: '2'
57+
memory: 5Gi
58+
ephemeral-storage: 25Gi
59+
qosClass: Guaranteed
60+
volumes:
61+
- name: sw-tensorrt-pvc
62+
persistentVolumeClaim:
63+
claimName: sw-tensorrt-pvc
64+
""".stripIndent(),
65+
]
66+
67+
return podConfig
68+
}
69+
70+
pipeline {
71+
agent {
72+
kubernetes createKubernetesPodConfig(DOCKER_IMAGE)
73+
}
74+
options {
75+
timestamps()
76+
}
77+
environment {
78+
OPEN_SEARCH_DB_BASE_URL=credentials("open_search_db_base_url")
79+
OPEN_SEARCH_DB_CREDENTIALS=credentials("open_search_db_credentials")
80+
}
81+
parameters {
82+
string(name: "BRANCH", defaultValue: "main", description: "Branch to checkout.")
83+
string(name: "OPEN_SEARCH_PROJECT_NAME", defaultValue: "swdl-trtllm-infra-ci-prod-perf_sanity_info", description: "OpenSearch project name.")
84+
string(name: "OPERATION", defaultValue: "SLACK BOT SENDS MESSAGE", description: "Operation to perform.")
85+
string(name: "QUERY_JOB_NUMBER", defaultValue: "1", description: "Number of latest jobs to query.")
86+
string(name: "SLACK_CHANNEL_ID", defaultValue: "C0A7D0LCA1F", description: "Slack channel IDs to send messages to.")
87+
string(name: "SLACK_BOT_TOKEN", defaultValue: "", description: "Slack bot token for authentication.")
88+
}
89+
stages {
90+
stage("Run Perf Sanity Script") {
91+
steps {
92+
container("trt-llm") {
93+
script {
94+
sh "pwd && ls -alh"
95+
sh "env | sort"
96+
trtllm_utils.checkoutSource(LLM_REPO, params.BRANCH, LLM_ROOT, false, false)
97+
sh "pip install slack_sdk"
98+
sh """
99+
cd ${LLM_ROOT}/jenkins/scripts/perf && ls -alh && python3 perf_sanity_triage.py \
100+
--project_name "${params.OPEN_SEARCH_PROJECT_NAME}" \
101+
--operation "${params.OPERATION}" \
102+
--channel_id "${params.SLACK_CHANNEL_ID}" \
103+
--bot_token "${params.SLACK_BOT_TOKEN}" \
104+
--query_job_number "${params.QUERY_JOB_NUMBER}"
105+
"""
106+
}
107+
}
108+
}
109+
} // stage Run Perf Sanity Script
110+
} // stages
111+
} // pipeline
Lines changed: 251 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,251 @@
1+
#!/usr/bin/env python3
2+
3+
import argparse
4+
import json
5+
import sys
6+
import time
7+
8+
from slack_sdk import WebClient
9+
from slack_sdk.errors import SlackApiError
10+
11+
sys.path.insert(0, sys.path[0] + "/..")
12+
from open_search_db import OpenSearchDB
13+
14+
QUERY_LOOKBACK_DAYS = 90
15+
MAX_QUERY_SIZE = 3000
16+
MAX_TEST_CASES_PER_MSG = 5
17+
POST_SLACK_MSG_RETRY_TIMES = 5
18+
19+
20+
def query_regression_data(project_name):
21+
"""Query regression data from OpenSearch database."""
22+
last_days = QUERY_LOOKBACK_DAYS
23+
24+
must_clauses = [
25+
{"term": {"b_is_valid": True}},
26+
{"term": {"b_is_post_merge": True}},
27+
{"term": {"b_is_regression": True}},
28+
{"term": {"b_is_baseline": False}},
29+
{
30+
"range": {
31+
"ts_created": {
32+
"gte": int(time.time() - 24 * 3600 * last_days)
33+
// (24 * 3600)
34+
* 24
35+
* 3600
36+
* 1000,
37+
}
38+
}
39+
},
40+
]
41+
42+
json_data = {
43+
"query": {
44+
"bool": {"must": must_clauses},
45+
},
46+
"size": MAX_QUERY_SIZE,
47+
}
48+
json_data = json.dumps(json_data)
49+
50+
data_list = []
51+
try:
52+
res = OpenSearchDB.queryFromOpenSearchDB(json_data, project_name)
53+
if res is None:
54+
print(f"Failed to query from {project_name}, returned no response")
55+
return None
56+
payload = res.json().get("hits", {}).get("hits", [])
57+
if len(payload) == 0:
58+
print(f"No regression data found in {project_name}, returned empty list")
59+
return []
60+
for hit in payload:
61+
data_dict = hit.get("_source", {})
62+
data_dict["_id"] = hit.get("_id", "")
63+
if data_dict["_id"] == "":
64+
print(f"Failed to query from {project_name}, returned data with no _id")
65+
return None
66+
data_list.append(data_dict)
67+
print(f"Successfully queried from {project_name}, queried {len(data_list)} entries")
68+
return data_list
69+
except Exception as e:
70+
print(f"Failed to query from {project_name}, returned error: {e}")
71+
return None
72+
73+
74+
def get_regression_data_by_job_id(data_list, query_job_number):
75+
"""Returns a dict with job_id as key and list of regression data as value.
76+
77+
Only returns the latest query_job_number jobs.
78+
"""
79+
if data_list is None or len(data_list) == 0:
80+
return {}
81+
82+
# Group data by job_id
83+
job_data_dict = {}
84+
for data in data_list:
85+
job_id = data.get("s_job_id", "")
86+
if job_id == "":
87+
continue
88+
if job_id not in job_data_dict:
89+
job_data_dict[job_id] = []
90+
job_data_dict[job_id].append(data)
91+
92+
# Sort job_ids by the latest ts_created in each group (descending)
93+
def get_latest_timestamp(job_id):
94+
timestamps = [d.get("ts_created", 0) for d in job_data_dict[job_id]]
95+
return max(timestamps) if timestamps else 0
96+
97+
sorted_job_ids = sorted(job_data_dict.keys(), key=get_latest_timestamp, reverse=True)
98+
99+
# Only keep the latest query_job_number jobs
100+
latest_job_ids = sorted_job_ids[:query_job_number]
101+
102+
result = {}
103+
for job_id in latest_job_ids:
104+
result[job_id] = job_data_dict[job_id]
105+
106+
return result
107+
108+
109+
def process_regression_message(regression_dict):
110+
"""Process regression data into message chunks.
111+
112+
Returns a list of messages, each containing at most MAX_TEST_CASES_PER_MSG test cases.
113+
"""
114+
if not regression_dict:
115+
return []
116+
117+
# Flatten all test cases into a list with (job_id, idx, data) tuples
118+
all_test_cases = []
119+
for job_id, data_list in regression_dict.items():
120+
sorted_data_list = sorted(data_list, key=lambda x: x.get("s_test_case_name", ""))
121+
for idx, data in enumerate(sorted_data_list, start=1):
122+
all_test_cases.append((job_id, idx, data))
123+
124+
# Split into chunks of MAX_TEST_CASES_PER_MSG
125+
chunks = []
126+
for i in range(0, len(all_test_cases), MAX_TEST_CASES_PER_MSG):
127+
chunks.append(all_test_cases[i : i + MAX_TEST_CASES_PER_MSG])
128+
129+
# Build messages for each chunk
130+
messages = []
131+
for chunk in chunks:
132+
msg_parts = []
133+
current_job_id = None
134+
for job_id, idx, data in chunk:
135+
# Add job header when switching to a new job_id
136+
if job_id != current_job_id:
137+
if msg_parts:
138+
msg_parts.append("\n")
139+
job_header = f"*LLM/main/L0_PostMerge/{job_id}:*\n"
140+
msg_parts.append(job_header)
141+
current_job_id = job_id
142+
143+
test_case_name = data.get("s_test_case_name", "N/A")
144+
regression_info = data.get("s_regression_info", "N/A")
145+
msg_parts.append(f"*REGRESSION TEST CASE {idx}: {test_case_name}*\n")
146+
for part in regression_info.split(","):
147+
part = part.strip()
148+
if part and "baseline_id" not in part:
149+
msg_parts.append(f" {part}\n")
150+
151+
msg = "".join(msg_parts).strip()
152+
messages.append(msg)
153+
154+
return messages
155+
156+
157+
def send_regression_message(messages, channel_id, bot_token):
158+
"""Send regression messages to Slack channel(s).
159+
160+
channel_id can be a single ID or multiple IDs separated by commas.
161+
"""
162+
if not messages:
163+
print("No regression data to send")
164+
return
165+
166+
if channel_id and bot_token:
167+
channel_ids = [cid.strip() for cid in channel_id.split(",") if cid.strip()]
168+
for cid in channel_ids:
169+
for msg in messages:
170+
send_message(msg, cid, bot_token)
171+
else:
172+
print("Slack channel_id or bot_token not provided, printing message:")
173+
for i, msg in enumerate(messages, start=1):
174+
print(f"--- Message {i} ---")
175+
print(msg)
176+
177+
178+
def send_message(msg, channel_id, bot_token):
179+
"""Send message to Slack channel using slack_sdk."""
180+
client = WebClient(token=bot_token)
181+
182+
attachments = [
183+
{
184+
"title": "Perf Sanity Regression Report",
185+
"color": "#ff0000",
186+
"text": msg,
187+
}
188+
]
189+
190+
for attempt in range(1, POST_SLACK_MSG_RETRY_TIMES + 1):
191+
try:
192+
result = client.chat_postMessage(
193+
channel=channel_id,
194+
attachments=attachments,
195+
)
196+
assert result["ok"] is True, json.dumps(result.data)
197+
print(f"Message sent successfully to channel {channel_id}")
198+
return
199+
except SlackApiError as e:
200+
print(
201+
f"Attempt {attempt}/{POST_SLACK_MSG_RETRY_TIMES}: Error sending message to Slack: {e}"
202+
)
203+
except Exception as e:
204+
print(f"Attempt {attempt}/{POST_SLACK_MSG_RETRY_TIMES}: Unexpected error: {e}")
205+
206+
if attempt < POST_SLACK_MSG_RETRY_TIMES:
207+
time.sleep(1)
208+
209+
print(
210+
f"Failed to send message to channel {channel_id} after {POST_SLACK_MSG_RETRY_TIMES} attempts"
211+
)
212+
213+
214+
def main():
215+
parser = argparse.ArgumentParser(description="Perf Sanity Triage Script")
216+
parser.add_argument("--project_name", type=str, required=True, help="OpenSearch project name")
217+
parser.add_argument("--operation", type=str, required=True, help="Operation to perform")
218+
parser.add_argument(
219+
"--channel_id",
220+
type=str,
221+
default="",
222+
help="Slack channel ID(s), comma-separated for multiple channels",
223+
)
224+
parser.add_argument("--bot_token", type=str, default="", help="Slack bot token")
225+
parser.add_argument(
226+
"--query_job_number", type=int, default=1, help="Number of latest jobs to query"
227+
)
228+
229+
args = parser.parse_args()
230+
231+
print(f"Project Name: {args.project_name}")
232+
print(f"Operation: {args.operation}")
233+
print(f"Channel ID: {args.channel_id}")
234+
print(f"Bot Token: {'***' if args.bot_token else 'Not provided'}")
235+
print(f"Query Job Number: {args.query_job_number}")
236+
237+
if args.operation == "SLACK BOT SENDS MESSAGE":
238+
data_list = query_regression_data(args.project_name)
239+
if data_list is None:
240+
print("Failed to query regression data")
241+
return
242+
243+
regression_dict = get_regression_data_by_job_id(data_list, args.query_job_number)
244+
messages = process_regression_message(regression_dict)
245+
send_regression_message(messages, args.channel_id, args.bot_token)
246+
else:
247+
print(f"Unknown operation: {args.operation}")
248+
249+
250+
if __name__ == "__main__":
251+
main()

tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx1_gen1_dep8_bs768_eplb0_mtp0_ccb-UCX.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ benchmark:
2222
multi_round: 8
2323
benchmark_ratio: 0.8
2424
streaming: true
25-
concurrency_list: '6144'
25+
concurrency_list: '1024'
2626
input_length: 1024
2727
output_length: 1024
2828
dataset_file: <dataset_file>

0 commit comments

Comments
 (0)