Skip to content

Commit ec05ee7

Browse files
committed
[jenkins_failures.py] Extract NODE_NAME from console output
The node name is the container's name which can be linked to the VM and then to the physical machine. This will extract and print the node's name to link commonly failing tests to the machine they run on. This can be used to deduce if a particular machine may be experience slow hardware issues. Change-Id: I1d2132fa953e4e8d214ea438a80042e5c647cfdc Reviewed-on: https://review.couchbase.org/c/kv_engine/+/230557 Tested-by: Faizan Alam <[email protected]> Reviewed-by: Paolo Cocchi <[email protected]>
1 parent c7f2c00 commit ec05ee7

File tree

1 file changed

+63
-3
lines changed

1 file changed

+63
-3
lines changed

scripts/jenkins_failures.py

Lines changed: 63 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,10 @@
1616
It attempts to group logically identical failure reasons together, and then
1717
outputs a list of observed failure reasons, ordered by frequency.
1818
19+
The script also extracts NODE_NAME from Jenkins console output to help identify
20+
which Jenkins nodes experienced failures. If no name is found, it will be set
21+
to 'Unknown'.
22+
1923
Note: This is _very_ rough-and-ready - it "works" in that it extracts useful
2024
information from our CV jobs, but it's likely very specialised to the currently
2125
observed test failures - i.e. the filtering in filter_failed_builds() will
@@ -38,6 +42,7 @@
3842
import os
3943
import re
4044
import sys
45+
import requests
4146

4247
try:
4348
import jenkins
@@ -79,6 +84,14 @@ def jenkins_request(
7984
req.url += 'tree=' + self.tree_filter
8085
return super().jenkins_request(req, add_crumb, resolve_auth, stream)
8186

87+
class SessionWithAuth(requests.Session):
88+
"""
89+
A session that persists the authentication credentials.
90+
"""
91+
def __init__(self, username, password):
92+
super().__init__()
93+
self.auth = (username, password)
94+
8295

8396
def init_worker(function, url, username, password):
8497
"""Initialise a multiprocessing worker by establishing a connection to
@@ -88,6 +101,42 @@ def init_worker(function, url, username, password):
88101
# We only ever access these fields from these code paths.
89102
function.server.tree_filter = 'url,result,timestamp,actions[parameters[*],foundFailureCauses[*]]'
90103

104+
# Initialize for HTTP requests
105+
function.request_session = SessionWithAuth(username, password)
106+
107+
def get_node_name(server, job, number, session):
108+
console_url = None
109+
110+
if job.startswith('kv_engine-windows-'):
111+
# Sample console URL: /job/kv_engine-windows-master/63025/consoleText
112+
console_url = f"{server.server}job/{job}/{number}/consoleText"
113+
else:
114+
# We need the branch separate from the job name to construct the console URL.
115+
# Sample console URL: /job/kv_engine.ASan-UBSan/job/master/43301/consoleText
116+
(job_name, branch) = job.split('/')
117+
console_url = f"{server.server}job/{job_name}/job/{branch}/{number}/consoleText"
118+
119+
try:
120+
response = session.get(console_url, stream=True, timeout=30)
121+
response.raise_for_status()
122+
123+
# Node name in the first ~5000 bytes, however load 10000 bytes to be safe.
124+
for chunk in response.iter_content(decode_unicode=True, chunk_size=10000):
125+
match = re.search(r'NODE_NAME=([^\s]+)', chunk)
126+
if match:
127+
return match.group(1)
128+
break
129+
130+
logging.warning(f"Failed to find NODE_NAME for {job}-{number}")
131+
132+
except requests.exceptions.RequestException as e:
133+
logging.debug(f"get_node_name: Failed to fetch from {console_url}: {e}")
134+
return None
135+
except Exception as e:
136+
logging.warning(f"get_node_name: Failed to fetch console output for {job}-{number}: {e}")
137+
return None
138+
139+
return None
91140

92141
def get_build_info(build):
93142
"""For the given build job and number, download the information for
@@ -105,6 +154,15 @@ def get_build_info(build):
105154
if result in ('SUCCESS', 'ABORTED'):
106155
# Job succeeded or explicitly aborted - skip
107156
return
157+
158+
session = get_build_info.request_session
159+
node_name = get_node_name(get_build_info.server, job, number, session)
160+
if node_name:
161+
info['node_name'] = node_name
162+
logging.debug("Build: {}-{}: Node: {}".format(job, number, node_name))
163+
else:
164+
info['node_name'] = 'Unknown'
165+
108166
key = job + "-" + str(number)
109167
return (key, info)
110168

@@ -189,7 +247,8 @@ def extract_failed_builds(details):
189247
failures[description].append({'description': description,
190248
'gerrit_patch': gerrit_patch,
191249
'timestamp': timestamp,
192-
'url': info['url']})
250+
'url': info['url'],
251+
'node_name': info['node_name']})
193252
if not description:
194253
logging.warning(
195254
"extract_failed_builds: Did not find failure cause for " +
@@ -459,8 +518,9 @@ def include(elem):
459518
(num_failures * 100.0) / total_failures))
460519
for d_idx, d in enumerate(details[:100]):
461520
human_time = d['timestamp'].strftime('%Y-%m-%d %H:%M:%S')
462-
print("* Time: {}, Jenkins job: {}, patch: {}".format(human_time,
463-
d['url'], d['gerrit_patch']))
521+
node_name = d['node_name']
522+
print("* Time: {}, Jenkins job: {}, patch: {}, node: {}".format(human_time,
523+
d['url'], d['gerrit_patch'], node_name))
464524
if len(d['variables']) > 0:
465525
print(' `- where ', end='')
466526
for name, value in d['variables'].items():

0 commit comments

Comments
 (0)