Skip to content

Commit 1acf9f4

Browse files
committed
Add jenkins_failures.py script
Script to summarise the details of intermittent CV test failures from Jenkins. It attempts to group logically identical failure reasons together, and then outputs a list of observed failure reasons, ordered by frequency. Note: This is _very_ rough-and-ready - it "works" in that it extracts useful information from our CV jobs, but it's likely very specialised to the currently observed test failures - i.e. the filtering in filter_failed_builds() will likely need enhancing as and when tests change. Usage: ./jenkins_failures.py <USERNAME> <JENKINS_API_TOKEN> [--mode=MODE] The Jenkins API Token can be setup by logging into cv.jenkins.couchbase.com and clicking "Add New Token" from your user page (e.g. http://cv.jenkins.couchbase.com/user/daverigby/configure) Change-Id: I9262c1c8b0b3b24d740278b2dceda19bfe1480df Reviewed-on: http://review.couchbase.org/c/kv_engine/+/141148 Reviewed-by: James Harrison <[email protected]> Tested-by: Dave Rigby <[email protected]>
1 parent 908a0de commit 1acf9f4

File tree

1 file changed

+284
-0
lines changed

1 file changed

+284
-0
lines changed

scripts/jenkins_failures.py

Lines changed: 284 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,284 @@
1+
#!/usr/bin/env python3
2+
3+
"""Script to summarise the details of intermittent CV test failures from
4+
Jenkins.
5+
6+
It attempts to group logically identical failure reasons together, and then
7+
outputs a list of observed failure reasons, ordered by frequency.
8+
9+
Note: This is _very_ rough-and-ready - it "works" in that it extracts useful
10+
information from our CV jobs, but it's likely very specialised to the currently
11+
observed test failures - i.e. the filtering in filter_failed_builds() will
12+
likely need enhancing as and when tests change.
13+
14+
Usage: ./jenkins_failures.py <USERNAME> <JENKINS_API_TOKEN> [--mode=MODE]
15+
16+
The Jenkins API Token can be setup by logging into cv.jenkins.couchbase.com
17+
and clicking "Add New Token" from your user page
18+
(e.g. http://cv.jenkins.couchbase.com/user/daverigby/configure)
19+
"""
20+
21+
import argparse
22+
import collections
23+
import datetime
24+
import json
25+
import logging
26+
import multiprocessing
27+
import os
28+
import re
29+
import sys
30+
31+
try:
32+
import jenkins
33+
except ModuleNotFoundError as e:
34+
print(
35+
"{}.\n\nInstall using `python3 -m pip install python-jenkins` or similar.".format(
36+
e), file=sys.stderr);
37+
sys.exit(1)
38+
39+
logging.basicConfig(level=logging.DEBUG)
40+
logging.getLogger('urllib3').setLevel(logging.ERROR)
41+
42+
43+
def init_worker(function, url, username, password):
44+
"""Initialise a multiprocessing worker by establishing a connection to
45+
the Jenkins server at url/username/password"""
46+
function.server = jenkins.Jenkins(url, username=username, password=password)
47+
48+
49+
def get_build_info(build):
50+
"""For the given build job and number, download the information for
51+
that job from Jenkins.
52+
"""
53+
job = build['job']
54+
number = build['number']
55+
depth = 5 # Minimum depth needed to fetch all the details we need.
56+
info = get_build_info.server.get_build_info(job, number, depth)
57+
result = info['result']
58+
logging.debug("Build: {}-{}: {}".format(job, number, result))
59+
if not result:
60+
# Job still running - skip
61+
return
62+
if result in ('SUCCESS', 'ABORTED'):
63+
# Job succeeded or explicitly aborted - skip
64+
return
65+
key = job + "-" + str(number)
66+
return (key, info)
67+
68+
69+
def fetch_failed_builds(server_url, username, password):
70+
"""For the given Jenkins server URL & credentials, fetch details of all
71+
failed builds.
72+
Returns a dictionary of job+build_number to build details dictionary."""
73+
server = jenkins.Jenkins(server_url, username=username, password=password)
74+
user = server.get_whoami()
75+
version = server.get_version()
76+
logging.info('Hello {} from Jenkins {}'.format(user['fullName'], version))
77+
78+
details = dict()
79+
80+
jobs = ('kv_engine.linux/master', 'kv_engine.linux-CE/master',
81+
'kv_engine.macos/master', 'kv_engine.ASan-UBSan/master',
82+
'kv_engine.threadsanitizer/master', 'kv_engine-windows-master')
83+
84+
with multiprocessing.Pool(os.cpu_count() * 4,
85+
initializer=init_worker,
86+
initargs=(get_build_info,
87+
server_url,
88+
username,
89+
password)) as pool:
90+
for job in jobs:
91+
builds = server.get_job_info(job, depth=1, fetch_all_builds=True)[
92+
'builds']
93+
for b in builds:
94+
b['job'] = job
95+
logging.debug(
96+
"get_job_info({}) returned {} builds".format(job, len(builds)))
97+
98+
# Constrain to last 200 builds of each job.
99+
builds = builds[:200]
100+
results = pool.map(get_build_info, builds)
101+
for r in results:
102+
if r:
103+
details[r[0]] = r[1]
104+
return details
105+
106+
107+
def get_gerrit_patch_from_parameters_action(action):
108+
"""For the given action dictionary, return the Gerrit patch URL if present,
109+
else return None."""
110+
for param in action['parameters']:
111+
if param['name'] == 'GERRIT_CHANGE_URL':
112+
return param['value']
113+
return None
114+
115+
116+
def extract_failed_builds(details):
117+
failures = dict()
118+
for number, info in details.items():
119+
actions = info['actions']
120+
description = None
121+
gerrit_patch = None
122+
for a in actions:
123+
if '_class' in a and a['_class'] == 'hudson.model.ParametersAction':
124+
gerrit_patch = get_gerrit_patch_from_parameters_action(a)
125+
if '_class' in a and a['_class'] == 'com.sonyericsson.jenkins.plugins.bfa.model.FailureCauseBuildAction':
126+
for cause in a['foundFailureCauses']:
127+
description = cause['description'].strip()
128+
timestamp = datetime.datetime.utcfromtimestamp(
129+
info['timestamp'] / 1000.0)
130+
if description not in failures:
131+
failures[description] = list()
132+
assert(gerrit_patch)
133+
failures[description].append({'description': description,
134+
'gerrit_patch': gerrit_patch,
135+
'timestamp': timestamp,
136+
'url': info['url']})
137+
if not description:
138+
logging.warning(
139+
"extract_failed_builds: Did not find failure cause for " +
140+
info['url'] + " Result:" + info['result'])
141+
if not gerrit_patch:
142+
logging.warning(
143+
"extract_failed_builds: Did not find Gerrit patch for " +
144+
info['url'] + " Result:" + info['result'])
145+
return failures
146+
147+
148+
def filter_failed_builds(details):
149+
"""Discard any failures which are not of interest, and collapse effectively
150+
identical issues:
151+
- compile errors which are not transient and hence are not of interest when
152+
looking for intermittent issues.
153+
- Errors which mention an address which differs but is otherwise identical :)
154+
"""
155+
156+
def include(elem):
157+
for ignored in (r'Unexpected stat:', r'Compile error at',
158+
r'^The warnings threshold for this job was exceeded'):
159+
if re.search(ignored, elem[0]):
160+
return False
161+
return True
162+
163+
filtered = dict(filter(include, details.items()))
164+
165+
# Replace all hex addresses with a single sanitized value.
166+
def mask(matchobj):
167+
return '0x' + ('X' * len(matchobj.group(1)))
168+
169+
merged = collections.defaultdict(list)
170+
for key, value in filtered.items():
171+
key = re.sub(r'0x([0-9a-fA-F]+)', mask, key)
172+
# Strip out pipeline build timestamps of the form
173+
# '[2020-11-26T15:30:05.571Z] ' - non-pipeline builds don't include
174+
# them.
175+
key = re.sub(r'\[\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}.\d{3}Z\] ', '',
176+
key)
177+
# Filter timings of the form '2 ms'
178+
key = re.sub(r'\d+ ms', 'X ms', key)
179+
# Filter exact test order / test number - e.g.
180+
# 13/398 Test #181: ep-engine_ep_unit_tests.
181+
key = re.sub(r'\s+\d+/\d+ Test\s+#\d+:', ' N/M Test X:', key)
182+
# Filter engine_testapp exact order / test number - e.g.
183+
# Running [0007/0047]: multi se
184+
key = re.sub(r'Running \[\d+/\d+]:', 'Running [X/Y]:', key)
185+
# Replace long '....' strings used for padding / alignment
186+
key = re.sub(r'\.\.\.+', '...', key)
187+
# Filter expected / actual values
188+
key = re.sub(r'last value:\d+', 'last value:XXX', key)
189+
key = re.sub(r"Expected `(\d+)', got `\d+'",
190+
r"Expected `\1', got `YYY'", key)
191+
key = re.sub(r"Expected `\d+' to be greater than or equal to `(\d+)'",
192+
r"Expected `XXX' to be greater than or equal to `\1'", key)
193+
# Normalise file paths across Windows & *ix.
194+
key = key.replace('\\', '/')
195+
# Merge value-only and full-eviction failures
196+
key = key.replace('value_only', '<EVICTION_MODE>')
197+
key = key.replace('full_eviction', '<EVICTION_MODE>')
198+
# Merge couchstore filenames
199+
key = re.sub(r'\d+\.couch\.\d+', 'VBID.couch.REV', key)
200+
# Filter local ephemeral port numbers
201+
key = re.sub(r'127\.0\.0\.1(.)\d{5}', r'127.0.0.1\1<PORT>', key)
202+
key = re.sub(r'localhost(.)\d{5}', r'localhost\1<PORT>', key)
203+
204+
merged[key] += value
205+
206+
# For each failure, determine the time range it has occurred over, and
207+
# for which patches.
208+
# If it only occurred:
209+
# a) Over a small time (6 hours), or
210+
# b) For a single patch
211+
# then that implies it's not a repeated, intermittent issue - e.g. just an
212+
# issue with a single patch / a transient issue on some machine - hence
213+
# discard it.
214+
merged2 = dict()
215+
for key, details in merged.items():
216+
timestamps = list()
217+
patches = dict()
218+
for d in details:
219+
timestamps.append(d['timestamp'])
220+
patches[d['gerrit_patch']] = True
221+
range = max(timestamps) - min(timestamps)
222+
223+
if range > datetime.timedelta(hours=6) and len(patches) > 1:
224+
merged2[key] = details
225+
return merged2
226+
227+
228+
if __name__ == '__main__':
229+
parser = argparse.ArgumentParser()
230+
parser.add_argument('username', help='Username for Jenkins REST API')
231+
parser.add_argument('password', help='Password/token for Jenkins REST API')
232+
parser.add_argument('--mode',
233+
choices=['download', 'parse', 'download_and_parse'],
234+
default='download_and_parse',
235+
help=('Analysis mode.\n'
236+
'download: Download build details, writing to stdout as JSON.\n'
237+
'parse: Parse previously-downloaded build details read from stdin.\n'
238+
'download_and_parse: Download and parse build details.\n'))
239+
args = parser.parse_args()
240+
241+
if args.mode != 'parse':
242+
raw_builds = fetch_failed_builds('http://cv.jenkins.couchbase.com',
243+
args.username, args.password)
244+
if args.mode == 'download':
245+
print(json.dumps(raw_builds))
246+
sys.exit(0)
247+
if args.mode =='parse':
248+
raw_builds = json.load(sys.stdin)
249+
logging.info("Number of builds downloaded: {}.".format(len(raw_builds)))
250+
failures = extract_failed_builds(raw_builds)
251+
logging.info("Number of builds with at least one failure: {}.".format(len(failures)))
252+
failures = filter_failed_builds(failures)
253+
logging.info("Number of unique failures: {}".format(len(failures)))
254+
255+
sorted_failures = sorted(failures.items(), key=lambda i: len(i[1]),
256+
reverse=True)
257+
258+
# Count total number of failures
259+
total_failures = 0
260+
for (_, details) in sorted_failures:
261+
total_failures += len(details)
262+
263+
for (summary, details) in sorted_failures:
264+
num_failures = len(details)
265+
print('+Error signature+')
266+
print('{code}')
267+
print(summary)
268+
print('{code}\n')
269+
print('+Details+')
270+
print(
271+
'{} instances of this failure ({:.1f}% of sampled failures):'.format(
272+
num_failures,
273+
(num_failures * 100.0) / total_failures))
274+
for d in details[:10]:
275+
human_time = d['timestamp'].strftime('%Y-%m-%d %H:%M:%S')
276+
print("* Time: {}, Jenkins job: {}, patch: {}".format(human_time,
277+
d['url'],
278+
d[
279+
'gerrit_patch']))
280+
if len(details) > 10:
281+
print(
282+
"<cut> - only showing details of first 10/{} instances.".format(
283+
len(details)))
284+
print('\n===========\n')

0 commit comments

Comments
 (0)