|
| 1 | +#!/usr/bin/env python3 |
| 2 | + |
| 3 | +"""Script to summarise the details of intermittent CV test failures from |
| 4 | +Jenkins. |
| 5 | +
|
| 6 | +It attempts to group logically identical failure reasons together, and then |
| 7 | +outputs a list of observed failure reasons, ordered by frequency. |
| 8 | +
|
| 9 | +Note: This is _very_ rough-and-ready - it "works" in that it extracts useful |
| 10 | +information from our CV jobs, but it's likely very specialised to the currently |
| 11 | +observed test failures - i.e. the filtering in filter_failed_builds() will |
| 12 | +likely need enhancing as and when tests change. |
| 13 | +
|
| 14 | +Usage: ./jenkins_failures.py <USERNAME> <JENKINS_API_TOKEN> [--mode=MODE] |
| 15 | +
|
| 16 | +The Jenkins API Token can be setup by logging into cv.jenkins.couchbase.com |
| 17 | +and clicking "Add New Token" from your user page |
| 18 | + (e.g. http://cv.jenkins.couchbase.com/user/daverigby/configure) |
| 19 | +""" |
| 20 | + |
| 21 | +import argparse |
| 22 | +import collections |
| 23 | +import datetime |
| 24 | +import json |
| 25 | +import logging |
| 26 | +import multiprocessing |
| 27 | +import os |
| 28 | +import re |
| 29 | +import sys |
| 30 | + |
| 31 | +try: |
| 32 | + import jenkins |
| 33 | +except ModuleNotFoundError as e: |
| 34 | + print( |
| 35 | + "{}.\n\nInstall using `python3 -m pip install python-jenkins` or similar.".format( |
| 36 | + e), file=sys.stderr); |
| 37 | + sys.exit(1) |
| 38 | + |
| 39 | +logging.basicConfig(level=logging.DEBUG) |
| 40 | +logging.getLogger('urllib3').setLevel(logging.ERROR) |
| 41 | + |
| 42 | + |
| 43 | +def init_worker(function, url, username, password): |
| 44 | + """Initialise a multiprocessing worker by establishing a connection to |
| 45 | + the Jenkins server at url/username/password""" |
| 46 | + function.server = jenkins.Jenkins(url, username=username, password=password) |
| 47 | + |
| 48 | + |
| 49 | +def get_build_info(build): |
| 50 | + """For the given build job and number, download the information for |
| 51 | + that job from Jenkins. |
| 52 | + """ |
| 53 | + job = build['job'] |
| 54 | + number = build['number'] |
| 55 | + depth = 5 # Minimum depth needed to fetch all the details we need. |
| 56 | + info = get_build_info.server.get_build_info(job, number, depth) |
| 57 | + result = info['result'] |
| 58 | + logging.debug("Build: {}-{}: {}".format(job, number, result)) |
| 59 | + if not result: |
| 60 | + # Job still running - skip |
| 61 | + return |
| 62 | + if result in ('SUCCESS', 'ABORTED'): |
| 63 | + # Job succeeded or explicitly aborted - skip |
| 64 | + return |
| 65 | + key = job + "-" + str(number) |
| 66 | + return (key, info) |
| 67 | + |
| 68 | + |
| 69 | +def fetch_failed_builds(server_url, username, password): |
| 70 | + """For the given Jenkins server URL & credentials, fetch details of all |
| 71 | + failed builds. |
| 72 | + Returns a dictionary of job+build_number to build details dictionary.""" |
| 73 | + server = jenkins.Jenkins(server_url, username=username, password=password) |
| 74 | + user = server.get_whoami() |
| 75 | + version = server.get_version() |
| 76 | + logging.info('Hello {} from Jenkins {}'.format(user['fullName'], version)) |
| 77 | + |
| 78 | + details = dict() |
| 79 | + |
| 80 | + jobs = ('kv_engine.linux/master', 'kv_engine.linux-CE/master', |
| 81 | + 'kv_engine.macos/master', 'kv_engine.ASan-UBSan/master', |
| 82 | + 'kv_engine.threadsanitizer/master', 'kv_engine-windows-master') |
| 83 | + |
| 84 | + with multiprocessing.Pool(os.cpu_count() * 4, |
| 85 | + initializer=init_worker, |
| 86 | + initargs=(get_build_info, |
| 87 | + server_url, |
| 88 | + username, |
| 89 | + password)) as pool: |
| 90 | + for job in jobs: |
| 91 | + builds = server.get_job_info(job, depth=1, fetch_all_builds=True)[ |
| 92 | + 'builds'] |
| 93 | + for b in builds: |
| 94 | + b['job'] = job |
| 95 | + logging.debug( |
| 96 | + "get_job_info({}) returned {} builds".format(job, len(builds))) |
| 97 | + |
| 98 | + # Constrain to last 200 builds of each job. |
| 99 | + builds = builds[:200] |
| 100 | + results = pool.map(get_build_info, builds) |
| 101 | + for r in results: |
| 102 | + if r: |
| 103 | + details[r[0]] = r[1] |
| 104 | + return details |
| 105 | + |
| 106 | + |
| 107 | +def get_gerrit_patch_from_parameters_action(action): |
| 108 | + """For the given action dictionary, return the Gerrit patch URL if present, |
| 109 | + else return None.""" |
| 110 | + for param in action['parameters']: |
| 111 | + if param['name'] == 'GERRIT_CHANGE_URL': |
| 112 | + return param['value'] |
| 113 | + return None |
| 114 | + |
| 115 | + |
| 116 | +def extract_failed_builds(details): |
| 117 | + failures = dict() |
| 118 | + for number, info in details.items(): |
| 119 | + actions = info['actions'] |
| 120 | + description = None |
| 121 | + gerrit_patch = None |
| 122 | + for a in actions: |
| 123 | + if '_class' in a and a['_class'] == 'hudson.model.ParametersAction': |
| 124 | + gerrit_patch = get_gerrit_patch_from_parameters_action(a) |
| 125 | + if '_class' in a and a['_class'] == 'com.sonyericsson.jenkins.plugins.bfa.model.FailureCauseBuildAction': |
| 126 | + for cause in a['foundFailureCauses']: |
| 127 | + description = cause['description'].strip() |
| 128 | + timestamp = datetime.datetime.utcfromtimestamp( |
| 129 | + info['timestamp'] / 1000.0) |
| 130 | + if description not in failures: |
| 131 | + failures[description] = list() |
| 132 | + assert(gerrit_patch) |
| 133 | + failures[description].append({'description': description, |
| 134 | + 'gerrit_patch': gerrit_patch, |
| 135 | + 'timestamp': timestamp, |
| 136 | + 'url': info['url']}) |
| 137 | + if not description: |
| 138 | + logging.warning( |
| 139 | + "extract_failed_builds: Did not find failure cause for " + |
| 140 | + info['url'] + " Result:" + info['result']) |
| 141 | + if not gerrit_patch: |
| 142 | + logging.warning( |
| 143 | + "extract_failed_builds: Did not find Gerrit patch for " + |
| 144 | + info['url'] + " Result:" + info['result']) |
| 145 | + return failures |
| 146 | + |
| 147 | + |
| 148 | +def filter_failed_builds(details): |
| 149 | + """Discard any failures which are not of interest, and collapse effectively |
| 150 | + identical issues: |
| 151 | + - compile errors which are not transient and hence are not of interest when |
| 152 | + looking for intermittent issues. |
| 153 | + - Errors which mention an address which differs but is otherwise identical :) |
| 154 | + """ |
| 155 | + |
| 156 | + def include(elem): |
| 157 | + for ignored in (r'Unexpected stat:', r'Compile error at', |
| 158 | + r'^The warnings threshold for this job was exceeded'): |
| 159 | + if re.search(ignored, elem[0]): |
| 160 | + return False |
| 161 | + return True |
| 162 | + |
| 163 | + filtered = dict(filter(include, details.items())) |
| 164 | + |
| 165 | + # Replace all hex addresses with a single sanitized value. |
| 166 | + def mask(matchobj): |
| 167 | + return '0x' + ('X' * len(matchobj.group(1))) |
| 168 | + |
| 169 | + merged = collections.defaultdict(list) |
| 170 | + for key, value in filtered.items(): |
| 171 | + key = re.sub(r'0x([0-9a-fA-F]+)', mask, key) |
| 172 | + # Strip out pipeline build timestamps of the form |
| 173 | + # '[2020-11-26T15:30:05.571Z] ' - non-pipeline builds don't include |
| 174 | + # them. |
| 175 | + key = re.sub(r'\[\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}.\d{3}Z\] ', '', |
| 176 | + key) |
| 177 | + # Filter timings of the form '2 ms' |
| 178 | + key = re.sub(r'\d+ ms', 'X ms', key) |
| 179 | + # Filter exact test order / test number - e.g. |
| 180 | + # 13/398 Test #181: ep-engine_ep_unit_tests. |
| 181 | + key = re.sub(r'\s+\d+/\d+ Test\s+#\d+:', ' N/M Test X:', key) |
| 182 | + # Filter engine_testapp exact order / test number - e.g. |
| 183 | + # Running [0007/0047]: multi se |
| 184 | + key = re.sub(r'Running \[\d+/\d+]:', 'Running [X/Y]:', key) |
| 185 | + # Replace long '....' strings used for padding / alignment |
| 186 | + key = re.sub(r'\.\.\.+', '...', key) |
| 187 | + # Filter expected / actual values |
| 188 | + key = re.sub(r'last value:\d+', 'last value:XXX', key) |
| 189 | + key = re.sub(r"Expected `(\d+)', got `\d+'", |
| 190 | + r"Expected `\1', got `YYY'", key) |
| 191 | + key = re.sub(r"Expected `\d+' to be greater than or equal to `(\d+)'", |
| 192 | + r"Expected `XXX' to be greater than or equal to `\1'", key) |
| 193 | + # Normalise file paths across Windows & *ix. |
| 194 | + key = key.replace('\\', '/') |
| 195 | + # Merge value-only and full-eviction failures |
| 196 | + key = key.replace('value_only', '<EVICTION_MODE>') |
| 197 | + key = key.replace('full_eviction', '<EVICTION_MODE>') |
| 198 | + # Merge couchstore filenames |
| 199 | + key = re.sub(r'\d+\.couch\.\d+', 'VBID.couch.REV', key) |
| 200 | + # Filter local ephemeral port numbers |
| 201 | + key = re.sub(r'127\.0\.0\.1(.)\d{5}', r'127.0.0.1\1<PORT>', key) |
| 202 | + key = re.sub(r'localhost(.)\d{5}', r'localhost\1<PORT>', key) |
| 203 | + |
| 204 | + merged[key] += value |
| 205 | + |
| 206 | + # For each failure, determine the time range it has occurred over, and |
| 207 | + # for which patches. |
| 208 | + # If it only occurred: |
| 209 | + # a) Over a small time (6 hours), or |
| 210 | + # b) For a single patch |
| 211 | + # then that implies it's not a repeated, intermittent issue - e.g. just an |
| 212 | + # issue with a single patch / a transient issue on some machine - hence |
| 213 | + # discard it. |
| 214 | + merged2 = dict() |
| 215 | + for key, details in merged.items(): |
| 216 | + timestamps = list() |
| 217 | + patches = dict() |
| 218 | + for d in details: |
| 219 | + timestamps.append(d['timestamp']) |
| 220 | + patches[d['gerrit_patch']] = True |
| 221 | + range = max(timestamps) - min(timestamps) |
| 222 | + |
| 223 | + if range > datetime.timedelta(hours=6) and len(patches) > 1: |
| 224 | + merged2[key] = details |
| 225 | + return merged2 |
| 226 | + |
| 227 | + |
| 228 | +if __name__ == '__main__': |
| 229 | + parser = argparse.ArgumentParser() |
| 230 | + parser.add_argument('username', help='Username for Jenkins REST API') |
| 231 | + parser.add_argument('password', help='Password/token for Jenkins REST API') |
| 232 | + parser.add_argument('--mode', |
| 233 | + choices=['download', 'parse', 'download_and_parse'], |
| 234 | + default='download_and_parse', |
| 235 | + help=('Analysis mode.\n' |
| 236 | + 'download: Download build details, writing to stdout as JSON.\n' |
| 237 | + 'parse: Parse previously-downloaded build details read from stdin.\n' |
| 238 | + 'download_and_parse: Download and parse build details.\n')) |
| 239 | + args = parser.parse_args() |
| 240 | + |
| 241 | + if args.mode != 'parse': |
| 242 | + raw_builds = fetch_failed_builds('http://cv.jenkins.couchbase.com', |
| 243 | + args.username, args.password) |
| 244 | + if args.mode == 'download': |
| 245 | + print(json.dumps(raw_builds)) |
| 246 | + sys.exit(0) |
| 247 | + if args.mode =='parse': |
| 248 | + raw_builds = json.load(sys.stdin) |
| 249 | + logging.info("Number of builds downloaded: {}.".format(len(raw_builds))) |
| 250 | + failures = extract_failed_builds(raw_builds) |
| 251 | + logging.info("Number of builds with at least one failure: {}.".format(len(failures))) |
| 252 | + failures = filter_failed_builds(failures) |
| 253 | + logging.info("Number of unique failures: {}".format(len(failures))) |
| 254 | + |
| 255 | + sorted_failures = sorted(failures.items(), key=lambda i: len(i[1]), |
| 256 | + reverse=True) |
| 257 | + |
| 258 | + # Count total number of failures |
| 259 | + total_failures = 0 |
| 260 | + for (_, details) in sorted_failures: |
| 261 | + total_failures += len(details) |
| 262 | + |
| 263 | + for (summary, details) in sorted_failures: |
| 264 | + num_failures = len(details) |
| 265 | + print('+Error signature+') |
| 266 | + print('{code}') |
| 267 | + print(summary) |
| 268 | + print('{code}\n') |
| 269 | + print('+Details+') |
| 270 | + print( |
| 271 | + '{} instances of this failure ({:.1f}% of sampled failures):'.format( |
| 272 | + num_failures, |
| 273 | + (num_failures * 100.0) / total_failures)) |
| 274 | + for d in details[:10]: |
| 275 | + human_time = d['timestamp'].strftime('%Y-%m-%d %H:%M:%S') |
| 276 | + print("* Time: {}, Jenkins job: {}, patch: {}".format(human_time, |
| 277 | + d['url'], |
| 278 | + d[ |
| 279 | + 'gerrit_patch'])) |
| 280 | + if len(details) > 10: |
| 281 | + print( |
| 282 | + "<cut> - only showing details of first 10/{} instances.".format( |
| 283 | + len(details))) |
| 284 | + print('\n===========\n') |
0 commit comments