|
| 1 | +#!/usr/bin/python3 |
| 2 | +# |
| 3 | +# $ curl -s https://storage.googleapis.com/origin-ci-test/logs/release-openshift-origin-installer-e2e-aws-upgrade/1301652659597479936/artifacts/e2e-aws-upgrade/pods/openshift-cluster-version_cluster-version-operator-6b6bc46c59-dk8bm_cluster-version-operator.log | log-explainer.py |
| 4 | +# $ xdg-open index.html |
| 5 | + |
| 6 | +import datetime |
| 7 | +import logging |
| 8 | +import re |
| 9 | +import sys |
| 10 | + |
| 11 | + |
| 12 | +# klog format parser |
| 13 | +# https://github.com/kubernetes/klog/blob/v2.3.0/klog.go#L571-L585 |
| 14 | +log_regexp = re.compile('''^ |
| 15 | + .* # must-gather leading timestamp |
| 16 | + (?P<level>[IWEF]) # I(nfo), W(arning), E(rror), F(atal) |
| 17 | + (?P<month>[0-1][0-9])(?P<day>[0-3][0-9])[ ](?P<time>[0-9][0-9]:[0-9][0-9]:[0-9][0-9][.][0-9]+) # timestamp |
| 18 | + [ ]+ |
| 19 | + (?P<thread>[0-9]+) # thread ID as returned by GetTID() |
| 20 | + [ ] |
| 21 | + (?P<file>[^:]*):(?P<line>[0-9]+)] |
| 22 | + [ ] |
| 23 | + (?P<message>.*)$ |
| 24 | +''', re.VERBOSE) |
| 25 | + |
| 26 | + |
| 27 | +levels = { |
| 28 | + 'I': 'Info', |
| 29 | + 'W': 'Warning', |
| 30 | + 'E': 'Error', |
| 31 | + 'F': 'Fatal', |
| 32 | +} |
| 33 | + |
| 34 | + |
| 35 | +start_of_sync_cycle_regexp = re.compile('^Running sync (?P<version>[^ ]+) \(force=(?P<force>[^)]+)\) on generation (?P<generation>[0-9]+) in state (?P<state>[^ ]+) at attempt (?P<attempt>[0-9]+)$') |
| 36 | +end_of_sync_cycle_prefix = 'Result of work: ' |
| 37 | +manifest_regexp = re.compile('^(?P<action>Running sync|Done syncing|error running apply) for (?P<type>[^ ]+) "(?P<name>[^"]+)" \((?P<index>[0-9]+) of (?P<total>[0-9]+)\)(?P<suffix>.*)$') |
| 38 | + |
| 39 | + |
| 40 | +def read_klog(stream): |
| 41 | + lines = [] |
| 42 | + line = None |
| 43 | + year = datetime.date.today().year # assume the log is from this calendar year |
| 44 | + for line_string in sys.stdin.readlines(): |
| 45 | + match = log_regexp.match(line_string) |
| 46 | + if not match: |
| 47 | + if line and line_string.strip(): |
| 48 | + line['message'] += '\n' + line_string.strip() |
| 49 | + continue |
| 50 | + groups = match.groupdict() |
| 51 | + groups['year'] = year |
| 52 | + timestamp = datetime.datetime.strptime('{year} {month} {day} {time}'.format(**groups), '%Y %m %d %H:%M:%S.%f') |
| 53 | + line = { |
| 54 | + 'level': levels[groups['level']], |
| 55 | + 'time': timestamp, |
| 56 | + 'thread': int(groups['thread']), |
| 57 | + 'file': groups['file'], |
| 58 | + 'line': int(groups['line']), |
| 59 | + 'message': groups['message'], |
| 60 | + } |
| 61 | + lines.append(line) |
| 62 | + return lines |
| 63 | + |
| 64 | + |
| 65 | +def parse_lines(lines): |
| 66 | + metadata = { |
| 67 | + 'version': lines[0]['message'], |
| 68 | + 'start_time': lines[0]['time'], |
| 69 | + 'stop_time': lines[-1]['time'], |
| 70 | + 'threads': len({line['thread'] for line in lines}), |
| 71 | + |
| 72 | + # goroutines |
| 73 | + 'available updates sync': [line for line in lines if line['file'] == 'availableupdates.go' or 'syncing available updates' in line['message']], |
| 74 | + 'cluster version sync': [line for line in lines if 'syncing cluster version' in line['message'] or 'Desired version from' in line['message']], |
| 75 | + 'metrics': [line for line in lines if line['file'] == 'metrics.go'], |
| 76 | + 'sync worker': [line for line in lines if line['file'] in ['sync_worker.go', 'task.go', 'task_graph.go', 'warnings.go']], |
| 77 | + 'upgradeable': [line for line in lines if line['file'] == 'upgradeable.go' or 'syncing upgradeable' in line['message']], |
| 78 | + } |
| 79 | + |
| 80 | + metadata['leader_elections'] = [line for line in lines if line['file'] == 'leaderelection.go'] |
| 81 | + acquired_leader_lease = [line for line in metadata['leader_elections'] if 'successfully acquired lease' in line['message']] |
| 82 | + if acquired_leader_lease: |
| 83 | + metadata['time_to_leader_lease'] = acquired_leader_lease[0]['time'] - metadata['start_time'] |
| 84 | + |
| 85 | + sync = None |
| 86 | + metadata['syncs'] = [] |
| 87 | + for line in metadata['sync worker']: |
| 88 | + match = start_of_sync_cycle_regexp.match(line['message']) |
| 89 | + if match: |
| 90 | + sync = match.groupdict() |
| 91 | + sync['start_time'] = line['time'] |
| 92 | + sync['lines'] = [] |
| 93 | + sync['manifests'] = {} |
| 94 | + metadata['syncs'].append(sync) |
| 95 | + if sync: |
| 96 | + sync['lines'].append(line) |
| 97 | + else: |
| 98 | + logging.warning('sync worker line outside of sync: {}'.format(line)) |
| 99 | + if line['message'].startswith(end_of_sync_cycle_prefix): |
| 100 | + sync['results'] = { |
| 101 | + 'result': line['message'][len(end_of_sync_cycle_prefix):], |
| 102 | + } |
| 103 | + sync['results'].update(line) |
| 104 | + match = manifest_regexp.match(line['message']) |
| 105 | + if match: |
| 106 | + groups = match.groupdict() |
| 107 | + for key in ['index', 'total']: |
| 108 | + groups[key] = int(groups[key]) |
| 109 | + groups.update(line) |
| 110 | + manifest_key = (groups['type'], groups['name']) |
| 111 | + if manifest_key not in sync['manifests']: |
| 112 | + sync['manifests'][manifest_key] = [] |
| 113 | + sync['manifests'][manifest_key].append(groups) |
| 114 | + for sync in metadata['syncs']: |
| 115 | + sync['failed_manifests'] = {k: m for (k, m) in sync['manifests'].items() if m[-1]['action'] == 'error running apply'} |
| 116 | + |
| 117 | + metadata['unclassified'] = [ |
| 118 | + line for line in lines |
| 119 | + if 'ClusterVersionOperator' not in line['message'] and |
| 120 | + line not in metadata['available updates sync'] and |
| 121 | + line not in metadata['cluster version sync'] and |
| 122 | + line not in metadata['metrics'] and |
| 123 | + line not in metadata['sync worker'] and |
| 124 | + line not in metadata['upgradeable'] and |
| 125 | + line not in metadata['leader_elections'] |
| 126 | + ] |
| 127 | + return metadata |
| 128 | + |
| 129 | + |
| 130 | +def write_sync_svg(path, metadata): |
| 131 | + time_ranges = {} |
| 132 | + for sync in metadata['syncs']: |
| 133 | + for key, manifest in sync['manifests'].items(): |
| 134 | + start = (manifest[0]['time'] - metadata['start_time']).total_seconds() |
| 135 | + if 'Done syncing' in manifest[-1]['action']: |
| 136 | + stop = (manifest[-1]['time'] - metadata['start_time']).total_seconds() |
| 137 | + else: |
| 138 | + stop = None |
| 139 | + if key in time_ranges: |
| 140 | + previous = time_ranges[key] |
| 141 | + if previous[1]: |
| 142 | + stop = previous[1] |
| 143 | + time_ranges[key] = (previous[0], stop, previous[2], previous[3], manifest) |
| 144 | + else: |
| 145 | + time_ranges[key] = (start, stop, manifest[0]['type'], manifest[0]['name'], manifest) |
| 146 | + |
| 147 | + for _, stop, objType, name, manifest in sorted(time_ranges.values()): |
| 148 | + if not stop: |
| 149 | + logging.warning('not finished: {} {}{}'.format(objType, name, manifest[-1]['suffix'])) |
| 150 | + |
| 151 | + rectangles = [] |
| 152 | + y = 0 |
| 153 | + step = 4 |
| 154 | + last_stop = 0 |
| 155 | + max_stop = (metadata['stop_time'] - metadata['start_time']).total_seconds() |
| 156 | + for start, stop, objType, name, manifest in sorted(time_ranges.values()): |
| 157 | + if stop is None: |
| 158 | + stop = max_stop |
| 159 | + fill = 'red' |
| 160 | + else: |
| 161 | + fill = 'blue' |
| 162 | + rectangles.append('<rect x="{}" y="{}" width="{}" height="{}" fill="{}"><title>{} {} {}/{} ({})</title></rect>'.format(start, y, stop - start, step, fill, objType, name, manifest[0]['index'], manifest[0]['total'], stop - start)) |
| 163 | + y += step |
| 164 | + if stop > last_stop: |
| 165 | + last_stop = stop |
| 166 | + |
| 167 | + with open(path, 'w') as f: |
| 168 | + f.write('<svg viewBox="0 0 {} {}" xmlns="http://www.w3.org/2000/svg">\n'.format(last_stop, y)) |
| 169 | + duration = datetime.timedelta(seconds=last_stop) |
| 170 | + time_scale = 5 |
| 171 | + for i in range(1 + int(last_stop // (time_scale*60))): |
| 172 | + f.write(' <line x1="{x}" x2="{x}" y1="0%" y2="100%" stroke="gray" style="stroke-opacity: 0.5;"><title>{minutes} minutes</title></line>\n'.format( |
| 173 | + x=i * time_scale * 60, |
| 174 | + minutes=i * time_scale)) |
| 175 | + for rectangle in rectangles: |
| 176 | + f.write(' {}\n'.format(rectangle)) |
| 177 | + f.write(' <text x="50%" y="20" text-anchor="middle">CVO manifests, duration {}</text>\n'.format(duration)) |
| 178 | + f.write('</svg>\n') |
| 179 | + |
| 180 | + |
| 181 | +def write_log_lines(stream, lines): |
| 182 | + # FIXME: write to separate files? |
| 183 | + if lines: |
| 184 | + stream.write('<pre>\n{}\n</pre>\n'.format( |
| 185 | + '\n'.join( |
| 186 | + '{} {} {}'.format(line['time'], line['file'], line['message']) for line in lines |
| 187 | + ) |
| 188 | + )) |
| 189 | + else: |
| 190 | + stream.write(' <p>No log lines.</p>\n') |
| 191 | + |
| 192 | + |
| 193 | +def write_index(path, metadata, sync_svg): |
| 194 | + with open(path, 'w') as f: |
| 195 | + f.write('''<!doctype html> |
| 196 | +<html lang="en"> |
| 197 | +<head> |
| 198 | + <title>Cluster-version operator log</title> |
| 199 | + <style type="text/css"> |
| 200 | + .warning { |
| 201 | + font-weight: bold; |
| 202 | + color: red; |
| 203 | + } |
| 204 | + pre { |
| 205 | + font-size: 80%; |
| 206 | + } |
| 207 | + </style> |
| 208 | +</head> |
| 209 | +<body> |
| 210 | + <h1>Cluster-version operator log</h1> |
| 211 | +''') |
| 212 | + f.write(' <dl>\n') |
| 213 | + for key in [ |
| 214 | + 'version', |
| 215 | + 'start_time', |
| 216 | + 'stop_time', |
| 217 | + 'threads', |
| 218 | + ]: |
| 219 | + f.write(' <dt>{}</dt>\n <dd>{}</dd>\n'.format(key, metadata.get(key, '-'))) |
| 220 | + f.write(' </dl>\n') |
| 221 | + |
| 222 | + f.write(' <h2>Leader election</h2>\n') |
| 223 | + if metadata.get('time_to_leader_lease'): |
| 224 | + warning = '' |
| 225 | + if metadata['time_to_leader_lease'].total_seconds() > 30: |
| 226 | + warning = ' class="warning"' |
| 227 | + f.write(' <p{}>Leader lease acquired in {}.</p>'.format(warning, metadata['time_to_leader_lease'])) |
| 228 | + else: |
| 229 | + f.write(' <p class="warning">Leader lease never acquired.</p>') |
| 230 | + write_log_lines(stream=f, lines=metadata.get('leader_elections')) |
| 231 | + |
| 232 | + f.write(' <h2>Peripheral goroutines</h2>\n') |
| 233 | + for key in [ |
| 234 | + 'available updates sync', |
| 235 | + 'cluster version sync', |
| 236 | + 'metrics', |
| 237 | + 'upgradeable', |
| 238 | + ]: |
| 239 | + f.write(' <h3 id="{0}">{0}</h3>\n'.format(key)) |
| 240 | + gorountine_lines = metadata.get(key) |
| 241 | + if gorountine_lines: |
| 242 | + time_to_start = gorountine_lines[0]['time'] - metadata['start_time'] |
| 243 | + time_since_final_log = metadata['stop_time'] - gorountine_lines[-1]['time'] |
| 244 | + warning = '' |
| 245 | + if ( |
| 246 | + time_to_start.total_seconds() > 2*60 or |
| 247 | + (time_since_final_log.total_seconds() > 5*60 and key != 'metrics') # metrics server does not log periodically |
| 248 | + ): |
| 249 | + warning = ' class="warning"' |
| 250 | + f.write(' <p{}>Goroutine beings logging {} after the initial container log entry and is last seen {} before the final container log entry.</p>\n'.format( |
| 251 | + warning, time_to_start, time_since_final_log, |
| 252 | + )) |
| 253 | + |
| 254 | + if metadata.get('unclassified'): |
| 255 | + f.write(' <h2>Unclassified log lines</h2>\n') |
| 256 | + write_log_lines(stream=f, lines=metadata['unclassified']) |
| 257 | + |
| 258 | + f.write(' <h2 id="sync-cycles">Sync cycles</h2>\n') |
| 259 | + version = None |
| 260 | + mode = None |
| 261 | + for sync in metadata['syncs']: |
| 262 | + if sync['version'] != version: |
| 263 | + f.write(' <h3>{}</h3>\n'.format(sync['version'])) |
| 264 | + version = sync['version'] |
| 265 | + if (sync['state'], sync['force']) != mode: |
| 266 | + f.write(' <h4>{} (force={})</h4>\n'.format(sync['state'], sync['force'])) |
| 267 | + mode = (sync['state'], sync['force']) |
| 268 | + end = 'running' |
| 269 | + if 'results' in sync: |
| 270 | + end = sync['results']['time'] |
| 271 | + result = 'running' |
| 272 | + if 'results' in sync: |
| 273 | + if sync['results']['result'] == '[]': |
| 274 | + result = 'success' |
| 275 | + else: |
| 276 | + result = 'failure' |
| 277 | + f.write(' <h5>Attempt {} {}: {} - {}</h5>\n'.format(sync['attempt'], result, sync['start_time'], end)) |
| 278 | + f.write(' <dl>\n') |
| 279 | + for (key, value) in [ |
| 280 | + ('lines', len(sync['lines'])), |
| 281 | + ('result', sync.get('results', {}).get('result', 'running')), |
| 282 | + ('manifests attempted', len(sync['manifests'])), |
| 283 | + ]: |
| 284 | + f.write(' <dt>{}</dt>\n <dd>{}</dd>\n'.format(key, value)) |
| 285 | + f.write(' </dl>\n') |
| 286 | + if sync['failed_manifests']: |
| 287 | + f.write(' <p>Failed manifests:</p>\n') |
| 288 | + f.write(' <ul>\n') |
| 289 | + for lines in sorted(sync['failed_manifests'].values(), key=lambda lines: lines[-1]['index']): |
| 290 | + f.write(' <li>{time} {message}</li>\n'.format(**lines[-1])) |
| 291 | + f.write(' </ul>\n') |
| 292 | + |
| 293 | + if metadata['sync worker']: |
| 294 | + time_since_final_log = metadata['stop_time'] - metadata['sync worker'][-1]['time'] |
| 295 | + if time_since_final_log.total_seconds() > 5*60: |
| 296 | + f.write(' <p class="warning">Goroutine is last seen {} before the final container log entry.</p>\n'.format(time_since_final_log)) |
| 297 | + |
| 298 | + f.write(' <div style="text-align:center">\n <object data="{}" type="image/svg+xml" width="100%" />\n </div>\n'.format(sync_svg)) |
| 299 | + |
| 300 | + f.write('</body>\n</html>\n') |
| 301 | + |
| 302 | + |
| 303 | +if __name__ == '__main__': |
| 304 | + lines = read_klog(stream=sys.stdin) |
| 305 | + metadata = parse_lines(lines=lines) |
| 306 | + write_sync_svg(path='sync.svg', metadata=metadata) |
| 307 | + write_index(path='index.html', metadata=metadata, sync_svg='sync.svg') |
0 commit comments