Skip to content

Commit 8410557

Browse files
committed
hack/log-explainer: Expand SVG waterfall to explain more of the log
Making it easier to dig into the individual sync cycles, find goroutines that have hung, etc.
1 parent 087b506 commit 8410557

File tree

2 files changed

+307
-72
lines changed

2 files changed

+307
-72
lines changed

hack/log-explainer.py

Lines changed: 307 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,307 @@
1+
#!/usr/bin/python3
2+
#
3+
# $ curl -s https://storage.googleapis.com/origin-ci-test/logs/release-openshift-origin-installer-e2e-aws-upgrade/1301652659597479936/artifacts/e2e-aws-upgrade/pods/openshift-cluster-version_cluster-version-operator-6b6bc46c59-dk8bm_cluster-version-operator.log | log-explainer.py
4+
# $ xdg-open index.html
5+
6+
import datetime
7+
import logging
8+
import re
9+
import sys
10+
11+
12+
# klog format parser
13+
# https://github.com/kubernetes/klog/blob/v2.3.0/klog.go#L571-L585
14+
log_regexp = re.compile('''^
15+
.* # must-gather leading timestamp
16+
(?P<level>[IWEF]) # I(nfo), W(arning), E(rror), F(atal)
17+
(?P<month>[0-1][0-9])(?P<day>[0-3][0-9])[ ](?P<time>[0-9][0-9]:[0-9][0-9]:[0-9][0-9][.][0-9]+) # timestamp
18+
[ ]+
19+
(?P<thread>[0-9]+) # thread ID as returned by GetTID()
20+
[ ]
21+
(?P<file>[^:]*):(?P<line>[0-9]+)]
22+
[ ]
23+
(?P<message>.*)$
24+
''', re.VERBOSE)
25+
26+
27+
levels = {
28+
'I': 'Info',
29+
'W': 'Warning',
30+
'E': 'Error',
31+
'F': 'Fatal',
32+
}
33+
34+
35+
start_of_sync_cycle_regexp = re.compile('^Running sync (?P<version>[^ ]+) \(force=(?P<force>[^)]+)\) on generation (?P<generation>[0-9]+) in state (?P<state>[^ ]+) at attempt (?P<attempt>[0-9]+)$')
36+
end_of_sync_cycle_prefix = 'Result of work: '
37+
manifest_regexp = re.compile('^(?P<action>Running sync|Done syncing|error running apply) for (?P<type>[^ ]+) "(?P<name>[^"]+)" \((?P<index>[0-9]+) of (?P<total>[0-9]+)\)(?P<suffix>.*)$')
38+
39+
40+
def read_klog(stream):
41+
lines = []
42+
line = None
43+
year = datetime.date.today().year # assume the log is from this calendar year
44+
for line_string in sys.stdin.readlines():
45+
match = log_regexp.match(line_string)
46+
if not match:
47+
if line and line_string.strip():
48+
line['message'] += '\n' + line_string.strip()
49+
continue
50+
groups = match.groupdict()
51+
groups['year'] = year
52+
timestamp = datetime.datetime.strptime('{year} {month} {day} {time}'.format(**groups), '%Y %m %d %H:%M:%S.%f')
53+
line = {
54+
'level': levels[groups['level']],
55+
'time': timestamp,
56+
'thread': int(groups['thread']),
57+
'file': groups['file'],
58+
'line': int(groups['line']),
59+
'message': groups['message'],
60+
}
61+
lines.append(line)
62+
return lines
63+
64+
65+
def parse_lines(lines):
66+
metadata = {
67+
'version': lines[0]['message'],
68+
'start_time': lines[0]['time'],
69+
'stop_time': lines[-1]['time'],
70+
'threads': len({line['thread'] for line in lines}),
71+
72+
# goroutines
73+
'available updates sync': [line for line in lines if line['file'] == 'availableupdates.go' or 'syncing available updates' in line['message']],
74+
'cluster version sync': [line for line in lines if 'syncing cluster version' in line['message'] or 'Desired version from' in line['message']],
75+
'metrics': [line for line in lines if line['file'] == 'metrics.go'],
76+
'sync worker': [line for line in lines if line['file'] in ['sync_worker.go', 'task.go', 'task_graph.go', 'warnings.go']],
77+
'upgradeable': [line for line in lines if line['file'] == 'upgradeable.go' or 'syncing upgradeable' in line['message']],
78+
}
79+
80+
metadata['leader_elections'] = [line for line in lines if line['file'] == 'leaderelection.go']
81+
acquired_leader_lease = [line for line in metadata['leader_elections'] if 'successfully acquired lease' in line['message']]
82+
if acquired_leader_lease:
83+
metadata['time_to_leader_lease'] = acquired_leader_lease[0]['time'] - metadata['start_time']
84+
85+
sync = None
86+
metadata['syncs'] = []
87+
for line in metadata['sync worker']:
88+
match = start_of_sync_cycle_regexp.match(line['message'])
89+
if match:
90+
sync = match.groupdict()
91+
sync['start_time'] = line['time']
92+
sync['lines'] = []
93+
sync['manifests'] = {}
94+
metadata['syncs'].append(sync)
95+
if sync:
96+
sync['lines'].append(line)
97+
else:
98+
logging.warning('sync worker line outside of sync: {}'.format(line))
99+
if line['message'].startswith(end_of_sync_cycle_prefix):
100+
sync['results'] = {
101+
'result': line['message'][len(end_of_sync_cycle_prefix):],
102+
}
103+
sync['results'].update(line)
104+
match = manifest_regexp.match(line['message'])
105+
if match:
106+
groups = match.groupdict()
107+
for key in ['index', 'total']:
108+
groups[key] = int(groups[key])
109+
groups.update(line)
110+
manifest_key = (groups['type'], groups['name'])
111+
if manifest_key not in sync['manifests']:
112+
sync['manifests'][manifest_key] = []
113+
sync['manifests'][manifest_key].append(groups)
114+
for sync in metadata['syncs']:
115+
sync['failed_manifests'] = {k: m for (k, m) in sync['manifests'].items() if m[-1]['action'] == 'error running apply'}
116+
117+
metadata['unclassified'] = [
118+
line for line in lines
119+
if 'ClusterVersionOperator' not in line['message'] and
120+
line not in metadata['available updates sync'] and
121+
line not in metadata['cluster version sync'] and
122+
line not in metadata['metrics'] and
123+
line not in metadata['sync worker'] and
124+
line not in metadata['upgradeable'] and
125+
line not in metadata['leader_elections']
126+
]
127+
return metadata
128+
129+
130+
def write_sync_svg(path, metadata):
131+
time_ranges = {}
132+
for sync in metadata['syncs']:
133+
for key, manifest in sync['manifests'].items():
134+
start = (manifest[0]['time'] - metadata['start_time']).total_seconds()
135+
if 'Done syncing' in manifest[-1]['action']:
136+
stop = (manifest[-1]['time'] - metadata['start_time']).total_seconds()
137+
else:
138+
stop = None
139+
if key in time_ranges:
140+
previous = time_ranges[key]
141+
if previous[1]:
142+
stop = previous[1]
143+
time_ranges[key] = (previous[0], stop, previous[2], previous[3], manifest)
144+
else:
145+
time_ranges[key] = (start, stop, manifest[0]['type'], manifest[0]['name'], manifest)
146+
147+
for _, stop, objType, name, manifest in sorted(time_ranges.values()):
148+
if not stop:
149+
logging.warning('not finished: {} {}{}'.format(objType, name, manifest[-1]['suffix']))
150+
151+
rectangles = []
152+
y = 0
153+
step = 4
154+
last_stop = 0
155+
max_stop = (metadata['stop_time'] - metadata['start_time']).total_seconds()
156+
for start, stop, objType, name, manifest in sorted(time_ranges.values()):
157+
if stop is None:
158+
stop = max_stop
159+
fill = 'red'
160+
else:
161+
fill = 'blue'
162+
rectangles.append('<rect x="{}" y="{}" width="{}" height="{}" fill="{}"><title>{} {} {}/{} ({})</title></rect>'.format(start, y, stop - start, step, fill, objType, name, manifest[0]['index'], manifest[0]['total'], stop - start))
163+
y += step
164+
if stop > last_stop:
165+
last_stop = stop
166+
167+
with open(path, 'w') as f:
168+
f.write('<svg viewBox="0 0 {} {}" xmlns="http://www.w3.org/2000/svg">\n'.format(last_stop, y))
169+
duration = datetime.timedelta(seconds=last_stop)
170+
time_scale = 5
171+
for i in range(1 + int(last_stop // (time_scale*60))):
172+
f.write(' <line x1="{x}" x2="{x}" y1="0%" y2="100%" stroke="gray" style="stroke-opacity: 0.5;"><title>{minutes} minutes</title></line>\n'.format(
173+
x=i * time_scale * 60,
174+
minutes=i * time_scale))
175+
for rectangle in rectangles:
176+
f.write(' {}\n'.format(rectangle))
177+
f.write(' <text x="50%" y="20" text-anchor="middle">CVO manifests, duration {}</text>\n'.format(duration))
178+
f.write('</svg>\n')
179+
180+
181+
def write_log_lines(stream, lines):
182+
# FIXME: write to separate files?
183+
if lines:
184+
stream.write('<pre>\n{}\n</pre>\n'.format(
185+
'\n'.join(
186+
'{} {} {}'.format(line['time'], line['file'], line['message']) for line in lines
187+
)
188+
))
189+
else:
190+
stream.write(' <p>No log lines.</p>\n')
191+
192+
193+
def write_index(path, metadata, sync_svg):
194+
with open(path, 'w') as f:
195+
f.write('''<!doctype html>
196+
<html lang="en">
197+
<head>
198+
<title>Cluster-version operator log</title>
199+
<style type="text/css">
200+
.warning {
201+
font-weight: bold;
202+
color: red;
203+
}
204+
pre {
205+
font-size: 80%;
206+
}
207+
</style>
208+
</head>
209+
<body>
210+
<h1>Cluster-version operator log</h1>
211+
''')
212+
f.write(' <dl>\n')
213+
for key in [
214+
'version',
215+
'start_time',
216+
'stop_time',
217+
'threads',
218+
]:
219+
f.write(' <dt>{}</dt>\n <dd>{}</dd>\n'.format(key, metadata.get(key, '-')))
220+
f.write(' </dl>\n')
221+
222+
f.write(' <h2>Leader election</h2>\n')
223+
if metadata.get('time_to_leader_lease'):
224+
warning = ''
225+
if metadata['time_to_leader_lease'].total_seconds() > 30:
226+
warning = ' class="warning"'
227+
f.write(' <p{}>Leader lease acquired in {}.</p>'.format(warning, metadata['time_to_leader_lease']))
228+
else:
229+
f.write(' <p class="warning">Leader lease never acquired.</p>')
230+
write_log_lines(stream=f, lines=metadata.get('leader_elections'))
231+
232+
f.write(' <h2>Peripheral goroutines</h2>\n')
233+
for key in [
234+
'available updates sync',
235+
'cluster version sync',
236+
'metrics',
237+
'upgradeable',
238+
]:
239+
f.write(' <h3 id="{0}">{0}</h3>\n'.format(key))
240+
gorountine_lines = metadata.get(key)
241+
if gorountine_lines:
242+
time_to_start = gorountine_lines[0]['time'] - metadata['start_time']
243+
time_since_final_log = metadata['stop_time'] - gorountine_lines[-1]['time']
244+
warning = ''
245+
if (
246+
time_to_start.total_seconds() > 2*60 or
247+
(time_since_final_log.total_seconds() > 5*60 and key != 'metrics') # metrics server does not log periodically
248+
):
249+
warning = ' class="warning"'
250+
f.write(' <p{}>Goroutine beings logging {} after the initial container log entry and is last seen {} before the final container log entry.</p>\n'.format(
251+
warning, time_to_start, time_since_final_log,
252+
))
253+
254+
if metadata.get('unclassified'):
255+
f.write(' <h2>Unclassified log lines</h2>\n')
256+
write_log_lines(stream=f, lines=metadata['unclassified'])
257+
258+
f.write(' <h2 id="sync-cycles">Sync cycles</h2>\n')
259+
version = None
260+
mode = None
261+
for sync in metadata['syncs']:
262+
if sync['version'] != version:
263+
f.write(' <h3>{}</h3>\n'.format(sync['version']))
264+
version = sync['version']
265+
if (sync['state'], sync['force']) != mode:
266+
f.write(' <h4>{} (force={})</h4>\n'.format(sync['state'], sync['force']))
267+
mode = (sync['state'], sync['force'])
268+
end = 'running'
269+
if 'results' in sync:
270+
end = sync['results']['time']
271+
result = 'running'
272+
if 'results' in sync:
273+
if sync['results']['result'] == '[]':
274+
result = 'success'
275+
else:
276+
result = 'failure'
277+
f.write(' <h5>Attempt {} {}: {} - {}</h5>\n'.format(sync['attempt'], result, sync['start_time'], end))
278+
f.write(' <dl>\n')
279+
for (key, value) in [
280+
('lines', len(sync['lines'])),
281+
('result', sync.get('results', {}).get('result', 'running')),
282+
('manifests attempted', len(sync['manifests'])),
283+
]:
284+
f.write(' <dt>{}</dt>\n <dd>{}</dd>\n'.format(key, value))
285+
f.write(' </dl>\n')
286+
if sync['failed_manifests']:
287+
f.write(' <p>Failed manifests:</p>\n')
288+
f.write(' <ul>\n')
289+
for lines in sorted(sync['failed_manifests'].values(), key=lambda lines: lines[-1]['index']):
290+
f.write(' <li>{time} {message}</li>\n'.format(**lines[-1]))
291+
f.write(' </ul>\n')
292+
293+
if metadata['sync worker']:
294+
time_since_final_log = metadata['stop_time'] - metadata['sync worker'][-1]['time']
295+
if time_since_final_log.total_seconds() > 5*60:
296+
f.write(' <p class="warning">Goroutine is last seen {} before the final container log entry.</p>\n'.format(time_since_final_log))
297+
298+
f.write(' <div style="text-align:center">\n <object data="{}" type="image/svg+xml" width="100%" />\n </div>\n'.format(sync_svg))
299+
300+
f.write('</body>\n</html>\n')
301+
302+
303+
if __name__ == '__main__':
304+
lines = read_klog(stream=sys.stdin)
305+
metadata = parse_lines(lines=lines)
306+
write_sync_svg(path='sync.svg', metadata=metadata)
307+
write_index(path='index.html', metadata=metadata, sync_svg='sync.svg')

waterfall/cvo-waterfall.py

Lines changed: 0 additions & 72 deletions
This file was deleted.

0 commit comments

Comments
 (0)