-
Notifications
You must be signed in to change notification settings - Fork 6
Expand file tree
/
Copy pathrelease-validation.nix
More file actions
441 lines (365 loc) · 16 KB
/
release-validation.nix
File metadata and controls
441 lines (365 loc) · 16 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
# This is meant to be the final _automated_ validation test before
# pushing the release out for manual testing/QA.
#
# It tests the double-self-update scenario, updating PlayOS from a BASE system
# version to a PRE version and finally to the NEXT version.
#
# By default the system image in PRE and NEXT is the same and is the current
# PlayOS image. This setup tests whether the current system can self-update.
#
# The tested steps:
# 1. 'latest' is set to PRE in update server
# 2. BASE system downloads+installs PRE
# 3. VM reboots into PRE
# 4. PRE system is marked Good. First update (BASE->PRE) successful.
# 5. 'latest' is set to NEXT in update server
# 6. PRE downloads+installs NEXT
# 7. VM reboots into NEXT
# 8. NEXT system is marked Good. Second update (PRE->NEXT) successful.
#
# In theory, the PRE and NEXT bundles can be replaced with a different system
# image as long as they have the same build configuration (updateUrl and kioskUrl)
# and a passwordless root. This allows to test path-dependant update scenarios.
#
# The test is "untainted" because it does not alter the configuration of the
# base or next systems' in any way (e.g. no test-instrumentation.nix extras).
# Instead it sets up a simulated environment (DHCP, DNS, update server, etc.)
# and runs the base system in it, interacting via "physical" inputs (mouse,
# keyboard using QEMU's QMP) and observing the results via screenshots+OCR.
#
# The test can be run non-interactively, but for debugging you will definitely
# need visible output since there are no logs. This can be done using:
#
# nix-build -A driverInteractive testing/release-validation.nix
# ./result/bin/nixos-test-driver
# >> run_tests()
#
# The system images have a passwordless root account, so you can gain root
# access from the QEMU GUI:
# - switch to QEMU monitor console (using ctrl-alt-2 or the menu)
# - execute "sendkey ctrl-shift-f8" (switch to status screen on TTY8)
# - execute "sendkey ctrl-shift-f1" (switch to TTY1)
# - login with "root"
let
# Note: we use HTTP instead of HTTPS, because pkgs.fetchurl fails
# when __structuredAttrs is enabled (due to mysterious OpenSSL/TLS errors)
# and also fails when __structuredAttrs is disabled (due to
# https://github.com/NixOS/nixpkgs/issues/177660).
# HTTP usage is fine since the output hash is fixed and verifies the download.
baseS3URL = "http://dividat-playos-test-disks.s3.amazonaws.com/by-tag";
# Generated via ./build release-disk and .github/workflows/release-tag.yml
# See https://github.com/dividat/playos/releases
diskImageURLs = {
"2023.9.1-DISK" = {
url = "${baseS3URL}/playos-release-disk-2023.9.1-DISK.img.zst";
hash = "sha256-Az5eYYZFUweSzMSEBKIB6Q3mGtG0SLJ51LxWeeJqpfw=";
};
"2024.7.0-DISK" = { # fails double-update, needs backport+rebuild
url = "${baseS3URL}/playos-release-disk-2024.7.0-DISK.img.zst";
hash = "sha256-vJDB99ICt0W1PmONikNY5wwIF7oQU388DzYRgPqkooY=";
};
"2025.3.1" = {
url = "${baseS3URL}/playos-release-disk-2025.3.1.img.zst";
hash = "sha256-ySLOMGsDfeGU4r8xUOwW9M/VMKC8GpzhaKXVQwu5fxM=";
};
"2025.3.2" = { # fails double-update, needs backport+rebuild
url = "${baseS3URL}/playos-release-disk-2025.3.2.img.zst";
hash = "sha256-txgvrLtO2qq8JZlU/ijONnVLAMLK/6QyRutwej5UEWY=";
};
"2025.3.3" = {
url = "${baseS3URL}/playos-release-disk-2025.3.3.img.zst";
hash = "sha256-u71dsbtnzXrERQ20H1CmCj9K9S1t2aOG0elzcrLIsYY=";
};
"2026.1.0" = {
url = "${baseS3URL}/playos-release-disk-2026.1.0.img.zst";
hash = "sha256-M+fZJtoHONlPBIaWV0vjJCdvtwDH+a6TyoiV243/wfo=";
};
};
mkNextSystemBundle = { pkgs, version, updateUrlDomain, kioskUrlDomain }:
(pkgs.callPackage ../default.nix {
updateUrl = "http://${updateUrlDomain}/";
kioskUrl = "http://${kioskUrlDomain}/";
versionOverride = version;
}).releaseValidation.components.unsignedRaucBundle;
in
{
pkgs ? import ../pkgs { },
application ? import ../application.nix,
safeProductName ? application.safeProductName,
# Note: the system images must all be built with the same update and kiosk URLs which:
# 1) have proper domain names (i.e. not localhost or plain IPs)
# 2) do not use HTTPS
updateUrlDomain ? "update-server.local",
kioskUrlDomain ? "kiosk-server.local",
# PlayOS system we are updating from
baseSystemVersion ? "2026.1.0",
# A zstd-compressed PlayOS disk image
baseSystemDiskImage ? (pkgs.fetchurl diskImageURLs.${baseSystemVersion})
.overrideAttrs {
__structuredAttrs = true;
unsafeDiscardReferences.out = true;
},
# PlayOS versions we are updating into.
#
# There will be two updates: BASE -> PRE and PRE -> NEXT where PRE and NEXT
# are by default the same (i.e. the current PlayOS system image).
#
# Note: these versions cannot be substrings of each other, since
# we rely on (visually) detecting the values on screen.
preSystemVersion ? "6666.66.66",
nextSystemVersion ? "9999.99.99",
# PlayOS bundles to be updated to
preSystemBundlePath ? mkNextSystemBundle
{ version = preSystemVersion; inherit pkgs updateUrlDomain kioskUrlDomain; },
nextSystemBundlePath ? mkNextSystemBundle
{ version = nextSystemVersion; inherit pkgs updateUrlDomain kioskUrlDomain; }
}:
let
overlayPath = "/tmp/release-validation-disk.img";
in
with pkgs.lib;
pkgs.testers.runNixOSTest {
name = "Older releases of PlayOS self-update to newer";
nodes = {
# runs a DNS server and a mock HTTP update/bundle server
sidekick = { config, nodes, lib, pkgs, ... }:
{
config = {
networking.dhcpcd.enable = false;
networking.primaryIPAddress = "192.168.1.${toString config.virtualisation.test.nodeNumber}";
# disable QEMU `-net user` interface to have less moving parts
virtualisation.qemu.networkingOptions = lib.mkOverride 0 [ ];
# will not work, because playos receives IP settings via DHCP
networking.extraHosts = lib.mkOverride 0 "";
virtualisation.vlans = [ 1 ];
networking.firewall.enable = false;
services.static-web-server.enable = true;
services.static-web-server.listen = "[::]:80";
services.static-web-server.root = "/tmp/www";
systemd.tmpfiles.rules = [
"d ${config.services.static-web-server.root} 0777 root root -"
];
services.dnsmasq.enable = true;
services.dnsmasq.settings = {
dhcp-option = [
"3,${config.networking.primaryIPAddress}" # self as gateway
"6,${config.networking.primaryIPAddress}" # self as DNS
];
dhcp-range = "192.168.1.30,192.168.1.99,1h";
address = [
"/${updateUrlDomain}/${config.networking.primaryIPAddress}"
"/${kioskUrlDomain}/${config.networking.primaryIPAddress}"
];
};
};
};
# Note: playos is started from pre-built disk _without_ any test
# instrumentation, there's no test-driver "backdoor", no shared files, etc.
# Therefore the only way to interact is via QMP.
playos = { config, lib, pkgs, ... }:
{
imports = [
(import ./end-to-end/virtualisation-config.nix { inherit overlayPath; })
];
config = {
# disable QEMU VNET
virtualisation.qemu.networkingOptions = lib.mkOverride 0 [ ];
virtualisation.sharedDirectories = lib.mkOverride 0 { };
virtualisation.memorySize = lib.mkForce 4096;
virtualisation.vlans = [ 1 ];
virtualisation.qemu.options = [
# needed for mouse_move to work
"-device" "usb-mouse,bus=usb-bus.0"
];
};
};
};
interactive.nodes.playos.virtualisation.qemu.options = [
# extra QEMU monitor GUI access for debugging when running interactively
"-monitor" "vc"
];
extraPythonPackages = ps: [
ps.colorama
ps.types-colorama
ps.requests
ps.types-requests
ps.tesserocr
ps.pillow
ps.types-pillow
];
testScript = {nodes}: ''
${builtins.readFile ./helpers/nixos-test-script-helpers.py}
${builtins.readFile ./end-to-end/tests/base/proxy-and-update-helpers.py}
import tesserocr # type: ignore
import PIL.Image
import PIL.ImageEnhance
import PIL.ImageOps
import os
### Constants
product_name = "${safeProductName}"
pre_version = "${preSystemVersion}"
next_version = "${nextSystemVersion}"
http_root = "${nodes.sidekick.services.static-web-server.root}"
http_local_url = "http://127.0.0.1"
### Test helpers
# Note #1: extracting the compressed disk in the test rather than in a
# derivation to avoid bloating nix store with a 10GB+ file
# Note #2: no need to create a COW overlay, since we can write to the temp disk
# image directly
def extract_base_system_disk(compressed_disk, target_path):
eprint("Extracting compressed disk image, this will take a while...")
subprocess.run(["rm", "-f", target_path])
subprocess.run(['${pkgs.zstd}/bin/unzstd', compressed_disk, '-o', target_path],
check=True)
os.chmod(target_path, 0o666)
atexit.register(os.remove, target_path)
# Faster OCR than NixOS `get_screen_text`, which takes almost 20 seconds per
# call. Fails to identify white text on dark backgrounds.
def screenshot_and_ocr(vm):
with tempfile.TemporaryDirectory() as d:
vm.screenshot(d + "/screenshot.png")
im = PIL.Image.open(d + "/screenshot.png")
im = PIL.ImageOps.grayscale(im)
im = PIL.ImageEnhance.Brightness(im).enhance(1.5)
im = PIL.ImageEnhance.Contrast(im).enhance(4.0)
return tesserocr.image_to_text(im)
# Navigate to system status page using keyboard only.
# Hack: depends on current UI layout. Could be made more
# sophisticated by using tesseract to detect the bounding box
# and then mouse_move'ing there for a click
def navigate_to_system_status():
for _ in range(4):
playos.send_key("tab", delay=0.2)
playos.send_key("ret", delay=0.2)
time.sleep(2)
def check_for_text_in_status_page(text, ignore_errors=False):
playos.send_key("ctrl-r")
time.sleep(2)
navigate_to_system_status()
screen_text = screenshot_and_ocr(playos)
print(f"Current sreen text: {screen_text}")
# return early if there is an error
if not ignore_errors:
possible_errors = ["ErrorDownloading", "ErrorInstalling", "UpdateError"]
if any([e in screen_text for e in possible_errors]):
return screen_text
t.assertIn(text, screen_text)
# Note: done via root shell on tty1, since a QEMU system_reset corrupts the
# /boot/status.ini due to unclean unmount + FAT
def reboot_via_tty():
playos.send_key("ctrl-alt-f8", delay=2) # direct switch to tty1 prevented by limit-vtes.nix
playos.send_key("ctrl-alt-f1", delay=2)
playos.send_chars("root\n")
time.sleep(2)
playos.send_chars("systemctl reboot\n")
### === Start VMs
extract_base_system_disk("${baseSystemDiskImage}", "${overlayPath}")
playos.start(allow_reboot=True)
sidekick.start()
### === Stub Update server setup
with TestPrecondition("Stub update server is started"):
update_server = UpdateServer(sidekick, product_name, http_root)
update_server.wait_for_unit()
sidekick.succeed(f"echo 'Hello world!' > {http_root}/index.html")
sidekick.succeed(f"curl -f {http_local_url}")
with TestPrecondition("Stub update server is functional") as t:
update_server.add_bundle(pre_version, filepath="${preSystemBundlePath}")
update_server.add_bundle(next_version, filepath="${nextSystemBundlePath}")
update_server.set_latest_version(pre_version)
out_v = sidekick.succeed(f"curl -f {http_local_url}/latest")
t.assertEqual(out_v, pre_version)
def move_mouse_to_corner():
# move mouse to bottom right corner so it doesn't accidentally cover
# any text while OCR'ing
playos.send_monitor_command("mouse_move 2000 2000")
### === Validate that PlayOS VM and baseSystem is OK
with TestPrecondition("dnsmasq hands out an IP to playos"):
dhcp_seq = [
"DHCPOFFER.*192.168.1.3",
"DHCPREQUEST.*192.168.1.3",
"DHCPACK.*192.168.1.3.*playos",
]
for msg in dhcp_seq:
wait_for_logs(sidekick, msg, unit="dnsmasq.service", timeout=60)
playos_ip = sidekick.succeed("cat /var/lib/dnsmasq/dnsmasq.leases | grep playos | awk '{print $3}'").strip()
sidekick.succeed(f"ping -c1 {playos_ip}", timeout=3)
with TestPrecondition("kiosk is open with kiosk URL") as t:
wait_until_passes(
lambda: t.assertIn("Hello world", screenshot_and_ocr(playos)),
retries=60, # can take quite long on CI
sleep=2
)
move_mouse_to_corner()
with TestPrecondition("controller GUI is visible") as t:
# switch to controller
playos.send_key("ctrl-shift-f12")
def t_check():
screen_text = screenshot_and_ocr(playos)
t.assertIn("Information", screen_text)
time.sleep(2) # ensure page fully loaded
return screen_text
screen_text = wait_until_passes(t_check, retries=10)
t.assertIn("Version", screen_text)
t.assertIn("${baseSystemVersion}", screen_text)
with TestPrecondition("Navigate to System Status page") as t:
navigate_to_system_status()
screen_text = screenshot_and_ocr(playos)
t.assertIn("Update State", screen_text,
"Update State not visible in screen, navigation failed?")
### Helpers re-used in both BASE->PRE and PRE->NEXT
def check_update_is_downloaded_and_installed(stage):
with TestCase(f"{stage}: controller starts downloading the bundle") as t:
def t_check():
playos.send_key("ctrl-r")
time.sleep(2)
navigate_to_system_status()
screen_text = screenshot_and_ocr(playos)
t.assertIn("Downloading", screen_text)
wait_until_passes(t_check, retries=30, sleep=1)
with TestCase(f"{stage}: controller has downloaded and installed the bundle") as t:
# controller takes at least 2 minutes for the download
# (1.2GB @ 10 MB/s), so allow up to 5 minutes for the download+install
screen_text = wait_until_passes(
lambda: check_for_text_in_status_page("RebootRequired"),
retries=30, sleep=10)
if screen_text is not None:
t.fail(f"Update process failed with an error, last screen text: {screen_text}")
def check_system_boots_into_new_version(new_version, stage):
with TestCase(f"{stage}: kiosk is open with kiosk URL after reboot") as t:
wait_until_passes(
lambda: t.assertIn("Hello world", screenshot_and_ocr(playos)),
retries=60,
sleep=2
)
move_mouse_to_corner()
with TestCase(f"{stage}: controller GUI with new version is visible") as t:
# switch to controller
playos.send_key("ctrl-shift-f12")
wait_until_passes(
lambda: t.assertIn(new_version, screenshot_and_ocr(playos)),
retries=10
)
with TestCase(f"{stage}: The new booted version reaches a Good state") as t:
wait_until_passes(
# UpdateError possible initially, because DHCP has not completed
lambda: check_for_text_in_status_page("Good", ignore_errors=True),
retries=10, sleep=10)
print("======== First update (BASE->PRE) tests ========")
check_update_is_downloaded_and_installed("BASE->PRE")
reboot_via_tty()
# Note: we must immediatelly change the latest version to NEXT, because if
# controller determines it is UpToDate after the reboot, it will not do another
# check for an hour
update_server.set_latest_version(next_version)
check_system_boots_into_new_version(pre_version, "BASE->PRE")
print("======== First update (BASE->PRE) successful =============")
print("======== Start second (PRE->NEXT) update =================")
check_update_is_downloaded_and_installed("PRE->NEXT")
reboot_via_tty()
check_system_boots_into_new_version(next_version, "PRE->NEXT")
with TestCase("Update state is UpToDate") as t:
wait_until_passes(
lambda: check_for_text_in_status_page("UpToDate", ignore_errors=True),
retries=3, sleep=10)
'';
}