platform/qemu: detect if QEMU process exits unexpectedly

jlebon · dustymabe · commit 30480db05612 · 2024-09-04T13:19:02.000-04:00
Currently, we only try to detect if the QEMU process exited by actually
`wait()`ing for it in the `kola qemuexec` path. We should do it in
the kola testing path as well so that it's easy to tell if e.g. it was
killed while the test was running.

Note this doesn't actually stop the test early if QEMU exited. That
would require some tricky wiring into the harness. But at least what it
prints helps diagnose the issue when we see the test time out on SSH.
And the QEMU process won't just hang there as defunct.
diff --git a/mantle/platform/machine/qemu/cluster.go b/mantle/platform/machine/qemu/cluster.go
@@ -40,7 +40,8 @@ type Cluster struct {
 	*platform.BaseCluster
 	flight *flight
 
-	mu sync.Mutex
+	mu          sync.Mutex
+	tearingDown bool
 }
 
 func (qc *Cluster) NewMachine(userdata *conf.UserData) (platform.Machine, error) {
@@ -231,10 +232,22 @@ func (qc *Cluster) NewMachineWithQemuOptions(userdata *conf.UserData, options pl
 
 	qc.AddMach(qm)
 
+	// In this flow, nothing actually Wait()s for the QEMU process. Let's do it here
+	// and print something if it exited unexpectedly. Ideally in the future, this
+	// interface allows the test harness to provide e.g. a channel we can signal on so
+	// it knows to stop the test once QEMU dies.
+	go func() {
+		err := inst.Wait()
+		if err != nil && !qc.tearingDown {
+			plog.Errorf("QEMU process finished abnormally: %v", err)
+		}
+	}()
+
 	return qm, nil
 }
 
 func (qc *Cluster) Destroy() {
+	qc.tearingDown = true
 	qc.BaseCluster.Destroy()
 	qc.flight.DelCluster(qc)
 }