Skip to content

Commit d37a381

Browse files
authored
ereport: Task faulted/panicked (#2341)
Depends on #2313, #2350, #2358 Fixes #2309 It's currently somewhat difficult to become aware of Hubris task panics and other task faults in a production environment. While MGS can ask the SP to list task dumps as part of the API for reading dumps, this requires that the control plane (or faux-mgs user) proactively ask the SP whether it has any record of panicked tasks, rather than recording panics as they occur. Therefore, we should have a proactive notification from the SP indicating that task faults have occurred. This commit adds code to `packrat` for producing an ereport when a task has faulted. This could eventually be used by the control plane to trigger dump collection and produce a service bundle. In addition, it will provide a more permanent record that a task faulted at a particular time, even if the SP that contains the faulted task is later reset or replaced with an entirely different SP. This works using an approach similar to the one described by @cbiffle in [this comment][1]. There's a detailed description of how this works [in the module-level RustDoc for `ereport.rs` in Packrat][2]. The ereports that come out of this thing look like this: ```console eliza@hekate ~/Code/oxide/hubris $ faux-mgs --interface eno1np0 --discovery-addr '[fe80::0c1d:deff:fef0:d922]:11111' ereports Jan 15 10:27:41.370 INFO creating SP handle on interface eno1np0, component: faux-mgs Jan 15 10:27:41.372 INFO initial discovery complete, addr: [fe80::c1d:deff:fef0:d922%2]:11111, interface: eno1np0, socket: control-plane-agent, component: faux-mgs restart ID: 4e54b7f1-e13a-d9bb-709a-c7e863d64a64 restart IDs did not match (requested 00000000-0000-0000-0000-000000000000) count: 4 ereports: 0x1: { "ereport_message_version": Number(0), "hubris_task_gen": Number(0), "hubris_task_name": String("packrat"), "hubris_uptime_ms": Number(0), "lost": Null, } 0x2: { "ereport_message_version": Number(0), "hubris_task_gen": Number(0), "hubris_task_name": String("ereportulator"), "hubris_uptime_ms": Number(378010), "k": String("hubris.fault.panic"), "msg": String("panicked at task/ereportulator/src/main.rs:158:9:\nim dead lol"), "v": Number(0), } 0x3: { "by": Object { "gen": Number(0), "task": String("jefe"), }, "ereport_message_version": Number(0), "hubris_task_gen": Number(0), "hubris_task_name": String("user_leds"), "hubris_uptime_ms": Number(382914), "k": String("hubris.fault.injected"), "v": Number(0), } 0x4: { "by": Object { "gen": Number(0), "task": String("jefe"), }, "ereport_message_version": Number(0), "hubris_task_gen": Number(1), "hubris_task_name": String("ereportulator"), "hubris_uptime_ms": Number(388215), "k": String("hubris.fault.injected"), "v": Number(0), } ``` [1]: #2309 (comment) [2]: https://github.com/oxidecomputer/hubris/blob/1e2b121fcd165dcdbcbdfdc69c36d35520dfa954/task/packrat/src/ereport.rs#L15-L95
1 parent aa7951c commit d37a381

File tree

16 files changed

+690
-33
lines changed

16 files changed

+690
-33
lines changed

Cargo.lock

Lines changed: 2 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

app/cosmo/base.toml

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,9 @@ net = "jefe-state-change"
3333
host_sp_comms = "jefe-state-change"
3434
spd = "jefe-state-change"
3535

36+
[tasks.jefe.config.on-task-fault]
37+
packrat = "task-faulted"
38+
3639
[tasks.jefe.config.allowed-callers]
3740
set_state = ["cosmo_seq"]
3841
set_reset_reason = ["sys"]
@@ -128,11 +131,11 @@ notifications = ["i2c1-irq", "i2c2-irq", "i2c3-irq", "i2c4-irq"]
128131
[tasks.packrat]
129132
name = "task-packrat"
130133
priority = 1
131-
stacksize = 1096
134+
stacksize = 1352
132135
start = true
133-
# task-slots is explicitly empty: packrat should not send IPCs!
134-
task-slots = []
136+
task-slots = ["jefe"]
135137
features = ["cosmo", "ereport"]
138+
notifications = ["task-faulted"]
136139

137140
[tasks.rng_driver]
138141
features = ["h753", "ereport"]

app/gimlet/base.toml

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,9 @@ net = "jefe-state-change"
2828
host_sp_comms = "jefe-state-change"
2929
spd = "jefe-state-change"
3030

31+
[tasks.jefe.config.on-task-fault]
32+
packrat = "task-faulted"
33+
3134
[tasks.jefe.config.allowed-callers]
3235
set_state = ["gimlet_seq"]
3336
set_reset_reason = ["sys"]
@@ -122,11 +125,11 @@ notifications = ["i2c1-irq", "jefe-state-change"]
122125
[tasks.packrat]
123126
name = "task-packrat"
124127
priority = 1
125-
stacksize = 1096
128+
stacksize = 1352
126129
start = true
127-
# task-slots is explicitly empty: packrat should not send IPCs!
128-
task-slots = []
130+
task-slots = ["jefe"]
129131
features = ["gimlet", "ereport"]
132+
notifications = ["task-faulted"]
130133

131134
[tasks.thermal]
132135
name = "task-thermal"

app/gimletlet/app.toml

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,9 @@ request_reset = ["hiffy", "control_plane_agent", "udprpc"]
2323
[tasks.jefe.config.on-state-change]
2424
host_sp_comms = "jefe-state-change"
2525

26+
[tasks.jefe.config.on-task-fault]
27+
packrat = "task-faulted"
28+
2629
[tasks.sys]
2730
# Enable EXTI in the sys task so that we can notify sprot when the RoT
2831
# raises an IRQ.
@@ -48,10 +51,10 @@ owner = {name = "sprot", notification = "rot_irq"}
4851
name = "task-packrat"
4952
priority = 1
5053
start = true
51-
# task-slots is explicitly empty: packrat should not send IPCs!
52-
task-slots = []
53-
stacksize = 1096
54+
task-slots = ["jefe"]
55+
stacksize = 1336
5456
features = ["ereport"]
57+
notifications = ["task-faulted"]
5558

5659
[tasks.control_plane_agent]
5760
name = "task-control-plane-agent"

app/grapefruit/app-dev.toml

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,13 @@ interrupts = {"uart8.irq" = "usart-irq"}
1212

1313
# Ereport stuff
1414
[tasks.packrat]
15-
stacksize = 1096
15+
stacksize = 1344
16+
task-slots = ["jefe"]
1617
features = ["ereport"]
18+
notifications = ["task-faulted"]
19+
20+
[tasks.jefe.config.on-task-fault]
21+
packrat = "task-faulted"
1722

1823
[tasks.snitch]
1924
name = "task-snitch"

app/oxcon2023g0/app.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ board = "oxcon2023g0"
66

77
[kernel]
88
name = "oxcon2023g0"
9-
requires = {flash = 11744, ram = 1296}
9+
requires = {flash = 11776, ram = 1296}
1010
stacksize = 640
1111

1212
[tasks.jefe]

app/psc/base.toml

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,9 @@ extern-regions = ["sram1", "sram2", "sram3", "sram4"]
2727
[tasks.jefe.config.on-state-change]
2828
net = "jefe-state-change"
2929

30+
[tasks.jefe.config.on-task-fault]
31+
packrat = "task-faulted"
32+
3033
[tasks.jefe.config.allowed-callers]
3134
set_reset_reason = ["sys"]
3235
request_reset = ["hiffy", "control_plane_agent"]
@@ -118,11 +121,11 @@ notifications = ["i2c2-irq", "i2c3-irq"]
118121
[tasks.packrat]
119122
name = "task-packrat"
120123
priority = 1
121-
stacksize = 1096
124+
stacksize = 1336
122125
start = true
123-
# task-slots is explicitly empty: packrat should not send IPCs!
124-
task-slots = []
126+
task-slots = ["jefe"]
125127
features = ["ereport"]
128+
notifications = ["task-faulted"]
126129

127130
[tasks.sequencer]
128131
name = "drv-psc-seq-server"

app/sidecar/base.toml

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,9 @@ extern-regions = ["sram1", "sram2", "sram3", "sram4"]
2828
set_reset_reason = ["sys"]
2929
request_reset = ["hiffy", "control_plane_agent"]
3030

31+
[tasks.jefe.config.on-task-fault]
32+
packrat = "task-faulted"
33+
3134
[tasks.sys]
3235
name = "drv-stm32xx-sys"
3336
features = ["h753", "exti", "no-panic"]
@@ -257,11 +260,11 @@ notifications = ["socket", "timer"]
257260
[tasks.packrat]
258261
name = "task-packrat"
259262
priority = 1
260-
stacksize = 1096
263+
stacksize = 1336
261264
start = true
262-
# task-slots is explicitly empty: packrat should not send IPCs!
263-
task-slots = []
265+
task-slots = ["jefe"]
264266
features = ["ereport"]
267+
notifications = ["task-faulted"]
265268

266269
[tasks.sequencer]
267270
name = "drv-sidecar-seq-server"

idl/ereportulator.idol

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,5 +20,12 @@ Interface(
2020
),
2121
idempotent: true,
2222
),
23+
"panicme": (
24+
doc: "Make the ereportulator panic",
25+
args: {
26+
},
27+
reply: Simple("()"),
28+
idempotent: true,
29+
),
2330
}
2431
)

sys/kern/src/kipc.rs

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -482,12 +482,6 @@ fn find_faulted_task(
482482
message: USlice<u8>,
483483
response: USlice<u8>,
484484
) -> Result<NextTask, UserError> {
485-
if caller != 0 {
486-
return Err(UserError::Unrecoverable(FaultInfo::SyscallUsage(
487-
UsageError::NotSupervisor,
488-
)));
489-
}
490-
491485
let index = deserialize_message::<u32>(&tasks[caller], message)? as usize;
492486

493487
// Note: we explicitly permit index == tasks.len(), which causes us to wrap

0 commit comments

Comments
 (0)