Skip to content
2 changes: 2 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion app/gimlet/base.toml
Original file line number Diff line number Diff line change
Expand Up @@ -163,7 +163,7 @@ name = "drv-gimlet-seq-server"
features = ["h753"]
priority = 4
max-sizes = {flash = 131072, ram = 16384 }
stacksize = 2600
stacksize = 2912
start = true
task-slots = ["sys", "i2c_driver", {spi_driver = "spi2_driver"}, "hf", "jefe", "packrat"]
notifications = ["timer", "vcore"]
Expand Down
4 changes: 3 additions & 1 deletion drv/cosmo-seq-server/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,13 @@ drv-packrat-vpd-loader = { path = "../packrat-vpd-loader" }
drv-spartan7-loader-api = { path = "../spartan7-loader-api" }
drv-spi-api = { path = "../spi-api" }
drv-stm32xx-sys-api = { path = "../stm32xx-sys-api" }
fixedstr = { path = "../../lib/fixedstr", features = ["microcbor"] }
gnarle = { path = "../../lib/gnarle" }
ringbuf = { path = "../../lib/ringbuf" }
microcbor = { path = "../../lib/microcbor" }
userlib = { path = "../../sys/userlib", features = ["panic-messages"] }
task-jefe-api = { path = "../../task/jefe-api" }
task-packrat-api = { path = "../../task/packrat-api", features = ["serde"] }
task-packrat-api = { path = "../../task/packrat-api", features = ["microcbor"] }
static-cell = { path = "../../lib/static-cell" }

cfg-if = { workspace = true }
Expand Down
185 changes: 163 additions & 22 deletions drv/cosmo-seq-server/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,21 +10,21 @@
use drv_cpu_seq_api::{
PowerState, SeqError as CpuSeqError, StateChangeReason, Transition,
};
use drv_hf_api::HostFlash;
use drv_ice40_spi_program as ice40;
use drv_packrat_vpd_loader::{read_vpd_and_load_packrat, Packrat};
use drv_spartan7_loader_api::Spartan7Loader;
use drv_spi_api::{SpiDevice, SpiServer};
use drv_stm32xx_sys_api::{self as sys_api, Sys};
use fixedstr::FixedStr;
use idol_runtime::{NotificationHandler, RequestError};
use ringbuf::{counted_ringbuf, ringbuf_entry, Count};
use task_jefe_api::Jefe;
use userlib::{
hl, set_timer_relative, sys_get_timer, sys_recv_notification, task_slot,
RecvMessage,
};

use drv_hf_api::HostFlash;
use ringbuf::{counted_ringbuf, ringbuf_entry, Count};

include!(concat!(env!("OUT_DIR"), "/i2c_config.rs"));

mod vcore;
Expand Down Expand Up @@ -101,6 +101,14 @@ enum Trace {
now: u64,
},
UnexpectedInterrupt,

EreportSent(#[count(children)] EreportClass, usize),
EreportLost(
#[count(children)] EreportClass,
usize,
task_packrat_api::EreportWriteError,
),
EreportTooBig(#[count(children)] EreportClass),
}
counted_ringbuf!(Trace, 128, Trace::None);

Expand Down Expand Up @@ -378,12 +386,18 @@ struct ServerImpl {
seq: fmc_sequencer::Sequencer,
espi: fmc_periph::espi::Espi,
vcore: VCore,
packrat: Packrat,
/// Static buffer for encoding ereports. This is a static so that we don't
/// have it on the stack when encoding ereports.
ereport_buf: &'static mut [u8; EREPORT_BUF_LEN],
}

const EREPORT_BUF_LEN: usize = 256;
const EREPORT_BUF_LEN: usize = microcbor::max_cbor_len_for![
Ereport<vcore::PmbusEreport>,
Ereport<UnrecognizedCPU>,
// For FPGA MAPO/SMERR ereports
Ereport<&'static SeqFpgaRefdes>,
];

impl ServerImpl {
fn new(
Expand Down Expand Up @@ -421,7 +435,8 @@ impl ServerImpl {
hf: HostFlash::from(HF.get_task_id()),
seq,
espi,
vcore: VCore::new(I2C.get_task_id(), packrat),
vcore: VCore::new(I2C.get_task_id()),
packrat,
ereport_buf,
}
}
Expand Down Expand Up @@ -537,26 +552,42 @@ impl ServerImpl {
});

// From sp5-mobo-guide-56870_1.1.pdf table 72
match (coretype0, coretype1, coretype2) {
let coretype_ok = match (coretype0, coretype1, coretype2) {
// These correspond to Type-2 and Type-3
(true, false, true) | (true, false, false) => (),
(true, false, true) | (true, false, false) => true,
// Reject all other combos and return to A0
_ => {
self.seq.power_ctrl.modify(|m| m.set_a0_en(false));
return Err(CpuSeqError::UnrecognizedCPU);
}
_ => false,
};

// From sp5-mobo-guide-56870_1.1.pdf table 73
match (sp5r1, sp5r2, sp5r3, sp5r4) {
let sp5r_ok = match (sp5r1, sp5r2, sp5r3, sp5r4) {
// There is only combo we accept here
(true, false, false, false) => (),
(true, false, false, false) => true,
// Reject all other combos and return to A0
_ => {
self.seq.power_ctrl.modify(|m| m.set_a0_en(false));
return Err(CpuSeqError::UnrecognizedCPU);
}
_ => false,
};

if !(coretype_ok && sp5r_ok) {
// Looks weird!
self.seq.power_ctrl.modify(|m| m.set_a0_en(false));
let ereport = Ereport {
class: EreportClass::UnrecognizedCPU,
version: 0,
report: UnrecognizedCPU {
refdes: &HOST_CPU_REFDES,
coretype0,
coretype1,
coretype2,
sp5r1,
sp5r2,
sp5r3,
sp5r4,
},
};
deliver_ereport(&ereport, &self.packrat, self.ereport_buf);
return Err(CpuSeqError::UnrecognizedCPU);
}

// Turn on the voltage regulator undervolt alerts.
self.enable_sequencer_interrupts();

Expand Down Expand Up @@ -758,8 +789,12 @@ impl ServerImpl {
vddcr_cpu0: ifr.pwr_cont1_to_fpga1_alert,
vddcr_cpu1: ifr.pwr_cont2_to_fpga1_alert,
};
self.vcore
.handle_pmbus_alert(which_rails, now, self.ereport_buf);
self.vcore.handle_pmbus_alert(
which_rails,
now,
&self.packrat,
self.ereport_buf,
);

// We need not instruct the sequencer to reset. PMBus alerts from
// the RAA229620As are divided into two categories, "warnings" and
Expand Down Expand Up @@ -787,22 +822,54 @@ impl ServerImpl {
self.seq.ifr.modify(|h| h.set_thermtrip(false));
ringbuf_entry!(Trace::Thermtrip);
action = InternalAction::ThermTrip;
// Great place for an ereport?
let ereport = Ereport {
class: EreportClass::Thermtrip,
version: 0,
report: &HOST_CPU_REFDES,
// TODO(eliza): eventually, it would be nice to include sequencer
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What sequencer registers are you imagining for this?

// state registers here, however, we would need to modify the
// `fpga_regmap` codegen to let us get the raw bits out (since
// encoding the `...View` structs as CBOR uses a lot more bytes for
// field names and 8-bit `bool`s...) I'll do this eventually...
};
deliver_ereport(&ereport, &self.packrat, self.ereport_buf);
}

if ifr.a0mapo {
self.log_pg_registers();
self.seq.ifr.modify(|h| h.set_a0mapo(false));
ringbuf_entry!(Trace::A0MapoInterrupt);
action = InternalAction::Mapo;
// Great place for an ereport?

let ereport = Ereport {
class: EreportClass::A0Mapo,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We should consider adding the SP's notion of when the state transition to A0 begun and the tick we detected the event in here as additional information to get a sense of relative uptime. The tick detection may not be relevant if that's part of the standard payload. This applies to most of the sequencer loop related actions. I wouldn't do this on an unsupported CPU.

version: 0,
report: &SEQ_FPGA_REFDES,
// TODO(eliza): eventually, it would be nice to include sequencer
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I guess some of the sequencer registers may make sense, but I think the things we'll want to be following up with are the actual power failures that occurred.

// state registers here, however, we would need to modify the
// `fpga_regmap` codegen to let us get the raw bits out (since
// encoding the `...View` structs as CBOR uses a lot more bytes for
// field names and 8-bit `bool`s...) I'll do this eventually...
};
deliver_ereport(&ereport, &self.packrat, self.ereport_buf);
}

if ifr.smerr_assert {
self.seq.ifr.modify(|h| h.set_smerr_assert(false));
ringbuf_entry!(Trace::SmerrInterrupt);
action = InternalAction::Smerr;
// Great place for an ereport?

let ereport = Ereport {
class: EreportClass::Smerr,
version: 0,
report: &SEQ_FPGA_REFDES,
// TODO(eliza): eventually, it would be nice to include sequencer
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What sequencer state is relevant here?

// state registers here, however, we would need to modify the
// `fpga_regmap` codegen to let us get the raw bits out (since
// encoding the `...View` structs as CBOR uses a lot more bytes for
// field names and 8-bit `bool`s...) I'll do this eventually...
};
deliver_ereport(&ereport, &self.packrat, self.ereport_buf);
}
// Fan Fault is unconnected
// NIC MAPO is unconnected
Expand Down Expand Up @@ -959,6 +1026,80 @@ impl NotificationHandler for ServerImpl {

////////////////////////////////////////////////////////////////////////////////

#[derive(Eq, PartialEq, Copy, Clone, microcbor::Encode, counters::Count)]
pub(crate) enum EreportClass {
//
// Interrupts
//
#[cbor(rename = "hw.cpu.thermtrip")]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've been going back and forth with myself on whether to phrase this as specific to amd or not. e.g. hw.cpu.amd.thermtrip, as we're referring to its thermal trip assertion pin. I think it's probably better to keep this as is.

Thermtrip,
#[cbor(rename = "hw.seq.smerr")]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So, I don't think we should phrase this in terms of the sequencer as this is a fact about the CPU. So probably closer to hw.cpu.smerr. It's not clear to me again if we want to namespace this or if we'd try to make these similar. This almost feels like hw.cpu.amd.smerr (though I had to look up what smerr is).

Smerr,
#[cbor(rename = "hw.seq.a0_mapo")]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In this case I don't think we should phrase this as specific to the sequencer, this feels more related to power. So I would think of this as hw.pwr.a0.mapo. This kind of organization assumes we want to make a class of A0/A2-specific power events. I also think we should consider instead hw.pwr.mapo with a power domain present as an argument. A small part of me prefers this in case we detect A2 issues or we have a more complex situation with the Metro FPGA.

A0Mapo,
#[cbor(rename = "hw.pwr.pmbus.alert")]
PmbusAlert,

//
// Initialization failures
//
#[cbor(rename = "hw.cpu.a0_fail.unknown")]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I get where you're coming from here, but I think I would basically keep this as hw.cpu.unsup or you know the old callback to enotsup. I think unknown here is a bit confusing and it's not clear to me that an explicit a0_fail subclass makes sense here.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yeah, i like unsup --- i thought about unknown_cpu but that felt like a few too many characters. I agree that unknown feels like it could mean "unknown error", which is not what this is supposed to mean. if we dropped a0_fail, unknown_cpu seems not too bad...

UnrecognizedCPU,
}

pub(crate) type Ereport<T> = task_packrat_api::Ereport<EreportClass, T>;

#[derive(microcbor::EncodeFields)]
pub(crate) struct UnrecognizedCPU {
#[cbor(flatten)]
refdes: &'static HostCpuRefdes,
coretype0: bool,
coretype1: bool,
coretype2: bool,
sp5r1: bool,
sp5r2: bool,
sp5r3: bool,
sp5r4: bool,
Comment on lines +1053 to +1062
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So this is now scoped to a socket. I think we would to think about how the payloads change across families and how to name fields that way. SP3 for example only had a single coretype and two revision registers. I would speculate that SP7 will be closer to SP5, but it won't be called sp5.

}

#[derive(microcbor::EncodeFields)]
struct HostCpuRefdes {
refdes: FixedStr<2>,
dev_id: FixedStr<16>,
}

#[derive(microcbor::EncodeFields)]
struct SeqFpgaRefdes {
refdes: FixedStr<3>,
}

static SEQ_FPGA_REFDES: SeqFpgaRefdes = SeqFpgaRefdes {
refdes: FixedStr::from_str("U27"),
};

static HOST_CPU_REFDES: HostCpuRefdes = HostCpuRefdes {
refdes: FixedStr::from_str("P0"),
// TODO(eliza): can we get this from the `gateway-sp-messages` crate?
dev_id: FixedStr::from_str("sp5-host-cpu"),
};

pub(crate) fn deliver_ereport<E: microcbor::EncodeFields<()>>(
ereport: &Ereport<E>,
packrat: &Packrat,
buf: &mut [u8],
) {
match packrat.encode_ereport(ereport, buf) {
Ok(len) => ringbuf_entry!(Trace::EreportSent(ereport.class, len)),
Err(task_packrat_api::EreportEncodeError::Packrat { len, err }) => {
ringbuf_entry!(Trace::EreportLost(ereport.class, len, err))
}
Err(task_packrat_api::EreportEncodeError::Encoder(_)) => {
ringbuf_entry!(Trace::EreportTooBig(ereport.class))
}
}
}
////////////////////////////////////////////////////////////////////////////////

mod idl {
use drv_cpu_seq_api::StateChangeReason;
include!(concat!(env!("OUT_DIR"), "/server_stub.rs"));
Expand Down
Loading