From 98c54167a1c29b51284a95ee9df3c17659004cc8 Mon Sep 17 00:00:00 2001 From: Kris Jacque Date: Sat, 17 Jan 2026 01:38:14 +0000 Subject: [PATCH] DAOS-18453 control: Allow daos cmd to inject fault value - Add an optional parameter to set fault injection value at the specified location. - Add a reset option to change both location and value to 0. Features: recovery Signed-off-by: Kris Jacque --- src/control/cmd/daos/fi.go | 80 +++++++++++++++++++++++++++++----- src/control/lib/daos/api/fi.go | 44 +++++++++++++++++++ 2 files changed, 112 insertions(+), 12 deletions(-) create mode 100644 src/control/lib/daos/api/fi.go diff --git a/src/control/cmd/daos/fi.go b/src/control/cmd/daos/fi.go index b47f463d0ce..adacf7575f7 100644 --- a/src/control/cmd/daos/fi.go +++ b/src/control/cmd/daos/fi.go @@ -1,5 +1,6 @@ // // (C) Copyright 2022 Intel Corporation. +// (C) Copyright 2026 Hewlett Packard Enterprise Development LP // // SPDX-License-Identifier: BSD-2-Clause-Patent // @@ -19,6 +20,8 @@ import ( "strings" "github.com/pkg/errors" + + "github.com/daos-stack/daos/src/control/lib/daos/api" ) type faultsCmdRoot struct { @@ -58,23 +61,53 @@ func (ff faultFrequency) HasSome() (uint64, bool) { type faultLocation uint64 func (fl *faultLocation) UnmarshalFlag(fv string) error { - // Ugh. Seems like there should be a more clever way to do this... - switch strings.TrimSpace(fv) { - case "DAOS_CHK_CONT_ORPHAN": - *fl = faultLocation(C.DAOS_CHK_CONT_ORPHAN) - case "DAOS_CHK_CONT_BAD_LABEL": - *fl = faultLocation(C.DAOS_CHK_CONT_BAD_LABEL) - default: - return errors.Errorf("unhandled fault location %q", fv) + if fv == "none" { + *fl = 0 + return nil + } + + loc, err := api.FaultLocationFromString(fv) + if err != nil { + return err } + *fl = faultLocation(loc) return nil } +// IsSet indicates whether a fault location has been set. +func (fl faultLocation) IsSet() bool { + return fl != 0 +} + +type faultValue uint64 + +const faultValueUnset = faultValue(^uint64(0)) + +func (fv *faultValue) UnmarshalFlag(fvStr string) error { + if fvStr == "none" { + *fv = faultValueUnset + return nil + } + + // Allow hexadecimal and binary values, as well as decimal. + v, err := strconv.ParseUint(fvStr, 0, 64) + if err != nil { + return errors.Errorf("invalid fault value %q", fvStr) + } + *fv = faultValue(v) + return nil +} + +// IsSet indicates whether a fault value has been set. +func (fv faultValue) IsSet() bool { + return fv != faultValueUnset +} + type faultRank uint32 func (fr *faultRank) UnmarshalFlag(fv string) error { - if fv == strconv.FormatUint(uint64(C.CRT_NO_RANK), 10) || fv == "-1" { + if fv == "all" || fv == strconv.FormatUint(uint64(C.CRT_NO_RANK), 10) || fv == "-1" { *fr = faultRank(C.CRT_NO_RANK) return nil } @@ -90,9 +123,10 @@ func (fr *faultRank) UnmarshalFlag(fv string) error { type faultInjectionCmd struct { daosCmd - Rank faultRank `short:"r" long:"rank" description:"Rank to inject fault on" default:"4294967295"` + Rank faultRank `short:"r" long:"rank" description:"Rank to inject fault on" default:"all"` Frequency faultFrequency `short:"f" long:"frequency" description:"Fault injection frequency" choices:"always,once" default:"once"` - Location faultLocation `short:"l" long:"location" description:"Fault injection location" required:"1"` + Location faultLocation `short:"l" long:"location" description:"Fault injection location" default:"none"` + Value faultValue `short:"v" long:"value" description:"Fault injection value" default:"none"` } func (cmd *faultInjectionCmd) setParams() error { @@ -112,19 +146,41 @@ func (cmd *faultInjectionCmd) setParams() error { if cmd.Rank != C.CRT_NO_RANK { rankMsg = fmt.Sprintf("rank %d", cmd.Rank) } - cmd.Debugf("injecting fault %d on %s", faultMask, rankMsg) + cmd.Debugf("injecting fault location 0x%x on %s", faultMask, rankMsg) rc := C.daos_debug_set_params(nil, C.d_rank_t(cmd.Rank), C.DMG_KEY_FAIL_LOC, faultMask, 0, nil) if err := daosError(rc); err != nil { return errors.Wrap(err, "failed to set fault injection") } + + if cmd.Value.IsSet() { + cmd.Debugf("injecting fault value %d on %s", cmd.Value, rankMsg) + rc = C.daos_debug_set_params(nil, C.d_rank_t(cmd.Rank), C.DMG_KEY_FAIL_VALUE, C.uint64_t(cmd.Value), 0, nil) + if err := daosError(rc); err != nil { + return errors.Wrap(err, "failed to set fault injection value") + } + } return nil } type debugFaultCmd struct { faultInjectionCmd + + Reset bool `long:"reset" description:"Reset all fault injection parameters"` } func (cmd *debugFaultCmd) Execute(_ []string) error { + if cmd.Reset { + if cmd.Location.IsSet() || cmd.Value.IsSet() { + return errors.New("cannot set location or value when resetting fault injection parameters") + } + + cmd.Debugf("resetting all fault injection parameters") + cmd.Frequency = 0 + cmd.Location = 0 + cmd.Value = 0 + } else if !cmd.Location.IsSet() { + return errors.New("--location must be specified unless --reset is used") + } return cmd.setParams() } diff --git a/src/control/lib/daos/api/fi.go b/src/control/lib/daos/api/fi.go new file mode 100644 index 00000000000..40d4e2ef79a --- /dev/null +++ b/src/control/lib/daos/api/fi.go @@ -0,0 +1,44 @@ +// +// (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP +// +// SPDX-License-Identifier: BSD-2-Clause-Patent +// + +package api + +/* +#include +#include +*/ +import "C" + +import "fmt" + +var ( + // failLocMap maps from strings to DAOS fault injection location constants. + // The definitions come from daos_common.h. + // TODO: Add the rest of existing fault locs. Maybe auto-generate this mapping? + failLocMap = map[string]C.uint64_t{ + "DAOS_CHK_CONT_ORPHAN": C.DAOS_CHK_CONT_ORPHAN, + "DAOS_CHK_CONT_BAD_LABEL": C.DAOS_CHK_CONT_BAD_LABEL, + "DAOS_CHK_LEADER_BLOCK": C.DAOS_CHK_LEADER_BLOCK, + "DAOS_CHK_LEADER_FAIL_REGPOOL": C.DAOS_CHK_LEADER_FAIL_REGPOOL, + "DAOS_CHK_PS_NOTIFY_LEADER": C.DAOS_CHK_PS_NOTIFY_LEADER, + "DAOS_CHK_PS_NOTIFY_ENGINE": C.DAOS_CHK_PS_NOTIFY_ENGINE, + "DAOS_CHK_SYNC_ORPHAN_PROCESS": C.DAOS_CHK_SYNC_ORPHAN_PROCESS, + "DAOS_CHK_FAIL_REPORT_POOL1": C.DAOS_CHK_FAIL_REPORT_POOL1, + "DAOS_CHK_FAIL_REPORT_POOL2": C.DAOS_CHK_FAIL_REPORT_POOL2, + "DAOS_CHK_ENGINE_DEATH": C.DAOS_CHK_ENGINE_DEATH, + "DAOS_CHK_VERIFY_CONT_SHARDS": C.DAOS_CHK_VERIFY_CONT_SHARDS, + "DAOS_CHK_ORPHAN_POOL_SHARD": C.DAOS_CHK_ORPHAN_POOL_SHARD, + } +) + +// FaultLocationFromString converts a string to a fault injection location value. +func FaultLocationFromString(str string) (uint64, error) { + loc, found := failLocMap[str] + if !found { + return 0, fmt.Errorf("invalid fault injection location %q", str) + } + return uint64(loc), nil +}