Skip to content

Commit b570e54

Browse files
DAOS-9576 chk: aggregated patch for cat_recovery (#13718)
* DAOS-15084 chk: aggregated patch for cat_recovery DAOS employed various fault tolerance mechanisms to cope with regular temporary or permanent hardware failures, such as Raft engine used for pool/container metadata, EC/Replica used for user data, etc. These mechanisms ensure the system survives from most regular failures, and automatic self- healing mechanisms are in place to bring back redundancy once tolerable failure happens. However, several factors can challenge this design: - Users can control (and lower) the data protection on a per-container basis. - The system is facing some unexpected events causing more failures than it is designed to tolerate. - The self-healing mechanism may fail in some specific conditions (e.g., ENOSPC on the surviving nodes). - Hardware bugs (e.g., broken flush, firmware issue, data corruption, ...) are causing massive corruption. - Software bugs (e.g., corner cases, overflow, ...). - Human errors. The DAOS catastrophic recovery feature is introduced to address the failure cases above. While it is unreasonable to assume that all cases can be covered, this feature covers the most likely ones. The first goal is to detect corruptions and distributed consistency issues and then offer a remediation path whenever possible. Remediation options can range from a transparent, automatic fix to a manual repair or deletion of a pool or container. If catastrophic recovery fails, then the system will ultimately have to be reformatted. Another aspect to take into account is that the check and repair should complete in a reasonable amount of time and the framework should provide estimates on how long it is expected to take for each pool and allow the administrator to prioritize some pools over others. This patch allows offline check & repair (when possible) of a DAOS system. Signed-off-by: Fan Yong <[email protected]> Signed-off-by: Dalton Bohning <[email protected]> Signed-off-by: Kris Jacque <[email protected]> Co-authored-by: Dalton Bohning <[email protected]>
1 parent 9de9b71 commit b570e54

File tree

344 files changed

+65808
-1165
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

344 files changed

+65808
-1165
lines changed

Jenkinsfile

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -108,7 +108,8 @@ void fixup_rpmlintrc() {
108108
'/usr/bin/hello_drpc',
109109
'/usr/bin/daos_firmware',
110110
'/usr/bin/daos_admin',
111-
'/usr/bin/daos_server']
111+
'/usr/bin/daos_server',
112+
'/usr/bin/ddb']
112113

113114
String content = readFile(file: 'utils/rpms/daos.rpmlintrc') + '\n\n' +
114115
'# https://daosio.atlassian.net/browse/DAOS-11534\n'

ci/codespell.ignores

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,3 +34,5 @@ expres
3434
signalling
3535
laf
3636
cacl
37+
chk
38+
falloc

debian/changelog

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,9 @@
1+
daos (2.5.101-4) unstable; urgency=medium
2+
[ Fan Yong ]
3+
* NOOP change to keep in parity with RPM version
4+
5+
-- Fan Yong <[email protected]> Fri, 05 Apr 2024 09:30:00 +0900
6+
17
daos (2.5.101-3) unstable; urgency=medium
28
[ Ashley M. Pittman ]
39
* Updated pydaos install process

debian/daos-server-tests.install

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ usr/bin/smd_ut
88
usr/bin/bio_ut
99
usr/bin/vea_ut
1010
usr/bin/vos_tests
11+
usr/bin/ddb_tests
1112
usr/bin/vea_stress
1213
usr/bin/vos_perf
1314
usr/bin/obj_ctl

debian/daos-server.install

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,10 @@ usr/bin/daos_server_helper
88
usr/bin/daos_server
99
usr/bin/daos_engine
1010
usr/bin/daos_metrics
11+
usr/bin/ddb
12+
usr/lib64/daos_srv/libchk.so
1113
usr/lib64/daos_srv/libcont.so
14+
usr/lib64/daos_srv/libddb.so
1215
usr/lib64/daos_srv/libdtx.so
1316
usr/lib64/daos_srv/libmgmt.so
1417
usr/lib64/daos_srv/libobj.so

src/SConscript

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -106,6 +106,7 @@ def scons():
106106

107107
# Build each DAOS component
108108
SConscript('rsvc/SConscript')
109+
SConscript('chk/SConscript')
109110
SConscript('mgmt/SConscript')
110111
SConscript('pool/SConscript')
111112
SConscript('container/SConscript')
@@ -128,6 +129,9 @@ def scons():
128129
# Build utilities
129130
SConscript('utils/SConscript')
130131

132+
# Build ddb
133+
SConscript('ddb/SConscript')
134+
131135
# Build the control plane components
132136
SConscript('control/SConscript')
133137

src/bio/bio_context.c

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,6 @@
99
#include "bio_internal.h"
1010
#include "bio_wal.h"
1111

12-
#define BIO_BLOB_HDR_MAGIC (0xb0b51ed5)
13-
1412
struct blob_cp_arg {
1513
spdk_blob_id bca_id;
1614
struct spdk_blob *bca_blob;

src/bio/bio_internal.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919

2020
#include "smd.pb-c.h"
2121

22+
#define BIO_BLOB_HDR_MAGIC (0xb0b51ed5)
2223
#define BIO_DMA_PAGE_SHIFT 12 /* 4K */
2324
#define BIO_DMA_PAGE_SZ (1UL << BIO_DMA_PAGE_SHIFT)
2425
#define BIO_XS_CNT_MAX BIO_MAX_VOS_TGT_CNT /* Max VOS xstreams per blobstore */

src/cart/crt_corpc.c

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -57,8 +57,8 @@ crt_corpc_info_init(struct crt_rpc_priv *rpc_priv,
5757
rpc_priv->crp_flags |= CRT_RPC_FLAG_COLL;
5858
if (co_info->co_grp_priv->gp_primary)
5959
rpc_priv->crp_flags |= CRT_RPC_FLAG_PRIMARY_GRP;
60-
if (flags & CRT_RPC_FLAG_FILTER_INVERT)
61-
rpc_priv->crp_flags |= CRT_RPC_FLAG_FILTER_INVERT;
60+
rpc_priv->crp_flags |= flags & (CRT_RPC_FLAG_FILTER_INVERT |
61+
CRT_RPC_FLAG_CO_FAILOUT);
6262

6363
co_hdr->coh_grpid = grp_priv->gp_pub.cg_grpid;
6464
co_hdr->coh_filter_ranks = co_info->co_filter_ranks;
@@ -906,6 +906,11 @@ crt_corpc_req_hdlr(struct crt_rpc_priv *rpc_priv)
906906
}
907907

908908
forward_done:
909+
if (rc != 0 && rpc_priv->crp_flags & CRT_RPC_FLAG_CO_FAILOUT) {
910+
crt_corpc_complete(rpc_priv);
911+
goto out;
912+
}
913+
909914
/* NOOP bcast (no child and root excluded) */
910915
if (co_info->co_child_num == 0 && co_info->co_root_excluded)
911916
crt_corpc_complete(rpc_priv);

src/chk/SConscript

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
# pylint: disable=consider-using-f-string
2+
# pylint: disable-next=wrong-spelling-in-comment
3+
"""Build check library"""
4+
5+
6+
def scons():
7+
"""Execute build"""
8+
Import('env', 'prereqs')
9+
10+
env.AppendUnique(LIBPATH=[Dir('.')])
11+
12+
denv = env.Clone()
13+
14+
# common
15+
prereqs.require(denv, 'argobots', 'protobufc')
16+
chk_pb = denv.SharedObject(['chk.pb-c.c'])
17+
Export('chk_pb')
18+
19+
if not prereqs.server_requested():
20+
return
21+
22+
# chk
23+
chk = denv.d_library('chk',
24+
[chk_pb, 'chk_srv.c', 'chk_common.c', 'chk_vos.c',
25+
'chk_rpc.c', 'chk_upcall.c', 'chk_iv.c', 'chk_leader.c',
26+
'chk_engine.c'], install_off="../..")
27+
denv.Install('$PREFIX/lib64/daos_srv', chk)
28+
29+
30+
if __name__ == "SCons.Script":
31+
scons()

0 commit comments

Comments
 (0)