Skip to content

Commit 1f118db

Browse files
kanard38knard38
authored andcommitted
Merge remote-tracking branch 'origin/master' into ckochhof/dev/master/daos-16501-part1
2 parents e2b0d34 + 4e574b5 commit 1f118db

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

70 files changed

+4034
-984
lines changed

src/chk/chk_engine.c

Lines changed: 50 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,7 @@ struct chk_cont_rec {
6060
d_iov_t ccr_label_cs;
6161
uint32_t ccr_label_checked:1,
6262
ccr_skip:1;
63+
uint32_t ccr_tgt_nr;
6364
};
6465

6566
struct chk_cont_bundle {
@@ -114,6 +115,7 @@ chk_cont_alloc(struct btr_instance *tins, d_iov_t *key_iov, d_iov_t *val_iov,
114115
if (ccr == NULL)
115116
D_GOTO(out, rc = -DER_NOMEM);
116117

118+
ccr->ccr_tgt_nr = 1;
117119
uuid_copy(ccr->ccr_uuid, ccb->ccb_uuid);
118120
ccr->ccr_aggregator = ccb->ccb_aggregator;
119121
d_list_add_tail(&ccr->ccr_link, &ccb->ccb_aggregator->ccla_list);
@@ -159,6 +161,10 @@ static int
159161
chk_cont_update(struct btr_instance *tins, struct btr_record *rec,
160162
d_iov_t *key, d_iov_t *val, d_iov_t *val_out)
161163
{
164+
struct chk_cont_rec *ccr = umem_off2ptr(&tins->ti_umm, rec->rec_off);
165+
166+
ccr->ccr_tgt_nr++;
167+
162168
return 0;
163169
}
164170

@@ -1591,12 +1597,19 @@ static int
15911597
chk_engine_cont_cleanup(struct chk_pool_rec *cpr, struct ds_pool_svc *ds_svc,
15921598
struct chk_cont_list_aggregator *aggregator)
15931599
{
1594-
struct chk_instance *ins = cpr->cpr_ins;
1595-
struct cont_svc *svc;
1596-
struct chk_cont_rec *ccr;
1597-
struct chk_cont_label_cb_args cclca = { 0 };
1598-
int rc = 0;
1599-
bool failout;
1600+
struct chk_instance *ins = cpr->cpr_ins;
1601+
struct cont_svc *svc;
1602+
struct chk_cont_rec *ccr;
1603+
char msg[CHK_MSG_BUFLEN];
1604+
struct chk_cont_label_cb_args cclca = {0};
1605+
struct chk_report_unit cru = {0};
1606+
uint64_t seq;
1607+
uint32_t exp_tgt_nr = 0;
1608+
int rc = 0;
1609+
bool failout;
1610+
1611+
if (DAOS_FAIL_CHECK(DAOS_CHK_VERIFY_CONT_SHARDS))
1612+
exp_tgt_nr = daos_fail_value_get();
16001613

16011614
if (ins->ci_prop.cp_flags & CHK__CHECK_FLAG__CF_FAILOUT)
16021615
failout = true;
@@ -1636,11 +1649,36 @@ chk_engine_cont_cleanup(struct chk_pool_rec *cpr, struct ds_pool_svc *ds_svc,
16361649
goto out;
16371650

16381651
d_list_for_each_entry(ccr, &aggregator->ccla_list, ccr_link) {
1639-
if (!ccr->ccr_skip && !ccr->ccr_label_checked) {
1652+
if (ccr->ccr_skip)
1653+
continue;
1654+
1655+
if (!ccr->ccr_label_checked) {
16401656
rc = chk_engine_cont_set_label(cpr, ccr, svc);
16411657
if (rc != 0)
16421658
goto out;
16431659
}
1660+
1661+
if (likely(ccr->ccr_tgt_nr >= exp_tgt_nr))
1662+
continue;
1663+
1664+
snprintf(
1665+
msg, CHK_MSG_BUFLEN - 1,
1666+
"The container " DF_UUID " in the pool " DF_UUID " lost some shards: %u/%u\n",
1667+
DP_UUID(ccr->ccr_uuid), DP_UUID(cpr->cpr_uuid), ccr->ccr_tgt_nr, exp_tgt_nr);
1668+
1669+
cru.cru_msg = msg;
1670+
cru.cru_gen = cpr->cpr_bk.cb_gen;
1671+
cru.cru_cla = CHK__CHECK_INCONSIST_CLASS__CIC_UNKNOWN;
1672+
cru.cru_act = CHK__CHECK_INCONSIST_ACTION__CIA_IGNORE;
1673+
cru.cru_rank = dss_self_rank();
1674+
cru.cru_pool = (uuid_t *)&cpr->cpr_uuid;
1675+
cru.cru_pool_label = cpr->cpr_label;
1676+
cru.cru_cont = (uuid_t *)&ccr->ccr_uuid;
1677+
if (ccr->ccr_label_prop != NULL && ccr->ccr_label_prop->dpp_entries != NULL)
1678+
cru.cru_cont_label = ccr->ccr_label_prop->dpp_entries[0].dpe_str;
1679+
1680+
seq = 0;
1681+
chk_engine_report(&cru, &seq, NULL);
16441682
}
16451683

16461684
out:
@@ -3206,10 +3244,11 @@ chk_engine_report(struct chk_report_unit *cru, uint64_t *seq, int *decision)
32063244
pool->cpr_bk.cb_pool_status = CHK__CHECK_POOL_STATUS__CPS_PENDING;
32073245

32083246
log:
3209-
D_CDEBUG(rc != 0, DLOG_ERR, DLOG_INFO,
3210-
DF_ENGINE" on rank %u report with class %u, action %u, seq "
3211-
DF_X64", handle_rc %d, report_rc %d\n", DP_ENGINE(ins),
3212-
cru->cru_rank, cru->cru_cla, cru->cru_act, *seq, cru->cru_result, rc);
3247+
DL_CDEBUG(rc != 0, DLOG_ERR, DLOG_INFO, rc,
3248+
DF_ENGINE " on rank %u report with class %u, action %u, seq " DF_X64 ", %s, "
3249+
"handle_rc %d, report_rc %d",
3250+
DP_ENGINE(ins), cru->cru_rank, cru->cru_cla, cru->cru_act, *seq, cru->cru_msg,
3251+
cru->cru_result, rc);
32133252

32143253
if (rc != 0 || cpr == NULL)
32153254
goto out;

src/chk/chk_leader.c

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3631,10 +3631,10 @@ chk_leader_report(struct chk_report_unit *cru, uint64_t *seq, int *decision)
36313631
*seq = chk_report_seq_gen(ins);
36323632
}
36333633

3634-
D_INFO(DF_LEADER" handle %s report from rank %u with seq "
3635-
DF_X64" class %u, action %u, result %d\n", DP_LEADER(ins),
3636-
decision != NULL ? "local" : "remote", cru->cru_rank, *seq, cru->cru_cla,
3637-
cru->cru_act, cru->cru_result);
3634+
D_INFO(DF_LEADER " handle %s report from rank %u with seq " DF_X64 " class %u, action %u, "
3635+
"%s, result %d\n",
3636+
DP_LEADER(ins), decision != NULL ? "local" : "remote", cru->cru_rank, *seq,
3637+
cru->cru_cla, cru->cru_act, cru->cru_msg, cru->cru_result);
36383638

36393639
if (cru->cru_act == CHK__CHECK_INCONSIST_ACTION__CIA_INTERACT) {
36403640
if (cru->cru_pool == NULL)

src/client/java/daos-java/src/main/native/io_daos_dfs_DaosFsClient.c

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -458,13 +458,6 @@ Java_io_daos_dfs_DaosFsClient_createNewFile(JNIEnv *env,
458458
dfs_obj_t *parent = NULL;
459459
mode_t tmp_mode;
460460

461-
if (unlikely(!type_id)) {
462-
char *msg = NULL;
463-
464-
asprintf(&msg, "unsupported object class, %s", object_type);
465-
throw_exc(env, msg, CUSTOM_ERR6);
466-
goto out;
467-
}
468461
int rc = dfs_lookup(dfs, parent_path, O_RDWR, &parent,
469462
&tmp_mode, NULL);
470463

src/client/java/hadoop-daos/src/main/java/io/daos/fs/hadoop/DaosFileSystem.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -381,10 +381,10 @@ public FSDataOutputStream create(Path f,
381381
throw new IOException("failed to delete existing file " + daosFile);
382382
}
383383
}
384-
384+
385385
daosFile.createNewFile(
386386
Constants.DAOS_MODLE,
387-
DaosObjectClass.OC_SX,
387+
DaosObjectClass.OC_UNKNOWN,
388388
this.chunkSize,
389389
true);
390390

src/container/rpc.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -850,7 +850,6 @@ cont_snap_oit_oid_get_in_set_data(crt_rpc_t *rpc, crt_opcode_t opc, int cont_pro
850850
((uuid_t) (tdi_uuid) CRT_VAR)
851851

852852
#define DAOS_OSEQ_TGT_DESTROY /* output fields */ \
853-
/* number of errors */ \
854853
((int32_t) (tdo_rc) CRT_VAR)
855854

856855
CRT_RPC_DECLARE(cont_tgt_destroy, DAOS_ISEQ_TGT_DESTROY, DAOS_OSEQ_TGT_DESTROY)

src/container/srv_container.c

Lines changed: 41 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -102,10 +102,16 @@ cont_svc_init(struct cont_svc *svc, const uuid_t pool_uuid, uint64_t id,
102102
D_GOTO(err, rc = dss_abterr2der(rc));
103103
}
104104

105+
rc = ABT_mutex_create(&svc->cs_cont_ephs_mutex);
106+
if (rc != ABT_SUCCESS) {
107+
D_ERROR("failed to create cs_cont_ephs_mutex: %d\n", rc);
108+
D_GOTO(err_rwlock, rc = dss_abterr2der(rc));
109+
}
110+
105111
/* cs_root */
106112
rc = rdb_path_init(&svc->cs_root);
107113
if (rc != 0)
108-
goto err_lock;
114+
goto err_mutex;
109115
rc = rdb_path_push(&svc->cs_root, &rdb_path_root_key);
110116
if (rc != 0)
111117
goto err_root;
@@ -144,7 +150,9 @@ cont_svc_init(struct cont_svc *svc, const uuid_t pool_uuid, uint64_t id,
144150
rdb_path_fini(&svc->cs_uuids);
145151
err_root:
146152
rdb_path_fini(&svc->cs_root);
147-
err_lock:
153+
err_mutex:
154+
ABT_mutex_free(&svc->cs_cont_ephs_mutex);
155+
err_rwlock:
148156
ABT_rwlock_free(&svc->cs_lock);
149157
err:
150158
return rc;
@@ -158,6 +166,7 @@ cont_svc_fini(struct cont_svc *svc)
158166
rdb_path_fini(&svc->cs_uuids);
159167
rdb_path_fini(&svc->cs_root);
160168
ABT_rwlock_free(&svc->cs_lock);
169+
ABT_mutex_free(&svc->cs_cont_ephs_mutex);
161170
}
162171

163172
int
@@ -713,7 +722,6 @@ cont_create_prop_prepare(struct ds_pool_hdl *pool_hdl,
713722
D_ERROR("container global %u version could be not set\n", entry->dpe_type);
714723
return -DER_INVAL;
715724
case DAOS_PROP_CO_OBJ_VERSION:
716-
/* this is a walkaround for 2.6 only */
717725
entry_def->dpe_val = entry->dpe_val;
718726
break;
719727
default:
@@ -768,10 +776,14 @@ cont_create_prop_prepare(struct ds_pool_hdl *pool_hdl,
768776
if (entry_def)
769777
entry_def->dpe_val = pool_hdl->sph_global_ver;
770778

771-
/* inherit object version from pool*/
779+
/*
780+
* New container creation by clients will specify the object version.
781+
* If not specified (dpe_val == 0), it indicates a client from before
782+
* DAOS 2.6.4, so use VERSION 1 for backward compatibility.
783+
*/
772784
entry_def = daos_prop_entry_get(prop_def, DAOS_PROP_CO_OBJ_VERSION);
773785
if (entry_def && entry_def->dpe_val == 0)
774-
entry_def->dpe_val = pool_hdl->sph_obj_ver;
786+
entry_def->dpe_val = DAOS_POOL_OBJ_VERSION_1;
775787

776788
/* for new container set HEALTHY status with current pm ver */
777789
entry_def = daos_prop_entry_get(prop_def, DAOS_PROP_CO_STATUS);
@@ -1046,6 +1058,11 @@ cont_create(struct rdb_tx *tx, struct ds_pool_hdl *pool_hdl, struct cont_svc *sv
10461058
D_GOTO(out, rc = -DER_NO_PERM);
10471059
}
10481060

1061+
/* Reset recov_cont prop to notify on flight pool_recov_cont to retry. */
1062+
rc = ds_pool_prop_recov_cont_reset(tx, svc->cs_rsvc);
1063+
if (rc != 0)
1064+
goto out;
1065+
10491066
cont_create_in_get_data(rpc, CONT_CREATE, cont_proto_ver, &cprop);
10501067

10511068
/* Determine if the label property was supplied, and if so,
@@ -1291,10 +1308,18 @@ cont_destroy_bcast(crt_context_t ctx, struct cont_svc *svc,
12911308

12921309
out = crt_reply_get(rpc);
12931310
rc = out->tdo_rc;
1294-
if (rc != 0) {
1295-
D_ERROR(DF_CONT": failed to destroy %d targets\n",
1296-
DP_CONT(svc->cs_pool_uuid, cont_uuid), rc);
1297-
rc = -DER_IO;
1311+
if (rc == -DER_BUSY) {
1312+
D_INFO(DF_CONT ": some target busy\n", DP_CONT(svc->cs_pool_uuid, cont_uuid));
1313+
/*
1314+
* Must return an error that ds_pool_svc_ops_save considers
1315+
* retryable. Otherwise, when it is retried, this container
1316+
* destroy operation would always get its result from svc_ops
1317+
* without being executed.
1318+
*/
1319+
rc = -DER_TIMEDOUT;
1320+
} else if (rc != 0) {
1321+
DL_ERROR(rc, DF_CONT ": failed to destroy targets",
1322+
DP_CONT(svc->cs_pool_uuid, cont_uuid));
12981323
}
12991324

13001325
out_rpc:
@@ -1550,6 +1575,11 @@ cont_destroy(struct rdb_tx *tx, struct ds_pool_hdl *pool_hdl, struct cont *cont,
15501575
D_DEBUG(DB_MD, DF_CONT ": processing rpc: %p force=%u\n",
15511576
DP_CONT(pool_hdl->sph_pool->sp_uuid, cont->c_uuid), rpc, force);
15521577

1578+
/* Reset recov_cont prop to notify on flight pool_recov_cont to retry. */
1579+
rc = ds_pool_prop_recov_cont_reset(tx, cont->c_svc->cs_rsvc);
1580+
if (rc != 0)
1581+
goto out;
1582+
15531583
/* Fetch the container props to check access for delete */
15541584
rc = cont_prop_read(tx, cont,
15551585
DAOS_CO_QUERY_PROP_ACL |
@@ -2032,6 +2062,7 @@ cont_agg_eph_sync(struct ds_pool *pool, struct cont_svc *svc)
20322062
return;
20332063
}
20342064

2065+
ABT_mutex_lock(svc->cs_cont_ephs_mutex);
20352066
d_list_for_each_entry_safe(eph_ldr, tmp, &svc->cs_cont_ephs_leader_list, cte_list) {
20362067
if (eph_ldr->cte_deleted) {
20372068
d_list_del(&eph_ldr->cte_list);
@@ -2135,6 +2166,7 @@ cont_agg_eph_sync(struct ds_pool *pool, struct cont_svc *svc)
21352166
if (pool->sp_rebuilding)
21362167
break;
21372168
}
2169+
ABT_mutex_unlock(svc->cs_cont_ephs_mutex);
21382170

21392171
map_ranks_fini(&fail_ranks);
21402172
}

src/container/srv_internal.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,7 @@ struct cont_svc {
9595
/* Manage the EC aggregation epoch and stable epoch */
9696
struct sched_request *cs_cont_ephs_leader_req;
9797
d_list_t cs_cont_ephs_leader_list; /* link cont_track_eph_leader */
98+
ABT_mutex cs_cont_ephs_mutex; /* protect cs_cont_ephs_leader_list */
9899
};
99100

100101
/* Container descriptor */

0 commit comments

Comments
 (0)