Skip to content

Commit bc0c86f

Browse files
committed
DAOS-18388 client: handle timeout for CRT_OPC_PROTO_QUERY RPC
Currently, the initial timeout for CRT_OPC_PROTO_QUERY RPC is only 3 seconds, it will help to get going more quickly when some rank(s) is down. But that increases the risk of query failure with timeout if there are only a few targets in the system and they may be busy or not ready in time when being queried. The patch adds another one CRT_OPC_PROTO_QUERY RPC retry against the rank that has ever reported RPC timeout. Such retry will use default RPC timeout configuration instead of initial small value. Signed-off-by: Fan Yong <fan.yong@hpe.com>
1 parent 540df3a commit bc0c86f

File tree

1 file changed

+44
-17
lines changed

1 file changed

+44
-17
lines changed

src/client/api/rpc.c

Lines changed: 44 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
/**
22
* (C) Copyright 2016-2024 Intel Corporation.
3-
* (C) Copyright 2025 Hewlett Packard Enterprise Development LP
3+
* (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP
44
*
55
* SPDX-License-Identifier: BSD-2-Clause-Patent
66
*/
@@ -107,6 +107,8 @@ struct rpc_proto {
107107
uint32_t *ver_array;
108108
uint32_t array_size;
109109
uint32_t timeout;
110+
uint32_t first_timeout_rank;
111+
crt_context_t ctx;
110112
};
111113

112114
static void
@@ -118,6 +120,10 @@ query_cb(struct crt_proto_query_cb_info *cb_info)
118120
if (daos_rpc_retryable_rc(cb_info->pq_rc)) {
119121
int nr_ranks;
120122
d_rank_t rank;
123+
uint32_t timeout = 0;
124+
125+
if (cb_info->pq_rc == -DER_TIMEDOUT && rproto->first_timeout_rank == CRT_NO_RANK)
126+
rproto->first_timeout_rank = dc_mgmt_net_get_srv_rank(rproto->rank_idx);
121127

122128
/** select next rank to issue the retry proto query rpc to */
123129
nr_ranks = dc_mgmt_net_get_num_srv_ranks();
@@ -127,21 +133,40 @@ query_cb(struct crt_proto_query_cb_info *cb_info)
127133

128134
/** We tried all engines and found none alive */
129135
if (rproto->num_retries_left <= 0) {
130-
D_ERROR("crt_proto_query_with_ctx() failed -- All %d targets tried\n",
131-
nr_ranks);
132-
rproto->rc = cb_info->pq_rc;
133-
rproto->completed = true;
134-
return;
136+
if (rproto->timeout > 0) {
137+
rc = crt_context_get_timeout(rproto->ctx, &timeout);
138+
D_ASSERT(rc == 0);
139+
D_ASSERT(timeout != 0);
140+
}
141+
142+
if (rproto->timeout == 0 || (timeout > 0 && timeout <= rproto->timeout) ||
143+
rproto->first_timeout_rank == CRT_NO_RANK) {
144+
D_ERROR("crt_proto_query_with_ctx() failed, all %d targets tried\n",
145+
nr_ranks);
146+
rproto->rc = cb_info->pq_rc;
147+
rproto->completed = true;
148+
return;
149+
}
150+
151+
/* More retry to the first timeout rank with default timeout. */
152+
rank = rproto->first_timeout_rank;
153+
rproto->timeout = 0;
154+
rproto->num_retries_left = 1; /* Only once */
155+
156+
D_NOTE("No target respond during first cycle quick proto query. Retry once "
157+
"to former timeout rank %u with longer timeout value %u\n",
158+
rank, timeout);
159+
} else {
160+
rank = dc_mgmt_net_get_srv_rank(rproto->rank_idx);
161+
rproto->timeout += 3;
135162
}
136163

137-
rank = dc_mgmt_net_get_srv_rank(rproto->rank_idx);
138164
D_ASSERT(rank != CRT_NO_RANK);
139165
rproto->ep.ep_rank = rank;
140166

141-
rproto->timeout += 3;
142167
rc = crt_proto_query_with_ctx(&rproto->ep, rproto->base_opc, rproto->ver_array,
143168
rproto->array_size, rproto->timeout, query_cb, rproto,
144-
daos_get_crt_ctx());
169+
rproto->ctx);
145170
if (rc) {
146171
D_ERROR("crt_proto_query_with_ctx() failed: "DF_RC"\n", DP_RC(rc));
147172
rproto->rc = rc;
@@ -185,14 +210,16 @@ daos_rpc_proto_query(crt_opcode_t base_opc, uint32_t *ver_array, int count, int
185210
rproto->num_retries_left = nr_ranks;
186211
rank = dc_mgmt_net_get_srv_rank(rproto->rank_idx);
187212
D_ASSERT(rank != CRT_NO_RANK);
188-
rproto->ep.ep_rank = rank;
189-
190-
rproto->ep.ep_tag = 0;
191-
rproto->ver_array = ver_array;
192-
rproto->array_size = count;
193-
rproto->ep.ep_grp = sys->sy_group;
194-
rproto->base_opc = base_opc;
195-
rproto->timeout = 3;
213+
214+
rproto->ep.ep_rank = rank;
215+
rproto->ep.ep_tag = 0;
216+
rproto->ver_array = ver_array;
217+
rproto->array_size = count;
218+
rproto->ep.ep_grp = sys->sy_group;
219+
rproto->base_opc = base_opc;
220+
rproto->timeout = 3;
221+
rproto->first_timeout_rank = CRT_NO_RANK;
222+
rproto->ctx = ctx;
196223

197224
rc = crt_proto_query_with_ctx(&rproto->ep, base_opc, ver_array, count, rproto->timeout,
198225
query_cb, rproto, ctx);

0 commit comments

Comments
 (0)