11/**
22 * (C) Copyright 2016-2024 Intel Corporation.
3- * (C) Copyright 2025 Hewlett Packard Enterprise Development LP
3+ * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP
44 *
55 * SPDX-License-Identifier: BSD-2-Clause-Patent
66 */
@@ -107,6 +107,8 @@ struct rpc_proto {
107107 uint32_t * ver_array ;
108108 uint32_t array_size ;
109109 uint32_t timeout ;
110+ uint32_t first_timeout_rank ;
111+ crt_context_t ctx ;
110112};
111113
112114static void
@@ -118,6 +120,10 @@ query_cb(struct crt_proto_query_cb_info *cb_info)
118120 if (daos_rpc_retryable_rc (cb_info -> pq_rc )) {
119121 int nr_ranks ;
120122 d_rank_t rank ;
123+ uint32_t timeout = 0 ;
124+
125+ if (cb_info -> pq_rc == - DER_TIMEDOUT && rproto -> first_timeout_rank == CRT_NO_RANK )
126+ rproto -> first_timeout_rank = dc_mgmt_net_get_srv_rank (rproto -> rank_idx );
121127
122128 /** select next rank to issue the retry proto query rpc to */
123129 nr_ranks = dc_mgmt_net_get_num_srv_ranks ();
@@ -127,21 +133,39 @@ query_cb(struct crt_proto_query_cb_info *cb_info)
127133
128134 /** We tried all engines and found none alive */
129135 if (rproto -> num_retries_left <= 0 ) {
130- D_ERROR ("crt_proto_query_with_ctx() failed -- All %d targets tried\n" ,
131- nr_ranks );
132- rproto -> rc = cb_info -> pq_rc ;
133- rproto -> completed = true;
134- return ;
136+ if (rproto -> timeout > 0 ) {
137+ rc = crt_context_get_timeout (rproto -> ctx , & timeout );
138+ D_ASSERT (rc == 0 );
139+ D_ASSERT (timeout != 0 );
140+ }
141+
142+ if (rproto -> timeout == 0 || (timeout > 0 && timeout <= rproto -> timeout ) ||
143+ rproto -> first_timeout_rank == CRT_NO_RANK ) {
144+ D_ERROR ("crt_proto_query_with_ctx() failed, all %d targets tried\n" ,
145+ nr_ranks );
146+ rproto -> rc = cb_info -> pq_rc ;
147+ rproto -> completed = true;
148+ return ;
149+ }
150+
151+ /* More retry to the first timeout rank with default timeout. */
152+ rank = rproto -> first_timeout_rank ;
153+ rproto -> timeout = 0 ;
154+ rproto -> num_retries_left = 1 ; /* Only once */
155+
156+ D_NOTE ("No target respond during first cycle quick proto query. Retry once "
157+ "to former timeout rank %u with longer timeout %u\n" , rank , timeout );
158+ } else {
159+ rank = dc_mgmt_net_get_srv_rank (rproto -> rank_idx );
160+ rproto -> timeout += 3 ;
135161 }
136162
137- rank = dc_mgmt_net_get_srv_rank (rproto -> rank_idx );
138163 D_ASSERT (rank != CRT_NO_RANK );
139164 rproto -> ep .ep_rank = rank ;
140165
141- rproto -> timeout += 3 ;
142166 rc = crt_proto_query_with_ctx (& rproto -> ep , rproto -> base_opc , rproto -> ver_array ,
143167 rproto -> array_size , rproto -> timeout , query_cb , rproto ,
144- daos_get_crt_ctx () );
168+ rproto -> ctx );
145169 if (rc ) {
146170 D_ERROR ("crt_proto_query_with_ctx() failed: " DF_RC "\n" , DP_RC (rc ));
147171 rproto -> rc = rc ;
@@ -185,14 +209,16 @@ daos_rpc_proto_query(crt_opcode_t base_opc, uint32_t *ver_array, int count, int
185209 rproto -> num_retries_left = nr_ranks ;
186210 rank = dc_mgmt_net_get_srv_rank (rproto -> rank_idx );
187211 D_ASSERT (rank != CRT_NO_RANK );
188- rproto -> ep .ep_rank = rank ;
189-
190- rproto -> ep .ep_tag = 0 ;
191- rproto -> ver_array = ver_array ;
192- rproto -> array_size = count ;
193- rproto -> ep .ep_grp = sys -> sy_group ;
194- rproto -> base_opc = base_opc ;
195- rproto -> timeout = 3 ;
212+
213+ rproto -> ep .ep_rank = rank ;
214+ rproto -> ep .ep_tag = 0 ;
215+ rproto -> ver_array = ver_array ;
216+ rproto -> array_size = count ;
217+ rproto -> ep .ep_grp = sys -> sy_group ;
218+ rproto -> base_opc = base_opc ;
219+ rproto -> timeout = 3 ;
220+ rproto -> first_timeout_rank = CRT_NO_RANK ;
221+ rproto -> ctx = ctx ;
196222
197223 rc = crt_proto_query_with_ctx (& rproto -> ep , base_opc , ver_array , count , rproto -> timeout ,
198224 query_cb , rproto , ctx );
0 commit comments