1111
1212import org .apache .lucene .util .ArrayUtil ;
1313import org .apache .lucene .util .automaton .TooComplexToDeterminizeException ;
14+ import org .elasticsearch .ElasticsearchTimeoutException ;
1415import org .elasticsearch .ExceptionsHelper ;
1516import org .elasticsearch .action .ActionListener ;
17+ import org .elasticsearch .action .ActionListenerResponseHandler ;
1618import org .elasticsearch .action .ActionRunnable ;
1719import org .elasticsearch .action .ActionType ;
1820import org .elasticsearch .action .OriginalIndices ;
2224import org .elasticsearch .action .support .ChannelActionListener ;
2325import org .elasticsearch .action .support .HandledTransportAction ;
2426import org .elasticsearch .action .support .RefCountingRunnable ;
25- import org .elasticsearch .client . internal . RemoteClusterClient ;
27+ import org .elasticsearch .action . support . SubscribableListener ;
2628import org .elasticsearch .cluster .ProjectState ;
2729import org .elasticsearch .cluster .block .ClusterBlockLevel ;
2830import org .elasticsearch .cluster .metadata .IndexNameExpressionResolver ;
3739import org .elasticsearch .common .util .concurrent .ThrottledTaskRunner ;
3840import org .elasticsearch .core .Nullable ;
3941import org .elasticsearch .core .Releasable ;
42+ import org .elasticsearch .core .TimeValue ;
4043import org .elasticsearch .core .Tuple ;
4144import org .elasticsearch .index .shard .ShardId ;
4245import org .elasticsearch .indices .IndicesService ;
4851import org .elasticsearch .tasks .Task ;
4952import org .elasticsearch .threadpool .ThreadPool ;
5053import org .elasticsearch .transport .RemoteClusterAware ;
51- import org .elasticsearch .transport .RemoteClusterService ;
54+ import org .elasticsearch .transport .Transport ;
5255import org .elasticsearch .transport .TransportChannel ;
5356import org .elasticsearch .transport .TransportRequestHandler ;
57+ import org .elasticsearch .transport .TransportRequestOptions ;
5458import org .elasticsearch .transport .TransportService ;
5559
5660import java .util .ArrayList ;
@@ -91,6 +95,8 @@ public class TransportFieldCapabilitiesAction extends HandledTransportAction<Fie
9195
9296 private final IndicesService indicesService ;
9397 private final boolean ccsCheckCompatibility ;
98+ private final ThreadPool threadPool ;
99+ private final TimeValue forceConnectTimeoutSecs ;
94100
95101 @ Inject
96102 public TransportFieldCapabilitiesAction (
@@ -117,32 +123,40 @@ public TransportFieldCapabilitiesAction(
117123 new NodeTransportHandler ()
118124 );
119125 this .ccsCheckCompatibility = SearchService .CCS_VERSION_CHECK_SETTING .get (clusterService .getSettings ());
126+ this .threadPool = threadPool ;
127+ this .forceConnectTimeoutSecs = clusterService .getSettings ().getAsTime ("search.ccs.force_connect_timeout" , null );
120128 }
121129
122130 @ Override
123131 protected void doExecute (Task task , FieldCapabilitiesRequest request , final ActionListener <FieldCapabilitiesResponse > listener ) {
124132 executeRequest (
125133 task ,
126134 request ,
127- (remoteClient , remoteRequest , remoteListener ) -> remoteClient .execute (REMOTE_TYPE , remoteRequest , remoteListener ),
135+ (transportService , conn , fieldCapabilitiesRequest , responseHandler ) -> transportService .sendRequest (
136+ conn ,
137+ REMOTE_TYPE .name (),
138+ fieldCapabilitiesRequest ,
139+ TransportRequestOptions .EMPTY ,
140+ responseHandler
141+ ),
128142 listener
129143 );
130144 }
131145
132146 public void executeRequest (
133147 Task task ,
134148 FieldCapabilitiesRequest request ,
135- RemoteRequestExecutor remoteRequestExecutor ,
149+ LinkedRequestExecutor linkedRequestExecutor ,
136150 ActionListener <FieldCapabilitiesResponse > listener
137151 ) {
138152 // workaround for https://github.com/elastic/elasticsearch/issues/97916 - TODO remove this when we can
139- searchCoordinationExecutor .execute (ActionRunnable .wrap (listener , l -> doExecuteForked (task , request , remoteRequestExecutor , l )));
153+ searchCoordinationExecutor .execute (ActionRunnable .wrap (listener , l -> doExecuteForked (task , request , linkedRequestExecutor , l )));
140154 }
141155
142156 private void doExecuteForked (
143157 Task task ,
144158 FieldCapabilitiesRequest request ,
145- RemoteRequestExecutor remoteRequestExecutor ,
159+ LinkedRequestExecutor linkedRequestExecutor ,
146160 ActionListener <FieldCapabilitiesResponse > listener
147161 ) {
148162 if (ccsCheckCompatibility ) {
@@ -268,12 +282,6 @@ private void doExecuteForked(
268282 for (Map .Entry <String , OriginalIndices > remoteIndices : remoteClusterIndices .entrySet ()) {
269283 String clusterAlias = remoteIndices .getKey ();
270284 OriginalIndices originalIndices = remoteIndices .getValue ();
271- var remoteClusterClient = transportService .getRemoteClusterService ()
272- .getRemoteClusterClient (
273- clusterAlias ,
274- singleThreadedExecutor ,
275- RemoteClusterService .DisconnectedStrategy .RECONNECT_UNLESS_SKIP_UNAVAILABLE
276- );
277285 FieldCapabilitiesRequest remoteRequest = prepareRemoteRequest (clusterAlias , request , originalIndices , nowInMillis );
278286 ActionListener <FieldCapabilitiesResponse > remoteListener = ActionListener .wrap (response -> {
279287 for (FieldCapabilitiesIndexResponse resp : response .getIndexResponses ()) {
@@ -299,18 +307,34 @@ private void doExecuteForked(
299307 handleIndexFailure .accept (RemoteClusterAware .buildRemoteIndexName (clusterAlias , index ), ex );
300308 }
301309 });
302- remoteRequestExecutor .executeRemoteRequest (
303- remoteClusterClient ,
304- remoteRequest ,
310+
311+ SubscribableListener <Transport .Connection > connectionListener = new SubscribableListener <>();
312+ if (forceConnectTimeoutSecs != null ) {
313+ connectionListener .addTimeout (forceConnectTimeoutSecs , threadPool , singleThreadedExecutor );
314+ }
315+
316+ connectionListener .addListener (
305317 // The underlying transport service may call onFailure with a thread pool other than search_coordinator.
306318 // This fork is a workaround to ensure that the merging of field-caps always occurs on the search_coordinator.
307319 // TODO: remove this workaround after we fixed https://github.com/elastic/elasticsearch/issues/107439
308320 new ForkingOnFailureActionListener <>(
309321 singleThreadedExecutor ,
310322 true ,
311323 ActionListener .releaseAfter (remoteListener , refs .acquire ())
324+ ).delegateFailure (
325+ (responseListener , conn ) -> linkedRequestExecutor .executeRemoteRequest (
326+ transportService ,
327+ conn ,
328+ remoteRequest ,
329+ new ActionListenerResponseHandler <>(responseListener , FieldCapabilitiesResponse ::new , singleThreadedExecutor )
330+ )
312331 )
313332 );
333+
334+ boolean ensureConnected = forceConnectTimeoutSecs != null
335+ || transportService .getRemoteClusterService ().isSkipUnavailable (clusterAlias ) == false ;
336+ transportService .getRemoteClusterService ()
337+ .maybeEnsureConnectedAndGetConnection (clusterAlias , ensureConnected , connectionListener );
314338 }
315339 }
316340 }
@@ -338,11 +362,12 @@ public void onFailure(Exception e) {
338362 });
339363 }
340364
341- public interface RemoteRequestExecutor {
365+ public interface LinkedRequestExecutor {
342366 void executeRemoteRequest (
343- RemoteClusterClient remoteClient ,
367+ TransportService transportService ,
368+ Transport .Connection conn ,
344369 FieldCapabilitiesRequest remoteRequest ,
345- ActionListener <FieldCapabilitiesResponse > remoteListener
370+ ActionListenerResponseHandler <FieldCapabilitiesResponse > responseHandler
346371 );
347372 }
348373
@@ -376,8 +401,20 @@ private static void mergeIndexResponses(
376401 } else {
377402 // we have no responses at all, maybe because of errors
378403 if (indexFailures .isEmpty () == false ) {
379- // throw back the first exception
380- listener .onFailure (failures .get (0 ).getException ());
404+ /*
405+ * Under no circumstances are we to pass timeout errors originating from SubscribableListener as top-level errors.
406+ * Instead, they should always be passed through the response object, as part of "failures".
407+ */
408+ if (failures .stream ()
409+ .anyMatch (
410+ failure -> failure .getException () instanceof IllegalStateException ise
411+ && ise .getCause () instanceof ElasticsearchTimeoutException
412+ )) {
413+ listener .onResponse (new FieldCapabilitiesResponse (Collections .emptyList (), failures ));
414+ } else {
415+ // throw back the first exception
416+ listener .onFailure (failures .get (0 ).getException ());
417+ }
381418 } else {
382419 listener .onResponse (new FieldCapabilitiesResponse (Collections .emptyList (), Collections .emptyList ()));
383420 }
@@ -585,15 +622,24 @@ List<FieldCapabilitiesFailure> build(Set<String> successfulIndices) {
585622 for (Map .Entry <String , Exception > failure : failuresByIndex .entrySet ()) {
586623 String index = failure .getKey ();
587624 Exception e = failure .getValue ();
625+ /*
626+ * The listener we use to briefly try, and connect to a linked cluster can throw an ElasticsearchTimeoutException
627+ * error if it cannot be reached. To make sure we correctly recognise this scenario via
628+ * ExceptionsHelper.isRemoteUnavailableException(), we wrap this error appropriately.
629+ */
630+ if (e instanceof ElasticsearchTimeoutException ete ) {
631+ e = new IllegalStateException ("Unable to open any connections" , ete );
632+ }
588633
589634 if (successfulIndices .contains (index ) == false ) {
590635 // we deduplicate exceptions on the underlying causes message and classname
591636 // we unwrap the cause to e.g. group RemoteTransportExceptions coming from different nodes if the cause is the same
592637 Throwable cause = ExceptionsHelper .unwrapCause (e );
593638 Tuple <String , String > groupingKey = new Tuple <>(cause .getMessage (), cause .getClass ().getName ());
639+ Exception ex = e ;
594640 indexFailures .compute (
595641 groupingKey ,
596- (k , v ) -> v == null ? new FieldCapabilitiesFailure (new String [] { index }, e ) : v .addIndex (index )
642+ (k , v ) -> v == null ? new FieldCapabilitiesFailure (new String [] { index }, ex ) : v .addIndex (index )
597643 );
598644 }
599645 }
0 commit comments