Skip to content

Commit 2997189

Browse files
committed
Fix DataNodeRequestSender
1 parent 2c6dd6c commit 2997189

File tree

2 files changed

+24
-9
lines changed

2 files changed

+24
-9
lines changed

muted-tests.yml

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -392,9 +392,6 @@ tests:
392392
- class: org.elasticsearch.test.rest.ClientYamlTestSuiteIT
393393
method: test {yaml=update/100_synthetic_source/keyword}
394394
issue: https://github.com/elastic/elasticsearch/issues/121965
395-
- class: org.elasticsearch.xpack.esql.plugin.DataNodeRequestSenderTests
396-
method: testDoNotRetryOnRequestLevelFailure
397-
issue: https://github.com/elastic/elasticsearch/issues/121966
398395
- class: org.elasticsearch.xpack.searchablesnapshots.hdfs.SecureHdfsSearchableSnapshotsIT
399396
issue: https://github.com/elastic/elasticsearch/issues/121967
400397
- class: org.elasticsearch.xpack.esql.parser.StatementParserTests

x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/plugin/DataNodeRequestSender.java

Lines changed: 24 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,9 @@
3434
import org.elasticsearch.xpack.esql.action.EsqlSearchShardsAction;
3535

3636
import java.util.ArrayList;
37+
import java.util.Collections;
3738
import java.util.HashMap;
39+
import java.util.IdentityHashMap;
3840
import java.util.Iterator;
3941
import java.util.List;
4042
import java.util.Map;
@@ -58,6 +60,7 @@ abstract class DataNodeRequestSender {
5860
private final Map<DiscoveryNode, Semaphore> nodePermits = new HashMap<>();
5961
private final Map<ShardId, ShardFailure> shardFailures = ConcurrentCollections.newConcurrentMap();
6062
private final AtomicBoolean changed = new AtomicBoolean();
63+
private boolean reportedFailure = false; // guarded by sendingLock
6164

6265
DataNodeRequestSender(TransportService transportService, Executor esqlExecutor, CancellableTask rootTask) {
6366
this.transportService = transportService;
@@ -106,7 +109,9 @@ private void trySendingRequestsForPendingShards(TargetShards targetShards, Compu
106109
if (changed.compareAndSet(true, false) == false) {
107110
break;
108111
}
109-
for (ShardId shardId : pendingShardIds) {
112+
final Iterator<ShardId> shardIts = pendingShardIds.iterator();
113+
while (shardIts.hasNext()) {
114+
final ShardId shardId = shardIts.next();
110115
if (targetShards.getShard(shardId).remainingNodes.isEmpty()) {
111116
shardFailures.compute(
112117
shardId,
@@ -115,12 +120,12 @@ private void trySendingRequestsForPendingShards(TargetShards targetShards, Compu
115120
v == null ? new NoShardAvailableActionException(shardId, "no shard copies found") : v.failure
116121
)
117122
);
123+
shardIts.remove();
118124
}
119125
}
120-
if (shardFailures.values().stream().anyMatch(shardFailure -> shardFailure.fatal)) {
121-
for (var e : shardFailures.values()) {
122-
computeListener.acquireAvoid().onFailure(e.failure);
123-
}
126+
if (reportedFailure || shardFailures.values().stream().anyMatch(shardFailure -> shardFailure.fatal)) {
127+
reportedFailure = true;
128+
reportFailures(computeListener);
124129
} else {
125130
var nodeRequests = selectNodeRequests(targetShards);
126131
for (NodeRequest request : nodeRequests) {
@@ -136,6 +141,19 @@ private void trySendingRequestsForPendingShards(TargetShards targetShards, Compu
136141
}
137142
}
138143

144+
private void reportFailures(ComputeListener computeListener) {
145+
assert sendingLock.isHeldByCurrentThread();
146+
Iterator<ShardFailure> it = shardFailures.values().iterator();
147+
Set<Exception> seen = Collections.newSetFromMap(new IdentityHashMap<>());
148+
while (it.hasNext()) {
149+
ShardFailure failure = it.next();
150+
if (seen.add(failure.failure)) {
151+
computeListener.acquireAvoid().onFailure(failure.failure);
152+
}
153+
it.remove();
154+
}
155+
}
156+
139157
private void sendOneNodeRequest(TargetShards targetShards, ComputeListener computeListener, NodeRequest request) {
140158
final ActionListener<List<DriverProfile>> listener = computeListener.acquireCompute();
141159
sendRequest(request.node, request.shardIds, request.aliasFilters, new NodeListener() {
@@ -148,7 +166,7 @@ void onAfter(List<DriverProfile> profiles) {
148166
@Override
149167
public void onResponse(DataNodeComputeResponse response) {
150168
// remove failures of successful shards
151-
for (ShardId shardId : targetShards.shardIds()) {
169+
for (ShardId shardId : request.shardIds()) {
152170
if (response.shardLevelFailures().containsKey(shardId) == false) {
153171
shardFailures.remove(shardId);
154172
}

0 commit comments

Comments
 (0)