Skip to content

Commit ecda919

Browse files
authored
Support real-time gets on hollow shards (#122012)
* Support returning `getLastUnsafeSegmentGenerationForGets` for any `Engine`, not only `InternalEngine` * Retry real-time gets on `AlreadyClosedException` in case a shard's engine gets swapped. See ES-10571
1 parent 2b9d7f6 commit ecda919

File tree

5 files changed

+30
-7
lines changed

5 files changed

+30
-7
lines changed

server/src/main/java/org/elasticsearch/action/get/TransportGetAction.java

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111

1212
import org.apache.logging.log4j.LogManager;
1313
import org.apache.logging.log4j.Logger;
14+
import org.apache.lucene.store.AlreadyClosedException;
1415
import org.elasticsearch.ElasticsearchException;
1516
import org.elasticsearch.ExceptionsHelper;
1617
import org.elasticsearch.action.ActionListener;
@@ -230,7 +231,10 @@ private void getFromTranslog(
230231
final var retryingListener = listener.delegateResponse((l, e) -> {
231232
final var cause = ExceptionsHelper.unwrapCause(e);
232233
logger.debug("get_from_translog failed", cause);
233-
if (cause instanceof ShardNotFoundException || cause instanceof IndexNotFoundException) {
234+
if (cause instanceof ShardNotFoundException
235+
|| cause instanceof IndexNotFoundException
236+
|| cause instanceof AlreadyClosedException) {
237+
// TODO AlreadyClosedException the engine reset should be fixed by ES-10826
234238
logger.debug("retrying get_from_translog");
235239
observer.waitForNextChange(new ClusterStateObserver.Listener() {
236240
@Override
@@ -245,7 +249,13 @@ public void onClusterServiceClose() {
245249

246250
@Override
247251
public void onTimeout(TimeValue timeout) {
248-
l.onFailure(new ElasticsearchException("Timed out retrying get_from_translog", cause));
252+
// TODO AlreadyClosedException the engine reset should be fixed by ES-10826
253+
if (cause instanceof AlreadyClosedException) {
254+
// Do an additional retry just in case AlreadyClosedException didn't generate a cluster update
255+
tryGetFromTranslog(request, indexShard, node, l);
256+
} else {
257+
l.onFailure(new ElasticsearchException("Timed out retrying get_from_translog", cause));
258+
}
249259
}
250260
});
251261
} else {

server/src/main/java/org/elasticsearch/action/get/TransportGetFromTranslogAction.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -81,7 +81,7 @@ protected void doExecute(Task task, Request request, ActionListener<Response> li
8181
if (engine == null) {
8282
throw new AlreadyClosedException("engine closed");
8383
}
84-
segmentGeneration = ((InternalEngine) engine).getLastUnsafeSegmentGenerationForGets();
84+
segmentGeneration = engine.getLastUnsafeSegmentGenerationForGets();
8585
}
8686
return new Response(result, indexShard.getOperationPrimaryTerm(), segmentGeneration);
8787
});

server/src/main/java/org/elasticsearch/action/get/TransportShardMultiGetAction.java

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111

1212
import org.apache.logging.log4j.LogManager;
1313
import org.apache.logging.log4j.Logger;
14+
import org.apache.lucene.store.AlreadyClosedException;
1415
import org.elasticsearch.ElasticsearchException;
1516
import org.elasticsearch.ExceptionsHelper;
1617
import org.elasticsearch.action.ActionListener;
@@ -211,7 +212,10 @@ private void shardMultiGetFromTranslog(
211212
final var retryingListener = listener.delegateResponse((l, e) -> {
212213
final var cause = ExceptionsHelper.unwrapCause(e);
213214
logger.debug("mget_from_translog[shard] failed", cause);
214-
if (cause instanceof ShardNotFoundException || cause instanceof IndexNotFoundException) {
215+
if (cause instanceof ShardNotFoundException
216+
|| cause instanceof IndexNotFoundException
217+
|| cause instanceof AlreadyClosedException) {
218+
// TODO AlreadyClosedException the engine reset should be fixed by ES-10826
215219
logger.debug("retrying mget_from_translog[shard]");
216220
observer.waitForNextChange(new ClusterStateObserver.Listener() {
217221
@Override
@@ -226,7 +230,13 @@ public void onClusterServiceClose() {
226230

227231
@Override
228232
public void onTimeout(TimeValue timeout) {
229-
l.onFailure(new ElasticsearchException("Timed out retrying mget_from_translog[shard]", cause));
233+
// TODO AlreadyClosedException the engine reset should be fixed by ES-10826
234+
if (cause instanceof AlreadyClosedException) {
235+
// Do an additional retry just in case AlreadyClosedException didn't generate a cluster update
236+
tryShardMultiGetFromTranslog(request, indexShard, node, l);
237+
} else {
238+
l.onFailure(new ElasticsearchException("Timed out retrying mget_from_translog[shard]", cause));
239+
}
230240
}
231241
});
232242
} else {

server/src/main/java/org/elasticsearch/action/get/TransportShardMultiGetFomTranslogAction.java

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,6 @@
2323
import org.elasticsearch.common.io.stream.StreamOutput;
2424
import org.elasticsearch.index.IndexService;
2525
import org.elasticsearch.index.engine.Engine;
26-
import org.elasticsearch.index.engine.InternalEngine;
2726
import org.elasticsearch.index.shard.IndexShard;
2827
import org.elasticsearch.index.shard.ShardId;
2928
import org.elasticsearch.indices.IndicesService;
@@ -102,7 +101,7 @@ protected void doExecute(Task task, Request request, ActionListener<Response> li
102101
if (engine == null) {
103102
throw new AlreadyClosedException("engine closed");
104103
}
105-
segmentGeneration = ((InternalEngine) engine).getLastUnsafeSegmentGenerationForGets();
104+
segmentGeneration = engine.getLastUnsafeSegmentGenerationForGets();
106105
}
107106
return new Response(multiGetShardResponse, indexShard.getOperationPrimaryTerm(), segmentGeneration);
108107
});

server/src/main/java/org/elasticsearch/index/engine/Engine.java

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2346,4 +2346,8 @@ public record FlushResult(boolean flushPerformed, long generation) {
23462346
public void prepareForEngineReset() throws IOException {
23472347
throw new UnsupportedOperationException("does not support engine reset");
23482348
}
2349+
2350+
public long getLastUnsafeSegmentGenerationForGets() {
2351+
throw new UnsupportedOperationException("Doesn't support getting the latest segment generation");
2352+
}
23492353
}

0 commit comments

Comments
 (0)