-
Notifications
You must be signed in to change notification settings - Fork 25.7k
Improve PIT context relocation #135231
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Improve PIT context relocation #135231
Changes from all commits
73a174f
730efd4
ad4219c
fb526d4
6bcb45f
841c5b3
b76a959
237ea4c
9085194
d7166cd
cbab660
df9d875
613cf73
0528e1d
07fad27
76340ca
bb0f265
c3d1b65
033e445
5bf1853
cc3c6c5
67872f6
308c366
5011068
b64a366
8ea193d
df1b1c7
9f5a1ad
bee8400
9fe9542
3dca6e0
0a06acf
1a302ac
b9f6f3d
7a7aeef
0edfa87
65747c4
d2618d8
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,5 @@ | ||
| pr: 135231 | ||
| summary: Improve retrying PIT contexts for read-only indices | ||
| area: Search | ||
| type: enhancement | ||
| issues: [] |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -13,6 +13,7 @@ | |
| import org.apache.lucene.util.SetOnce; | ||
| import org.elasticsearch.ElasticsearchException; | ||
| import org.elasticsearch.ExceptionsHelper; | ||
| import org.elasticsearch.TransportVersion; | ||
| import org.elasticsearch.action.ActionListener; | ||
| import org.elasticsearch.action.NoShardAvailableActionException; | ||
| import org.elasticsearch.action.OriginalIndices; | ||
|
|
@@ -21,6 +22,7 @@ | |
| import org.elasticsearch.action.support.SubscribableListener; | ||
| import org.elasticsearch.action.support.TransportActions; | ||
| import org.elasticsearch.cluster.ClusterState; | ||
| import org.elasticsearch.cluster.node.DiscoveryNodes; | ||
| import org.elasticsearch.common.bytes.BytesReference; | ||
| import org.elasticsearch.common.io.stream.NamedWriteableRegistry; | ||
| import org.elasticsearch.common.util.Maps; | ||
|
|
@@ -30,8 +32,10 @@ | |
| import org.elasticsearch.rest.action.search.SearchResponseMetrics; | ||
| import org.elasticsearch.search.SearchContextMissingException; | ||
| import org.elasticsearch.search.SearchPhaseResult; | ||
| import org.elasticsearch.search.SearchService; | ||
| import org.elasticsearch.search.SearchShardTarget; | ||
| import org.elasticsearch.search.builder.PointInTimeBuilder; | ||
| import org.elasticsearch.search.builder.SearchSourceBuilder; | ||
| import org.elasticsearch.search.internal.AliasFilter; | ||
| import org.elasticsearch.search.internal.SearchContext; | ||
| import org.elasticsearch.search.internal.ShardSearchContextId; | ||
|
|
@@ -40,6 +44,8 @@ | |
|
|
||
| import java.util.ArrayList; | ||
| import java.util.Arrays; | ||
| import java.util.Collection; | ||
| import java.util.HashMap; | ||
| import java.util.List; | ||
| import java.util.Map; | ||
| import java.util.concurrent.ConcurrentHashMap; | ||
|
|
@@ -53,6 +59,7 @@ | |
| import java.util.function.Supplier; | ||
| import java.util.stream.Collectors; | ||
|
|
||
| import static org.elasticsearch.action.search.TransportClosePointInTimeAction.closeContexts; | ||
| import static org.elasticsearch.core.Strings.format; | ||
|
|
||
| /** | ||
|
|
@@ -94,11 +101,13 @@ abstract class AbstractSearchAsyncAction<Result extends SearchPhaseResult> exten | |
| private final Map<String, PendingExecutions> pendingExecutionsPerNode; | ||
| private final AtomicBoolean requestCancelled = new AtomicBoolean(); | ||
| private final int skippedCount; | ||
| private final TransportVersion mintransportVersion; | ||
| protected final SearchResponseMetrics searchResponseMetrics; | ||
| protected long phaseStartTimeInNanos; | ||
|
|
||
| // protected for tests | ||
| protected final SubscribableListener<Void> doneFuture = new SubscribableListener<>(); | ||
| private final Supplier<DiscoveryNodes> discoveryNodes; | ||
|
|
||
| AbstractSearchAsyncAction( | ||
| String name, | ||
|
|
@@ -153,6 +162,8 @@ abstract class AbstractSearchAsyncAction<Result extends SearchPhaseResult> exten | |
| this.nodeIdToConnection = nodeIdToConnection; | ||
| this.concreteIndexBoosts = concreteIndexBoosts; | ||
| this.clusterStateVersion = clusterState.version(); | ||
| this.mintransportVersion = clusterState.getMinTransportVersion(); | ||
| this.discoveryNodes = clusterState::nodes; | ||
| this.aliasFilter = aliasFilter; | ||
| this.results = resultConsumer; | ||
| // register the release of the query consumer to free up the circuit breaker memory | ||
|
|
@@ -422,6 +433,7 @@ protected final void onShardFailure(final int shardIndex, SearchShardTarget shar | |
| onShardGroupFailure(shardIndex, shard, e); | ||
| } | ||
| if (lastShard == false) { | ||
| logger.debug("Retrying shard [{}] with target [{}]", shard.getShardId(), nextShard); | ||
| performPhaseOnShard(shardIndex, shardIt, nextShard); | ||
| } else { | ||
| // count down outstanding shards, we're done with this shard as there's no more copies to try | ||
|
|
@@ -613,10 +625,87 @@ public void sendSearchResponse(SearchResponseSections internalSearchResponse, At | |
| } | ||
|
|
||
| protected BytesReference buildSearchContextId(ShardSearchFailure[] failures) { | ||
| var source = request.source(); | ||
| return source != null && source.pointInTimeBuilder() != null && source.pointInTimeBuilder().singleSession() == false | ||
| ? source.pointInTimeBuilder().getEncodedId() | ||
| : null; | ||
| SearchSourceBuilder source = request.source(); | ||
| // only (re-)build a search context id if we are running a long-lived point-in-time request | ||
| if (source != null && source.pointInTimeBuilder() != null && source.pointInTimeBuilder().singleSession() == false) { | ||
| if (SearchService.PIT_RELOCATION_FEATURE_FLAG.isEnabled()) { | ||
| // we want to change node ids in the PIT id if any shards and its PIT context have moved | ||
| return maybeReEncodeNodeIds( | ||
| source.pointInTimeBuilder(), | ||
| results.getAtomicArray().asList(), | ||
| namedWriteableRegistry, | ||
| mintransportVersion, | ||
| searchTransportService, | ||
| discoveryNodes.get(), | ||
| logger | ||
| ); | ||
| } else { | ||
| return source.pointInTimeBuilder().getEncodedId(); | ||
| } | ||
| } else { | ||
| return null; | ||
| } | ||
| } | ||
|
|
||
| static <Result extends SearchPhaseResult> BytesReference maybeReEncodeNodeIds( | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Question: I haven't looked into how we handle partial results with re-encoding. Have you considered this?
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I assume you are referring to the functionality added with #111516? If I understand that PR correctly we will have SearchContextIdForNode entries in the PIT with a "null" node entry. I think in that case we won't add that shard to the shard iterators of any subsequent search, so we won't get a Result for that shard. That is one reason why I added copying PIT id entries for everything that has no Result from the old to the new encoded ID without change.
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I looked again at how we handle partial results. The way I see it, when opening the PIT we can tolerate partial results, i.e. failures from certain shards when they are not available. The result will be an entry in the PIT id that has a ShardId but a ShardContextIdForNode with empty node/contextId |
||
| PointInTimeBuilder originalPit, | ||
| List<Result> results, | ||
| NamedWriteableRegistry namedWriteableRegistry, | ||
| TransportVersion mintransportVersion, | ||
| SearchTransportService searchTransportService, | ||
| DiscoveryNodes nodes, | ||
| Logger logger | ||
| ) { | ||
| SearchContextId original = originalPit.getSearchContextId(namedWriteableRegistry); | ||
| // only create the following two collections if we detect an id change | ||
| Map<ShardId, SearchContextIdForNode> updatedShardMap = null; | ||
| Collection<SearchContextIdForNode> contextsToClose = null; | ||
| logger.debug("checking search result shards to detect PIT node changes"); | ||
| for (Result result : results) { | ||
| SearchShardTarget searchShardTarget = result.getSearchShardTarget(); | ||
| ShardId shardId = searchShardTarget.getShardId(); | ||
| SearchContextIdForNode originalShard = original.shards().get(shardId); | ||
| if (originalShard != null && originalShard.getSearchContextId() != null && originalShard.getSearchContextId().isRetryable()) { | ||
| // check if the node is different, if so we need to re-encode the PIT | ||
| String originalNode = originalShard.getNode(); | ||
| if (originalNode != null && originalNode.equals(searchShardTarget.getNodeId()) == false) { | ||
cbuescher marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| // the target node for this shard entry in the PIT has changed, we need to update it | ||
| if (updatedShardMap == null) { | ||
| // initialize the map with entries from old map to keep ids for shards that have not responded in this results | ||
| updatedShardMap = new HashMap<>(original.shards()); | ||
| contextsToClose = new ArrayList<>(); | ||
| } | ||
| SearchContextIdForNode updatedId = new SearchContextIdForNode( | ||
| searchShardTarget.getClusterAlias(), | ||
| searchShardTarget.getNodeId(), | ||
| result.getContextId() | ||
| ); | ||
|
|
||
| logger.debug("changing node for PIT shard id from [{}] to [{}]", originalShard, updatedId); | ||
| updatedShardMap.put(shardId, updatedId); | ||
| contextsToClose.add(original.shards().get(shardId)); | ||
|
|
||
| } | ||
| } | ||
| } | ||
| if (updatedShardMap != null) { | ||
| // we free all old contexts that have moved, just in case we have re-tried them elsewhere | ||
| // but they still exist in the old location | ||
| closeContexts(nodes, searchTransportService, contextsToClose, new ActionListener<Integer>() { | ||
| @Override | ||
| public void onResponse(Integer integer) { | ||
| // ignore | ||
| } | ||
|
|
||
| @Override | ||
| public void onFailure(Exception e) { | ||
| logger.trace("Failure while freeing old point in time contexts", e); | ||
| } | ||
| }); | ||
| return SearchContextId.encode(updatedShardMap, original.aliasFilter(), mintransportVersion, ShardSearchFailure.EMPTY_ARRAY); | ||
| } else { | ||
| return originalPit.getEncodedId(); | ||
| } | ||
| } | ||
|
|
||
| /** | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This should be async so we wait for old contexts to close before responding. We've seen ML not wait for closing PIT and overload the cluster. But we can do this in a follow-up since this change is behind a feature flag.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I considered waiting here but that would potentially slow down the search that is triggering the id update though, if I understand correctly. I assumed that the fire-and-forget approach should be okay. I'm still not 100% clear under which circumstance we'd need to close a PIT context that is now on a new node here because the reason we are opening a part of the PIT on a new node is likely caused by that context missing in its original location. So nothing to clean up? Or what am I missing here?
Do you remember the details for that scenario? Both for checking how this relates to the situation here and for my own learning to avoid such scenarios in future work.
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
A shard-level request can fail for various reasons, such as a circuit breaking exception on a data node. In such cases, we try another node, and if successful, we replace the old context with the new one here. Alternatively, we could avoid replacing the search context, which means we do not need to close it. We can maintain a list of failure types (SearchContextMissingException or ...) that indicate the search contexts are no longer available and should be replaced. What do you think?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks for the explanation, I like the idea of conditioning the Id rewrite on failure types, however when looking at how to do this here in AbstractSearchAsyncAction I noticed that we clear earlier shard failures when another shard responds with a result. I'm not sure yet how to make this possible here or if there are better locations in the code to do so. Need to do some digging.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I added a different solution to this than we talked about directly in AbstractSearchAsyncAction without having to modify anything in SearchPhaseResult. Let me know if this looks okay to you, otherwise I'll change.