-
Notifications
You must be signed in to change notification settings - Fork 25.5k
Improve PIT context relocation #135231
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Improve PIT context relocation #135231
Changes from 4 commits
73a174f
730efd4
ad4219c
fb526d4
6bcb45f
841c5b3
b76a959
237ea4c
9085194
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
pr: 135231 | ||
summary: Improve PIT context relocation | ||
area: Search | ||
type: enhancement | ||
issues: [] |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -13,6 +13,7 @@ | |
import org.apache.lucene.util.SetOnce; | ||
import org.elasticsearch.ElasticsearchException; | ||
import org.elasticsearch.ExceptionsHelper; | ||
import org.elasticsearch.TransportVersion; | ||
import org.elasticsearch.action.ActionListener; | ||
import org.elasticsearch.action.NoShardAvailableActionException; | ||
import org.elasticsearch.action.OriginalIndices; | ||
|
@@ -31,6 +32,7 @@ | |
import org.elasticsearch.search.SearchPhaseResult; | ||
import org.elasticsearch.search.SearchShardTarget; | ||
import org.elasticsearch.search.builder.PointInTimeBuilder; | ||
import org.elasticsearch.search.builder.SearchSourceBuilder; | ||
import org.elasticsearch.search.internal.AliasFilter; | ||
import org.elasticsearch.search.internal.SearchContext; | ||
import org.elasticsearch.search.internal.ShardSearchContextId; | ||
|
@@ -39,8 +41,10 @@ | |
|
||
import java.util.ArrayList; | ||
import java.util.Arrays; | ||
import java.util.HashMap; | ||
import java.util.List; | ||
import java.util.Map; | ||
import java.util.Objects; | ||
import java.util.concurrent.ConcurrentHashMap; | ||
import java.util.concurrent.ConcurrentLinkedQueue; | ||
import java.util.concurrent.Executor; | ||
|
@@ -93,6 +97,7 @@ abstract class AbstractSearchAsyncAction<Result extends SearchPhaseResult> exten | |
private final Map<String, PendingExecutions> pendingExecutionsPerNode; | ||
private final AtomicBoolean requestCancelled = new AtomicBoolean(); | ||
private final int skippedCount; | ||
private final TransportVersion mintransportVersion; | ||
|
||
// protected for tests | ||
protected final SubscribableListener<Void> doneFuture = new SubscribableListener<>(); | ||
|
@@ -149,6 +154,7 @@ abstract class AbstractSearchAsyncAction<Result extends SearchPhaseResult> exten | |
this.nodeIdToConnection = nodeIdToConnection; | ||
this.concreteIndexBoosts = concreteIndexBoosts; | ||
this.clusterStateVersion = clusterState.version(); | ||
this.mintransportVersion = clusterState.getMinTransportVersion(); | ||
this.aliasFilter = aliasFilter; | ||
this.results = resultConsumer; | ||
// register the release of the query consumer to free up the circuit breaker memory | ||
|
@@ -416,6 +422,7 @@ protected final void onShardFailure(final int shardIndex, SearchShardTarget shar | |
onShardGroupFailure(shardIndex, shard, e); | ||
} | ||
if (lastShard == false) { | ||
logger.debug("Retrying shard [{}] with target [{}]", shard.getShardId(), nextShard); | ||
performPhaseOnShard(shardIndex, shardIt, nextShard); | ||
} else { | ||
// count down outstanding shards, we're done with this shard as there's no more copies to try | ||
|
@@ -607,10 +614,70 @@ public void sendSearchResponse(SearchResponseSections internalSearchResponse, At | |
} | ||
|
||
protected BytesReference buildSearchContextId(ShardSearchFailure[] failures) { | ||
var source = request.source(); | ||
return source != null && source.pointInTimeBuilder() != null && source.pointInTimeBuilder().singleSession() == false | ||
? source.pointInTimeBuilder().getEncodedId() | ||
: null; | ||
SearchSourceBuilder source = request.source(); | ||
// only (re-)build a search context id if we have a point in time | ||
if (source != null && source.pointInTimeBuilder() != null && source.pointInTimeBuilder().singleSession() == false) { | ||
// we want to change node ids in the PIT id if any shards and its PIT context have moved | ||
return maybeReEncodeNodeIds( | ||
source.pointInTimeBuilder(), | ||
results.getAtomicArray().asList(), | ||
failures, | ||
namedWriteableRegistry, | ||
mintransportVersion | ||
); | ||
} else { | ||
return null; | ||
} | ||
} | ||
|
||
static <Result extends SearchPhaseResult> BytesReference maybeReEncodeNodeIds( | ||
PointInTimeBuilder originalPit, | ||
List<Result> results, | ||
ShardSearchFailure[] failures, | ||
NamedWriteableRegistry namedWriteableRegistry, | ||
TransportVersion mintransportVersion | ||
) { | ||
SearchContextId original = originalPit.getSearchContextId(namedWriteableRegistry); | ||
boolean idChanged = false; | ||
Map<ShardId, SearchContextIdForNode> updatedShardMap = null; // only create this if we detect a change | ||
for (Result result : results) { | ||
SearchShardTarget searchShardTarget = result.getSearchShardTarget(); | ||
ShardId shardId = searchShardTarget.getShardId(); | ||
SearchContextIdForNode originalShard = original.shards().get(shardId); | ||
if (originalShard != null | ||
&& Objects.equals(originalShard.getClusterAlias(), searchShardTarget.getClusterAlias()) | ||
cbuescher marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
&& Objects.equals(originalShard.getSearchContextId(), result.getContextId())) { | ||
cbuescher marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
// result shard and context id match the originalShard one, check if the node is different and replace if so | ||
String originalNode = originalShard.getNode(); | ||
if (originalNode != null && originalNode.equals(searchShardTarget.getNodeId()) == false) { | ||
// the target node for this shard entry in the PIT has changed, we need to update it | ||
idChanged = true; | ||
if (updatedShardMap == null) { | ||
updatedShardMap = new HashMap<>(original.shards().size()); | ||
} | ||
updatedShardMap.put( | ||
shardId, | ||
new SearchContextIdForNode( | ||
originalShard.getClusterAlias(), | ||
searchShardTarget.getNodeId(), | ||
originalShard.getSearchContextId() | ||
) | ||
cbuescher marked this conversation as resolved.
Show resolved
Hide resolved
|
||
); | ||
} | ||
} | ||
} | ||
if (idChanged) { | ||
// we also need to add shard that are not in the results for some reason (e.g. query rewrote to match none) but that | ||
// were part of the original PIT | ||
for (ShardId shardId : original.shards().keySet()) { | ||
if (updatedShardMap.containsKey(shardId) == false) { | ||
updatedShardMap.put(shardId, original.shards().get(shardId)); | ||
} | ||
} | ||
return SearchContextId.encode(updatedShardMap, original.aliasFilter(), mintransportVersion, failures); | ||
|
||
} else { | ||
return originalPit.getEncodedId(); | ||
} | ||
} | ||
|
||
/** | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Question: I haven't looked into how we handle partial results with re-encoding. Have you considered this?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I assume you are referring to the functionality added with #111516? If I understand that PR correctly we will have SearchContextIdForNode entries in the PIT with a "null" node entry. I think in that case we won't add that shard to the shard iterators of any subsequent search, so we won't get a Result for that shard. That is one reason why I added copying PIT id entries for everything that has no Result from the old to the new encoded ID without change.
Is that what you mean?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I looked again at how we handle partial results. The way I see it, when opening the PIT we can tolerate partial results, i.e. failures from certain shards when they are not available. The result will be an entry in the PIT id that has a ShardId but a ShardContextIdForNode with empty node/contextId
The way I see it, we should not change any of these entries, which is what is already happening in this method.
In addition the re-encoding step here doesn't change any entries for shards that have failed in the last search. They shouldn't be included in the "results" list and for that reason not update the related part in the updated PIT id. In cases where these failures are temporary, subsequent searches with the updated ID will try to hit the "old" shard context locations, if any of these can be re-tried we will updated that part of the PIT in a later call.