Merge branch '09162025/ReshardSplitRequestOnSourceTwoPass' of github.com:ankikuma/elasticsearch into ankikuma-09162025/ReshardSplitRequestOnSourceTwoPass

Tim-Brooks · Tim-Brooks · commit 0c16c45fbd1d · 2025-10-07T12:05:21.000-06:00
diff --git a/server/src/main/java/org/elasticsearch/action/DocWriteRequest.java b/server/src/main/java/org/elasticsearch/action/DocWriteRequest.java
@@ -176,6 +176,8 @@ default void postRoutingProcess(IndexRouting indexRouting) {}
      */
     int route(IndexRouting indexRouting);
 
+    int rerouteAtSourceDuringResharding(IndexRouting indexRouting);
+
     /**
      * Resolves the write index that should receive this request
      * based on the provided index abstraction.
diff --git a/server/src/main/java/org/elasticsearch/action/bulk/TransportShardBulkAction.java b/server/src/main/java/org/elasticsearch/action/bulk/TransportShardBulkAction.java
@@ -33,7 +33,9 @@
 import org.elasticsearch.cluster.ClusterStateObserver;
 import org.elasticsearch.cluster.action.index.MappingUpdatedAction;
 import org.elasticsearch.cluster.action.shard.ShardStateAction;
+import org.elasticsearch.cluster.metadata.ProjectMetadata;
 import org.elasticsearch.cluster.project.ProjectResolver;
+import org.elasticsearch.cluster.routing.IndexRouting;
 import org.elasticsearch.cluster.service.ClusterService;
 import org.elasticsearch.common.bytes.BytesReference;
 import org.elasticsearch.common.compress.CompressedXContent;
@@ -44,6 +46,7 @@
 import org.elasticsearch.core.Strings;
 import org.elasticsearch.core.TimeValue;
 import org.elasticsearch.core.Tuple;
+import org.elasticsearch.index.Index;
 import org.elasticsearch.index.IndexingPressure;
 import org.elasticsearch.index.engine.Engine;
 import org.elasticsearch.index.engine.VersionConflictEngineException;
@@ -55,6 +58,7 @@
 import org.elasticsearch.index.mapper.SourceToParse;
 import org.elasticsearch.index.seqno.SequenceNumbers;
 import org.elasticsearch.index.shard.IndexShard;
+import org.elasticsearch.index.shard.ShardId;
 import org.elasticsearch.index.translog.Translog;
 import org.elasticsearch.indices.ExecutorSelector;
 import org.elasticsearch.indices.IndicesService;
@@ -70,6 +74,9 @@
 import org.elasticsearch.xcontent.XContentType;
 
 import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
 import java.util.Map;
 import java.util.concurrent.Executor;
 import java.util.function.Consumer;
@@ -163,6 +170,71 @@ protected void shardOperationOnPrimary(
         primary.ensureMutable(listener.delegateFailure((l, ignored) -> super.shardOperationOnPrimary(request, primary, l)), true);
     }
 
+    @Override
+    protected Map<ShardId, BulkShardRequest> splitRequestOnPrimary(BulkShardRequest request) {
+        // System.out.println("I am splitting");
+        ClusterState clusterState = clusterService.state();
+        ProjectMetadata project = projectResolver.getProjectMetadata(clusterState);
+        Index index = request.shardId().getIndex();
+        // IndexMetadata indexMetadata = clusterState.getMetadata().indexMetadata(index);
+        IndexRouting routing = IndexRouting.fromIndexMetadata(project.getIndexSafe(index));
+        Map<ShardId, List<BulkItemRequest>> requestsByShard = new HashMap<>();
+        Map<ShardId, BulkShardRequest> bulkRequestsPerShard = new HashMap<>();
+
+        // Iterate through the items in the input request and split them based on the
+        // current resharding-split state.
+        BulkItemRequest[] items = request.items();
+        if (items.length == 0) {  // Nothing to split
+            return Map.of(request.shardId(), request);
+        }
+
+        for (int i = 0; i < items.length; i++) {
+            BulkItemRequest bulkItemRequest = items[i];
+            DocWriteRequest<?> docWriteRequest = bulkItemRequest.request();
+            int shardId = docWriteRequest.rerouteAtSourceDuringResharding(routing);
+            // int shardId = docWriteRequest.route(routing);
+            // System.out.println("shardId = " + shardId);
+            List<BulkItemRequest> shardRequests = requestsByShard.computeIfAbsent(
+                new ShardId(index, shardId),
+                shardNum -> new ArrayList<>()
+            );
+            shardRequests.add(bulkItemRequest);
+        }
+
+        // System.out.println("requestsByShard = " + requestsByShard.size());
+        // All items belong to either the source shard or target shard.
+        if (requestsByShard.size() == 1) {
+            ShardId targetShard = requestsByShard.entrySet().iterator().next().getKey();
+            // Return original request if no items were split to target.
+            if (targetShard.equals(request.shardId())) {
+                return Map.of(request.shardId(), request);
+            } else {
+                // Create new bulk request that is identical to the original request except the shardId.
+                // TODO: Verify that this is alright because each BulkItemRequest also contains shardId
+                BulkShardRequest bulkShardRequest = new BulkShardRequest(
+                    targetShard,
+                    request.getRefreshPolicy(),
+                    request.items(),
+                    request.isSimulated()
+                );
+                return Map.of(targetShard, bulkShardRequest);
+            }
+        }
+
+        for (Map.Entry<ShardId, List<BulkItemRequest>> entry : requestsByShard.entrySet()) {
+            final ShardId shardId = entry.getKey();
+            final List<BulkItemRequest> requests = entry.getValue();
+            BulkShardRequest bulkShardRequest = new BulkShardRequest(
+                shardId,
+                request.getRefreshPolicy(),
+                requests.toArray(new BulkItemRequest[0]),
+                request.isSimulated()
+            );
+            bulkRequestsPerShard.put(shardId, bulkShardRequest);
+        }
+        return (bulkRequestsPerShard);
+    }
+
     @Override
     protected void dispatchedShardOperationOnPrimary(
         BulkShardRequest request,
diff --git a/server/src/main/java/org/elasticsearch/action/delete/DeleteRequest.java b/server/src/main/java/org/elasticsearch/action/delete/DeleteRequest.java
@@ -236,6 +236,11 @@ public int route(IndexRouting indexRouting) {
         return indexRouting.deleteShard(id, routing);
     }
 
+    @Override
+    public int rerouteAtSourceDuringResharding(IndexRouting indexRouting) {
+        return indexRouting.rerouteDeleteRequestIfResharding(id, routing);
+    }
+
     @Override
     public void writeTo(StreamOutput out) throws IOException {
         super.writeTo(out);
diff --git a/server/src/main/java/org/elasticsearch/action/index/IndexRequest.java b/server/src/main/java/org/elasticsearch/action/index/IndexRequest.java
@@ -928,6 +928,11 @@ public int route(IndexRouting indexRouting) {
         return indexRouting.indexShard(this);
     }
 
+    @Override
+    public int rerouteAtSourceDuringResharding(IndexRouting indexRouting) {
+        return indexRouting.rerouteIndexingRequestIfResharding(this);
+    }
+
     public IndexRequest setRequireAlias(boolean requireAlias) {
         this.requireAlias = requireAlias;
         return this;
diff --git a/server/src/main/java/org/elasticsearch/action/support/replication/TransportReplicationAction.java b/server/src/main/java/org/elasticsearch/action/support/replication/TransportReplicationAction.java
@@ -317,6 +317,13 @@ protected abstract void shardOperationOnReplica(
         ActionListener<ReplicaResult> listener
     );
 
+    /**
+     * During Resharding, we might need to split the primary request.
+     */
+    protected Map<ShardId, Request> splitRequestOnPrimary(Request request) {
+        return Map.of(request.shardId(), request);
+    }
+
     /**
      * Cluster level block to check before request execution. Returning null means that no blocks need to be checked.
      */
@@ -508,54 +515,127 @@ public void handleException(TransportException exp) {
                             }
                         }
                     );
-                } else {
-                    setPhase(replicationTask, "primary");
-
-                    final ActionListener<Response> responseListener = ActionListener.wrap(response -> {
-                        adaptResponse(response, primaryShardReference.indexShard);
-
-                        if (syncGlobalCheckpointAfterOperation) {
-                            try {
-                                primaryShardReference.indexShard.maybeSyncGlobalCheckpoint("post-operation");
-                            } catch (final Exception e) {
-                                // only log non-closed exceptions
-                                if (ExceptionsHelper.unwrap(e, AlreadyClosedException.class, IndexShardClosedException.class) == null) {
-                                    // intentionally swallow, a missed global checkpoint sync should not fail this operation
-                                    logger.info(
-                                        () -> format(
-                                            "%s failed to execute post-operation global checkpoint sync",
-                                            primaryShardReference.indexShard.shardId()
-                                        ),
-                                        e
-                                    );
-                                }
+                } else if (reshardSplitShardCountSummary.isUnset()
+                    || reshardSplitShardCountSummary.equals(
+                        SplitShardCountSummary.forIndexing(indexMetadata, primaryRequest.getRequest().shardId().getId())
+                    ) == false) {
+                        // Split Request
+                        Map<ShardId, Request> splitRequests = splitRequestOnPrimary(primaryRequest.getRequest());
+                        int numSplitRequests = splitRequests.size();
+
+                        // splitRequestOnPrimary must handle the case when the request has no items
+                        assert numSplitRequests > 0 : "expected atleast 1 split request";
+                        assert numSplitRequests <= 2 : "number of split requests too many";
+
+                        // System.out.println("numSplitRequests = " + numSplitRequests);
+                        // System.out.println("source shardId = " + primaryRequest.getRequest().shardId().toString());
+                        if (numSplitRequests == 1) {
+                            // System.out.println("shardId = " + splitRequests.entrySet().iterator().next().getKey().toString());
+                            // If the request is for source, same behaviour as before
+                            if (splitRequests.containsKey(primaryRequest.getRequest().shardId())) {
+                                // System.out.println("Execute request on source");
+                                executePrimaryRequest(primaryShardReference, "primary");
+                                // executePrimaryRequest(primaryShardReference, "primary_reshardSplit");
+                            } else {
+                                // System.out.println("Execute request on target");
+                                // If the request is for target, forward request to target.
+                                // TODO: Note that the request still contains the original shardId. We need to test if this will be a
+                                // problem.
+                                setPhase(replicationTask, "primary_reshardSplit_delegation");
+                                // If the request is for target, send request to target node
+                                ShardId targetShardId = splitRequests.entrySet().iterator().next().getKey();
+                                final IndexShard targetShard = getIndexShard(targetShardId);
+                                final ShardRouting target = targetShard.routingEntry();
+                                final Writeable.Reader<Response> reader = TransportReplicationAction.this::newResponseInstance;
+                                DiscoveryNode targetNode = clusterState.nodes().get(target.currentNodeId());
+                                transportService.sendRequest(
+                                    targetNode,
+                                    transportPrimaryAction,
+                                    new ConcreteShardRequest<>(
+                                        primaryRequest.getRequest(),
+                                        target.allocationId().getRelocationId(),
+                                        primaryRequest.getPrimaryTerm()
+                                    ),
+                                    transportOptions,
+                                    new ActionListenerResponseHandler<>(
+                                        onCompletionListener,
+                                        reader,
+                                        TransportResponseHandler.TRANSPORT_WORKER
+                                    ) {
+
+                                        @Override
+                                        public void handleResponse(Response response) {
+                                            setPhase(replicationTask, "finished");
+                                            super.handleResponse(response);
+                                        }
+
+                                        @Override
+                                        public void handleException(TransportException exp) {
+                                            setPhase(replicationTask, "finished");
+                                            super.handleException(exp);
+                                        }
+                                    }
+                                );
                             }
-                        }
+                        } else {
+                            // TODO:
+                            // We have requests for both source and target shards.
+                            // Use a refcounted listener to run both requests async in parallel and collect the responses from both requests
 
-                        assert primaryShardReference.indexShard.isPrimaryMode();
-                        primaryShardReference.close(); // release shard operation lock before responding to caller
-                        setPhase(replicationTask, "finished");
-                        onCompletionListener.onResponse(response);
-                    }, e -> handleException(primaryShardReference, e));
-
-                    new ReplicationOperation<>(
-                        primaryRequest.getRequest(),
-                        primaryShardReference,
-                        responseListener.map(result -> result.replicationResponse),
-                        newReplicasProxy(),
-                        logger,
-                        threadPool,
-                        actionName,
-                        primaryRequest.getPrimaryTerm(),
-                        initialRetryBackoffBound,
-                        retryTimeout
-                    ).execute();
-                }
+                            // Merge responses from source and target before calling onCompletionListener
+                        }
+                    } else {
+                        executePrimaryRequest(primaryShardReference, "primary");
+                    }
             } catch (Exception e) {
                 handleException(primaryShardReference, e);
             }
         }
 
+        private void executePrimaryRequest(final PrimaryShardReference primaryShardReference, String phase) throws Exception {
+            setPhase(replicationTask, phase);
+
+            final ActionListener<Response> responseListener = ActionListener.wrap(response -> {
+                adaptResponse(response, primaryShardReference.indexShard);
+
+                if (syncGlobalCheckpointAfterOperation) {
+                    try {
+                        primaryShardReference.indexShard.maybeSyncGlobalCheckpoint("post-operation");
+                    } catch (final Exception e) {
+                        // only log non-closed exceptions
+                        if (ExceptionsHelper.unwrap(e, AlreadyClosedException.class, IndexShardClosedException.class) == null) {
+                            // intentionally swallow, a missed global checkpoint sync should not fail this operation
+                            logger.info(
+                                () -> format(
+                                    "%s failed to execute post-operation global checkpoint sync",
+                                    primaryShardReference.indexShard.shardId()
+                                ),
+                                e
+                            );
+                        }
+                    }
+                }
+
+                assert primaryShardReference.indexShard.isPrimaryMode();
+                primaryShardReference.close(); // release shard operation lock before responding to caller
+                setPhase(replicationTask, "finished");
+                onCompletionListener.onResponse(response);
+            }, e -> handleException(primaryShardReference, e));
+
+            new ReplicationOperation<>(
+                primaryRequest.getRequest(),
+                primaryShardReference,
+                responseListener.map(result -> result.replicationResponse),
+                newReplicasProxy(),
+                logger,
+                threadPool,
+                actionName,
+                primaryRequest.getPrimaryTerm(),
+                initialRetryBackoffBound,
+                retryTimeout
+            ).execute();
+        }
+
         private void handleException(PrimaryShardReference primaryShardReference, Exception e) {
             Releasables.closeWhileHandlingException(primaryShardReference); // release shard operation lock before responding to caller
             onFailure(e);
diff --git a/server/src/main/java/org/elasticsearch/action/update/UpdateRequest.java b/server/src/main/java/org/elasticsearch/action/update/UpdateRequest.java
@@ -688,6 +688,11 @@ public int route(IndexRouting indexRouting) {
         return indexRouting.updateShard(id, routing);
     }
 
+    @Override
+    public int rerouteAtSourceDuringResharding(IndexRouting indexRouting) {
+        return indexRouting.rerouteUpdateRequestIfResharding(id, routing);
+    }
+
     public UpdateRequest setRequireAlias(boolean requireAlias) {
         this.requireAlias = requireAlias;
         return this;
diff --git a/server/src/main/java/org/elasticsearch/cluster/routing/IndexRouting.java b/server/src/main/java/org/elasticsearch/cluster/routing/IndexRouting.java