Skip to content

Commit 4d62a32

Browse files
Fix a downsample persistent task assignment bug (#106247) (#106280)
If as part of the persistent task assignment the source downsample index no longer exists, then the persistent task framework will continuously try to find an assignment and fail with IndexNotFoundException (which gets logged as a warning on elected master node). This fixes a bug in resolving the shard routing, so that if the index no longer exists any node is returned and the persistent task can fail gracefully at a later stage. The original fix via #98769 didn't get this part right. Co-authored-by: Elastic Machine <[email protected]>
1 parent 2eb9729 commit 4d62a32

File tree

3 files changed

+140
-1
lines changed

3 files changed

+140
-1
lines changed

docs/changelog/106247.yaml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
pr: 106247
2+
summary: Fix a downsample persistent task assignment bug
3+
area: Downsampling
4+
type: bug
5+
issues: []

x-pack/plugin/downsample/src/main/java/org/elasticsearch/xpack/downsample/DownsampleShardPersistentTaskExecutor.java

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
import org.elasticsearch.client.internal.Client;
2424
import org.elasticsearch.cluster.ClusterState;
2525
import org.elasticsearch.cluster.node.DiscoveryNode;
26+
import org.elasticsearch.cluster.routing.IndexShardRoutingTable;
2627
import org.elasticsearch.cluster.routing.ShardRouting;
2728
import org.elasticsearch.common.inject.Inject;
2829
import org.elasticsearch.common.io.stream.StreamOutput;
@@ -135,7 +136,7 @@ public PersistentTasksCustomMetadata.Assignment getAssignment(
135136
// If during re-assignment the source index was deleted, then we need to break out.
136137
// Returning NO_NODE_FOUND just keeps the persistent task until the source index appears again (which would never happen)
137138
// So let's return a node and then in the node operation we would just fail and stop this persistent task
138-
var indexShardRouting = clusterState.routingTable().shardRoutingTable(params.shardId().getIndexName(), params.shardId().id());
139+
var indexShardRouting = findShardRoutingTable(shardId, clusterState);
139140
if (indexShardRouting == null) {
140141
var node = selectLeastLoadedNode(clusterState, candidateNodes, DiscoveryNode::canContainData);
141142
return new PersistentTasksCustomMetadata.Assignment(node.getId(), "a node to fail and stop this persistent task");
@@ -176,6 +177,14 @@ private void delegate(final AllocatedPersistentTask task, final DownsampleShardT
176177
);
177178
}
178179

180+
private static IndexShardRoutingTable findShardRoutingTable(ShardId shardId, ClusterState clusterState) {
181+
var indexRoutingTable = clusterState.routingTable().index(shardId.getIndexName());
182+
if (indexRoutingTable != null) {
183+
return indexRoutingTable.shard(shardId.getId());
184+
}
185+
return null;
186+
}
187+
179188
static void realNodeOperation(
180189
Client client,
181190
IndicesService indicesService,
Lines changed: 125 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,125 @@
1+
/*
2+
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
3+
* or more contributor license agreements. Licensed under the Elastic License
4+
* 2.0; you may not use this file except in compliance with the Elastic License
5+
* 2.0.
6+
*/
7+
8+
package org.elasticsearch.xpack.downsample;
9+
10+
import org.elasticsearch.action.downsample.DownsampleConfig;
11+
import org.elasticsearch.client.internal.Client;
12+
import org.elasticsearch.cluster.ClusterState;
13+
import org.elasticsearch.cluster.metadata.DataStreamTestHelper;
14+
import org.elasticsearch.cluster.node.DiscoveryNode;
15+
import org.elasticsearch.cluster.node.DiscoveryNodeRole;
16+
import org.elasticsearch.cluster.node.DiscoveryNodeUtils;
17+
import org.elasticsearch.cluster.node.DiscoveryNodes;
18+
import org.elasticsearch.cluster.routing.IndexRoutingTable;
19+
import org.elasticsearch.cluster.routing.RoutingTable;
20+
import org.elasticsearch.common.Strings;
21+
import org.elasticsearch.common.UUIDs;
22+
import org.elasticsearch.core.Tuple;
23+
import org.elasticsearch.index.Index;
24+
import org.elasticsearch.index.shard.ShardId;
25+
import org.elasticsearch.search.aggregations.bucket.histogram.DateHistogramInterval;
26+
import org.elasticsearch.test.ESTestCase;
27+
import org.elasticsearch.xpack.core.downsample.DownsampleShardTask;
28+
import org.junit.Before;
29+
30+
import java.time.Instant;
31+
import java.time.temporal.ChronoUnit;
32+
import java.util.List;
33+
import java.util.Map;
34+
import java.util.Set;
35+
import java.util.concurrent.Executor;
36+
37+
import static org.elasticsearch.cluster.routing.ShardRoutingState.STARTED;
38+
import static org.elasticsearch.cluster.routing.TestShardRouting.shardRoutingBuilder;
39+
import static org.hamcrest.Matchers.equalTo;
40+
import static org.mockito.Mockito.mock;
41+
42+
public class DownsampleShardPersistentTaskExecutorTests extends ESTestCase {
43+
44+
private ClusterState initialClusterState;
45+
private DownsampleShardPersistentTaskExecutor executor;
46+
47+
@Before
48+
public void setup() {
49+
Instant now = Instant.now().truncatedTo(ChronoUnit.MILLIS);
50+
Instant start = now.minus(2, ChronoUnit.HOURS);
51+
Instant end = now.plus(40, ChronoUnit.MINUTES);
52+
initialClusterState = DataStreamTestHelper.getClusterStateWithDataStream("metrics-app1", List.of(new Tuple<>(start, end)));
53+
executor = new DownsampleShardPersistentTaskExecutor(mock(Client.class), DownsampleShardTask.TASK_NAME, mock(Executor.class));
54+
}
55+
56+
public void testGetAssignment() {
57+
var backingIndex = initialClusterState.metadata().dataStreams().get("metrics-app1").getWriteIndex();
58+
var node = newNode();
59+
var shardId = new ShardId(backingIndex, 0);
60+
var clusterState = ClusterState.builder(initialClusterState)
61+
.nodes(new DiscoveryNodes.Builder().add(node).build())
62+
.routingTable(
63+
RoutingTable.builder()
64+
.add(
65+
IndexRoutingTable.builder(backingIndex)
66+
.addShard(shardRoutingBuilder(shardId, node.getId(), true, STARTED).withRecoverySource(null).build())
67+
)
68+
)
69+
.build();
70+
71+
var params = new DownsampleShardTaskParams(
72+
new DownsampleConfig(new DateHistogramInterval("1h")),
73+
shardId.getIndexName(),
74+
1,
75+
1,
76+
shardId,
77+
Strings.EMPTY_ARRAY,
78+
Strings.EMPTY_ARRAY,
79+
Strings.EMPTY_ARRAY
80+
);
81+
var result = executor.getAssignment(params, Set.of(node), clusterState);
82+
assertThat(result.getExecutorNode(), equalTo(node.getId()));
83+
}
84+
85+
public void testGetAssignmentMissingIndex() {
86+
var backingIndex = initialClusterState.metadata().dataStreams().get("metrics-app1").getWriteIndex();
87+
var node = newNode();
88+
var shardId = new ShardId(backingIndex, 0);
89+
var clusterState = ClusterState.builder(initialClusterState)
90+
.nodes(new DiscoveryNodes.Builder().add(node).build())
91+
.routingTable(
92+
RoutingTable.builder()
93+
.add(
94+
IndexRoutingTable.builder(backingIndex)
95+
.addShard(shardRoutingBuilder(shardId, node.getId(), true, STARTED).withRecoverySource(null).build())
96+
)
97+
)
98+
.build();
99+
100+
var missingShardId = new ShardId(new Index("another_index", "uid"), 0);
101+
var params = new DownsampleShardTaskParams(
102+
new DownsampleConfig(new DateHistogramInterval("1h")),
103+
missingShardId.getIndexName(),
104+
1,
105+
1,
106+
missingShardId,
107+
Strings.EMPTY_ARRAY,
108+
Strings.EMPTY_ARRAY,
109+
Strings.EMPTY_ARRAY
110+
);
111+
var result = executor.getAssignment(params, Set.of(node), clusterState);
112+
assertThat(result.getExecutorNode(), equalTo(node.getId()));
113+
assertThat(result.getExplanation(), equalTo("a node to fail and stop this persistent task"));
114+
}
115+
116+
private static DiscoveryNode newNode() {
117+
return DiscoveryNodeUtils.create(
118+
"node_" + UUIDs.randomBase64UUID(random()),
119+
buildNewFakeTransportAddress(),
120+
Map.of(),
121+
DiscoveryNodeRole.roles()
122+
);
123+
}
124+
125+
}

0 commit comments

Comments
 (0)