Skip to content

Commit ff30fad

Browse files
authored
Fix testCancelOnExpiry (#146015)
The test failed because it used a small keep_alive on submission, leading to cases where data-node drivers were slow to start and the query was cancelled before any driver had started. This PR modifies the test to check for cancellation in two steps: submit with a large keep_alive and wait until at least one data-node driver has started, then adjust the keep_alive to a much smaller interval and expect cancellation to happen. Closes #145502
1 parent 038d0b4 commit ff30fad

File tree

2 files changed

+27
-5
lines changed

2 files changed

+27
-5
lines changed

muted-tests.yml

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -217,9 +217,6 @@ tests:
217217
- class: org.elasticsearch.compute.aggregation.SumLongGroupingAggregatorFunctionTests
218218
method: testOverflowInGroupingProducesNullAndWarning
219219
issue: https://github.com/elastic/elasticsearch/issues/145438
220-
- class: org.elasticsearch.xpack.esql.action.AsyncEsqlQueryActionIT
221-
method: testCancelOnExpiry
222-
issue: https://github.com/elastic/elasticsearch/issues/145502
223220
- class: org.elasticsearch.xpack.ml.integration.RegressionIT
224221
method: testTwoJobsWithSameRandomizeSeedUseSameTrainingSet
225222
issue: https://github.com/elastic/elasticsearch/issues/145519

x-pack/plugin/esql/src/internalClusterTest/java/org/elasticsearch/xpack/esql/action/AsyncEsqlQueryActionIT.java

Lines changed: 27 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -331,18 +331,43 @@ public void testUpdateKeepAlive() throws Exception {
331331
}
332332

333333
public void testCancelOnExpiry() throws Exception {
334-
TimeValue keepAlive = timeValueMillis(between(1000, 2000));
335334
var request = asyncEsqlQueryRequest("from test | stats sum(pause_me)").pragmas(queryPragmas())
335+
// small interval so that we can return quickly on submission
336336
.waitForCompletionTimeout(TimeValue.timeValueMillis(between(1, 10)))
337337
.keepOnCompletion(randomBoolean())
338-
.keepAlive(keepAlive);
338+
.allowPartialResults(false)
339+
// large interval so that the tasks won't be cancelled until it has started
340+
.keepAlive(TimeValue.timeValueMinutes(between(1, 5)));
339341
final String asyncId;
342+
scriptPermits.drainPermits();
340343
try {
341344
try (EsqlQueryResponse initialResponse = client().execute(EsqlQueryAction.INSTANCE, request).actionGet(60, TimeUnit.SECONDS)) {
342345
assertThat(initialResponse.isRunning(), is(true));
343346
assertTrue(initialResponse.asyncExecutionId().isPresent());
344347
asyncId = initialResponse.asyncExecutionId().get();
345348
}
349+
// make sure at least one data node driver has started
350+
assertBusy(() -> {
351+
List<TaskInfo> driverTasks = client().admin()
352+
.cluster()
353+
.prepareListTasks()
354+
.setActions(DriverTaskRunner.ACTION_NAME)
355+
.setDetailed(true)
356+
.get()
357+
.getTasks()
358+
.stream()
359+
.filter(d -> d.status().toString().contains("Lucene"))
360+
.toList();
361+
assertThat(driverTasks, not(empty()));
362+
for (TaskInfo driveTask : driverTasks) {
363+
assertFalse(driveTask.cancelled());
364+
}
365+
});
366+
var getRequest = new GetAsyncResultRequest(asyncId).setWaitForCompletionTimeout(TimeValue.timeValueMillis(between(1, 10)))
367+
.setKeepAlive(timeValueMillis(randomIntBetween(1, 100)));
368+
try (var resp = client().execute(EsqlAsyncGetResultAction.INSTANCE, getRequest).actionGet()) {
369+
assertTrue(resp.isRunning());
370+
}
346371
// all the started drivers were canceled
347372
assertBusy(() -> {
348373
List<TaskInfo> tasks = client().admin()

0 commit comments

Comments
 (0)