|
7 | 7 |
|
8 | 8 | package org.elasticsearch.xpack.enrich;
|
9 | 9 |
|
| 10 | +import org.elasticsearch.ElasticsearchException; |
| 11 | +import org.elasticsearch.ElasticsearchTimeoutException; |
10 | 12 | import org.elasticsearch.ResourceNotFoundException;
|
11 | 13 | import org.elasticsearch.action.ActionListener;
|
12 | 14 | import org.elasticsearch.action.ActionRequest;
|
13 | 15 | import org.elasticsearch.action.ActionResponse;
|
14 | 16 | import org.elasticsearch.action.ActionType;
|
15 | 17 | import org.elasticsearch.action.LatchedActionListener;
|
| 18 | +import org.elasticsearch.action.admin.cluster.node.tasks.get.GetTaskAction; |
| 19 | +import org.elasticsearch.action.support.PlainActionFuture; |
16 | 20 | import org.elasticsearch.client.Client;
|
17 | 21 | import org.elasticsearch.cluster.ClusterName;
|
18 | 22 | import org.elasticsearch.cluster.ClusterState;
|
|
22 | 26 | import org.elasticsearch.common.util.concurrent.EsRejectedExecutionException;
|
23 | 27 | import org.elasticsearch.core.Map;
|
24 | 28 | import org.elasticsearch.indices.TestIndexNameExpressionResolver;
|
| 29 | +import org.elasticsearch.tasks.TaskId; |
25 | 30 | import org.elasticsearch.test.ESTestCase;
|
26 | 31 | import org.elasticsearch.test.client.NoOpClient;
|
27 | 32 | import org.elasticsearch.threadpool.TestThreadPool;
|
28 | 33 | import org.elasticsearch.threadpool.ThreadPool;
|
29 | 34 | import org.elasticsearch.xcontent.XContentType;
|
30 | 35 | import org.elasticsearch.xpack.core.enrich.EnrichPolicy;
|
31 | 36 | import org.elasticsearch.xpack.core.enrich.action.ExecuteEnrichPolicyAction;
|
| 37 | +import org.elasticsearch.xpack.enrich.action.InternalExecutePolicyAction; |
32 | 38 | import org.junit.AfterClass;
|
| 39 | +import org.junit.Assert; |
33 | 40 | import org.junit.BeforeClass;
|
34 | 41 |
|
| 42 | +import java.util.concurrent.BrokenBarrierException; |
35 | 43 | import java.util.concurrent.CountDownLatch;
|
| 44 | +import java.util.concurrent.CyclicBarrier; |
36 | 45 | import java.util.concurrent.TimeUnit;
|
| 46 | +import java.util.concurrent.TimeoutException; |
| 47 | +import java.util.concurrent.atomic.AtomicBoolean; |
37 | 48 |
|
38 | 49 | import static org.hamcrest.CoreMatchers.containsString;
|
39 | 50 | import static org.hamcrest.CoreMatchers.equalTo;
|
40 | 51 | import static org.hamcrest.CoreMatchers.is;
|
| 52 | +import static org.hamcrest.CoreMatchers.notNullValue; |
| 53 | +import static org.hamcrest.CoreMatchers.nullValue; |
41 | 54 | import static org.hamcrest.Matchers.empty;
|
42 | 55 | import static org.mockito.Mockito.mock;
|
43 | 56 | import static org.mockito.Mockito.when;
|
@@ -183,6 +196,191 @@ public void testMaximumPolicyExecutionLimit() throws InterruptedException {
|
183 | 196 | finalTaskComplete.await();
|
184 | 197 | }
|
185 | 198 |
|
| 199 | + public void testWaitForCompletionConditionRemainsLocked() throws Exception { |
| 200 | + String testPolicyName = "test_policy"; |
| 201 | + String testTaskId = randomAlphaOfLength(10) + ":" + randomIntBetween(100, 300); |
| 202 | + boolean completeWithResourceNotFound = randomBoolean(); |
| 203 | + |
| 204 | + // Client calls are forked to a different thread which will await on this latch before actually running anything |
| 205 | + CountDownLatch clientBlockingLatch = new CountDownLatch(1); |
| 206 | + // When the client is called with a GetTask call a second time, it should count down this latch, so we can check the lock status. |
| 207 | + CountDownLatch secondGetTaskWasCalled = new CountDownLatch(1); |
| 208 | + // A barrier to repeatedly control when the async client will respond with Get Task API results. |
| 209 | + CyclicBarrier getTaskActionBlockingBarrier = new CyclicBarrier(2); |
| 210 | + // State flag to ensure first Get Task API call will fail. |
| 211 | + AtomicBoolean shouldGetTaskApiReturnTimeout = new AtomicBoolean(true); |
| 212 | + |
| 213 | + // Create the async testing client |
| 214 | + Client client = new NoOpClient(testThreadPool) { |
| 215 | + @Override |
| 216 | + protected <Request extends ActionRequest, Response extends ActionResponse> void doExecute( |
| 217 | + ActionType<Response> action, |
| 218 | + Request request, |
| 219 | + ActionListener<Response> listener |
| 220 | + ) { |
| 221 | + // Validate the request on the submitting thread before forking its execution. |
| 222 | + if (request instanceof InternalExecutePolicyAction.Request) { |
| 223 | + assertFalse(((InternalExecutePolicyAction.Request) request).isWaitForCompletion()); |
| 224 | + } |
| 225 | + // Execute all client operations on another thread. |
| 226 | + testThreadPool.generic().execute(() -> { |
| 227 | + try { |
| 228 | + // All client operations should wait until we're ready in the test. |
| 229 | + clientBlockingLatch.await(); |
| 230 | + } catch (InterruptedException e) { |
| 231 | + Thread.currentThread().interrupt(); |
| 232 | + } |
| 233 | + |
| 234 | + if (GetTaskAction.INSTANCE.equals(action)) { |
| 235 | + if (shouldGetTaskApiReturnTimeout.get() == false) { |
| 236 | + // This is the second call to the Get Task API, so count down the latch to let the main test logic know. |
| 237 | + secondGetTaskWasCalled.countDown(); |
| 238 | + } |
| 239 | + // Enrich uses GetTaskAction to detect when the task completes during wait_for_completion. The first call will |
| 240 | + // throw a timeout, and all remaining calls will return normally. |
| 241 | + try { |
| 242 | + // Wait until the signal is given to respond to the get task action |
| 243 | + getTaskActionBlockingBarrier.await(); |
| 244 | + } catch (InterruptedException | BrokenBarrierException e) { |
| 245 | + throw new RuntimeException(e); |
| 246 | + } |
| 247 | + // First call is a timeout to test the recovery logic. Remaining calls will no-op which should complete |
| 248 | + // the execution. |
| 249 | + if (shouldGetTaskApiReturnTimeout.getAndSet(false)) { |
| 250 | + listener.onFailure(new ElasticsearchTimeoutException("Test call has timed out")); |
| 251 | + } else if (completeWithResourceNotFound) { |
| 252 | + listener.onFailure(new ElasticsearchException("Test wrapping", new ResourceNotFoundException("test"))); |
| 253 | + } else { |
| 254 | + listener.onResponse(null); |
| 255 | + } |
| 256 | + } else if (InternalExecutePolicyAction.INSTANCE.equals(action)) { |
| 257 | + // Return a fake task id for the run |
| 258 | + @SuppressWarnings("unchecked") |
| 259 | + Response response = (Response) new ExecuteEnrichPolicyAction.Response(new TaskId(testTaskId)); |
| 260 | + listener.onResponse(response); |
| 261 | + } else { |
| 262 | + listener.onResponse(null); |
| 263 | + } |
| 264 | + }); |
| 265 | + } |
| 266 | + }; |
| 267 | + |
| 268 | + // Set up |
| 269 | + final EnrichPolicyLocks enrichPolicyLocks = new EnrichPolicyLocks(); |
| 270 | + final EnrichPolicyExecutor testExecutor = new EnrichPolicyExecutor( |
| 271 | + Settings.EMPTY, |
| 272 | + null, |
| 273 | + client, |
| 274 | + testThreadPool, |
| 275 | + TestIndexNameExpressionResolver.newInstance(testThreadPool.getThreadContext()), |
| 276 | + enrichPolicyLocks, |
| 277 | + ESTestCase::randomNonNegativeLong |
| 278 | + ); |
| 279 | + |
| 280 | + // Launch a fake policy run that will block until firstTaskBlock is counted down. |
| 281 | + PlainActionFuture<ExecuteEnrichPolicyAction.Response> firstTaskResult = PlainActionFuture.newFuture(); |
| 282 | + testExecutor.coordinatePolicyExecution( |
| 283 | + new ExecuteEnrichPolicyAction.Request(testPolicyName).setWaitForCompletion(false), |
| 284 | + firstTaskResult |
| 285 | + ); |
| 286 | + |
| 287 | + // Check to make sure the policy is locked. Do this instead of an assertTrue so that we can clean up if something breaks. |
| 288 | + if (enrichPolicyLocks.lockedPolices().contains(testPolicyName) == false) { |
| 289 | + // If this fails, be a good citizen and conclude the fake runs to keep the logs clean from interrupted exceptions during cleanup |
| 290 | + clientBlockingLatch.countDown(); |
| 291 | + try { |
| 292 | + firstTaskResult.get(3, TimeUnit.SECONDS); |
| 293 | + } catch (Exception e) { |
| 294 | + logger.error("Encountered ignorable exception during test cleanup"); |
| 295 | + } |
| 296 | + try { |
| 297 | + // Wait on the timing out request |
| 298 | + getTaskActionBlockingBarrier.await(3, TimeUnit.SECONDS); |
| 299 | + // Wait on the response request |
| 300 | + getTaskActionBlockingBarrier.await(3, TimeUnit.SECONDS); |
| 301 | + } catch (InterruptedException | BrokenBarrierException | TimeoutException e) { |
| 302 | + logger.error("Encountered ignorable barrier wait exception during test cleanup"); |
| 303 | + } |
| 304 | + fail("Enrich policy was not locked during task submission when it should have been"); |
| 305 | + } |
| 306 | + |
| 307 | + // Free the client to execute |
| 308 | + clientBlockingLatch.countDown(); |
| 309 | + |
| 310 | + // Wait for task id to be returned |
| 311 | + try { |
| 312 | + ExecuteEnrichPolicyAction.Response response = firstTaskResult.actionGet(); |
| 313 | + assertThat(response.getStatus(), is(nullValue())); |
| 314 | + assertThat(response.getTaskId(), is(notNullValue())); |
| 315 | + } catch (AssertionError e) { |
| 316 | + // conclude the fake runs |
| 317 | + try { |
| 318 | + // Wait on the timing out request |
| 319 | + getTaskActionBlockingBarrier.await(3, TimeUnit.SECONDS); |
| 320 | + // Wait on the response request |
| 321 | + getTaskActionBlockingBarrier.await(3, TimeUnit.SECONDS); |
| 322 | + } catch (InterruptedException | BrokenBarrierException | TimeoutException be) { |
| 323 | + logger.error("Encountered ignorable barrier wait exception during test cleanup"); |
| 324 | + } |
| 325 | + throw e; |
| 326 | + } |
| 327 | + |
| 328 | + // Check to make sure the policy is locked still |
| 329 | + if (enrichPolicyLocks.lockedPolices().contains(testPolicyName) == false) { |
| 330 | + // keep the logs clean |
| 331 | + try { |
| 332 | + // Wait on the timing out request |
| 333 | + getTaskActionBlockingBarrier.await(3, TimeUnit.SECONDS); |
| 334 | + // Wait on the response request |
| 335 | + getTaskActionBlockingBarrier.await(3, TimeUnit.SECONDS); |
| 336 | + } catch (InterruptedException | BrokenBarrierException | TimeoutException e) { |
| 337 | + logger.error("Encountered ignorable barrier wait exception during test cleanup"); |
| 338 | + } |
| 339 | + fail("Enrich policy was not locked after task response when it should have been"); |
| 340 | + } |
| 341 | + |
| 342 | + // Now lets return a timeout response on the getTaskAPI |
| 343 | + try { |
| 344 | + getTaskActionBlockingBarrier.await(3, TimeUnit.SECONDS); |
| 345 | + } catch (BrokenBarrierException e) { |
| 346 | + throw new RuntimeException("Unexpected broken barrier exception", e); |
| 347 | + } |
| 348 | + |
| 349 | + // Wait for the executor to call back to the client with a new get task action |
| 350 | + try { |
| 351 | + // Don't need to clean up any barrier states here because the client was never called again |
| 352 | + assertTrue( |
| 353 | + "Expected task API to be called a second time by the executor after first call timed out", |
| 354 | + secondGetTaskWasCalled.await(3, TimeUnit.SECONDS) |
| 355 | + ); |
| 356 | + } catch (InterruptedException e) { |
| 357 | + // We were interrupted, which means we shouldn't wait on any barriers. |
| 358 | + Assert.fail("Thread interrupted while waiting for background executor to call task API"); |
| 359 | + } |
| 360 | + |
| 361 | + // Ensure that the policy remained locked |
| 362 | + if (enrichPolicyLocks.lockedPolices().contains(testPolicyName) == false) { |
| 363 | + // Another thread is waiting to send a task API response, signal it before failing test to keep the logs clean. |
| 364 | + try { |
| 365 | + getTaskActionBlockingBarrier.await(3, TimeUnit.SECONDS); |
| 366 | + } catch (InterruptedException | BrokenBarrierException | TimeoutException e) { |
| 367 | + logger.error("Encountered ignorable barrier wait exception during test cleanup"); |
| 368 | + } |
| 369 | + fail("Enrich policy was not locked after timeout when it should have been"); |
| 370 | + } |
| 371 | + |
| 372 | + // If the lock has remained, then the client should have resubmitted the task wait operation. Signal a new response that will |
| 373 | + // complete the task wait |
| 374 | + try { |
| 375 | + getTaskActionBlockingBarrier.await(3, TimeUnit.SECONDS); |
| 376 | + } catch (BrokenBarrierException e) { |
| 377 | + throw new RuntimeException("Unexpected broken barrier exception", e); |
| 378 | + } |
| 379 | + |
| 380 | + // At this point the task should complete and unlock the policy correctly |
| 381 | + assertBusy(() -> assertFalse(enrichPolicyLocks.lockedPolices().contains(testPolicyName)), 3, TimeUnit.SECONDS); |
| 382 | + } |
| 383 | + |
186 | 384 | public void testRunPolicyLocallyMissingPolicy() {
|
187 | 385 | EnrichPolicy enrichPolicy = EnrichPolicyTests.randomEnrichPolicy(XContentType.JSON);
|
188 | 386 | ClusterState clusterState = ClusterState.builder(new ClusterName("_name"))
|
|
0 commit comments