Skip to content

Commit b207660

Browse files
authored
Evict workflows when they complete (#847)
1 parent bde0afb commit b207660

File tree

23 files changed

+248
-282
lines changed

23 files changed

+248
-282
lines changed

core/src/core_tests/local_activities.rs

Lines changed: 6 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -573,9 +573,6 @@ async fn la_resolve_during_legacy_query_does_not_combine(#[case] impossible_quer
573573
// then next task is incremental w/ legacy query (for impossible query case)
574574
t.add_full_wf_task();
575575

576-
let barr = Arc::new(Barrier::new(2));
577-
let barr_c = barr.clone();
578-
579576
let tasks = [
580577
hist_to_poll_resp(&t, wfid.to_owned(), ResponseType::ToTaskNum(1)),
581578
{
@@ -592,17 +589,7 @@ async fn la_resolve_during_legacy_query_does_not_combine(#[case] impossible_quer
592589
pr
593590
},
594591
{
595-
let mut pr = hist_to_poll_resp(
596-
&t,
597-
wfid.to_owned(),
598-
ResponseType::UntilResolved(
599-
async move {
600-
barr_c.wait().await;
601-
}
602-
.boxed(),
603-
2,
604-
),
605-
);
592+
let mut pr = hist_to_poll_resp(&t, wfid.to_owned(), ResponseType::ToTaskNum(2));
606593
// Strip beginning of history so the only events are WFT sched/started, we need to look
607594
// like we hit the cache
608595
{
@@ -629,6 +616,7 @@ async fn la_resolve_during_legacy_query_does_not_combine(#[case] impossible_quer
629616
}
630617
let mut mock = single_hist_mock_sg(wfid, t, tasks, mock, true);
631618
mock.worker_cfg(|wc| wc.max_cached_workflows = 1);
619+
let taskmap = mock.outstanding_task_map.clone().unwrap();
632620
let core = mock_worker(mock);
633621

634622
let wf_fut = async {
@@ -653,6 +641,9 @@ async fn la_resolve_during_legacy_query_does_not_combine(#[case] impossible_quer
653641
variant: Some(workflow_activation_job::Variant::FireTimer(_)),
654642
},]
655643
);
644+
// We want to make sure the weird-looking query gets received while we're working on other
645+
// stuff, so that we don't see the workflow complete and choose to evict.
646+
taskmap.release_run(&task.run_id);
656647
core.complete_workflow_activation(WorkflowActivationCompletion::from_cmd(
657648
task.run_id,
658649
schedule_local_activity_cmd(
@@ -691,7 +682,6 @@ async fn la_resolve_during_legacy_query_does_not_combine(#[case] impossible_quer
691682
))
692683
.await
693684
.unwrap();
694-
barr.wait().await;
695685

696686
if impossible_query_in_task {
697687
// finish last query
@@ -873,7 +863,7 @@ async fn start_to_close_timeout_allows_retries(#[values(true, false)] la_complet
873863
1,
874864
"1",
875865
None,
876-
Some(Failure::application_failure("la failed".to_string(), false)),
866+
Some(Failure::timeout(TimeoutType::StartToClose)),
877867
|_| {},
878868
);
879869
}

core/src/core_tests/queries.rs

Lines changed: 68 additions & 68 deletions
Original file line numberDiff line numberDiff line change
@@ -83,15 +83,7 @@ async fn legacy_query(#[case] include_history: bool) {
8383
.unwrap();
8484
};
8585
let clear_eviction = || async {
86-
let t = worker.poll_workflow_activation().await.unwrap();
87-
assert_matches!(
88-
t.jobs[0].variant,
89-
Some(workflow_activation_job::Variant::RemoveFromCache(_))
90-
);
91-
worker
92-
.complete_workflow_activation(WorkflowActivationCompletion::empty(t.run_id))
93-
.await
94-
.unwrap();
86+
worker.handle_eviction().await;
9587
};
9688

9789
first_wft().await;
@@ -324,15 +316,7 @@ async fn query_failure_because_nondeterminism(#[values(true, false)] legacy: boo
324316
core.complete_workflow_activation(WorkflowActivationCompletion::empty(task.run_id))
325317
.await
326318
.unwrap();
327-
let task = core.poll_workflow_activation().await.unwrap();
328-
assert_matches!(
329-
task.jobs[0].variant,
330-
Some(workflow_activation_job::Variant::RemoveFromCache(_))
331-
);
332-
core.complete_workflow_activation(WorkflowActivationCompletion::empty(task.run_id))
333-
.await
334-
.unwrap();
335-
319+
core.handle_eviction().await;
336320
core.shutdown().await;
337321
}
338322

@@ -372,23 +356,31 @@ async fn legacy_query_after_complete(#[values(false, true)] full_history: bool)
372356
mock.worker_cfg(|wc| wc.max_cached_workflows = 10);
373357
let core = mock_worker(mock);
374358

375-
let task = core.poll_workflow_activation().await.unwrap();
376-
core.complete_workflow_activation(WorkflowActivationCompletion::from_cmd(
377-
task.run_id,
378-
start_timer_cmd(1, Duration::from_secs(1)),
379-
))
380-
.await
381-
.unwrap();
382-
let task = core.poll_workflow_activation().await.unwrap();
383-
core.complete_workflow_activation(WorkflowActivationCompletion::from_cmds(
384-
task.run_id,
385-
vec![CompleteWorkflowExecution { result: None }.into()],
386-
))
387-
.await
388-
.unwrap();
359+
let activations = || async {
360+
let task = core.poll_workflow_activation().await.unwrap();
361+
core.complete_workflow_activation(WorkflowActivationCompletion::from_cmd(
362+
task.run_id,
363+
start_timer_cmd(1, Duration::from_secs(1)),
364+
))
365+
.await
366+
.unwrap();
367+
let task = core.poll_workflow_activation().await.unwrap();
368+
core.complete_workflow_activation(WorkflowActivationCompletion::from_cmds(
369+
task.run_id,
370+
vec![CompleteWorkflowExecution { result: None }.into()],
371+
))
372+
.await
373+
.unwrap();
374+
};
375+
activations().await;
376+
377+
if !full_history {
378+
core.handle_eviction().await;
379+
activations().await;
380+
}
389381

390382
// We should get queries two times
391-
for _ in 1..=2 {
383+
for i in 1..=2 {
392384
let task = core.poll_workflow_activation().await.unwrap();
393385
let query = assert_matches!(
394386
task.jobs.as_slice(),
@@ -402,6 +394,10 @@ async fn legacy_query_after_complete(#[values(false, true)] full_history: bool)
402394
))
403395
.await
404396
.unwrap();
397+
if i == 1 {
398+
core.handle_eviction().await;
399+
activations().await;
400+
}
405401
}
406402

407403
core.shutdown().await;
@@ -770,8 +766,6 @@ async fn legacy_query_combined_with_timer_fire_repro() {
770766
},
771767
{
772768
let mut pr = hist_to_poll_resp(&t, wfid.to_owned(), ResponseType::ToTaskNum(2));
773-
// Strip history, we need to look like we hit the cache for a legacy query
774-
pr.history = Some(History { events: vec![] });
775769
pr.query = Some(WorkflowQuery {
776770
query_type: "query-type".to_string(),
777771
query_args: Some(b"hi".into()),
@@ -788,41 +782,44 @@ async fn legacy_query_combined_with_timer_fire_repro() {
788782
mock.worker_cfg(|wc| wc.max_cached_workflows = 1);
789783
let core = mock_worker(mock);
790784

791-
let task = core.poll_workflow_activation().await.unwrap();
792-
core.complete_workflow_activation(WorkflowActivationCompletion::from_cmds(
793-
task.run_id,
794-
vec![
795-
schedule_activity_cmd(
796-
1,
797-
"whatever",
798-
"1",
799-
ActivityCancellationType::TryCancel,
800-
Duration::from_secs(60),
801-
Duration::from_secs(60),
802-
),
803-
start_timer_cmd(1, Duration::from_secs(1)),
804-
],
805-
))
806-
.await
807-
.unwrap();
785+
let activations = || async {
786+
let task = core.poll_workflow_activation().await.unwrap();
787+
core.complete_workflow_activation(WorkflowActivationCompletion::from_cmds(
788+
task.run_id,
789+
vec![
790+
schedule_activity_cmd(
791+
1,
792+
"whatever",
793+
"1",
794+
ActivityCancellationType::TryCancel,
795+
Duration::from_secs(60),
796+
Duration::from_secs(60),
797+
),
798+
start_timer_cmd(1, Duration::from_secs(1)),
799+
],
800+
))
801+
.await
802+
.unwrap();
808803

809-
let task = core.poll_workflow_activation().await.unwrap();
810-
core.complete_workflow_activation(WorkflowActivationCompletion::from_cmd(
811-
task.run_id,
812-
RequestCancelActivity { seq: 1 }.into(),
813-
))
814-
.await
815-
.unwrap();
804+
let task = core.poll_workflow_activation().await.unwrap();
805+
core.complete_workflow_activation(WorkflowActivationCompletion::from_cmd(
806+
task.run_id,
807+
RequestCancelActivity { seq: 1 }.into(),
808+
))
809+
.await
810+
.unwrap();
816811

817-
// First should get the activity resolve
818-
let task = core.poll_workflow_activation().await.unwrap();
819-
assert_matches!(
820-
task.jobs.as_slice(),
821-
[WorkflowActivationJob {
822-
variant: Some(workflow_activation_job::Variant::ResolveActivity(_)),
823-
}]
824-
);
825-
core.complete_execution(&task.run_id).await;
812+
// First should get the activity resolve
813+
let task = core.poll_workflow_activation().await.unwrap();
814+
assert_matches!(
815+
task.jobs.as_slice(),
816+
[WorkflowActivationJob {
817+
variant: Some(workflow_activation_job::Variant::ResolveActivity(_)),
818+
}]
819+
);
820+
core.complete_execution(&task.run_id).await;
821+
};
822+
activations().await;
826823

827824
// Then the queries
828825
let task = core.poll_workflow_activation().await.unwrap();
@@ -840,6 +837,9 @@ async fn legacy_query_combined_with_timer_fire_repro() {
840837
.await
841838
.unwrap();
842839

840+
core.handle_eviction().await;
841+
activations().await;
842+
843843
let task = core.poll_workflow_activation().await.unwrap();
844844
assert_matches!(
845845
task.jobs.as_slice(),

core/src/core_tests/workers.rs

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ use temporal_sdk_core_protos::{
2626
PollWorkflowTaskQueueResponse, RespondWorkflowTaskCompletedResponse, ShutdownWorkerResponse,
2727
},
2828
};
29-
use temporal_sdk_core_test_utils::start_timer_cmd;
29+
use temporal_sdk_core_test_utils::{start_timer_cmd, WorkerTestHelpers};
3030
use tokio::sync::{watch, Barrier};
3131

3232
#[tokio::test]
@@ -260,11 +260,7 @@ async fn worker_does_not_panic_on_retry_exhaustion_of_nonfatal_net_err() {
260260
.await
261261
.unwrap();
262262
// We should see an eviction
263-
let res = core.poll_workflow_activation().await.unwrap();
264-
assert_matches!(
265-
res.jobs[0].variant,
266-
Some(workflow_activation_job::Variant::RemoveFromCache(_))
267-
);
263+
core.handle_eviction().await;
268264
}
269265

270266
#[rstest::rstest]

core/src/core_tests/workflow_tasks.rs

Lines changed: 4 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -1330,17 +1330,7 @@ async fn fail_wft_then_recover() {
13301330
.await
13311331
.unwrap();
13321332
// We must handle an eviction now
1333-
let evict_act = core.poll_workflow_activation().await.unwrap();
1334-
assert_eq!(evict_act.run_id, act.run_id);
1335-
assert_matches!(
1336-
evict_act.jobs.as_slice(),
1337-
[WorkflowActivationJob {
1338-
variant: Some(workflow_activation_job::Variant::RemoveFromCache(_)),
1339-
}]
1340-
);
1341-
core.complete_workflow_activation(WorkflowActivationCompletion::empty(evict_act.run_id))
1342-
.await
1343-
.unwrap();
1333+
core.handle_eviction().await;
13441334

13451335
// Workflow starting over, this time issue the right command
13461336
let act = core.poll_workflow_activation().await.unwrap();
@@ -1531,6 +1521,7 @@ async fn failing_wft_doesnt_eat_permit_forever() {
15311521
// row because we purposefully time out rather than spamming.
15321522
for _ in 1..=2 {
15331523
let activation = worker.poll_workflow_activation().await.unwrap();
1524+
run_id.clone_from(&activation.run_id);
15341525
// Issue a nonsense completion that will trigger a WFT failure
15351526
worker
15361527
.complete_workflow_activation(WorkflowActivationCompletion::from_cmd(
@@ -1539,18 +1530,7 @@ async fn failing_wft_doesnt_eat_permit_forever() {
15391530
))
15401531
.await
15411532
.unwrap();
1542-
let activation = worker.poll_workflow_activation().await.unwrap();
1543-
assert_matches!(
1544-
activation.jobs.as_slice(),
1545-
[WorkflowActivationJob {
1546-
variant: Some(workflow_activation_job::Variant::RemoveFromCache(_)),
1547-
},]
1548-
);
1549-
run_id.clone_from(&activation.run_id);
1550-
worker
1551-
.complete_workflow_activation(WorkflowActivationCompletion::empty(activation.run_id))
1552-
.await
1553-
.unwrap();
1533+
worker.handle_eviction().await;
15541534
}
15551535
assert_eq!(worker.outstanding_workflow_tasks().await, 0);
15561536
// We should be "out of work" because the mock service thinks we didn't complete the last task,
@@ -2601,6 +2581,7 @@ async fn _do_post_terminal_commands_test(
26012581

26022582
let act = core.poll_workflow_activation().await.unwrap();
26032583

2584+
core.initiate_shutdown();
26042585
core.complete_workflow_activation(WorkflowActivationCompletion::from_cmds(
26052586
act.run_id,
26062587
commands_sent_by_lang,

core/src/telemetry/metrics.rs

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,7 @@ struct Instruments {
5555
sticky_cache_hit: Arc<dyn Counter>,
5656
sticky_cache_miss: Arc<dyn Counter>,
5757
sticky_cache_size: Arc<dyn Gauge>,
58-
sticky_cache_evictions: Arc<dyn Counter>,
58+
sticky_cache_forced_evictions: Arc<dyn Counter>,
5959
}
6060

6161
impl MetricsContext {
@@ -263,8 +263,10 @@ impl MetricsContext {
263263
}
264264

265265
/// Count a workflow being evicted from the cache
266-
pub(crate) fn cache_eviction(&self) {
267-
self.instruments.sticky_cache_evictions.add(1, &self.kvs);
266+
pub(crate) fn forced_cache_eviction(&self) {
267+
self.instruments
268+
.sticky_cache_forced_evictions
269+
.add(1, &self.kvs);
268270
}
269271
}
270272

@@ -423,7 +425,7 @@ impl Instruments {
423425
description: "Current number of cached workflows".into(),
424426
unit: "".into(),
425427
}),
426-
sticky_cache_evictions: meter.counter(MetricParameters {
428+
sticky_cache_forced_evictions: meter.counter(MetricParameters {
427429
name: "sticky_cache_total_forced_eviction".into(),
428430
description: "Count of evictions of cached workflows".into(),
429431
unit: "".into(),
@@ -867,7 +869,7 @@ mod tests {
867869
true,
868870
);
869871
let mc = MetricsContext::top_level("foo".to_string(), "q".to_string(), &telem_instance);
870-
mc.cache_eviction();
872+
mc.forced_cache_eviction();
871873
let events = call_buffer.retrieve();
872874
let a1 = assert_matches!(
873875
&events[0],

core/src/test_help/mod.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -656,7 +656,7 @@ pub(crate) fn build_mock_pollers(mut cfg: MockPollCfg) -> MocksHolder {
656656
tokio::select! {
657657
_ = outstanding_wakeup.notified() => {}
658658
_ = tokio::time::sleep(Duration::from_secs(60)) => {}
659-
};
659+
}
660660
}
661661
}
662662
});

core/src/worker/mod.rs

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -508,7 +508,12 @@ impl Worker {
508508
if let Some(name) = self.workflows.get_sticky_queue_name() {
509509
// This is a best effort call and we can still shutdown the worker if it fails
510510
match self.client.shutdown_worker(name).await {
511-
Err(err) if err.code() != tonic::Code::Unavailable => {
511+
Err(err)
512+
if !matches!(
513+
err.code(),
514+
tonic::Code::Unimplemented | tonic::Code::Unavailable
515+
) =>
516+
{
512517
warn!("Failed to shutdown sticky queue {:?}", err);
513518
}
514519
_ => {}

0 commit comments

Comments
 (0)