Fix duplicate task generation (#5431)

ndr-ds · web-flow · commit e56f3fe2ae37 · 2026-02-12T16:21:47.000-03:00
## Motivation The task processor can call `process_actions` multiple times while a previous batch's tasks are still in-flight. When there's a backlog (`schedule.start` far in the past), the deadline timer fires immediately because the next event is also in the past. The `last_requested_callbacks` guard passes, and `next_actions` reads stale on-chain state (schedule.start hasn't advanced yet), generating overlapping timestamps. When both batches are eventually submitted as blocks, batch 2's stale timestamps don't match the now-advanced `schedule.start`, causing: ``` contract.rs:115: assert_eq!(timed_data.timestamp, next_timestamp, "unexpected timestamp in posted data") ``` ## Proposal Introduce an `in_flight_apps: BTreeSet<ApplicationId>` guard in the task processor. When a batch of tasks is spawned for an application, the app is marked as in-flight and subsequent `process_actions` calls skip it. A new `TaskMessage` enum replaces the raw `(ApplicationId, TaskOutcome)` channel, adding a `BatchComplete` variant that the spawned task sends after all outcomes. On `BatchComplete`, the guard is cleared and `process_actions` is re-triggered for that app so it reads freshly-updated on-chain state. ## Test Plan - CI - I had a stuck worker with a big backlog. After rebuilding pm-app with this fix and restarting the worker with the new package version, I see no more panics, and it seems that the worker is actually making progress through the backlog: ![Screenshot 2026-02-12 at 13.44.18.png](https://app.graphite.com/user-attachments/assets/be611bf2-3b7a-4bbd-bda2-45b1e5049108.png)
diff --git a/linera-service/src/task_processor.rs b/linera-service/src/task_processor.rs
@@ -22,7 +22,7 @@ use linera_base::{
 };
 use linera_core::{client::ChainClient, node::NotificationStream, worker::Reason};
 use serde_json::json;
-use tokio::{io::AsyncWriteExt, process::Command, select, sync::mpsc, task::JoinHandle};
+use tokio::{io::AsyncWriteExt, process::Command, select, sync::mpsc};
 use tokio_util::sync::CancellationToken;
 use tracing::{debug, error, info};
 
@@ -43,6 +43,17 @@ pub fn parse_operator(s: &str) -> Result<(String, PathBuf), String> {
 
 type Deadline = Reverse<(Timestamp, Option<ApplicationId>)>;
 
+/// Messages sent from background task execution to the main loop.
+enum TaskMessage {
+    /// A task outcome ready to be submitted.
+    Outcome {
+        application_id: ApplicationId,
+        outcome: TaskOutcome,
+    },
+    /// All tasks in a batch have completed and their outcomes (if any) have been sent.
+    BatchComplete { application_id: ApplicationId },
+}
+
 /// A task processor that watches applications and executes off-chain operators.
 pub struct TaskProcessor<Env: linera_core::Environment> {
     chain_id: ChainId,
@@ -51,12 +62,12 @@ pub struct TaskProcessor<Env: linera_core::Environment> {
     chain_client: ChainClient<Env>,
     cancellation_token: CancellationToken,
     notifications: NotificationStream,
-    outcome_sender: mpsc::UnboundedSender<(ApplicationId, TaskOutcome)>,
-    outcome_receiver: mpsc::UnboundedReceiver<(ApplicationId, TaskOutcome)>,
+    outcome_sender: mpsc::UnboundedSender<TaskMessage>,
+    outcome_receiver: mpsc::UnboundedReceiver<TaskMessage>,
     update_receiver: mpsc::UnboundedReceiver<Update>,
     deadlines: BinaryHeap<Deadline>,
     operators: OperatorMap,
-    last_task_handles: BTreeMap<ApplicationId, JoinHandle<()>>,
+    in_flight_apps: BTreeSet<ApplicationId>,
 }
 
 impl<Env: linera_core::Environment> TaskProcessor<Env> {
@@ -84,7 +95,7 @@ impl<Env: linera_core::Environment> TaskProcessor<Env> {
             update_receiver,
             deadlines: BinaryHeap::new(),
             operators,
-            last_task_handles: BTreeMap::new(),
+            in_flight_apps: BTreeSet::new(),
         }
     }
 
@@ -105,9 +116,17 @@ impl<Env: linera_core::Environment> TaskProcessor<Env> {
                     let application_ids = self.process_events();
                     self.process_actions(application_ids).await;
                 }
-                Some((application_id, outcome)) = self.outcome_receiver.recv() => {
-                    if let Err(e) = self.submit_task_outcome(application_id, &outcome).await {
-                        error!("Error while processing task outcome {outcome:?}: {e}");
+                Some(msg) = self.outcome_receiver.recv() => {
+                    match msg {
+                        TaskMessage::Outcome { application_id, outcome } => {
+                            if let Err(e) = self.submit_task_outcome(application_id, &outcome).await {
+                                error!("Error while processing task outcome {outcome:?}: {e}");
+                            }
+                        }
+                        TaskMessage::BatchComplete { application_id } => {
+                            self.in_flight_apps.remove(&application_id);
+                            self.process_actions(vec![application_id]).await;
+                        }
                     }
                 }
                 Some(update) = self.update_receiver.recv() => {
@@ -138,9 +157,11 @@ impl<Env: linera_core::Environment> TaskProcessor<Env> {
         let new_app_set: BTreeSet<_> = update.application_ids.iter().cloned().collect();
         let old_app_set: BTreeSet<_> = self.application_ids.iter().cloned().collect();
 
-        // Retain only last_requested_callbacks for applications that are still active
+        // Retain only last_requested_callbacks and in_flight_apps for applications that are still active
         self.last_requested_callbacks
             .retain(|app_id, _| new_app_set.contains(app_id));
+        self.in_flight_apps
+            .retain(|app_id| new_app_set.contains(app_id));
 
         // Update the application_ids
         self.application_ids = update.application_ids;
@@ -176,6 +197,10 @@ impl<Env: linera_core::Environment> TaskProcessor<Env> {
 
     async fn process_actions(&mut self, application_ids: Vec<ApplicationId>) {
         for application_id in application_ids {
+            if self.in_flight_apps.contains(&application_id) {
+                debug!("Skipping {application_id}: tasks already in flight");
+                continue;
+            }
             debug!("Processing actions for {application_id}");
             let now = Timestamp::now();
             let last_requested_callback =
@@ -201,10 +226,10 @@ impl<Env: linera_core::Environment> TaskProcessor<Env> {
                     .push(Reverse((timestamp, Some(application_id))));
             }
             if !actions.execute_tasks.is_empty() {
+                self.in_flight_apps.insert(application_id);
                 let sender = self.outcome_sender.clone();
                 let operators = self.operators.clone();
-                let previous = self.last_task_handles.remove(&application_id);
-                let handle = tokio::spawn(async move {
+                tokio::spawn(async move {
                     // Spawn all tasks concurrently and join them.
                     let handles: Vec<_> = actions
                         .execute_tasks
@@ -220,17 +245,17 @@ impl<Env: linera_core::Environment> TaskProcessor<Env> {
                         })
                         .collect();
                     let results = future::join_all(handles).await;
-                    // Wait for the previous batch to finish sending outcomes first.
-                    if let Some(previous) = previous {
-                        if let Err(error) = previous.await {
-                            error!(%application_id, %error, "Task panicked");
-                        }
-                    }
                     // Submit outcomes in the original order.
                     for result in results {
                         match result {
                             Ok(Ok(outcome)) => {
-                                if sender.send((application_id, outcome)).is_err() {
+                                if sender
+                                    .send(TaskMessage::Outcome {
+                                        application_id,
+                                        outcome,
+                                    })
+                                    .is_err()
+                                {
                                     error!("Outcome receiver dropped for {application_id}");
                                     break;
                                 }
@@ -243,8 +268,10 @@ impl<Env: linera_core::Environment> TaskProcessor<Env> {
                             }
                         }
                     }
+                    // Signal that this batch is done so the main loop can process
+                    // the next batch for this application.
+                    let _ = sender.send(TaskMessage::BatchComplete { application_id });
                 });
-                self.last_task_handles.insert(application_id, handle);
             }
         }
     }