golemcloud · mschuwalow · Mar 11, 2026 · Mar 9, 2026 · Mar 10, 2026 · vigoo
diff --git a/golem-common/src/model/mod.rs b/golem-common/src/model/mod.rs
@@ -697,7 +697,13 @@ pub struct AgentStatusRecord {
     /// The number of encountered error entries grouped by their 'retry_from' index, calculated from
     /// the last invocation boundary.
     pub current_retry_count: HashMap<OplogIndex, u32>,
-    pub last_snapshot_index: Option<OplogIndex>,
+    /// Index of the last manual update snapshot index. Agent will call load_snapshot
+    /// on this payload before starting replay.
+    pub last_manual_update_snapshot_index: Option<OplogIndex>,
+    /// Index of the last automatic snapshot index. Must be >= last_manual_snapshot_index.
+    /// Agent will call load_snapshot on this payload before starting replay. If the load_snapshot
+    /// fails this will be ignored and a full replay from last_manual_snapshot_index will performed.
+    pub last_automatic_snapshot_index: Option<OplogIndex>,
 }
 
 impl Default for AgentStatusRecord {
@@ -721,7 +727,8 @@ impl Default for AgentStatusRecord {
             deleted_regions: DeletedRegions::new(),
             component_revision_for_replay: ComponentRevision::INITIAL,
             current_retry_count: HashMap::new(),
-            last_snapshot_index: None,
+            last_manual_update_snapshot_index: None,
+            last_automatic_snapshot_index: None,
         }
     }
 }

diff --git a/golem-common/src/model/worker.rs b/golem-common/src/model/worker.rs
@@ -30,7 +30,9 @@ impl UntypedParsedWorkerCreationLocalAgentConfigEntry {
 
         let value_type = component_metadata
             .find_agent_type_by_name(agent_type_name)
-            .ok_or("did not find expected agent type in the metadata")?
+            .ok_or_else(|| {
+                format!("did not find expected agent type {agent_type_name} in the metadata")
+            })?
             .config
             .into_iter()
             .find_map(|c| match c {

diff --git a/golem-worker-executor/src/durable_host/mod.rs b/golem-worker-executor/src/durable_host/mod.rs
@@ -1173,6 +1173,13 @@ impl<Ctx: WorkerCtx + DurableWorkerCtxView<Ctx>> DurableWorkerCtx<Ctx> {
             OplogEntry::Snapshot {
                 data, mime_type, ..
             } => (data, mime_type),
+            OplogEntry::PendingUpdate {
+                description:
+                    UpdateDescription::SnapshotBased {
+                        payload, mime_type, ..
+                    },
+                ..
+            } => (payload, mime_type),
             _ => {
                 warn!(
                     "Expected Snapshot entry at oplog index {snapshot_index}, found different entry; falling back to full replay"
@@ -2325,11 +2332,25 @@ impl<Ctx: WorkerCtx + DurableWorkerCtxView<Ctx>> ExternalOperations<Ctx> for Dur
                         UpdateDescription::Automatic {
                             target_revision, ..
                         } => {
-                            // snapshot update will be succeeded as part of the replay.
-                            let result = Self::resume_replay(store, instance, false).await;
-                            record_resume_worker(start.elapsed());
+                            let replay_result = async {
+                                if let SnapshotRecoveryResult::Failed =
+                                    Self::try_load_snapshot(store, instance).await
+                                {
+                                    return Err(WorkerExecutorError::failed_to_resume_worker(
+                                        agent_id.clone(),
+                                        WorkerExecutorError::runtime("loading snapshot failed"),
+                                    ));
+                                };
+                                // automatic update will be succeeded as part of the replay.
+                                let result = Self::resume_replay(store, instance, false).await?;
+
+                                record_resume_worker(start.elapsed());
+
+                                Ok(result)
+                            }
+                            .await;
 
-                            match result {
+                            match replay_result {
                                 Err(error) => {
                                     // replay failed. There are two cases here:
                                     // 1. We failed before the update has succeeded. In this case we fail the update and retry the replay.
@@ -2367,18 +2388,13 @@ impl<Ctx: WorkerCtx + DurableWorkerCtxView<Ctx>> ExternalOperations<Ctx> for Dur
                                         _ => Err(error),
                                     }
                                 }
-                                _ => result,
+                                _ => replay_result,
                             }
                         }
                     }
                 }
                 None => match Self::try_load_snapshot(store, instance).await {
-                    SnapshotRecoveryResult::Success => {
-                        let result = Self::resume_replay(store, instance, false).await;
-                        record_resume_worker(start.elapsed());
-                        result
-                    }
-                    SnapshotRecoveryResult::NotAttempted => {
+                    SnapshotRecoveryResult::Success | SnapshotRecoveryResult::NotAttempted => {
                         let result = Self::resume_replay(store, instance, false).await;
                         record_resume_worker(start.elapsed());
                         result

diff --git a/golem-worker-executor/src/worker/invocation_loop.rs b/golem-worker-executor/src/worker/invocation_loop.rs
@@ -327,22 +327,13 @@ impl<Ctx: WorkerCtx> InnerInvocationLoop<'_, Ctx> {
     /// first pending_updates, then pending_invocations
     async fn drain_pending_from_status(&mut self) -> CommandOutcome {
         loop {
-            let status = self.parent.last_known_status.read().await.clone();
+            let status = self.parent.get_non_detached_last_known_status().await;
 
             // First, try to process a pending update
-            if let Some(update) = status.pending_updates.front() {
-                let target_revision = *update.description.target_revision();
-                let mut store = self.store.lock().await;
-                let mut invocation = Invocation {
-                    owned_agent_id: self.owned_agent_id.clone(),
-                    parent: self.parent.clone(),
-                    instance: self.instance,
-                    store: store.deref_mut(),
-                };
-                match invocation.manual_update(target_revision).await {
-                    CommandOutcome::Continue => continue,
-                    other => break other,
-                }
+            if status.pending_updates.front().is_some() {
+                // if the update made it to pending_updates (instead of pending invocations), it is ready
+                // to be processed on next restart. So just restart here and let the recovery logic take over
+                break CommandOutcome::BreakInnerLoop(RetryDecision::Immediate);
             }
 
             // Then, try to process a pending invocation

diff --git a/golem-worker-executor/src/worker/mod.rs b/golem-worker-executor/src/worker/mod.rs
@@ -49,7 +49,7 @@ use golem_common::model::component::ComponentRevision;
 use golem_common::model::component::{ComponentFilePath, PluginPriority};
 use golem_common::model::invocation_context::InvocationContextStack;
 use golem_common::model::oplog::{OplogEntry, OplogIndex, UpdateDescription};
-use golem_common::model::regions::OplogRegion;
+use golem_common::model::regions::{DeletedRegionsBuilder, OplogRegion};
 use golem_common::model::worker::{RevertWorkerTarget, WorkerCreationLocalAgentConfigEntry};
 use golem_common::model::AgentStatus;
 use golem_common::model::RetryConfig;
@@ -2162,6 +2162,31 @@ impl RunningWorker {
                     .component_revision_for_replay,
             );
 
+        let mut skipped_regions = worker_metadata.last_known_status.skipped_regions;
+        let mut last_snapshot_index = worker_metadata
+            .last_known_status
+            .last_manual_update_snapshot_index;
+
+        // automatic snapshots are only considered until the first failure.
+        // additionally, if there are updates, the automatic snapshot is temporarily ignored to catch issues earlier
+        if let Some(snapshot_idx) = worker_metadata
+            .last_known_status
+            .last_automatic_snapshot_index
+        {
+            if pending_update.is_none()
+                && !parent.snapshot_recovery_disabled.load(Ordering::Acquire)
+            {
+                let snapshot_skip =
+                    DeletedRegionsBuilder::from_regions(vec![OplogRegion::from_index_range(
+                        OplogIndex::INITIAL.next()..=snapshot_idx,
+                    )])
+                    .build();
+                skipped_regions.set_override(snapshot_skip);
+
+                last_snapshot_index = Some(snapshot_idx);
+            }
+        }
+
         let context = Ctx::create(
             worker_metadata.created_by,
             OwnedAgentId::new(worker_metadata.environment_id, &worker_metadata.agent_id),
@@ -2184,19 +2209,13 @@ impl RunningWorker {
             parent.extra_deps(),
             parent.config(),
             AgentConfig::new(
-                worker_metadata.last_known_status.skipped_regions,
+                skipped_regions,
                 worker_metadata.last_known_status.total_linear_memory_size,
                 component_version_for_replay,
                 worker_metadata.created_by,
                 worker_metadata.config_vars,
                 worker_metadata.local_agent_config,
-                if pending_update.is_none()
-                    && !parent.snapshot_recovery_disabled.load(Ordering::Acquire)
-                {
-                    worker_metadata.last_known_status.last_snapshot_index
-                } else {
-                    None
-                },
+                last_snapshot_index,
             ),
             parent.execution_status.clone(),
             parent.file_loader(),

diff --git a/golem-worker-executor/src/worker/status.rs b/golem-worker-executor/src/worker/status.rs
@@ -153,13 +153,17 @@ pub async fn update_status_with_new_entries<T: HasOplogService + Sync>(
         component_revision,
         component_size,
         component_revision_for_replay,
+        last_manual_update_snapshot_index,
+        last_automatic_snapshot_index,
     ) = calculate_update_fields(
         last_known.pending_updates,
         last_known.failed_updates,
         last_known.successful_updates,
         last_known.component_revision,
         last_known.component_size,
         last_known.component_revision_for_replay,
+        last_known.last_manual_update_snapshot_index,
+        last_known.last_automatic_snapshot_index,
         &deleted_regions,
         &new_entries,
     );
@@ -182,12 +186,6 @@ pub async fn update_status_with_new_entries<T: HasOplogService + Sync>(
 
     let active_plugins = calculate_active_plugins(active_plugins, &deleted_regions, &new_entries);
 
-    let last_snapshot_index = calculate_last_snapshot_index(
-        last_known.last_snapshot_index,
-        &deleted_regions,
-        &new_entries,
-    );
-
     let result = AgentStatusRecord {
         oplog_idx: new_entries
             .keys()
@@ -211,7 +209,8 @@ pub async fn update_status_with_new_entries<T: HasOplogService + Sync>(
         deleted_regions,
         component_revision_for_replay,
         current_retry_count,
-        last_snapshot_index,
+        last_manual_update_snapshot_index,
+        last_automatic_snapshot_index,
     };
 
     Some(result)
@@ -587,13 +586,16 @@ async fn calculate_pending_invocations<T: HasOplogService + Sync>(
     result
 }
 
+#[allow(clippy::type_complexity)]
 fn calculate_update_fields(
     initial_pending_updates: VecDeque<TimestampedUpdateDescription>,
     initial_failed_updates: Vec<FailedUpdateRecord>,
     initial_successful_updates: Vec<SuccessfulUpdateRecord>,
     initial_revision: ComponentRevision,
     initial_component_size: u64,
     initial_component_revision_for_replay: ComponentRevision,
+    initial_last_manual_update_snapshot_index: Option<OplogIndex>,
+    initial_last_automatic_snapshot_index: Option<OplogIndex>,
     deleted_regions: &DeletedRegions,
     entries: &BTreeMap<OplogIndex, OplogEntry>,
 ) -> (
@@ -603,13 +605,17 @@ fn calculate_update_fields(
     ComponentRevision,
     u64,
     ComponentRevision,
+    Option<OplogIndex>,
+    Option<OplogIndex>,
 ) {
     let mut pending_updates = initial_pending_updates;
     let mut failed_updates = initial_failed_updates;
     let mut successful_updates = initial_successful_updates;
     let mut revision = initial_revision;
     let mut size = initial_component_size;
     let mut component_revision_for_replay = initial_component_revision_for_replay;
+    let mut last_manual_update_snapshot_index = initial_last_manual_update_snapshot_index;
+    let mut last_automatic_snapshot_index = initial_last_automatic_snapshot_index;
 
     for (oplog_idx, entry) in entries {
         // Skipping entries in deleted regions (by revert)
@@ -663,17 +669,20 @@ fn calculate_update_fields(
                 revision = *target_revision;
                 size = *new_component_size;
 
-                let applied_update = pending_updates.pop_front();
-                if matches!(
-                    applied_update,
-                    Some(TimestampedUpdateDescription {
-                        description: UpdateDescription::SnapshotBased { .. },
-                        ..
-                    })
-                ) {
-                    component_revision_for_replay = *target_revision
+                if let Some(TimestampedUpdateDescription {
+                    description: UpdateDescription::SnapshotBased { .. },
+                    oplog_index: applied_update_oplog_index,
+                    ..
+                }) = pending_updates.pop_front()
+                {
+                    component_revision_for_replay = *target_revision;
+                    last_manual_update_snapshot_index = Some(applied_update_oplog_index);
+                    last_automatic_snapshot_index = None;
                 }
             }
+            OplogEntry::Snapshot { .. } => {
+                last_automatic_snapshot_index = Some(*oplog_idx);
+            }
             _ => {}
         }
     }
@@ -684,6 +693,8 @@ fn calculate_update_fields(
         revision,
         size,
         component_revision_for_replay,
+        last_manual_update_snapshot_index,
+        last_automatic_snapshot_index,
     )
 }
 
@@ -830,31 +841,6 @@ fn calculate_active_plugins(
     result
 }
 
-fn calculate_last_snapshot_index(
-    initial: Option<OplogIndex>,
-    deleted_regions: &DeletedRegions,
-    entries: &BTreeMap<OplogIndex, OplogEntry>,
-) -> Option<OplogIndex> {
-    let mut result = initial;
-
-    if let Some(idx) = result {
-        if deleted_regions.is_in_deleted_region(idx) {
-            result = None;
-        }
-    }
-
-    for (idx, entry) in entries {
-        if deleted_regions.is_in_deleted_region(*idx) {
-            continue;
-        }
-
-        if matches!(entry, OplogEntry::Snapshot { .. }) {
-            result = Some(*idx);
-        }
-    }
-    result
-}
-
 fn is_worker_error_retriable(
     retry_config: &RetryConfig,
     error: &AgentError,
@@ -1627,7 +1613,7 @@ mod test {
                     mime_type: "application/octet-stream".to_string(),
                 },
                 move |mut status| {
-                    status.last_snapshot_index = Some(oplog_idx);
+                    status.last_automatic_snapshot_index = Some(oplog_idx);
                     status
                 },
             )
@@ -1681,7 +1667,9 @@ mod test {
                 status.failed_updates = old_status.failed_updates;
                 status.invocation_results = old_status.invocation_results;
                 status.component_revision_for_replay = old_status.component_revision_for_replay;
-                status.last_snapshot_index = old_status.last_snapshot_index;
+                status.last_manual_update_snapshot_index =
+                    old_status.last_manual_update_snapshot_index;
+                status.last_automatic_snapshot_index = old_status.last_automatic_snapshot_index;
 
                 status
             })
@@ -1765,7 +1753,7 @@ mod test {
             )
             .rounded();
             self.add(entry.clone(), move |mut status| {
-                let _ = status.pending_updates.pop_front();
+                let applied_update = status.pending_updates.pop_front();
                 status.successful_updates.push(SuccessfulUpdateRecord {
                     timestamp: entry.timestamp(),
                     target_revision: *update_description.target_revision(),
@@ -1785,6 +1773,9 @@ mod test {
                 } = update_description
                 {
                     status.component_revision_for_replay = target_revision;
+                    status.last_manual_update_snapshot_index =
+                        applied_update.map(|au| au.oplog_index);
+                    status.last_automatic_snapshot_index = None;
                 };
 
                 status