controller: Skip FPM truncation if TS is not enabled

oleiman · oleiman · commit ede09afaf434 · 2025-07-15T09:34:34.000-07:00
Fast partition movement works by setting the learner's raft start offset to a safe prefix truncation of the local log, based on initial retention configs and some calculations to determine an offset below which all data are recoverable from cloud storage. Raft data above this offset are transferred directly, whereas data below this offset (assumed to be recoverable from cloud) are not. Prior to this change, this computation could produce an unsafe initial start offset for the learner if archival_metadata_stm::max_removable_local_log_offset exceeds the last uploaded offset, which can occur if archival is disabled. Note that this difference between max_removable and last uploaded is by design since automatic truncation is usually predicated on cleanup policy, and archival being switched off should not block cleanup in the common case. FPM is a special case in the sense that the "truncation" is not for cleanup as such, but rather an omission of backed-up data from an expensive bulk transfer. Which is to say it can and does occur even if the source topic is compact-only. The net result of this is that archival_metadata_stm reports that it is safe to remove up to offset::max, and, as a result, FPM truncates the raft transfer up to whatever removable offset is reported by the other stms. Any data produced between the last cloud storage upload and this offset are lost. This commit introduces checks to - prevent FPM from adjusting the learner start offset above the start of the local log for any partition w/ TS disabled or w/ uploads paused - prevent FPM from operating on internal topics - ensure that FPM won't truncate past the safe offset in general as reported by archival metadata stm. This is mostly a safeguard, as in most cases max_removable_local_log_offset should return the correct thing. Signed-off-by: Oren Leiman <oren.leiman@redpanda.com> (cherry picked from commit 3dc8d34) Conflicts: - max_collectible_offset was renamed to max_removable_local_log_offset - local naming in controller_backend - config::cloud_storage_enable_segment_uploads not yet implemented - turn off upload pause ducktape test
diff --git a/src/v/cluster/controller_backend.cc b/src/v/cluster/controller_backend.cc
@@ -585,12 +585,32 @@ controller_backend::calculate_learner_initial_offset(
      * Initial learner start offset only makes sense for partitions with cloud
      * storage data
      */
+    if (auto tp_cfg = p->get_topic_config();
+        tp_cfg.has_value() && tp_cfg->get().is_internal()) {
+        vlog(clusterlog.trace, "{} is part of an internal topic", p->ntp());
+        return std::nullopt;
+    }
+
     if (!p->cloud_data_available()) {
         vlog(clusterlog.trace, "no cloud data available for: {}", p->ntp());
         return std::nullopt;
     }
 
+    if (p->get_cloud_storage_mode() != cluster::cloud_storage_mode::full) {
+        vlog(
+          clusterlog.trace,
+          "cloud storage not fully enabled for: {}",
+          p->ntp());
+        return std::nullopt;
+    }
+
+    if (p->archival_meta_stm() == nullptr) {
+        vlog(clusterlog.trace, "no archival_meta_stm for {}", p->ntp());
+        return std::nullopt;
+    }
+
     auto log = p->log();
+
     /**
      * Calculate retention targets based on cluster and topic configuration
      */
@@ -670,20 +690,39 @@ controller_backend::calculate_learner_initial_offset(
         return std::nullopt;
     }
 
-    auto const cloud_storage_safe_offset
+    auto cloud_storage_safe_offset
       = p->archival_meta_stm()->max_collectible_offset();
+    auto archival_safe_removable
+      = p->archival_meta_stm()->cloud_recoverable_offset();
+
     /**
      * Last offset uploaded to the cloud is target learner retention upper
      * bound. We can not start retention recover from the point which is not yet
      * uploaded to Cloud Storage.
+     *
+     * In general cloud_storage_safe_offset should not exceed
+     * last_uploaded, but can if, for example, archival is disabled or paused.
      */
+
+    if (cloud_storage_safe_offset > archival_safe_removable) {
+        vlog(
+          clusterlog.info,
+          "[{}] cloud_storage_safe_offset {} exceeds last uploaded to "
+          "cloud {}, clamping to {}",
+          p->ntp(),
+          cloud_storage_safe_offset,
+          archival_safe_removable,
+          archival_safe_removable);
+        cloud_storage_safe_offset = archival_safe_removable;
+    }
+
     vlog(
       clusterlog.info,
       "[{}] calculated retention offset: {}, last uploaded to cloud: {}, "
       "manifest clean offset: {}, max_collectible_offset: {}",
       p->ntp(),
       *retention_offset,
-      p->archival_meta_stm()->manifest().get_last_offset(),
+      archival_safe_removable,
       p->archival_meta_stm()->get_last_clean_at(),
       cloud_storage_safe_offset);
 
diff --git a/tests/rptest/tests/node_pool_migration_test.py b/tests/rptest/tests/node_pool_migration_test.py
@@ -10,6 +10,7 @@
 from concurrent.futures import ThreadPoolExecutor
 import random
 import re
+import json
 
 import requests
 from rptest.clients.kafka_cat import KafkaCat
@@ -23,9 +24,13 @@
 from rptest.clients.types import TopicSpec
 from rptest.services.admin import Admin
 from rptest.services.redpanda import RESTART_LOG_ALLOW_LIST, SISettings
+from rptest.util import expect_exception
 from rptest.utils.mode_checks import cleanup_on_early_exit
 from rptest.utils.node_operations import NodeDecommissionWaiter
+from rptest.utils.mode_checks import skip_debug_mode
+from rptest.tests.redpanda_test import RedpandaTest
 from enum import Enum
+import ducktape.errors
 
 TS_LOG_ALLOW_LIST = [
     re.compile(
@@ -45,21 +50,10 @@ def has_tiered_storage(self):
         return self.value == self.TIRED_STORAGE or self.value == self.FAST_MOVES
 
 
-class NodePoolMigrationTest(PreallocNodesTest):
-    """
-    Basic nodes decommissioning test.
-    """
-    def __init__(self, test_context):
+class NodePoolMigrationTestBase(PreallocNodesTest):
+    def __init__(self, *args, **kwargs):
         self._topic = None
-
-        super(NodePoolMigrationTest, self).__init__(
-            test_context=test_context,
-            num_brokers=10,
-            node_prealloc_count=1,
-            si_settings=SISettings(test_context,
-                                   cloud_storage_enable_remote_read=True,
-                                   cloud_storage_enable_remote_write=True,
-                                   fast_uploads=True))
+        super(NodePoolMigrationTestBase, self).__init__(*args, **kwargs)
 
     def setup(self):
         # defer starting redpanda to test body
@@ -269,6 +263,27 @@ def _replicas_per_node(self):
 
         return node_replicas
 
+
+class NodePoolMigrationTest(NodePoolMigrationTestBase):
+    """
+    Basic nodes decommissioning test.
+    """
+    def __init__(self, test_context):
+        self._topic = None
+
+        super(NodePoolMigrationTest, self).__init__(
+            test_context=test_context,
+            num_brokers=10,
+            node_prealloc_count=1,
+            si_settings=SISettings(test_context,
+                                   cloud_storage_enable_remote_read=True,
+                                   cloud_storage_enable_remote_write=True,
+                                   fast_uploads=True))
+
+    def setup(self):
+        # defer starting redpanda to test body
+        pass
+
     @cluster(num_nodes=11,
              log_allow_list=RESTART_LOG_ALLOW_LIST + TS_LOG_ALLOW_LIST)
     @matrix(balancing_mode=["off", 'node_add'],
@@ -399,3 +414,182 @@ def _quiescent_state():
             self.redpanda.stop_node(n)
 
         self.verify()
+
+
+class DisableTestMode(str, Enum):
+    DISABLE = "disable tiered storage"
+    PAUSE = "pause uploads"
+
+    def do_disable(self, test: RedpandaTest, topic_name: str):
+        if self.value == self.DISABLE:
+            test.client().alter_topic_config(topic_name,
+                                             'redpanda.remote.read', 'false')
+            test.client().alter_topic_config(topic_name,
+                                             'redpanda.remote.write', 'false')
+        elif self.value == self.PAUSE:
+            test.client().alter_topic_config(topic_name,
+                                             'redpanda.remote.allowgaps',
+                                             'true')
+            test.redpanda.set_cluster_config(
+                {"cloud_storage_enable_segment_uploads": False})
+
+
+class DisableTieredStorageTest(NodePoolMigrationTestBase):
+    def __init__(self, test_context):
+        self._topic = None
+
+        super(DisableTieredStorageTest, self).__init__(
+            test_context=test_context,
+            num_brokers=3,
+            node_prealloc_count=1,
+            si_settings=SISettings(test_context,
+                                   cloud_storage_enable_remote_read=True,
+                                   cloud_storage_enable_remote_write=True,
+                                   fast_uploads=True))
+
+    def setup(self):
+        # defer starting redpanda to test body
+        pass
+
+    @cluster(num_nodes=4,
+             log_allow_list=RESTART_LOG_ALLOW_LIST + TS_LOG_ALLOW_LIST)
+    @matrix(disable_mode=[
+        DisableTestMode.DISABLE,
+        # Removed in backport, not yet implemented
+        # DisableTestMode.PAUSE,
+    ])
+    def test_disable_tiered_storage(self, disable_mode: DisableTestMode):
+        '''
+        This test performs the following actions:
+          - Create a tiered storage topic
+          - Produce some data and wait for cloud storage upload
+          - Disable tiered storage on the topic
+          - Produce some more data (note no additional upload)
+          - Decommission leader to force leadership transfer
+          - Check that start offset and high watermark on the new leader reflect
+            the full content of the original leader's raft log prior to decom.
+        '''
+
+        self.redpanda.start()
+        cfg = {"partition_autobalancing_mode": 'node_add'}
+        cfg["cloud_storage_enable_remote_write"] = True
+        cfg["cloud_storage_enable_remote_read"] = True
+        # we want data to be actually deleted
+        cfg["retention_local_strict"] = True
+
+        # we need to configure a small amount of initial local retention,
+        # otherwise we get the hwm, batch boundary adjustment fails, and we
+        # fall back to  setting the learner to start at offset 0
+        self.redpanda.set_cluster_config({
+            "initial_retention_local_target_bytes_default":
+            self.segment_size * 2
+        })
+
+        self.admin.patch_cluster_config(upsert=cfg)
+
+        spec = TopicSpec(
+            name=f"migration-test",
+            partition_count=1,
+            replication_factor=1,
+            cleanup_policy='compact',
+            segment_bytes=self.segment_size,
+        )
+        self.client().create_topic(spec)
+        self._topic = spec.name
+        rpk = RpkTool(self.redpanda)
+
+        def describe_topic():
+            info = None
+            while info is None:
+                for i in rpk.describe_topic(spec.name):
+                    info = i
+            self.logger.debug(f"{info}")
+            return info
+
+        self.start_producer()
+        self.producer.wait(timeout_sec=60)
+
+        info = describe_topic()
+
+        initial_start_offset = info.start_offset
+        initial_hwm = info.high_watermark
+
+        def pm_last_offset():
+            v = self.admin.get_partition_manifest(spec.name, 0)['last_offset']
+            return v
+
+        self.logger.debug("Wait until most of the topic is uploaded")
+
+        wait_until(lambda: pm_last_offset() >= initial_hwm,
+                   timeout_sec=30,
+                   backoff_sec=2,
+                   err_msg="Partition never uploaded")
+
+        self.logger.debug(
+            f"Now {disable_mode} and produce some more to put HWM well above the last uploaded offset"
+        )
+        disable_mode.do_disable(self, spec.name)
+
+        last_uploaded = pm_last_offset()
+
+        self.start_producer()
+        self.producer.wait(timeout_sec=60)
+
+        info = describe_topic()
+        second_start_offset = info.start_offset
+        second_hwm = info.high_watermark
+
+        assert pm_last_offset() == last_uploaded, \
+            f"Unexpectedly uploaded more data {pm_last_offset()} > {last_uploaded}"
+
+        self.logger.debug(
+            "Decommission the partition's leader and wait for leadership transfer"
+        )
+
+        leader_id = self.admin.get_partition_leader(namespace='kafka',
+                                                    topic=spec.name,
+                                                    partition=0)
+
+        self._decommission(leader_id, decommissioned_ids=[leader_id])
+
+        def new_leader_id():
+            partition_info = self.admin.get_partitions(topic=spec.name,
+                                                       partition=0,
+                                                       namespace='kafka',
+                                                       node=None)
+            self.logger.debug(f"{partition_info=}")
+            new_id = self.admin.get_partition_leader(namespace='kafka',
+                                                     topic=spec.name,
+                                                     partition=0)
+            self.logger.debug(f"{new_id=}")
+            return new_id
+
+        wait_until(lambda: new_leader_id() not in [leader_id, -1],
+                   timeout_sec=60,
+                   backoff_sec=2,
+                   err_msg="Partition didn't move")
+
+        if disable_mode == DisableTestMode.DISABLE:
+            self.logger.debug(
+                "With tiered storage disabled, we should skip FPM truncation and transfer the whole log via raft"
+            )
+        elif disable_mode == DisableTestMode.PAUSE:
+            self.logger.debug(
+                "With uploads paused, FPM should truncate only up to the last uploaded offset to avoid introducing a gap in the log"
+            )
+
+        with expect_exception(ducktape.errors.TimeoutError, lambda e: True):
+            wait_until(
+                lambda: describe_topic().start_offset > initial_start_offset,
+                timeout_sec=30,
+                backoff_sec=2,
+                err_msg="Start offset never jumped")
+
+        final_start_offset = describe_topic().start_offset
+        final_hwm = describe_topic().high_watermark
+
+        assert final_start_offset == initial_start_offset, \
+            f"Expected final_start_offset == {initial_start_offset}, got {final_start_offset=}"
+
+        assert final_hwm == second_hwm, \
+            f"Expected final_hwm == {second_hwm}, got {final_hwm=}"