Merge pull request #26822 from oleiman/manual-backport-26803-v24.2.x-185

oleiman · web-flow · commit 754dc2c89858 · 2025-07-15T15:31:42.000-07:00
diff --git a/src/v/archival/archival_metadata_stm.cc b/src/v/archival/archival_metadata_stm.cc
@@ -1330,6 +1330,21 @@ archival_metadata_stm::take_local_snapshot(ssx::semaphore_units apply_units) {
       0, snapshot_offset, std::move(snap_data));
 }
 
+model::offset archival_metadata_stm::cloud_recoverable_offset() {
+    auto lo = get_last_offset();
+    if (_manifest->size() == 0 && lo == model::offset{0}) {
+        lo = model::offset::min();
+    }
+
+    // Do not collect past the offset we last uploaded manifest for: this is
+    // needed for correctness because the remote manifest is used in
+    // handle_eviction() - it is what a remote node doing snapshot-driven
+    // raft recovery will use to start from.
+    lo = std::min(lo, _last_clean_at);
+
+    return lo;
+}
+
 model::offset archival_metadata_stm::max_collectible_offset() {
     // From Redpanda 22.3 up, the ntp_config's impression of whether
     // archival is enabled is authoritative.
@@ -1352,18 +1367,7 @@ model::offset archival_metadata_stm::max_collectible_offset() {
         // need to interact with local retention.
         return model::offset::max();
     }
-    auto lo = get_last_offset();
-    if (_manifest->size() == 0 && lo == model::offset{0}) {
-        lo = model::offset::min();
-    }
-
-    // Do not collect past the offset we last uploaded manifest for: this is
-    // needed for correctness because the remote manifest is used in
-    // handle_eviction() - it is what a remote node doing snapshot-driven
-    // raft recovery will use to start from.
-    lo = std::min(lo, _last_clean_at);
-
-    return lo;
+    return cloud_recoverable_offset();
 }
 
 void archival_metadata_stm::maybe_notify_waiter(cluster::errc err) noexcept {
diff --git a/src/v/archival/archival_metadata_stm.h b/src/v/archival/archival_metadata_stm.h
@@ -273,6 +273,16 @@ class archival_metadata_stm final : public raft::persisted_stm<> {
 
     model::offset get_last_clean_at() const { return _last_clean_at; };
 
+    /// Returns the maximum offset which is guaranteed to be recoverable from
+    /// cloud storage.
+    ///
+    /// This is the lesser of the last offset uploaded to cloud storage and the
+    /// last offset we uploaded a manifest for.
+    ///
+    /// If the manifest is empty or the last uploaded offset is 0, returns
+    /// offset::min(), indicating that nothing is recoverable from cloud.
+    model::offset cloud_recoverable_offset();
+
     model::offset max_collectible_offset() override;
 
     ss::future<iobuf> take_snapshot(model::offset) final { co_return iobuf{}; }
diff --git a/src/v/cluster/controller_backend.cc b/src/v/cluster/controller_backend.cc
@@ -585,12 +585,32 @@ controller_backend::calculate_learner_initial_offset(
      * Initial learner start offset only makes sense for partitions with cloud
      * storage data
      */
+    if (auto tp_cfg = p->get_topic_config();
+        tp_cfg.has_value() && tp_cfg->get().is_internal()) {
+        vlog(clusterlog.trace, "{} is part of an internal topic", p->ntp());
+        return std::nullopt;
+    }
+
     if (!p->cloud_data_available()) {
         vlog(clusterlog.trace, "no cloud data available for: {}", p->ntp());
         return std::nullopt;
     }
 
+    if (p->get_cloud_storage_mode() != cluster::cloud_storage_mode::full) {
+        vlog(
+          clusterlog.trace,
+          "cloud storage not fully enabled for: {}",
+          p->ntp());
+        return std::nullopt;
+    }
+
+    if (p->archival_meta_stm() == nullptr) {
+        vlog(clusterlog.trace, "no archival_meta_stm for {}", p->ntp());
+        return std::nullopt;
+    }
+
     auto log = p->log();
+
     /**
      * Calculate retention targets based on cluster and topic configuration
      */
@@ -670,20 +690,39 @@ controller_backend::calculate_learner_initial_offset(
         return std::nullopt;
     }
 
-    auto const cloud_storage_safe_offset
+    auto cloud_storage_safe_offset
       = p->archival_meta_stm()->max_collectible_offset();
+    auto archival_safe_removable
+      = p->archival_meta_stm()->cloud_recoverable_offset();
+
     /**
      * Last offset uploaded to the cloud is target learner retention upper
      * bound. We can not start retention recover from the point which is not yet
      * uploaded to Cloud Storage.
+     *
+     * In general cloud_storage_safe_offset should not exceed
+     * last_uploaded, but can if, for example, archival is disabled or paused.
      */
+
+    if (cloud_storage_safe_offset > archival_safe_removable) {
+        vlog(
+          clusterlog.info,
+          "[{}] cloud_storage_safe_offset {} exceeds last uploaded to "
+          "cloud {}, clamping to {}",
+          p->ntp(),
+          cloud_storage_safe_offset,
+          archival_safe_removable,
+          archival_safe_removable);
+        cloud_storage_safe_offset = archival_safe_removable;
+    }
+
     vlog(
       clusterlog.info,
       "[{}] calculated retention offset: {}, last uploaded to cloud: {}, "
       "manifest clean offset: {}, max_collectible_offset: {}",
       p->ntp(),
       *retention_offset,
-      p->archival_meta_stm()->manifest().get_last_offset(),
+      archival_safe_removable,
       p->archival_meta_stm()->get_last_clean_at(),
       cloud_storage_safe_offset);
 
diff --git a/tests/rptest/tests/node_pool_migration_test.py b/tests/rptest/tests/node_pool_migration_test.py
@@ -10,6 +10,7 @@
 from concurrent.futures import ThreadPoolExecutor
 import random
 import re
+import json
 
 import requests
 from rptest.clients.kafka_cat import KafkaCat
@@ -23,9 +24,13 @@
 from rptest.clients.types import TopicSpec
 from rptest.services.admin import Admin
 from rptest.services.redpanda import RESTART_LOG_ALLOW_LIST, SISettings
+from rptest.util import expect_exception
 from rptest.utils.mode_checks import cleanup_on_early_exit
 from rptest.utils.node_operations import NodeDecommissionWaiter
+from rptest.utils.mode_checks import skip_debug_mode
+from rptest.tests.redpanda_test import RedpandaTest
 from enum import Enum
+import ducktape.errors
 
 TS_LOG_ALLOW_LIST = [
     re.compile(
@@ -45,21 +50,10 @@ def has_tiered_storage(self):
         return self.value == self.TIRED_STORAGE or self.value == self.FAST_MOVES
 
 
-class NodePoolMigrationTest(PreallocNodesTest):
-    """
-    Basic nodes decommissioning test.
-    """
-    def __init__(self, test_context):
+class NodePoolMigrationTestBase(PreallocNodesTest):
+    def __init__(self, *args, **kwargs):
         self._topic = None
-
-        super(NodePoolMigrationTest, self).__init__(
-            test_context=test_context,
-            num_brokers=10,
-            node_prealloc_count=1,
-            si_settings=SISettings(test_context,
-                                   cloud_storage_enable_remote_read=True,
-                                   cloud_storage_enable_remote_write=True,
-                                   fast_uploads=True))
+        super(NodePoolMigrationTestBase, self).__init__(*args, **kwargs)
 
     def setup(self):
         # defer starting redpanda to test body
@@ -269,6 +263,27 @@ def _replicas_per_node(self):
 
         return node_replicas
 
+
+class NodePoolMigrationTest(NodePoolMigrationTestBase):
+    """
+    Basic nodes decommissioning test.
+    """
+    def __init__(self, test_context):
+        self._topic = None
+
+        super(NodePoolMigrationTest, self).__init__(
+            test_context=test_context,
+            num_brokers=10,
+            node_prealloc_count=1,
+            si_settings=SISettings(test_context,
+                                   cloud_storage_enable_remote_read=True,
+                                   cloud_storage_enable_remote_write=True,
+                                   fast_uploads=True))
+
+    def setup(self):
+        # defer starting redpanda to test body
+        pass
+
     @cluster(num_nodes=11,
              log_allow_list=RESTART_LOG_ALLOW_LIST + TS_LOG_ALLOW_LIST)
     @matrix(balancing_mode=["off", 'node_add'],
@@ -399,3 +414,182 @@ def _quiescent_state():
             self.redpanda.stop_node(n)
 
         self.verify()
+
+
+class DisableTestMode(str, Enum):
+    DISABLE = "disable tiered storage"
+    PAUSE = "pause uploads"
+
+    def do_disable(self, test: RedpandaTest, topic_name: str):
+        if self.value == self.DISABLE:
+            test.client().alter_topic_config(topic_name,
+                                             'redpanda.remote.read', 'false')
+            test.client().alter_topic_config(topic_name,
+                                             'redpanda.remote.write', 'false')
+        elif self.value == self.PAUSE:
+            test.client().alter_topic_config(topic_name,
+                                             'redpanda.remote.allowgaps',
+                                             'true')
+            test.redpanda.set_cluster_config(
+                {"cloud_storage_enable_segment_uploads": False})
+
+
+class DisableTieredStorageTest(NodePoolMigrationTestBase):
+    def __init__(self, test_context):
+        self._topic = None
+
+        super(DisableTieredStorageTest, self).__init__(
+            test_context=test_context,
+            num_brokers=3,
+            node_prealloc_count=1,
+            si_settings=SISettings(test_context,
+                                   cloud_storage_enable_remote_read=True,
+                                   cloud_storage_enable_remote_write=True,
+                                   fast_uploads=True))
+
+    def setup(self):
+        # defer starting redpanda to test body
+        pass
+
+    @cluster(num_nodes=4,
+             log_allow_list=RESTART_LOG_ALLOW_LIST + TS_LOG_ALLOW_LIST)
+    @matrix(disable_mode=[
+        DisableTestMode.DISABLE,
+        # Removed in backport, not yet implemented
+        # DisableTestMode.PAUSE,
+    ])
+    def test_disable_tiered_storage(self, disable_mode: DisableTestMode):
+        '''
+        This test performs the following actions:
+          - Create a tiered storage topic
+          - Produce some data and wait for cloud storage upload
+          - Disable tiered storage on the topic
+          - Produce some more data (note no additional upload)
+          - Decommission leader to force leadership transfer
+          - Check that start offset and high watermark on the new leader reflect
+            the full content of the original leader's raft log prior to decom.
+        '''
+
+        self.redpanda.start()
+        cfg = {"partition_autobalancing_mode": 'node_add'}
+        cfg["cloud_storage_enable_remote_write"] = True
+        cfg["cloud_storage_enable_remote_read"] = True
+        # we want data to be actually deleted
+        cfg["retention_local_strict"] = True
+
+        # we need to configure a small amount of initial local retention,
+        # otherwise we get the hwm, batch boundary adjustment fails, and we
+        # fall back to  setting the learner to start at offset 0
+        self.redpanda.set_cluster_config({
+            "initial_retention_local_target_bytes_default":
+            self.segment_size * 2
+        })
+
+        self.admin.patch_cluster_config(upsert=cfg)
+
+        spec = TopicSpec(
+            name=f"migration-test",
+            partition_count=1,
+            replication_factor=1,
+            cleanup_policy='compact',
+            segment_bytes=self.segment_size,
+        )
+        self.client().create_topic(spec)
+        self._topic = spec.name
+        rpk = RpkTool(self.redpanda)
+
+        def describe_topic():
+            info = None
+            while info is None:
+                for i in rpk.describe_topic(spec.name):
+                    info = i
+            self.logger.debug(f"{info}")
+            return info
+
+        self.start_producer()
+        self.producer.wait(timeout_sec=60)
+
+        info = describe_topic()
+
+        initial_start_offset = info.start_offset
+        initial_hwm = info.high_watermark
+
+        def pm_last_offset():
+            v = self.admin.get_partition_manifest(spec.name, 0)['last_offset']
+            return v
+
+        self.logger.debug("Wait until most of the topic is uploaded")
+
+        wait_until(lambda: pm_last_offset() >= initial_hwm,
+                   timeout_sec=30,
+                   backoff_sec=2,
+                   err_msg="Partition never uploaded")
+
+        self.logger.debug(
+            f"Now {disable_mode} and produce some more to put HWM well above the last uploaded offset"
+        )
+        disable_mode.do_disable(self, spec.name)
+
+        last_uploaded = pm_last_offset()
+
+        self.start_producer()
+        self.producer.wait(timeout_sec=60)
+
+        info = describe_topic()
+        second_start_offset = info.start_offset
+        second_hwm = info.high_watermark
+
+        assert pm_last_offset() == last_uploaded, \
+            f"Unexpectedly uploaded more data {pm_last_offset()} > {last_uploaded}"
+
+        self.logger.debug(
+            "Decommission the partition's leader and wait for leadership transfer"
+        )
+
+        leader_id = self.admin.get_partition_leader(namespace='kafka',
+                                                    topic=spec.name,
+                                                    partition=0)
+
+        self._decommission(leader_id, decommissioned_ids=[leader_id])
+
+        def new_leader_id():
+            partition_info = self.admin.get_partitions(topic=spec.name,
+                                                       partition=0,
+                                                       namespace='kafka',
+                                                       node=None)
+            self.logger.debug(f"{partition_info=}")
+            new_id = self.admin.get_partition_leader(namespace='kafka',
+                                                     topic=spec.name,
+                                                     partition=0)
+            self.logger.debug(f"{new_id=}")
+            return new_id
+
+        wait_until(lambda: new_leader_id() not in [leader_id, -1],
+                   timeout_sec=60,
+                   backoff_sec=2,
+                   err_msg="Partition didn't move")
+
+        if disable_mode == DisableTestMode.DISABLE:
+            self.logger.debug(
+                "With tiered storage disabled, we should skip FPM truncation and transfer the whole log via raft"
+            )
+        elif disable_mode == DisableTestMode.PAUSE:
+            self.logger.debug(
+                "With uploads paused, FPM should truncate only up to the last uploaded offset to avoid introducing a gap in the log"
+            )
+
+        with expect_exception(ducktape.errors.TimeoutError, lambda e: True):
+            wait_until(
+                lambda: describe_topic().start_offset > initial_start_offset,
+                timeout_sec=30,
+                backoff_sec=2,
+                err_msg="Start offset never jumped")
+
+        final_start_offset = describe_topic().start_offset
+        final_hwm = describe_topic().high_watermark
+
+        assert final_start_offset == initial_start_offset, \
+            f"Expected final_start_offset == {initial_start_offset}, got {final_start_offset=}"
+
+        assert final_hwm == second_hwm, \
+            f"Expected final_hwm == {second_hwm}, got {final_hwm=}"