redpanda-data
diff --git a/‎src/v/cloud_storage/tests/cloud_storage_e2e_test.cc‎
Lines changed: 3 additions & 0 deletions b/‎src/v/cloud_storage/tests/cloud_storage_e2e_test.cc‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎src/v/cluster/archival/tests/async_data_uploader_fixture.h‎
Lines changed: 1 addition & 0 deletions b/‎src/v/cluster/archival/tests/async_data_uploader_fixture.h‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/v/cluster/archival/tests/ntp_archiver_reupload_test.cc‎
Lines changed: 1 addition & 0 deletions b/‎src/v/cluster/archival/tests/ntp_archiver_reupload_test.cc‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/v/cluster/cluster_utils.cc‎
Lines changed: 6 additions & 1 deletion b/‎src/v/cluster/cluster_utils.cc‎
Lines changed: 6 additions & 1 deletion
diff --git a/‎src/v/cluster/data_migration_backend.h‎
Lines changed: 1 addition & 1 deletion b/‎src/v/cluster/data_migration_backend.h‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/v/cluster/rm_stm.cc‎
Lines changed: 65 additions & 2 deletions b/‎src/v/cluster/rm_stm.cc‎
Lines changed: 65 additions & 2 deletions
diff --git a/‎src/v/cluster/rm_stm.h‎
Lines changed: 10 additions & 1 deletion b/‎src/v/cluster/rm_stm.h‎
Lines changed: 10 additions & 1 deletion
diff --git a/‎src/v/cluster/tests/manual_log_deletion_test.cc‎
Lines changed: 1 addition & 0 deletions b/‎src/v/cluster/tests/manual_log_deletion_test.cc‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/v/cluster/tests/rm_stm_test_fixture.h‎
Lines changed: 3 additions & 1 deletion b/‎src/v/cluster/tests/rm_stm_test_fixture.h‎
Lines changed: 3 additions & 1 deletion
@@ -253,6 +253,7 @@ TEST_P(EndToEndFixture, TestProduceConsumeFromCloud) {
       1,
       log->stm_manager()->max_removable_local_log_offset(),
       log->stm_manager()->max_removable_local_log_offset(),
+      log->stm_manager()->max_removable_local_log_offset(),
       std::nullopt,
       std::nullopt,
       std::chrono::milliseconds{0},
@@ -700,6 +701,7 @@ TEST_P(CloudStorageEndToEndManualTest, TestTimequeryAfterArchivalGC) {
       1, // max_bytes_in_log
       log->stm_manager()->max_removable_local_log_offset(),
       log->stm_manager()->max_removable_local_log_offset(),
+      log->stm_manager()->max_removable_local_log_offset(),
       std::nullopt,
       std::nullopt,
       std::chrono::milliseconds{0},
@@ -1123,6 +1125,7 @@ TEST_P(EndToEndFixture, TestCloudStorageTimequery) {
       0,
       log->stm_manager()->max_removable_local_log_offset(),
       log->stm_manager()->max_removable_local_log_offset(),
+      log->stm_manager()->max_removable_local_log_offset(),
       std::nullopt,
       std::nullopt,
       std::chrono::milliseconds{0},
 
@@ -170,6 +170,7 @@ class async_data_uploader_fixture : public redpanda_thread_fixture {
           std::nullopt,
           max_collect_offset,
           max_collect_offset,
+          max_collect_offset,
           std::nullopt,
           std::nullopt,
           std::chrono::milliseconds{0},
 
@@ -255,6 +255,7 @@ struct reupload_fixture : public archiver_fixture {
               std::nullopt,
               max_removable,
               max_removable,
+              max_removable,
               std::nullopt,
               std::nullopt,
               std::chrono::milliseconds{0},
 
@@ -257,7 +257,8 @@ std::optional<shard_placement_target> placement_target_on_node(
 
 partition_state get_partition_state(ss::lw_shared_ptr<partition> partition) {
     partition_state state{};
-    if (unlikely(!partition)) {
+    if (!partition || !partition->log() || !partition->log()->stm_manager())
+      [[unlikely]] {
         return state;
     }
     state.start_offset = partition->raft_start_offset();
@@ -270,6 +271,8 @@ partition_state get_partition_state(ss::lw_shared_ptr<partition> partition) {
     state.revision_id = partition->get_revision_id();
     state.log_size_bytes = partition->size_bytes();
     state.non_log_disk_size_bytes = partition->non_log_disk_size_bytes();
+    state.max_tombstone_removable_offset
+      = partition->log()->stm_manager()->max_tombstone_remove_offset();
     state.is_read_replica_mode_enabled
       = partition->is_read_replica_mode_enabled();
     state.is_remote_fetch_enabled = partition->is_remote_fetch_enabled();
@@ -379,6 +382,8 @@ std::vector<partition_stm_state> get_partition_stm_state(consensus_ptr ptr) {
         state.last_applied_offset = stm->last_applied();
         state.max_removable_local_log_offset
           = stm->max_removable_local_log_offset();
+        state.last_local_snapshot_offset
+          = stm->last_locally_snapshotted_offset();
         result.push_back(std::move(state));
     }
     return result;
 
@@ -363,7 +363,7 @@ class backend {
      * Reconciliation-related data.
      *
      * When we are not the coordinator, _migration_states stores sought states
-     * and topics only, but no partititons, _node_states, _nodes_to_retry and
+     * and topics only, but no partitions, _node_states, _nodes_to_retry and
      * _topic_work_to_retry are empty. The same applies to the migration states
      * with topic scoped work only needed.
      *
 
@@ -257,6 +257,7 @@ ss::future<checked<model::term_id, tx::errc>> rm_stm::begin_tx(
   std::chrono::milliseconds transaction_timeout_ms,
   model::partition_id tm) {
     auto state_lock = co_await _state_lock.hold_read_lock();
+    auto lso_lock_holder = co_await _lso_lock.hold_write_lock();
     if (!co_await sync(_sync_timeout())) {
         vlog(
           _ctx_log.trace,
@@ -738,7 +739,7 @@ ss::future<tx::errc> rm_stm::do_abort_tx(
             // Or it may mean that a tx coordinator
             //   - lost its state
             //   - rolled back to previous op
-            //   - the previous op happend to be an abort
+            //   - the previous op happened to be an abort
             //   - the coordinator retried it
             //
             // In the first case the least impactful way to reject the request.
@@ -1264,6 +1265,9 @@ ss::future<result<kafka_result>> rm_stm::replicate_msg(
 }
 
 model::offset rm_stm::last_stable_offset() {
+    if (_as.abort_requested()) [[unlikely]] {
+        return model::invalid_lso;
+    }
     // There are two main scenarios we deal with here.
     // 1. stm is still bootstrapping
     // 2. stm is past bootstrapping.
@@ -1278,6 +1282,8 @@ model::offset rm_stm::last_stable_offset() {
     // We optimize for the case where there are no inflight transactional
     // batch to return the high water mark.
     auto last_applied = last_applied_offset();
+
+    // scenario 1: still bootstrapping
     if (unlikely(
           !_bootstrap_committed_offset
           || last_applied < _bootstrap_committed_offset.value())) {
@@ -1287,6 +1293,29 @@ model::offset rm_stm::last_stable_offset() {
         return model::invalid_lso;
     }
 
+    // scenario 2: past bootstrapping
+    auto read_units = _state_lock.try_hold_read_lock();
+    if (!read_units) {
+        // A reset in progress means the stm may not be in a consistent state
+        // for LSO calculation. In this case we return the last known LSO to be
+        // conservative.
+        vlog(
+          _ctx_log.trace,
+          "state machine is resetting, last_known_lso: {}, last_applied: {}",
+          _last_known_lso,
+          last_applied);
+        return _last_known_lso;
+    }
+    auto lso_read_units = _lso_lock.try_hold_read_lock();
+    if (!lso_read_units) {
+        // LSO calculation is in progress, return last known LSO
+        vlog(
+          _ctx_log.trace,
+          "lso update in progress, last_known_lso: {}, last_applied: {}",
+          _last_known_lso,
+          last_applied);
+        return _last_known_lso;
+    }
     // Check for any in-flight transactions.
     auto first_tx_start = model::offset::max();
     if (_is_tx_enabled && !_active_tx_producers.empty()) {
@@ -1313,6 +1342,39 @@ model::offset rm_stm::last_stable_offset() {
         // transactions.
         lso = std::min(first_tx_start, next_to_apply);
     } else if (synced_leader) {
+        ////////////////  WARNING ///////////
+        // there is a real bug lurking here that overestimates the LSO beyond
+        // an open transaction.
+        //
+
+        // The problem manifests when the LSO is requested after successful
+        // replication of begin_tx batch but before the stm has applied it.
+        // In this case the LSO may be advanced beyond the begin_tx batch offset
+        // because the leader doesn't yet 'know' about the begin_tx batch and
+        // may not consider it in LSO calculation.
+
+        // Another problem is we do not let lso move backwards once
+        // computed (see _last_known_lso update below), So even if the
+        // stm has applied the begin_tx later, we will not correct
+        // the LSO to reflect the begin_tx presence.
+
+        // There is a test that caught this issue in rm_stm_tests which
+        // is disabled for now until we can fix the underlying problem.
+
+        // The impact of this overestimation is that compaction may compact
+        // away open transaction begin marker as it relies on LSO. The
+        // chances are rare but not impossible :(. if at that point the replica
+        // restarts and there are no further updates in the transaction, the
+        // transaction has no record of ever beginning.
+
+        // We need a better way to track in-flight transactions for the purposes
+        // of LSO calculation.
+
+        // An obvious solution is to clamp LSO to next_to_apply in all cases
+        // but it was tried in the past and caused performance regressions
+        // in non transaction workloads like write_caching, acks=0/1. So that
+        // is not a viable solution.
+
         // no inflight transactions in (last_applied, last_visible_index]
         lso = model::next_offset(last_visible_index);
     } else {
@@ -1834,14 +1896,15 @@ model::offset rm_stm::to_log_offset(kafka::offset k_offset) const {
 
 ss::future<raft::local_snapshot_applied>
 rm_stm::apply_local_snapshot(raft::stm_snapshot_header hdr, iobuf&& tx_ss_buf) {
+    auto data_buf = std::move(tx_ss_buf);
     auto units = co_await _state_lock.hold_write_lock();
 
     vlog(
       _ctx_log.trace,
       "applying snapshot with last included offset: {}",
       hdr.offset);
     tx_snapshot_v6 data;
-    iobuf_parser data_parser(std::move(tx_ss_buf));
+    iobuf_parser data_parser(std::move(data_buf));
     if (hdr.version == tx_snapshot_v4::version) {
         tx_snapshot_v4 data_v4
           = co_await reflection::async_adl<tx_snapshot_v4>{}.from(data_parser);
 
@@ -117,7 +117,7 @@ namespace cluster {
  * This stm periodically checks if there is any pending transaction for
  * expiration. The expiration kicks in the transaction is not committed/aborted
  * within the user set transaction timeout. A producer with an active
- * transaction cannot be evicted, so exipration ensures that with timely
+ * transaction cannot be evicted, so expiration ensures that with timely
  * expiration of open transactions, the producer states are candidates for
  * eviction.
  */
@@ -435,6 +435,15 @@ class rm_stm final : public raft::persisted_stm<> {
     model::producer_id _highest_producer_id;
     // for monotonicity of computed LSO.
     model::offset _last_known_lso{-1};
+    /**
+     * LSO lock protects the LSO from being exposed before transaction begin
+     * batch is applied.
+     *
+     * The lock is acquired in write mode when a begin transaction batch is
+     * being handled protecting exposure of potentially invalid LSO until the
+     * begin batch is applied.
+     */
+    ss::rwlock _lso_lock;
 
     friend struct ::rm_stm_test_fixture;
 };
 
@@ -98,6 +98,7 @@ struct manual_deletion_fixture : public raft::raft_fixture {
                 100_MiB,
                 model::offset::max(),
                 model::offset::max(),
+                model::offset::max(),
                 std::nullopt,
                 std::nullopt,
                 std::chrono::milliseconds{0},
 
@@ -20,11 +20,13 @@
 static ss::logger logger{"rm_stm-test"};
 static prefix_logger ctx_logger{logger, ""};
 
+static constexpr auto large_timeout = std::chrono::minutes(30);
+
 struct rm_stm_test_fixture : simple_raft_fixture {
     void create_stm_and_start_raft(
       storage::ntp_config::default_overrides overrides = {}) {
         max_concurent_producers.start(std::numeric_limits<size_t>::max()).get();
-        producer_expiration_ms.start(std::chrono::milliseconds::max()).get();
+        producer_expiration_ms.start(large_timeout).get();
         producer_state_manager
           .start(
             ss::sharded_parameter(
Original file line number	Diff line number	Diff line change
`@@ -363,7 +363,7 @@ class backend {`
`363`	`363`	`* Reconciliation-related data.`
`364`	`364`	`*`
`365`	`365`	`* When we are not the coordinator, _migration_states stores sought states`
`366`		`- * and topics only, but no partititons, _node_states, _nodes_to_retry and`
	`366`	`+ * and topics only, but no partitions, _node_states, _nodes_to_retry and`
`367`	`367`	`* _topic_work_to_retry are empty. The same applies to the migration states`
`368`	`368`	`* with topic scoped work only needed.`
`369`	`369`	`*`