Skip to content

Commit 4127e17

Browse files
authored
Merge pull request #29109 from bashtanov/even_greater_tx_compaction_backport
[v25.3.x] Even greater TX Compaction backport
2 parents 9179deb + 922d72a commit 4127e17

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

67 files changed

+2134
-779
lines changed

src/v/cloud_storage/tests/cloud_storage_e2e_test.cc

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -253,6 +253,7 @@ TEST_P(EndToEndFixture, TestProduceConsumeFromCloud) {
253253
1,
254254
log->stm_manager()->max_removable_local_log_offset(),
255255
log->stm_manager()->max_removable_local_log_offset(),
256+
log->stm_manager()->max_removable_local_log_offset(),
256257
std::nullopt,
257258
std::nullopt,
258259
std::chrono::milliseconds{0},
@@ -700,6 +701,7 @@ TEST_P(CloudStorageEndToEndManualTest, TestTimequeryAfterArchivalGC) {
700701
1, // max_bytes_in_log
701702
log->stm_manager()->max_removable_local_log_offset(),
702703
log->stm_manager()->max_removable_local_log_offset(),
704+
log->stm_manager()->max_removable_local_log_offset(),
703705
std::nullopt,
704706
std::nullopt,
705707
std::chrono::milliseconds{0},
@@ -1123,6 +1125,7 @@ TEST_P(EndToEndFixture, TestCloudStorageTimequery) {
11231125
0,
11241126
log->stm_manager()->max_removable_local_log_offset(),
11251127
log->stm_manager()->max_removable_local_log_offset(),
1128+
log->stm_manager()->max_removable_local_log_offset(),
11261129
std::nullopt,
11271130
std::nullopt,
11281131
std::chrono::milliseconds{0},

src/v/cluster/archival/tests/async_data_uploader_fixture.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -170,6 +170,7 @@ class async_data_uploader_fixture : public redpanda_thread_fixture {
170170
std::nullopt,
171171
max_collect_offset,
172172
max_collect_offset,
173+
max_collect_offset,
173174
std::nullopt,
174175
std::nullopt,
175176
std::chrono::milliseconds{0},

src/v/cluster/archival/tests/ntp_archiver_reupload_test.cc

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -255,6 +255,7 @@ struct reupload_fixture : public archiver_fixture {
255255
std::nullopt,
256256
max_removable,
257257
max_removable,
258+
max_removable,
258259
std::nullopt,
259260
std::nullopt,
260261
std::chrono::milliseconds{0},

src/v/cluster/cluster_utils.cc

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -257,7 +257,8 @@ std::optional<shard_placement_target> placement_target_on_node(
257257

258258
partition_state get_partition_state(ss::lw_shared_ptr<partition> partition) {
259259
partition_state state{};
260-
if (unlikely(!partition)) {
260+
if (!partition || !partition->log() || !partition->log()->stm_manager())
261+
[[unlikely]] {
261262
return state;
262263
}
263264
state.start_offset = partition->raft_start_offset();
@@ -270,6 +271,8 @@ partition_state get_partition_state(ss::lw_shared_ptr<partition> partition) {
270271
state.revision_id = partition->get_revision_id();
271272
state.log_size_bytes = partition->size_bytes();
272273
state.non_log_disk_size_bytes = partition->non_log_disk_size_bytes();
274+
state.max_tombstone_removable_offset
275+
= partition->log()->stm_manager()->max_tombstone_remove_offset();
273276
state.is_read_replica_mode_enabled
274277
= partition->is_read_replica_mode_enabled();
275278
state.is_remote_fetch_enabled = partition->is_remote_fetch_enabled();
@@ -379,6 +382,8 @@ std::vector<partition_stm_state> get_partition_stm_state(consensus_ptr ptr) {
379382
state.last_applied_offset = stm->last_applied();
380383
state.max_removable_local_log_offset
381384
= stm->max_removable_local_log_offset();
385+
state.last_local_snapshot_offset
386+
= stm->last_locally_snapshotted_offset();
382387
result.push_back(std::move(state));
383388
}
384389
return result;

src/v/cluster/data_migration_backend.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -363,7 +363,7 @@ class backend {
363363
* Reconciliation-related data.
364364
*
365365
* When we are not the coordinator, _migration_states stores sought states
366-
* and topics only, but no partititons, _node_states, _nodes_to_retry and
366+
* and topics only, but no partitions, _node_states, _nodes_to_retry and
367367
* _topic_work_to_retry are empty. The same applies to the migration states
368368
* with topic scoped work only needed.
369369
*

src/v/cluster/rm_stm.cc

Lines changed: 65 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -257,6 +257,7 @@ ss::future<checked<model::term_id, tx::errc>> rm_stm::begin_tx(
257257
std::chrono::milliseconds transaction_timeout_ms,
258258
model::partition_id tm) {
259259
auto state_lock = co_await _state_lock.hold_read_lock();
260+
auto lso_lock_holder = co_await _lso_lock.hold_write_lock();
260261
if (!co_await sync(_sync_timeout())) {
261262
vlog(
262263
_ctx_log.trace,
@@ -738,7 +739,7 @@ ss::future<tx::errc> rm_stm::do_abort_tx(
738739
// Or it may mean that a tx coordinator
739740
// - lost its state
740741
// - rolled back to previous op
741-
// - the previous op happend to be an abort
742+
// - the previous op happened to be an abort
742743
// - the coordinator retried it
743744
//
744745
// In the first case the least impactful way to reject the request.
@@ -1264,6 +1265,9 @@ ss::future<result<kafka_result>> rm_stm::replicate_msg(
12641265
}
12651266

12661267
model::offset rm_stm::last_stable_offset() {
1268+
if (_as.abort_requested()) [[unlikely]] {
1269+
return model::invalid_lso;
1270+
}
12671271
// There are two main scenarios we deal with here.
12681272
// 1. stm is still bootstrapping
12691273
// 2. stm is past bootstrapping.
@@ -1278,6 +1282,8 @@ model::offset rm_stm::last_stable_offset() {
12781282
// We optimize for the case where there are no inflight transactional
12791283
// batch to return the high water mark.
12801284
auto last_applied = last_applied_offset();
1285+
1286+
// scenario 1: still bootstrapping
12811287
if (unlikely(
12821288
!_bootstrap_committed_offset
12831289
|| last_applied < _bootstrap_committed_offset.value())) {
@@ -1287,6 +1293,29 @@ model::offset rm_stm::last_stable_offset() {
12871293
return model::invalid_lso;
12881294
}
12891295

1296+
// scenario 2: past bootstrapping
1297+
auto read_units = _state_lock.try_hold_read_lock();
1298+
if (!read_units) {
1299+
// A reset in progress means the stm may not be in a consistent state
1300+
// for LSO calculation. In this case we return the last known LSO to be
1301+
// conservative.
1302+
vlog(
1303+
_ctx_log.trace,
1304+
"state machine is resetting, last_known_lso: {}, last_applied: {}",
1305+
_last_known_lso,
1306+
last_applied);
1307+
return _last_known_lso;
1308+
}
1309+
auto lso_read_units = _lso_lock.try_hold_read_lock();
1310+
if (!lso_read_units) {
1311+
// LSO calculation is in progress, return last known LSO
1312+
vlog(
1313+
_ctx_log.trace,
1314+
"lso update in progress, last_known_lso: {}, last_applied: {}",
1315+
_last_known_lso,
1316+
last_applied);
1317+
return _last_known_lso;
1318+
}
12901319
// Check for any in-flight transactions.
12911320
auto first_tx_start = model::offset::max();
12921321
if (_is_tx_enabled && !_active_tx_producers.empty()) {
@@ -1313,6 +1342,39 @@ model::offset rm_stm::last_stable_offset() {
13131342
// transactions.
13141343
lso = std::min(first_tx_start, next_to_apply);
13151344
} else if (synced_leader) {
1345+
//////////////// WARNING ///////////
1346+
// there is a real bug lurking here that overestimates the LSO beyond
1347+
// an open transaction.
1348+
//
1349+
1350+
// The problem manifests when the LSO is requested after successful
1351+
// replication of begin_tx batch but before the stm has applied it.
1352+
// In this case the LSO may be advanced beyond the begin_tx batch offset
1353+
// because the leader doesn't yet 'know' about the begin_tx batch and
1354+
// may not consider it in LSO calculation.
1355+
1356+
// Another problem is we do not let lso move backwards once
1357+
// computed (see _last_known_lso update below), So even if the
1358+
// stm has applied the begin_tx later, we will not correct
1359+
// the LSO to reflect the begin_tx presence.
1360+
1361+
// There is a test that caught this issue in rm_stm_tests which
1362+
// is disabled for now until we can fix the underlying problem.
1363+
1364+
// The impact of this overestimation is that compaction may compact
1365+
// away open transaction begin marker as it relies on LSO. The
1366+
// chances are rare but not impossible :(. if at that point the replica
1367+
// restarts and there are no further updates in the transaction, the
1368+
// transaction has no record of ever beginning.
1369+
1370+
// We need a better way to track in-flight transactions for the purposes
1371+
// of LSO calculation.
1372+
1373+
// An obvious solution is to clamp LSO to next_to_apply in all cases
1374+
// but it was tried in the past and caused performance regressions
1375+
// in non transaction workloads like write_caching, acks=0/1. So that
1376+
// is not a viable solution.
1377+
13161378
// no inflight transactions in (last_applied, last_visible_index]
13171379
lso = model::next_offset(last_visible_index);
13181380
} else {
@@ -1834,14 +1896,15 @@ model::offset rm_stm::to_log_offset(kafka::offset k_offset) const {
18341896

18351897
ss::future<raft::local_snapshot_applied>
18361898
rm_stm::apply_local_snapshot(raft::stm_snapshot_header hdr, iobuf&& tx_ss_buf) {
1899+
auto data_buf = std::move(tx_ss_buf);
18371900
auto units = co_await _state_lock.hold_write_lock();
18381901

18391902
vlog(
18401903
_ctx_log.trace,
18411904
"applying snapshot with last included offset: {}",
18421905
hdr.offset);
18431906
tx_snapshot_v6 data;
1844-
iobuf_parser data_parser(std::move(tx_ss_buf));
1907+
iobuf_parser data_parser(std::move(data_buf));
18451908
if (hdr.version == tx_snapshot_v4::version) {
18461909
tx_snapshot_v4 data_v4
18471910
= co_await reflection::async_adl<tx_snapshot_v4>{}.from(data_parser);

src/v/cluster/rm_stm.h

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -117,7 +117,7 @@ namespace cluster {
117117
* This stm periodically checks if there is any pending transaction for
118118
* expiration. The expiration kicks in the transaction is not committed/aborted
119119
* within the user set transaction timeout. A producer with an active
120-
* transaction cannot be evicted, so exipration ensures that with timely
120+
* transaction cannot be evicted, so expiration ensures that with timely
121121
* expiration of open transactions, the producer states are candidates for
122122
* eviction.
123123
*/
@@ -435,6 +435,15 @@ class rm_stm final : public raft::persisted_stm<> {
435435
model::producer_id _highest_producer_id;
436436
// for monotonicity of computed LSO.
437437
model::offset _last_known_lso{-1};
438+
/**
439+
* LSO lock protects the LSO from being exposed before transaction begin
440+
* batch is applied.
441+
*
442+
* The lock is acquired in write mode when a begin transaction batch is
443+
* being handled protecting exposure of potentially invalid LSO until the
444+
* begin batch is applied.
445+
*/
446+
ss::rwlock _lso_lock;
438447

439448
friend struct ::rm_stm_test_fixture;
440449
};

src/v/cluster/tests/manual_log_deletion_test.cc

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -98,6 +98,7 @@ struct manual_deletion_fixture : public raft::raft_fixture {
9898
100_MiB,
9999
model::offset::max(),
100100
model::offset::max(),
101+
model::offset::max(),
101102
std::nullopt,
102103
std::nullopt,
103104
std::chrono::milliseconds{0},

src/v/cluster/tests/rm_stm_test_fixture.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,11 +20,13 @@
2020
static ss::logger logger{"rm_stm-test"};
2121
static prefix_logger ctx_logger{logger, ""};
2222

23+
static constexpr auto large_timeout = std::chrono::minutes(30);
24+
2325
struct rm_stm_test_fixture : simple_raft_fixture {
2426
void create_stm_and_start_raft(
2527
storage::ntp_config::default_overrides overrides = {}) {
2628
max_concurent_producers.start(std::numeric_limits<size_t>::max()).get();
27-
producer_expiration_ms.start(std::chrono::milliseconds::max()).get();
29+
producer_expiration_ms.start(large_timeout).get();
2830
producer_state_manager
2931
.start(
3032
ss::sharded_parameter(

0 commit comments

Comments
 (0)