Skip to content

Commit c96f7dd

Browse files
committed
[#29215] DocDB: Fix txn_start_us DFATAL for DDL entering tablet wait-queue
Summary: Earlier, ysql was propagating `pg_txn_start_us` for regular transactions ('kPlain`) alone, which was being used at the tablet wait-queue for prioritizing waiter resumption. This was being done for DMLs alone with the assumption that DDLs don't enter the wait-queue and instead abort active transactions (with object locking disabled). Refer https://phorge.dev.yugabyte.com/D28756 for details. But this assumption doesn't seem to hold true with colocated database with index, as the following drop table results in a write query to the index table. ``` $CREATE DATABASE db with COLOCATION = true; $\c db; $CREATE TABLE test (k INT, v INT, PRIMARY KEY (k ASC)); $CREATE UNIQUE INDEX idx_1 ON test (v ASC); $DROP INDEX idx_1; ``` and when there are active DML transactions operating at the table, the drop DDL enters the wait-queue, which leads to the below DFATAL ``` [ts-1] F1101 11:02:59.304080 661742 wait_queue.cc:268] TxnId: 6557ecea-23bc-44b9-8d55-6c264ae0b958, ReqId: 612 Expected non-zero txn_start_us ``` This revision addresses the issue by propagating the pg_txn_start_us value from ysql to docdb, and initializes `YBTransaction` with the same so that the write rpc picks this up, and the tablet wait-queue eventually honors the waiter resumption logic. Note: With object locking enabled, the DFATAL wasn't getting triggered earlier, since the drop table gets gated the the object lock manager layer itself, and doesn't proceed to the tablet layer until the active DMLs finish. And once the active DMLs finish, it wouldn't enter the tablet wait-queue anyways. Jira: DB-18983 Test Plan: Jenkins ``` ./yb_build.sh --cxx-test='TEST_F(PgWaitQueuesTestWithoutObjectLocking, TestDDLHaveWaitStartTimeSet)' ``` Reviewers: rthallam, sergei, pjain, patnaik.balivada Reviewed By: pjain, patnaik.balivada Subscribers: yql, ybase Differential Revision: https://phorge.dev.yugabyte.com/D47969
1 parent cb00394 commit c96f7dd

File tree

3 files changed

+37
-4
lines changed

3 files changed

+37
-4
lines changed

src/yb/tserver/pg_client_session.cc

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3082,7 +3082,7 @@ class PgClientSession::Impl {
30823082
EnsureSession(kind, deadline);
30833083
RETURN_NOT_OK(GetDdlTransactionMetadata(
30843084
true /* use_transaction */, false /* use_regular_transaction_block */, deadline,
3085-
options.priority()));
3085+
options.priority(), options.pg_txn_start_us()));
30863086
} else {
30873087
DCHECK(kind == PgClientSessionKind::kPlain);
30883088
auto& session = EnsureSession(kind, deadline);
@@ -3332,7 +3332,8 @@ class PgClientSession::Impl {
33323332
// All DDLs use kHighestPriority unless specified otherwise.
33333333
Result<const TransactionMetadata*> GetDdlTransactionMetadata(
33343334
bool use_transaction, bool use_regular_transaction_block, CoarseTimePoint deadline,
3335-
uint64_t priority = kHighPriTxnUpperBound) {
3335+
uint64_t priority = kHighPriTxnUpperBound,
3336+
uint64_t pg_txn_start_us = 0) {
33363337
if (!use_transaction) {
33373338
return nullptr;
33383339
}
@@ -3360,6 +3361,8 @@ class PgClientSession::Impl {
33603361
const auto isolation = FLAGS_ysql_serializable_isolation_for_ddl_txn
33613362
? IsolationLevel::SERIALIZABLE_ISOLATION : IsolationLevel::SNAPSHOT_ISOLATION;
33623363
txn = transaction_provider_.Take<PgClientSessionKind::kDdl>(deadline);
3364+
RETURN_NOT_OK(txn->SetPgTxnStart(
3365+
pg_txn_start_us ? pg_txn_start_us : MonoTime::Now().ToUint64()));
33633366
RETURN_NOT_OK(txn->Init(isolation));
33643367
txn->SetPriority(priority);
33653368
txn->SetLogPrefixTag(kTxnLogPrefixTag, id_);

src/yb/yql/pggate/pg_txn_manager.cc

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -744,6 +744,7 @@ Status PgTxnManager::SetupPerformOptions(
744744
options->set_txn_serial_no(serial_no_.txn());
745745
options->set_active_sub_transaction_id(active_sub_transaction_id_);
746746
options->set_xcluster_target_ddl_bypass(yb_xcluster_target_ddl_bypass);
747+
options->set_pg_txn_start_us(pg_txn_start_us_);
747748

748749
if (use_saved_priority_) {
749750
options->set_use_existing_priority(true);
@@ -770,8 +771,6 @@ Status PgTxnManager::SetupPerformOptions(
770771
options->set_read_time_manipulation(
771772
GetActualReadTimeManipulator(isolation_level_, read_time_manipulation_, read_time_action));
772773
read_time_manipulation_ = tserver::ReadTimeManipulation::NONE;
773-
// pg_txn_start_us is similarly only used for kPlain transactions.
774-
options->set_pg_txn_start_us(pg_txn_start_us_);
775774
// Only clamp read-only txns/stmts.
776775
// Do not clamp in the serializable case since
777776
// - SERIALIZABLE reads do not pick read time until later.

src/yb/yql/pgwrapper/pg_wait_on_conflict-test.cc

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1880,5 +1880,36 @@ TEST_F(PgWaitQueuesWithRetriesTest, TestDeadlockRetries) {
18801880
thread_holder.WaitAndStop(30s * kTimeMultiplier);
18811881
}
18821882

1883+
#ifndef NDEBUG
1884+
TEST_F(PgWaitQueuesTestWithoutObjectLocking, TestDDLHaveWaitStartTimeSet) {
1885+
constexpr auto kDbName = "colodb";
1886+
auto setup_conn = ASSERT_RESULT(Connect());
1887+
ASSERT_OK(setup_conn.ExecuteFormat("CREATE DATABASE $0 with COLOCATION = true", kDbName));
1888+
1889+
auto conn = ASSERT_RESULT(ConnectToDB(kDbName));
1890+
ASSERT_OK(conn.Execute(
1891+
"CREATE TABLE test (k INT, v INT, PRIMARY KEY (k ASC))"));
1892+
ASSERT_OK(conn.Execute("CREATE UNIQUE INDEX idx_1 ON test (v ASC)"));
1893+
ASSERT_OK(conn.StartTransaction(IsolationLevel::SNAPSHOT_ISOLATION));
1894+
ASSERT_OK(conn.Execute("INSERT INTO test VALUES (1, -1)"));
1895+
1896+
yb::SyncPoint::GetInstance()->LoadDependency({
1897+
{"WaitQueue::Impl::SetupWaiterUnlocked:1", "TestDDLHaveWaitStartTimeSet"}});
1898+
yb::SyncPoint::GetInstance()->ClearTrace();
1899+
yb::SyncPoint::GetInstance()->EnableProcessing();
1900+
1901+
TestThreadHolder thread_holder;
1902+
thread_holder.AddThreadFunctor([this, &stop = thread_holder.stop_flag()] {
1903+
auto conn = ASSERT_RESULT(ConnectToDB(kDbName));
1904+
ASSERT_OK(conn.Execute("SET yb_max_query_layer_retries=10"));
1905+
ASSERT_OK(conn.Execute("DROP INDEX idx_1"));
1906+
});
1907+
DEBUG_ONLY_TEST_SYNC_POINT("TestDDLHaveWaitStartTimeSet");
1908+
ASSERT_OK(conn.CommitTransaction());
1909+
1910+
thread_holder.WaitAndStop(10s * kTimeMultiplier);
1911+
}
1912+
#endif
1913+
18831914
} // namespace pgwrapper
18841915
} // namespace yb

0 commit comments

Comments
 (0)