Skip to content

Commit 2555770

Browse files
authored
Allow to configure the backtrace tracing for failed allocation (#13376)
1 parent 1edb570 commit 2555770

File tree

9 files changed

+57
-33
lines changed

9 files changed

+57
-33
lines changed

ydb/core/kqp/compute_actor/kqp_compute_actor_factory.cpp

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -59,8 +59,7 @@ struct TMemoryQuotaManager : public NYql::NDq::TGuaranteeQuotaManager {
5959
}
6060

6161
TString MemoryConsumptionDetails() const override {
62-
// NOTE: don't forget to disable verbosity in stable branches.
63-
return Tx->ToString(true);
62+
return Tx->ToString();
6463
}
6564

6665
void TerminateHandler(bool success, const NYql::TIssues& issues) {

ydb/core/kqp/executer_actor/kqp_data_executer.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -145,6 +145,7 @@ class TKqpDataExecuter : public TKqpExecuterBase<TKqpDataExecuter, EExecType::Da
145145
, AllowOlapDataQuery(tableServiceConfig.GetAllowOlapDataQuery())
146146
, BlockTrackingMode(tableServiceConfig.GetBlockTrackingMode())
147147
, WaitCAStatsTimeout(TDuration::MilliSeconds(tableServiceConfig.GetQueryLimits().GetWaitCAStatsTimeoutMs()))
148+
, VerboseMemoryLimitException(tableServiceConfig.GetResourceManager().GetVerboseMemoryLimitException())
148149
{
149150
if (tableServiceConfig.HasArrayBufferMinFillPercentage()) {
150151
ArrayBufferMinFillPercentage = tableServiceConfig.GetArrayBufferMinFillPercentage();
@@ -2721,6 +2722,7 @@ class TKqpDataExecuter : public TKqpExecuterBase<TKqpDataExecuter, EExecType::Da
27212722
.CaFactory_ = Request.CaFactory_,
27222723
.BlockTrackingMode = BlockTrackingMode,
27232724
.ArrayBufferMinFillPercentage = ArrayBufferMinFillPercentage,
2725+
.VerboseMemoryLimitException = VerboseMemoryLimitException,
27242726
});
27252727

27262728
auto err = Planner->PlanExecution();
@@ -3045,6 +3047,7 @@ class TKqpDataExecuter : public TKqpExecuterBase<TKqpDataExecuter, EExecType::Da
30453047
const NKikimrConfig::TTableServiceConfig::EBlockTrackingMode BlockTrackingMode;
30463048
const TDuration WaitCAStatsTimeout;
30473049
TMaybe<ui8> ArrayBufferMinFillPercentage;
3050+
const bool VerboseMemoryLimitException;
30483051
};
30493052

30503053
} // namespace

ydb/core/kqp/executer_actor/kqp_planner.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -112,6 +112,7 @@ TKqpPlanner::TKqpPlanner(TKqpPlanner::TArgs&& args)
112112
, CaFactory_(args.CaFactory_)
113113
, BlockTrackingMode(args.BlockTrackingMode)
114114
, ArrayBufferMinFillPercentage(args.ArrayBufferMinFillPercentage)
115+
, VerboseMemoryLimitException(args.VerboseMemoryLimitException)
115116
{
116117
if (GUCSettings) {
117118
SerializedGUCSettings = GUCSettings->SerializeToString();
@@ -479,7 +480,7 @@ TString TKqpPlanner::ExecuteDataComputeTask(ui64 taskId, ui32 computeTasksSize)
479480

480481
TxInfo = MakeIntrusive<NRm::TTxState>(
481482
TxId, TInstant::Now(), ResourceManager_->GetCounters(),
482-
UserRequestContext->PoolId, memoryPoolPercent, Database);
483+
UserRequestContext->PoolId, memoryPoolPercent, Database, VerboseMemoryLimitException);
483484
}
484485

485486
if (ArrayBufferMinFillPercentage) {

ydb/core/kqp/executer_actor/kqp_planner.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,7 @@ class TKqpPlanner {
7171
const std::shared_ptr<NKikimr::NKqp::NComputeActor::IKqpNodeComputeActorFactory>& CaFactory_;
7272
const NKikimrConfig::TTableServiceConfig::EBlockTrackingMode BlockTrackingMode;
7373
const TMaybe<ui8> ArrayBufferMinFillPercentage;
74+
const bool VerboseMemoryLimitException;
7475
};
7576

7677
TKqpPlanner(TKqpPlanner::TArgs&& args);
@@ -150,6 +151,7 @@ class TKqpPlanner {
150151
TVector<TProgressStat> LastStats;
151152
const NKikimrConfig::TTableServiceConfig::EBlockTrackingMode BlockTrackingMode;
152153
const TMaybe<ui8> ArrayBufferMinFillPercentage;
154+
const bool VerboseMemoryLimitException;
153155

154156
public:
155157
static bool UseMockEmptyPlanner; // for tests: if true then use TKqpMockEmptyPlanner that leads to the error

ydb/core/kqp/executer_actor/kqp_scan_executer.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -314,8 +314,9 @@ class TKqpScanExecuter : public TKqpExecuterBase<TKqpScanExecuter, EExecType::Sc
314314
.GUCSettings = nullptr,
315315
.MayRunTasksLocally = false,
316316
.ResourceManager_ = Request.ResourceManager_,
317-
.CaFactory_ = Request.CaFactory_
317+
.CaFactory_ = Request.CaFactory_,
318318
// TODO: BlockTrackingMode is not set!
319+
.VerboseMemoryLimitException = false,
319320
});
320321

321322
LOG_D("Execute scan tx, PendingComputeTasks: " << TasksGraph.GetTasks().size());

ydb/core/kqp/node_service/kqp_node_service.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -236,7 +236,7 @@ class TKqpNodeService : public TActorBootstrapped<TKqpNodeService> {
236236
TIntrusivePtr<NRm::TTxState> txInfo = MakeIntrusive<NRm::TTxState>(
237237
txId, TInstant::Now(), ResourceManager_->GetCounters(),
238238
msg.GetSchedulerGroup(), msg.GetMemoryPoolPercent(),
239-
msg.GetDatabase());
239+
msg.GetDatabase(), Config.GetVerboseMemoryLimitException());
240240

241241
const ui32 tasksCount = msg.GetTasks().size();
242242
for (auto& dqTask: *msg.MutableTasks()) {

ydb/core/kqp/rm_service/kqp_rm_service.h

Lines changed: 43 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -118,39 +118,48 @@ class TTxState : public TAtomicRefCount<TTxState> {
118118
const TString PoolId;
119119
const double MemoryPoolPercent;
120120
const TString Database;
121+
const bool CollectBacktrace;
121122

122123
private:
123124
std::atomic<ui64> TxScanQueryMemory = 0;
124125
std::atomic<ui64> TxExternalDataQueryMemory = 0;
125126
std::atomic<ui32> TxExecutionUnits = 0;
126-
std::atomic<ui64> TxMaxAllocation = 0;
127-
std::atomic<ui64> TxFailedAllocation = 0;
127+
std::atomic<ui64> TxMaxAllocationSize = 0;
128128

129129
// TODO(ilezhankin): it's better to use std::atomic<std::shared_ptr<>> which is not supported at the moment.
130130
std::atomic<TBackTrace*> TxMaxAllocationBacktrace = nullptr;
131-
std::atomic<TBackTrace*> TxFailedAllocationBacktrace = nullptr;
131+
132+
// NOTE: it's hard to maintain atomic pointer in case of tracking the last failed allocation backtrace,
133+
// because while we try to print one - the new last may emerge and delete previous.
134+
mutable std::mutex BacktraceMutex;
135+
std::atomic<ui64> TxFailedAllocationSize = 0; // protected by BacktraceMutex (only if CollectBacktrace == true)
136+
TBackTrace TxFailedAllocationBacktrace; // protected by BacktraceMutex
137+
std::atomic<bool> HasFailedAllocationBacktrace = false;
132138

133139
public:
134140
explicit TTxState(ui64 txId, TInstant now, TIntrusivePtr<TKqpCounters> counters, const TString& poolId, const double memoryPoolPercent,
135-
const TString& database)
141+
const TString& database, bool collectBacktrace)
136142
: TxId(txId)
137143
, CreatedAt(now)
138144
, Counters(std::move(counters))
139145
, PoolId(poolId)
140146
, MemoryPoolPercent(memoryPoolPercent)
141147
, Database(database)
148+
, CollectBacktrace(collectBacktrace)
142149
{}
143150

144151
~TTxState() {
145152
delete TxMaxAllocationBacktrace.load();
146-
delete TxFailedAllocationBacktrace.load();
147153
}
148154

149155
std::pair<TString, TString> MakePoolId() const {
150156
return std::make_pair(Database, PoolId);
151157
}
152158

153-
TString ToString(bool verbose = false) const {
159+
TString ToString() const {
160+
// use unique_lock to safely unlock mutex in case of exceptions
161+
std::unique_lock backtraceLock(BacktraceMutex, std::defer_lock);
162+
154163
auto res = TStringBuilder() << "TxResourcesInfo { "
155164
<< "TxId: " << TxId
156165
<< ", Database: " << Database;
@@ -160,19 +169,28 @@ class TTxState : public TAtomicRefCount<TTxState> {
160169
<< ", MemoryPoolPercent: " << Sprintf("%.2f", MemoryPoolPercent > 0 ? MemoryPoolPercent : 100);
161170
}
162171

172+
if (CollectBacktrace) {
173+
backtraceLock.lock();
174+
}
175+
163176
res << ", tx initially granted memory: " << HumanReadableSize(TxExternalDataQueryMemory.load(), SF_BYTES)
164177
<< ", tx total memory allocations: " << HumanReadableSize(TxScanQueryMemory.load(), SF_BYTES)
165-
<< ", tx largest successful memory allocation: " << HumanReadableSize(TxMaxAllocation.load(), SF_BYTES)
166-
<< ", tx last failed memory allocation: " << HumanReadableSize(TxFailedAllocation.load(), SF_BYTES)
178+
<< ", tx largest successful memory allocation: " << HumanReadableSize(TxMaxAllocationSize.load(), SF_BYTES)
179+
<< ", tx last failed memory allocation: " << HumanReadableSize(TxFailedAllocationSize.load(), SF_BYTES)
167180
<< ", tx total execution units: " << TxExecutionUnits.load()
168181
<< ", started at: " << CreatedAt
169182
<< " }" << Endl;
170183

171-
if (verbose && TxMaxAllocationBacktrace.load()) {
172-
res << "TxMaxAllocationBacktrace:" << Endl << TxMaxAllocationBacktrace.load()->PrintToString();
184+
if (CollectBacktrace && HasFailedAllocationBacktrace.load()) {
185+
res << "TxFailedAllocationBacktrace:" << Endl << TxFailedAllocationBacktrace.PrintToString();
186+
}
187+
188+
if (CollectBacktrace) {
189+
backtraceLock.unlock();
173190
}
174-
if (verbose && TxFailedAllocationBacktrace.load()) {
175-
res << "TxFailedAllocationBacktrace:" << Endl << TxFailedAllocationBacktrace.load()->PrintToString();
191+
192+
if (CollectBacktrace && TxMaxAllocationBacktrace.load()) {
193+
res << "TxMaxAllocationBacktrace:" << Endl << TxMaxAllocationBacktrace.load()->PrintToString();
176194
}
177195

178196
return res;
@@ -183,22 +201,19 @@ class TTxState : public TAtomicRefCount<TTxState> {
183201
}
184202

185203
void AckFailedMemoryAlloc(ui64 memory) {
186-
auto* oldBacktrace = TxFailedAllocationBacktrace.load();
187-
ui64 lastAlloc = TxFailedAllocation.load();
188-
bool exchanged = false;
204+
// use unique_lock to safely unlock mutex in case of exceptions
205+
std::unique_lock backtraceLock(BacktraceMutex, std::defer_lock);
189206

190-
while(!exchanged) {
191-
exchanged = TxFailedAllocation.compare_exchange_weak(lastAlloc, memory);
207+
if (CollectBacktrace) {
208+
backtraceLock.lock();
192209
}
193210

194-
if (exchanged) {
195-
auto* newBacktrace = new TBackTrace();
196-
newBacktrace->Capture();
197-
if (TxFailedAllocationBacktrace.compare_exchange_strong(oldBacktrace, newBacktrace)) {
198-
delete oldBacktrace;
199-
} else {
200-
delete newBacktrace;
201-
}
211+
TxFailedAllocationSize = memory;
212+
213+
if (CollectBacktrace) {
214+
TxFailedAllocationBacktrace.Capture();
215+
HasFailedAllocationBacktrace = true;
216+
backtraceLock.unlock();
202217
}
203218
}
204219

@@ -239,17 +254,18 @@ class TTxState : public TAtomicRefCount<TTxState> {
239254
}
240255

241256
auto* oldBacktrace = TxMaxAllocationBacktrace.load();
242-
ui64 maxAlloc = TxMaxAllocation.load();
257+
ui64 maxAlloc = TxMaxAllocationSize.load();
243258
bool exchanged = false;
244259

245260
while(maxAlloc < resources.Memory && !exchanged) {
246-
exchanged = TxMaxAllocation.compare_exchange_weak(maxAlloc, resources.Memory);
261+
exchanged = TxMaxAllocationSize.compare_exchange_weak(maxAlloc, resources.Memory);
247262
}
248263

249264
if (exchanged) {
250265
auto* newBacktrace = new TBackTrace();
251266
newBacktrace->Capture();
252267
if (TxMaxAllocationBacktrace.compare_exchange_strong(oldBacktrace, newBacktrace)) {
268+
// XXX(ilezhankin): technically it's possible to have a race with `ToString()`, but it's very unlikely.
253269
delete oldBacktrace;
254270
} else {
255271
delete newBacktrace;

ydb/core/kqp/rm_service/kqp_rm_ut.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -184,7 +184,7 @@ class KqpRm : public TTestBase {
184184
}
185185

186186
TIntrusivePtr<NRm::TTxState> MakeTx(ui64 txId, std::shared_ptr<NRm::IKqpResourceManager> rm) {
187-
return MakeIntrusive<NRm::TTxState>(txId, TInstant::Now(), rm->GetCounters(), "", (double)100, "");
187+
return MakeIntrusive<NRm::TTxState>(txId, TInstant::Now(), rm->GetCounters(), "", (double)100, "", false);
188188
}
189189

190190
TIntrusivePtr<NRm::TTaskState> MakeTask(ui64 taskId, TIntrusivePtr<NRm::TTxState> tx) {

ydb/core/protos/table_service_config.proto

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,8 @@ message TTableServiceConfig {
5151
optional uint64 MaxNonParallelTopStageExecutionLimit = 26 [default = 1];
5252
optional bool PreferLocalDatacenterExecution = 27 [ default = true ];
5353
optional uint64 MaxNonParallelDataQueryTasksLimit = 28 [default = 1000];
54+
55+
optional bool VerboseMemoryLimitException = 29 [default = false];
5456
}
5557

5658
message TSpillingServiceConfig {

0 commit comments

Comments
 (0)