Skip to content

Commit 01a6423

Browse files
authored
[enhance](test) Add injection points to block FE/BE (#59165)
1 parent 6210dec commit 01a6423

File tree

7 files changed

+51
-1
lines changed

7 files changed

+51
-1
lines changed

be/src/cloud/cloud_stream_load_executor.cpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,11 @@ Status CloudStreamLoadExecutor::operate_txn_2pc(StreamLoadContext* ctx) {
104104

105105
Status CloudStreamLoadExecutor::commit_txn(StreamLoadContext* ctx) {
106106
DBUG_EXECUTE_IF("StreamLoadExecutor.commit_txn.block", DBUG_BLOCK);
107+
DBUG_EXECUTE_IF("StreamLoadExecutor.commit_txn.crash", {
108+
LOG(INFO) << "debug point " << DP_NAME << " trigger crash";
109+
volatile int* p = nullptr;
110+
*p = 1;
111+
});
107112
// forward to fe to excute commit transaction for MoW table
108113
if (ctx->is_mow_table() || !config::enable_stream_load_commit_txn_on_be ||
109114
ctx->load_type == TLoadType::ROUTINE_LOAD) {

be/src/cloud/cloud_tablet_mgr.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
#include "common/status.h"
3030
#include "olap/lru_cache.h"
3131
#include "runtime/memory/cache_policy.h"
32+
#include "util/debug_points.h"
3233
#include "util/stack_util.h"
3334

3435
namespace doris {
@@ -165,6 +166,7 @@ Result<std::shared_ptr<CloudTablet>> CloudTabletMgr::get_tablet(int64_t tablet_i
165166
SyncRowsetStats* sync_stats,
166167
bool force_use_only_cached,
167168
bool cache_on_miss) {
169+
DBUG_EXECUTE_IF("CloudTabletMgr::get_tablet.block", DBUG_BLOCK);
168170
// LRU value type. `Value`'s lifetime MUST NOT be longer than `CloudTabletMgr`
169171
class Value : public LRUCacheValueBase {
170172
public:

be/src/cloud/injection_point_action.cpp

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -279,6 +279,17 @@ void set_return_error(const std::string& point, HttpRequest* req) {
279279
HttpChannel::send_reply(req, HttpStatus::OK, "OK");
280280
}
281281

282+
void set_segfault(const std::string& point, HttpRequest* req) {
283+
auto sp = SyncPoint::get_instance();
284+
sp->set_call_back(point, [point](auto&&) {
285+
LOG(INFO) << "injection point hit, point=" << point << " trigger segfault";
286+
// Intentional null dereference to crash the BE for testing.
287+
volatile int* p = nullptr;
288+
*p = 1;
289+
});
290+
HttpChannel::send_reply(req, HttpStatus::OK, "OK");
291+
}
292+
282293
void handle_set(HttpRequest* req) {
283294
auto& point = req->param("name");
284295
if (point.empty()) {
@@ -302,6 +313,9 @@ void handle_set(HttpRequest* req) {
302313
} else if (behavior == "return_error") {
303314
set_return_error(point, req);
304315
return;
316+
} else if (behavior == "segfault") {
317+
set_segfault(point, req);
318+
return;
305319
}
306320
HttpChannel::send_reply(req, HttpStatus::BAD_REQUEST, "unknown behavior: " + behavior);
307321
}
@@ -377,13 +391,15 @@ InjectionPointAction::InjectionPointAction() = default;
377391
// which is an int, valid values can be found in status.h, e.g. -235 or -230,
378392
// if `code` is not present return Status::InternalError. Optional `probability`
379393
// determines the percentage of times to inject the error (default 100).
394+
// * segfault: dereference a null pointer to crash BE intentionally
380395
// ```
381396
// curl "be_ip:http_port/api/injection_point/set?name=${injection_point_name}&behavior=sleep&duration=${x_millsec}" # sleep x millisecs
382397
// curl "be_ip:http_port/api/injection_point/set?name=${injection_point_name}&behavior=return" # return void
383398
// curl "be_ip:http_port/api/injection_point/set?name=${injection_point_name}&behavior=return_ok" # return ok
384399
// curl "be_ip:http_port/api/injection_point/set?name=${injection_point_name}&behavior=return_error" # internal error
385400
// curl "be_ip:http_port/api/injection_point/set?name=${injection_point_name}&behavior=return_error&code=${code}" # -235
386401
// curl "be_ip:http_port/api/injection_point/set?name=${injection_point_name}&behavior=return_error&code=${code}&probability=50" # inject with 50% probability
402+
// curl "be_ip:http_port/api/injection_point/set?name=${injection_point_name}&behavior=segfault" # crash BE
387403
// ```
388404
void InjectionPointAction::handle(HttpRequest* req) {
389405
LOG(INFO) << "handle InjectionPointAction " << req->debug_string();

fe/fe-core/src/main/java/org/apache/doris/catalog/Env.java

Lines changed: 24 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,7 @@
6868
import org.apache.doris.common.publish.TopicPublisherThread;
6969
import org.apache.doris.common.publish.WorkloadGroupPublisher;
7070
import org.apache.doris.common.util.Daemon;
71+
import org.apache.doris.common.util.DebugPointUtil;
7172
import org.apache.doris.common.util.DynamicPartitionUtil;
7273
import org.apache.doris.common.util.HttpURLUtil;
7374
import org.apache.doris.common.util.MasterDaemon;
@@ -1093,6 +1094,29 @@ private void unlock() {
10931094
}
10941095
}
10951096

1097+
// Block the caller while holding the global env lock when the given debug point is enabled.
1098+
// Used to simulate a stuck import path that drags other operations waiting on the same lock.
1099+
public void debugBlockAllOnGlobalLock(String debugPointName) {
1100+
if (!DebugPointUtil.isEnable(debugPointName)) {
1101+
return;
1102+
}
1103+
try {
1104+
lock.lock();
1105+
LOG.info("debug point {} enabled, block and hold env lock", debugPointName);
1106+
while (DebugPointUtil.isEnable(debugPointName)) {
1107+
Thread.sleep(1000);
1108+
}
1109+
LOG.info("debug point {} cleared, release env lock", debugPointName);
1110+
} catch (InterruptedException e) {
1111+
LOG.warn("debug point {} interrupted while blocking env lock", debugPointName);
1112+
Thread.currentThread().interrupt();
1113+
} finally {
1114+
if (lock.isHeldByCurrentThread()) {
1115+
lock.unlock();
1116+
}
1117+
}
1118+
}
1119+
10961120
public String getBdbDir() {
10971121
return bdbDir;
10981122
}
@@ -7389,4 +7413,3 @@ protected void checkClusterSnapshot(File dir) {}
73897413

73907414
protected void cloneClusterSnapshot() throws Exception {}
73917415
}
7392-

fe/fe-core/src/main/java/org/apache/doris/cloud/transaction/CloudGlobalTransactionMgr.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -597,6 +597,7 @@ private void commitTransactionWithoutLock(long dbId, List<Table> tableList, long
597597
throw new TransactionCommitFailedException(
598598
"disable_load_job is set to true, all load jobs are not allowed");
599599
}
600+
Env.getCurrentEnv().debugBlockAllOnGlobalLock("FE.BLOCK_IMPORT_LOCK");
600601

601602
if (!mowTableList.isEmpty()) {
602603
List<Long> mowTableIds = mowTableList.stream().map(Table::getId).collect(Collectors.toList());

fe/fe-core/src/main/java/org/apache/doris/httpv2/rest/MetricsAction.java

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717

1818
package org.apache.doris.httpv2.rest;
1919

20+
import org.apache.doris.catalog.Env;
2021
import org.apache.doris.common.Config;
2122
import org.apache.doris.metric.JsonMetricVisitor;
2223
import org.apache.doris.metric.MetricRepo;
@@ -46,6 +47,7 @@ public void execute(HttpServletRequest request, HttpServletResponse response) {
4647
if (Config.enable_all_http_auth) {
4748
executeCheckPassword(request, response);
4849
}
50+
Env.getCurrentEnv().debugBlockAllOnGlobalLock("FE.BLOCK_IMPORT_LOCK");
4951

5052
String type = request.getParameter(TYPE_PARAM);
5153
MetricVisitor visitor = null;

fe/fe-core/src/main/java/org/apache/doris/transaction/DatabaseTransactionMgr.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -774,6 +774,7 @@ private boolean checkTransactionStateBeforeCommit(Database db, List<Table> table
774774
public void commitTransaction(List<Table> tableList, long transactionId, List<TabletCommitInfo> tabletCommitInfos,
775775
TxnCommitAttachment txnCommitAttachment, Boolean is2PC)
776776
throws UserException {
777+
env.debugBlockAllOnGlobalLock("FE.BLOCK_IMPORT_LOCK");
777778
// check status
778779
// the caller method already own tables' write lock
779780
Database db = env.getInternalCatalog().getDbOrMetaException(dbId);

0 commit comments

Comments
 (0)