Skip to content

Commit 40eaf50

Browse files
fix: zk reconnect in db sdk (#3656)
reconnect if need in CheckZk thread
1 parent e7538bd commit 40eaf50

File tree

4 files changed

+48
-18
lines changed

4 files changed

+48
-18
lines changed

src/sdk/db_sdk.cc

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -195,12 +195,19 @@ ClusterSDK::~ClusterSDK() {
195195
}
196196

197197
void ClusterSDK::CheckZk() {
198-
if (session_id_ == 0) {
199-
WatchNotify();
200-
} else if (session_id_ != zk_client_->GetSessionTerm()) {
201-
LOG(WARNING) << "session changed, re-watch notify";
202-
WatchNotify();
198+
// ensure that zk client is alive
199+
if (zk_client_->EnsureConnected()) {
200+
if (session_id_ == 0) {
201+
WatchNotify();
202+
} else if (session_id_ != zk_client_->GetSessionTerm()) {
203+
LOG(WARNING) << "session changed, re-watch notify";
204+
WatchNotify();
205+
}
206+
} else {
207+
// 5min print once
208+
LOG_EVERY_N(WARNING, 150) << "zk client is not connected, reconnect later";
203209
}
210+
204211
pool_.DelayTask(2000, [this] { CheckZk(); });
205212
}
206213

@@ -383,7 +390,7 @@ bool ClusterSDK::InitTabletClient() {
383390
std::vector<std::string> tablets;
384391
bool ok = zk_client_->GetNodes(tablets);
385392
if (!ok) {
386-
LOG(WARNING) << "fail to get tablet";
393+
LOG(WARNING) << "fail to get tablets from zk";
387394
return false;
388395
}
389396
std::map<std::string, std::string> real_ep_map;

src/sdk/db_sdk.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -174,7 +174,8 @@ class ClusterSDK : public DBSDK {
174174
std::string globalvar_changed_notify_path_;
175175
std::string leader_path_;
176176
std::string taskmanager_leader_path_;
177-
177+
// CheckZk will be called periodically, so we don't need to check zk_client_ before using it
178+
// if failed, just retry
178179
::openmldb::zk::ZkClient* zk_client_;
179180
::baidu::common::ThreadPool pool_;
180181
};

src/zk/zk_client.cc

Lines changed: 24 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,7 @@ void NodeWatcher(zhandle_t* zh, int type, int state, const char* path, void* wat
5555
}
5656

5757
void ItemWatcher(zhandle_t* zh, int type, int state, const char* path, void* watcher_ctx) {
58-
PDLOG(INFO, "node watcher with event type %d, state %d", type, state);
58+
PDLOG(INFO, "item watcher with event type %d, state %d", type, state);
5959
if (zoo_get_context(zh)) {
6060
ZkClient* client = const_cast<ZkClient*>(reinterpret_cast<const ZkClient*>(zoo_get_context(zh)));
6161
std::string path_str(path);
@@ -64,8 +64,8 @@ void ItemWatcher(zhandle_t* zh, int type, int state, const char* path, void* wat
6464
}
6565

6666
ZkClient::ZkClient(const std::string& hosts, const std::string& real_endpoint, int32_t session_timeout,
67-
const std::string& endpoint, const std::string& zk_root_path,
68-
const std::string& auth_schema, const std::string& cert)
67+
const std::string& endpoint, const std::string& zk_root_path, const std::string& auth_schema,
68+
const std::string& cert)
6969
: hosts_(hosts),
7070
session_timeout_(session_timeout),
7171
endpoint_(endpoint),
@@ -92,8 +92,8 @@ ZkClient::ZkClient(const std::string& hosts, const std::string& real_endpoint, i
9292
}
9393

9494
ZkClient::ZkClient(const std::string& hosts, int32_t session_timeout, const std::string& endpoint,
95-
const std::string& zk_root_path, const std::string& zone_path,
96-
const std::string& auth_schema, const std::string& cert)
95+
const std::string& zk_root_path, const std::string& zone_path, const std::string& auth_schema,
96+
const std::string& cert)
9797
: hosts_(hosts),
9898
session_timeout_(session_timeout),
9999
endpoint_(endpoint),
@@ -296,8 +296,7 @@ bool ZkClient::CreateNode(const std::string& node, const std::string& value, int
296296
}
297297
uint32_t size = node.size() + 11;
298298
char path_buffer[size]; // NOLINT
299-
int ret =
300-
zoo_create(zk_, node.c_str(), value.c_str(), value.size(), &acl_vector_, flags, path_buffer, size);
299+
int ret = zoo_create(zk_, node.c_str(), value.c_str(), value.size(), &acl_vector_, flags, path_buffer, size);
301300
if (ret == ZOK) {
302301
assigned_path_name.assign(path_buffer, size - 1);
303302
PDLOG(INFO, "create node %s ok and real node name %s", node.c_str(), assigned_path_name.c_str());
@@ -583,8 +582,13 @@ void ZkClient::LogEvent(int type, int state, const char* path) {
583582
if (type == ZOO_SESSION_EVENT) {
584583
if (state == ZOO_CONNECTED_STATE) {
585584
Connected();
585+
} else if (state == ZOO_CONNECTING_STATE || state == ZOO_ASSOCIATING_STATE) {
586+
// just wait
586587
} else if (state == ZOO_EXPIRED_SESSION_STATE) {
587588
connected_ = false;
589+
} else {
590+
// unknow state, should retry
591+
connected_ = false;
588592
}
589593
}
590594
}
@@ -630,5 +634,18 @@ bool ZkClient::Mkdir(const std::string& path) {
630634
return MkdirNoLock(path);
631635
}
632636

637+
bool ZkClient::EnsureConnected() {
638+
if (!IsConnected()) {
639+
LOG(WARNING) << "reconnect zk";
640+
if (Reconnect()) {
641+
LOG(INFO) << "reconnect zk ok";
642+
} else {
643+
LOG(WARNING) << "reconnect zk failed";
644+
return false;
645+
}
646+
}
647+
return true;
648+
}
649+
633650
} // namespace zk
634651
} // namespace openmldb

src/zk/zk_client.h

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -46,12 +46,12 @@ class ZkClient {
4646
// session_timeout, the session timeout
4747
// endpoint, the client endpoint
4848
ZkClient(const std::string& hosts, const std::string& real_endpoint, int32_t session_timeout,
49-
const std::string& endpoint, const std::string& zk_root_path,
50-
const std::string& auth_schema, const std::string& cert);
49+
const std::string& endpoint, const std::string& zk_root_path, const std::string& auth_schema,
50+
const std::string& cert);
5151

5252
ZkClient(const std::string& hosts, int32_t session_timeout, const std::string& endpoint,
53-
const std::string& zk_root_path, const std::string& zone_path,
54-
const std::string& auth_schema, const std::string& cert);
53+
const std::string& zk_root_path, const std::string& zone_path, const std::string& auth_schema,
54+
const std::string& cert);
5555
~ZkClient();
5656

5757
// init zookeeper connections
@@ -138,6 +138,11 @@ class ZkClient {
138138
// when reconnect, need Register and Watchnodes again
139139
bool Reconnect();
140140

141+
// ensure that zk client is connected:
142+
// if not, try to reconnect, return false if reconnect failed
143+
// DON'T use zk client if this function return false
144+
bool EnsureConnected();
145+
141146
private:
142147
void Connected();
143148

0 commit comments

Comments
 (0)