-
Notifications
You must be signed in to change notification settings - Fork 27
RSDK-10842 - shut down cpp modules properly when viam-server is hard killed #448
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 2 commits
8509daa
82c577a
1c791cc
9e0da78
953b850
b43f522
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -144,11 +144,16 @@ RobotClient::~RobotClient() { | |
|
|
||
| void RobotClient::close() { | ||
| should_refresh_.store(false); | ||
| should_check_connection_.store(false); | ||
|
|
||
| if (refresh_thread_.joinable()) { | ||
| refresh_thread_.join(); | ||
| } | ||
|
|
||
| if (check_connection_thread_.joinable()) { | ||
| check_connection_thread_.join(); | ||
| } | ||
|
|
||
| stop_all(); | ||
|
|
||
| viam_channel_.close(); | ||
|
|
@@ -231,6 +236,62 @@ void RobotClient::refresh_every() { | |
| } | ||
| }; | ||
|
|
||
| void RobotClient::check_connection() { | ||
| unsigned int check_every = check_every_interval_; | ||
| unsigned int reconnect_every = reconnect_every_interval_; | ||
| if (check_every == 0) { | ||
| check_every = reconnect_every; | ||
| } | ||
| if (check_every == 0 && reconnect_every == 0) { | ||
| should_check_connection_.store(false); | ||
| } | ||
| bool connected(true); | ||
| while (should_check_connection_) { | ||
| std::exception connection_error; | ||
|
||
| for (int i = 0; i < 3; ++i) { | ||
| try { | ||
| std::this_thread::sleep_for(std::chrono::seconds{check_every}); | ||
| impl::client_helper(impl_, &RobotService::Stub::ResourceNames).invoke([](auto&) { | ||
| return; | ||
| }); | ||
| connected = true; | ||
| break; | ||
| } catch (const std::exception& e) { | ||
| connected = false; | ||
| connection_error = e; | ||
| std::this_thread::sleep_for(std::chrono::milliseconds{100}); | ||
| } | ||
| } | ||
| if (connected) { | ||
| continue; | ||
| } | ||
| const auto* uri = viam_channel_.get_channel_addr(); | ||
| VIAM_SDK_LOG(error) << "Lost connection to machine at address " << uri << " with error " | ||
| << connection_error.what() << ". Attempting to reconnect to every " | ||
| << reconnect_every << "second(s)"; | ||
| viam_channel_.close(); | ||
|
|
||
| for (int i = 0; i < 3; ++i) { | ||
| try { | ||
| auto channel = ViamChannel::dial(uri, {}); | ||
| auto impl = | ||
| std::make_unique<RobotClient::impl>(RobotService::NewStub(channel.channel())); | ||
| impl_.reset(); | ||
| impl_.swap(impl); | ||
|
||
| refresh(); | ||
| connected = true; | ||
| } catch (const std::exception& e) { | ||
| viam_channel_.close(); | ||
| std::this_thread::sleep_for(std::chrono::seconds{reconnect_every}); | ||
| } | ||
| } | ||
| if (!connected) { | ||
| // NOLINTNEXTLINE | ||
| close(); | ||
|
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @lia-viam here's somewhere that your C++ expertise would be greatly appreciated!
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think the issue calling another thought would be to have a flag variable that says if we've been disconnected, and have
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. also maybe cc @acmorrow
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
I'm not sure I entirely understand this, but I'm a little concerned about it. We'd need to at some point reset the value to
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. fwiw, the python SDK at this point just calls
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. per offline discussion--this is actually fine bc when the |
||
| } | ||
| } | ||
| } | ||
|
|
||
| RobotClient::RobotClient(ViamChannel channel) | ||
| : viam_channel_(std::move(channel)), | ||
| impl_(std::make_unique<impl>(RobotService::NewStub(viam_channel_.channel()))) {} | ||
|
|
@@ -262,8 +323,8 @@ void RobotClient::log(const std::string& name, | |
| ClientContext ctx; | ||
| const auto response = impl_->stub->Log(ctx, req, &resp); | ||
| if (is_error_response(response)) { | ||
| // Manually override to force this to get logged to console so we don't set off an infinite | ||
| // loop | ||
| // Manually override to force this to get logged to console so we don't set off an | ||
| // infinite loop | ||
| VIAM_SDK_LOG(error) << boost::log::add_value(sdk::impl::attr_console_force_type{}, true) | ||
| << "Error sending log message over grpc: " << response.error_message() | ||
| << response.error_details(); | ||
|
|
@@ -279,6 +340,13 @@ std::shared_ptr<RobotClient> RobotClient::with_channel(ViamChannel channel, | |
| robot->refresh_thread_ = std::thread{&RobotClient::refresh_every, robot.get()}; | ||
| } | ||
|
|
||
| robot->should_check_connection_ = true; | ||
|
|
||
| robot->check_every_interval_ = options.check_every_interval(); | ||
| robot->reconnect_every_interval_ = options.reconnect_every_interval(); | ||
|
|
||
| robot->check_connection_thread_ = std::thread{&RobotClient::check_connection, robot.get()}; | ||
|
|
||
| robot->refresh(); | ||
| return robot; | ||
| }; | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -187,9 +187,12 @@ ViamChannel ViamChannel::dial(const char* uri, const boost::optional<DialOptions | |
| } | ||
| address += proxy_path; | ||
|
|
||
| return ViamChannel(sdk::impl::create_viam_channel(address, grpc::InsecureChannelCredentials()), | ||
| proxy_path, | ||
| ptr); | ||
| auto chan = | ||
| ViamChannel(sdk::impl::create_viam_channel(address, grpc::InsecureChannelCredentials()), | ||
| proxy_path, | ||
| ptr); | ||
| chan.uri_ = uri; | ||
| return chan; | ||
| } | ||
|
|
||
| const std::shared_ptr<grpc::Channel>& ViamChannel::channel() const { | ||
|
|
@@ -200,6 +203,26 @@ void ViamChannel::close() { | |
| pimpl_.reset(); | ||
| } | ||
|
|
||
| const char* ViamChannel::get_channel_addr() const { | ||
| return uri_; | ||
| } | ||
| Options& Options::set_check_every_interval(unsigned int interval) { | ||
|
||
| check_every_interval_ = interval; | ||
| return *this; | ||
| } | ||
| Options& Options::set_reconnect_every_interval(unsigned int interval) { | ||
| reconnect_every_interval_ = interval; | ||
| return *this; | ||
| } | ||
|
|
||
| unsigned int Options::check_every_interval() const { | ||
| return check_every_interval_; | ||
| } | ||
|
|
||
| unsigned int Options::reconnect_every_interval() const { | ||
| return reconnect_every_interval_; | ||
| } | ||
|
|
||
| unsigned int Options::refresh_interval() const { | ||
| return refresh_interval_; | ||
| } | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -45,7 +45,10 @@ class ViamChannel { | |
|
|
||
| void close(); | ||
|
|
||
| const char* get_channel_addr() const; | ||
|
|
||
| private: | ||
| const char* uri_; | ||
| struct impl; | ||
|
|
||
| std::shared_ptr<GrpcChannel> channel_; | ||
|
|
@@ -117,12 +120,32 @@ class Options { | |
| : refresh_interval_(std::move(refresh_interval)), dial_options_(std::move(dial_options)) {} | ||
|
|
||
| unsigned int refresh_interval() const; | ||
| unsigned int check_every_interval() const; | ||
| unsigned int reconnect_every_interval() const; | ||
|
|
||
| /// @brief Sets the frequency (in seconds) to verify connectivity | ||
| Options& set_check_every_interval(unsigned int interval); | ||
|
|
||
| /// @brief Sets the frequency (in seconds) to attempt to reconnect when connectivity is lost | ||
| Options& set_reconnect_every_interval(unsigned int interval); | ||
| const boost::optional<DialOptions>& dial_options() const; | ||
|
|
||
| private: | ||
| /// @brief How often to refresh the status/parts of the robot, in seconds. If set to 0, the | ||
| /// robot will not automatically refresh. | ||
| unsigned int refresh_interval_; | ||
|
|
||
| /// @brief How often to verify connectivity to the robot, in seconds. If set to 0, will not | ||
| /// check, will default to the `reconnect_every_interval_` value. Defaults to 0. | ||
| /// @note Setting to a non-zero value is useful in modules but may result in delays shutting | ||
| /// down client code | ||
| unsigned int check_every_interval_ = 0; | ||
|
||
|
|
||
| /// @brief How often to attempt to reconnect to the robot when disconnected. If set to 0, | ||
| /// will not attempt to reconnect. Defaults to 0. | ||
| /// @note Setting to a non-zero value is useful in modules but may result in delays shutting | ||
| /// down client code | ||
| unsigned int reconnect_every_interval_ = 0; | ||
|
||
| boost::optional<DialOptions> dial_options_; | ||
| }; | ||
|
|
||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@lia-viam another ignorant C++ question,
should_check_connection_is set to false when we want to shutdown (makes sense!) but also above whencheck_everyandreconnect_everyare set to zero. In such a case this thread will return pretty much immediately, but the client code in such a case could be running for an indefinite amount of time. Are there any safety concerns here?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
unless i'm missing something that should be fine--the thread would return and we would just have the same behavior as pre-this PR