Skip to content

Commit d51bbf4

Browse files
trondndaverigby
authored andcommitted
MB-46017: Refactor: Use ProcessMonitor from platform
The functionality in ParentMonitor was moved into a more generic library function ProcessMonitor and moved to platform so that we can reuse the functionality elsewhere (to spawn and monitor memcached process in testapp and cluster_test). Extend cluster_test to detect when the underlying memcached process crash, and if it takes more than a minute to run (and dump the last 8k of the log file if it does). To try to give more information on why the program hangs every now and then on windows. Change-Id: I48e8b51d852099ccc82b09fbbd498e1b4b1fd0a0 Reviewed-on: http://review.couchbase.org/c/kv_engine/+/154054 Tested-by: Trond Norbye <[email protected]> Reviewed-by: Dave Rigby <[email protected]>
1 parent ac6e9fc commit d51bbf4

File tree

10 files changed

+128
-433
lines changed

10 files changed

+128
-433
lines changed

cluster_framework/node.cc

Lines changed: 63 additions & 147 deletions
Original file line numberDiff line numberDiff line change
@@ -13,12 +13,10 @@
1313
#include <folly/portability/Unistd.h>
1414
#include <nlohmann/json.hpp>
1515
#include <platform/dirutils.h>
16+
#include <platform/process_monitor.h>
1617
#include <platform/strerror.h>
1718
#include <protocol/connection/client_connection_map.h>
18-
#ifndef WIN32
19-
#include <sys/wait.h>
20-
#include <csignal>
21-
#endif
19+
#include <atomic>
2220
#include <chrono>
2321
#include <fstream>
2422
#include <iostream>
@@ -49,6 +47,8 @@ class NodeImpl : public Node {
4947
nlohmann::json config;
5048
ConnectionMap connectionMap;
5149
const std::string id;
50+
std::unique_ptr<ProcessMonitor> child;
51+
std::atomic_bool allow_child_death{false};
5252
};
5353

5454
NodeImpl::NodeImpl(boost::filesystem::path directory, std::string id)
@@ -59,10 +59,11 @@ NodeImpl::NodeImpl(boost::filesystem::path directory, std::string id)
5959
const auto errmaps =
6060
source_root / "etc" / "couchbase" / "kv" / "error_maps";
6161
const auto rbac = source_root / "tests" / "testapp_cluster" / "rbac.json";
62-
const auto log_filename = NodeImpl::directory / "memcached_log";
62+
const auto log_filename = NodeImpl::directory / "log" / "memcached_log";
6363
const auto portnumber_file = NodeImpl::directory / "memcached.ports.json";
6464
const auto minidump_dir = NodeImpl::directory / "crash";
6565
create_directories(minidump_dir);
66+
create_directories(log_filename.parent_path());
6667

6768
config = {{"max_connections", 1000},
6869
{"system_connections", 250},
@@ -104,108 +105,71 @@ NodeImpl::NodeImpl(boost::filesystem::path directory, std::string id)
104105
}
105106

106107
void NodeImpl::startMemcachedServer() {
107-
#ifdef WIN32
108-
STARTUPINFO sinfo{};
109-
PROCESS_INFORMATION pinfo{};
110-
sinfo.cb = sizeof(sinfo);
111-
112-
char commandline[1024];
113-
sprintf(commandline,
114-
"memcached.exe -C %s",
115-
configfile.generic_string().c_str());
116-
if (!CreateProcess("memcached.exe", // lpApplicationName
117-
commandline, // lpCommandLine
118-
nullptr, // lpProcessAttributes
119-
nullptr, // lpThreadAttributes
120-
false, // bInheritHandles
121-
0, // dwCreationFlags
122-
nullptr, // lpEnvironment
123-
nullptr, // lpCurrentDirectory
124-
&sinfo, // lpStartupInfo
125-
&pinfo)) { // lpProcessInfoqrmation
126-
throw std::system_error(GetLastError(),
127-
std::system_category(),
128-
"Failed to execute memcached");
129-
}
130-
131-
child = pinfo.hProcess;
132-
#else
133-
child = fork();
134-
if (child == -1) {
135-
throw std::system_error(
136-
errno, std::system_category(), "Failed to start client");
108+
boost::filesystem::path exe{boost::filesystem::current_path() /
109+
"memcached"};
110+
exe = exe.generic_path();
111+
if (!boost::filesystem::exists(exe)) {
112+
exe = exe.generic_string() + ".exe";
113+
if (!boost::filesystem::exists(exe)) {
114+
throw std::runtime_error(
115+
"NodeImpl::startMemcachedServer(): Failed to locate "
116+
"memcached");
117+
}
137118
}
138-
139-
if (child == 0) {
140-
std::string binary(OBJECT_ROOT);
141-
const auto memcached_json = configfile.generic_string();
142-
binary.append("/memcached");
143-
144-
std::vector<const char*> argv;
145-
argv.emplace_back(binary.c_str());
146-
argv.emplace_back("-C");
147-
argv.emplace_back(memcached_json.c_str());
148-
argv.emplace_back(nullptr);
149-
execvp(argv[0], const_cast<char**>(argv.data()));
150-
throw std::system_error(
151-
errno, std::system_category(), "Failed to execute memcached");
119+
std::vector<std::string> argv = {
120+
{exe.generic_string(), "-C", configfile.generic_string()}};
121+
child = ProcessMonitor::create(argv, [this](const auto& ec) {
122+
if (!allow_child_death) {
123+
std::cerr << "memcached process on " << directory.generic_string()
124+
<< " terminated: " << ec.to_string() << std::endl;
125+
std::cerr << "Terminating process" << std::endl;
126+
std::_Exit(EXIT_FAILURE);
127+
}
128+
});
129+
130+
if (getenv("MEMCACHED_UNIT_TESTS")) {
131+
// The test _SHOULD_ complete under 60 seconds..
132+
child->setTimeoutHander(std::chrono::seconds{60}, [this]() {
133+
static std::mutex mutex;
134+
std::lock_guard<std::mutex> guard(mutex);
135+
std::cerr << "memcached process on " << directory.generic_string()
136+
<< " might be stuck." << std::endl;
137+
138+
// We've set the cycle size to be 200M so we should expect
139+
// only a single log file (but for simplicity just iterate
140+
// over them all and print the last 8k of each file
141+
std::cerr << "Last 8k of the log files" << std::endl
142+
<< "========================" << std::endl;
143+
for (const auto& p :
144+
boost::filesystem::directory_iterator(directory / "log")) {
145+
if (is_regular_file(p)) {
146+
auto content = cb::io::loadFile(p.path().generic_string());
147+
if (content.size() > 8192) {
148+
content = content.substr(
149+
content.find('\n', content.size() - 8192));
150+
}
151+
std::cerr << p.path().generic_string() << std::endl
152+
<< content << std::endl
153+
<< "-----------------------------" << std::endl;
154+
}
155+
}
156+
// Wait 2 secs before terminating so that the other nodes also
157+
// may have their timouts fire and dump the logs..
158+
std::this_thread::sleep_for(std::chrono::seconds{2});
159+
std::exit(EXIT_FAILURE);
160+
});
152161
}
153-
#endif
154-
155162
// wait and read the portnumber file
156163
parsePortnumberFile();
157164
}
158165

159166
NodeImpl::~NodeImpl() {
160-
if (isRunning()) {
161-
#ifdef WIN32
162-
// @todo This should be made a bit more robust
163-
if (!TerminateProcess(child, 0)) {
164-
std::cerr << "TerminateProcess failed!" << std::endl;
165-
std::cerr.flush();
166-
_exit(EXIT_FAILURE);
167-
}
168-
DWORD status;
169-
if ((status = WaitForSingleObject(child, 60000)) != WAIT_OBJECT_0) {
170-
std::cerr << "Unexpected return value from WaitForSingleObject: "
171-
<< status << std::endl;
172-
std::cerr.flush();
173-
_exit(EXIT_FAILURE);
174-
}
175-
if (!GetExitCodeProcess(child, &status)) {
176-
std::cerr << "GetExitCodeProcess failed: " << GetLastError()
177-
<< std::endl;
178-
std::cerr.flush();
179-
_exit(EXIT_FAILURE);
180-
}
181-
#else
182-
// Start by giving it a slow and easy start...
183-
const auto timeout =
184-
std::chrono::steady_clock::now() + std::chrono::seconds(15);
185-
kill(child, SIGTERM);
186-
187-
do {
188-
std::this_thread::sleep_for(std::chrono::milliseconds(10));
189-
} while (isRunning() && std::chrono::steady_clock::now() < timeout);
190-
191-
if (isRunning()) {
192-
// no mercy!
193-
kill(child, SIGKILL);
194-
195-
int status;
196-
pid_t ret;
197-
while (true) {
198-
ret = waitpid(child, &status, 0);
199-
if (ret == reinterpret_cast<pid_t>(-1) && errno == EINTR) {
200-
// Just loop again
201-
continue;
202-
}
203-
break;
204-
}
205-
}
206-
#endif
167+
if (child) {
168+
allow_child_death = true;
169+
child->terminate();
207170
}
208-
171+
// make sure we reap the thread
172+
child.reset();
209173
if (!configfile.empty()) {
210174
try {
211175
remove(configfile);
@@ -222,54 +186,6 @@ void NodeImpl::parsePortnumberFile() {
222186
cb::io::rmrf(config["portnumber_file"]);
223187
}
224188

225-
#ifdef WIN32
226-
bool Node::isRunning() const {
227-
if (child != INVALID_HANDLE_VALUE) {
228-
DWORD status;
229-
230-
if (!GetExitCodeProcess(child, &status)) {
231-
throw std::system_error(
232-
GetLastError(),
233-
std::system_category(),
234-
"NodeImpl::isRunning: GetExitCodeProcess failed");
235-
236-
std::cerr << "GetExitCodeProcess: failed: " << cb_strerror()
237-
<< std::endl;
238-
exit(EXIT_FAILURE);
239-
}
240-
241-
if (status == STILL_ACTIVE) {
242-
return true;
243-
}
244-
245-
CloseHandle(child);
246-
child = INVALID_HANDLE_VALUE;
247-
}
248-
return false;
249-
}
250-
#else
251-
bool Node::isRunning() const {
252-
if (child != 0) {
253-
int status;
254-
auto next = waitpid(child, &status, WNOHANG);
255-
if (next == static_cast<pid_t>(-1)) {
256-
throw std::system_error(errno,
257-
std::system_category(),
258-
"NodeImpl::isRunning: waitpid failed");
259-
}
260-
261-
if (next == child) {
262-
child = 0;
263-
return false;
264-
}
265-
266-
return true;
267-
}
268-
269-
return false;
270-
}
271-
#endif
272-
273189
std::unique_ptr<MemcachedConnection> NodeImpl::getConnection() const {
274190
auto ret = connectionMap.getConnection().clone();
275191
ret->setAutoRetryTmpfail(true);

cluster_framework/node.h

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -30,8 +30,6 @@ class Node {
3030
public:
3131
virtual ~Node();
3232

33-
bool isRunning() const;
34-
3533
virtual std::unique_ptr<MemcachedConnection> getConnection() const = 0;
3634

3735
const boost::filesystem::path directory;
@@ -57,12 +55,6 @@ class Node {
5755

5856
protected:
5957
explicit Node(boost::filesystem::path dir);
60-
61-
#ifdef WIN32
62-
mutable HANDLE child = INVALID_HANDLE_VALUE;
63-
#else
64-
mutable pid_t child = 0;
65-
#endif
6658
};
6759

6860
} // namespace cb::test

daemon/CMakeLists.txt

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -62,8 +62,6 @@ add_library(memcached_daemon STATIC
6262
network_interface_manager.h
6363
opentelemetry.cc
6464
opentelemetry.h
65-
parent_monitor.cc
66-
parent_monitor.h
6765
protocol/mcbp/adjust_timeofday_executor.cc
6866
protocol/mcbp/appendprepend_context.cc
6967
protocol/mcbp/appendprepend_context.h

daemon/memcached.cc

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,6 @@
2828
#include "network_interface_manager.h"
2929
#include "nobucket_taskable.h"
3030
#include "opentelemetry.h"
31-
#include "parent_monitor.h"
3231
#include "protocol/mcbp/engine_wrapper.h"
3332
#include "runtime.h"
3433
#include "server_core_api.h"
@@ -53,6 +52,7 @@
5352
#include <platform/backtrace.h>
5453
#include <platform/dirutils.h>
5554
#include <platform/interrupt.h>
55+
#include <platform/process_monitor.h>
5656
#include <platform/scope_timer.h>
5757
#include <platform/strerror.h>
5858
#include <platform/sysinfo.h>
@@ -855,7 +855,7 @@ int memcached_main(int argc, char** argv) {
855855
environment.max_file_descriptors = cb::io::maximizeFileDescriptors(
856856
std::numeric_limits<uint32_t>::max());
857857

858-
std::unique_ptr<ParentMonitor> parent_monitor;
858+
std::unique_ptr<ProcessMonitor> parent_monitor;
859859

860860
try {
861861
cb::logger::createConsoleLogger();
@@ -1077,7 +1077,15 @@ int memcached_main(int argc, char** argv) {
10771077
const int parent = Settings::instance().getParentIdentifier();
10781078
if (parent != -1) {
10791079
LOG_INFO_RAW("Starting parent monitor");
1080-
parent_monitor = std::make_unique<ParentMonitor>(parent);
1080+
parent_monitor = ProcessMonitor::create(
1081+
parent,
1082+
[parent](auto&) {
1083+
std::cerr << "Parent process " << parent
1084+
<< " died. Exiting" << std::endl;
1085+
std::cerr.flush();
1086+
std::_Exit(EXIT_FAILURE);
1087+
},
1088+
"mc:parent_mon");
10811089
}
10821090
}
10831091

0 commit comments

Comments
 (0)