Skip to content

Commit 3cb8569

Browse files
xinhaoyuancopybara-github
authored andcommitted
No public description
PiperOrigin-RevId: 781152254
1 parent d5857a5 commit 3cb8569

File tree

3 files changed

+121
-8
lines changed

3 files changed

+121
-8
lines changed

centipede/command.cc

Lines changed: 27 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -333,9 +333,10 @@ int Command::Execute() {
333333
struct pollfd poll_fd = {};
334334
int poll_ret = -1;
335335
auto poll_deadline = absl::Now() + options_.timeout;
336-
// The `poll()` syscall can get interrupted: it sets errno==EINTR in that
337-
// case. We should tolerate that.
336+
bool sigterm_sent = false;
337+
bool try_again = false;
338338
do {
339+
try_again = false;
339340
// NOTE: `poll_fd` has to be reset every time.
340341
poll_fd = {
341342
/*fd=*/fork_server_->pipe_[1], // The file descriptor to wait for.
@@ -344,15 +345,36 @@ int Command::Execute() {
344345
const int poll_timeout_ms = static_cast<int>(absl::ToInt64Milliseconds(
345346
std::max(poll_deadline - absl::Now(), absl::Milliseconds(1))));
346347
poll_ret = poll(&poll_fd, 1, poll_timeout_ms);
347-
} while (poll_ret < 0 && errno == EINTR);
348+
// The `poll()` syscall can get interrupted: it sets errno==EINTR in that
349+
// case. We should tolerate that.
350+
if (poll_ret < 0 && errno == EINTR) {
351+
try_again = true;
352+
continue;
353+
}
354+
if (poll_ret == 0 && !sigterm_sent) {
355+
LogProblemInfo(
356+
absl::StrCat("Timeout while waiting for fork server: timeout is ",
357+
absl::FormatDuration(options_.timeout)));
358+
CHECK_NE(fork_server_->pid_, -1);
359+
LOG(INFO) << "Sending SIGTERM to the fork server PID "
360+
<< fork_server_->pid_ << " and waiting for 60s";
361+
kill(fork_server_->pid_, SIGTERM);
362+
sigterm_sent = true;
363+
poll_deadline += absl::Seconds(60);
364+
try_again = true;
365+
continue;
366+
}
367+
} while (try_again);
348368

349369
if (poll_ret != 1 || (poll_fd.revents & POLLIN) == 0) {
350370
// The fork server errored out or timed out, or some other error occurred,
351371
// e.g. the syscall was interrupted.
352372
if (poll_ret == 0) {
373+
CHECK(sigterm_sent);
353374
LogProblemInfo(
354-
absl::StrCat("Timeout while waiting for fork server: timeout is ",
355-
absl::FormatDuration(options_.timeout)));
375+
"Fork server did not respond within 60s after SIGTERM was sent");
376+
// TODO: xinhaoyuan - the right thing to do is to either properly
377+
// recover or request early exit.
356378
} else {
357379
LogProblemInfo(absl::StrCat(
358380
"Error while waiting for fork server: poll() returned ", poll_ret));

centipede/command_test.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -190,7 +190,7 @@ TEST(CommandTest, ForkServer) {
190190
cmd_options.timeout = absl::Seconds(2);
191191
Command cmd{helper, std::move(cmd_options)};
192192
ASSERT_TRUE(cmd.StartForkServer(test_tmpdir, "ForkServer"));
193-
EXPECT_EQ(cmd.Execute(), EXIT_FAILURE);
193+
EXPECT_EQ(cmd.Execute(), SIGTERM);
194194
std::string log_contents;
195195
ReadFromLocalFile(log, log_contents);
196196
EXPECT_EQ(log_contents, absl::Substitute("Got input: $0", input));

centipede/runner_fork_server.cc

Lines changed: 93 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,7 @@
5757
#else // __APPLE__
5858
#include <linux/limits.h> // ARG_MAX
5959
#endif // __APPLE__
60+
#include <signal.h>
6061
#include <sys/wait.h>
6162
#include <unistd.h>
6263

@@ -205,6 +206,46 @@ __attribute__((constructor(150))) void ForkServerCallMeVeryEarly() {
205206
if (pipe1 < 0) Exit("###open pipe1 failed\n");
206207
Log("###Centipede fork server ready\n");
207208

209+
struct sigaction old_sigterm_act{};
210+
struct sigaction sigterm_act{};
211+
sigterm_act.sa_handler = [](int) {};
212+
if (sigaction(SIGTERM, &sigterm_act, &old_sigterm_act) != 0) {
213+
Exit("###sigaction failed on SIGTERM for the fork server");
214+
}
215+
216+
struct sigaction old_sigchld_act{};
217+
struct sigaction sigchld_act{};
218+
sigchld_act.sa_handler = [](int) {};
219+
if (sigaction(SIGCHLD, &sigchld_act, &old_sigchld_act) != 0) {
220+
Exit("###sigaction failed on SIGCHLD for the fork server");
221+
}
222+
223+
sigset_t old_sigset;
224+
sigset_t server_sigset;
225+
if (sigprocmask(SIG_SETMASK, nullptr, &server_sigset) != 0) {
226+
Exit("###sigprocmask() failed to get the existing sigset\n");
227+
}
228+
if (sigaddset(&server_sigset, SIGTERM) != 0) {
229+
Exit("###sigaddset() failed to add SIGTERM\n");
230+
}
231+
if (sigaddset(&server_sigset, SIGCHLD) != 0) {
232+
Exit("###sigaddset() failed to add SIGCHLD\n");
233+
}
234+
if (sigprocmask(SIG_SETMASK, &server_sigset, &old_sigset) != 0) {
235+
Exit("###sigprocmask() failed to set the fork server sigset\n");
236+
}
237+
238+
sigset_t wait_sigset;
239+
if (sigemptyset(&wait_sigset) != 0) {
240+
Exit("###sigemptyset() failed\n");
241+
}
242+
if (sigaddset(&wait_sigset, SIGTERM) != 0) {
243+
Exit("###sigaddset() failed to add SIGTERM to the wait sigset\n");
244+
}
245+
if (sigaddset(&wait_sigset, SIGCHLD) != 0) {
246+
Exit("###sigaddset() failed to add SIGCHLD to the wait sigset\n");
247+
}
248+
208249
// Loop.
209250
while (true) {
210251
Log("###Centipede fork server blocking on pipe0\n");
@@ -216,6 +257,15 @@ __attribute__((constructor(150))) void ForkServerCallMeVeryEarly() {
216257
if (pid < 0) {
217258
Exit("###fork failed\n");
218259
} else if (pid == 0) {
260+
if (sigaction(SIGTERM, &old_sigterm_act, nullptr) != 0) {
261+
Exit("###sigaction failed on SIGTERM for the child");
262+
}
263+
if (sigaction(SIGCHLD, &old_sigchld_act, nullptr) != 0) {
264+
Exit("###sigaction failed on SIGCHLD for the child");
265+
}
266+
if (sigprocmask(SIG_SETMASK, &old_sigset, nullptr) != 0) {
267+
Exit("###sigprocmask() failed to restore the previous sigset\n");
268+
}
219269
// Child process. Reset stdout/stderr and let it run normally.
220270
for (int fd = 1; fd <= 2; fd++) {
221271
lseek(fd, 0, SEEK_SET);
@@ -227,7 +277,28 @@ __attribute__((constructor(150))) void ForkServerCallMeVeryEarly() {
227277
} else {
228278
// Parent process.
229279
int status = -1;
230-
if (waitpid(pid, &status, 0) < 0) Exit("###waitpid failed\n");
280+
while (true) {
281+
int sig = -1;
282+
if (sigwait(&wait_sigset, &sig) != 0) {
283+
Exit("###sigwait() failed\n");
284+
}
285+
if (sig == SIGCHLD) {
286+
Log("###Got SIGCHLD\n");
287+
const pid_t ret = waitpid(pid, &status, WNOHANG);
288+
if (ret < 0) {
289+
Exit("###waitpid failed\n");
290+
}
291+
if (ret == pid && (WIFEXITED(status) || WIFSIGNALED(status))) {
292+
Log("###Got exit status\n");
293+
break;
294+
}
295+
} else if (sig == SIGTERM) {
296+
Log("###Got SIGTERM\n");
297+
kill(pid, SIGTERM);
298+
} else {
299+
Exit("###Unknown signal from sigwait\n");
300+
}
301+
}
231302
if (WIFEXITED(status)) {
232303
if (WEXITSTATUS(status) == EXIT_SUCCESS)
233304
Log("###Centipede fork returned EXIT_SUCCESS\n");
@@ -239,8 +310,28 @@ __attribute__((constructor(150))) void ForkServerCallMeVeryEarly() {
239310
Log("###Centipede fork crashed\n");
240311
}
241312
Log("###Centipede fork writing status to pipe1\n");
242-
if (write(pipe1, &status, sizeof(status)) == -1)
313+
if (write(pipe1, &status, sizeof(status)) == -1) {
243314
Exit("###write to pipe1 failed\n");
315+
}
316+
// Deplete any remaining signals before the next execution. Controller
317+
// won't send more signals after write succeeded.
318+
{
319+
sigset_t pending;
320+
while (true) {
321+
if (sigpending(&pending) != 0) {
322+
Exit("###sigpending() failed\n");
323+
}
324+
if (sigismember(&pending, SIGTERM) ||
325+
sigismember(&pending, SIGCHLD)) {
326+
int unused_sig;
327+
if (sigwait(&wait_sigset, &unused_sig) != 0) {
328+
Exit("###sigwait() failed\n");
329+
}
330+
} else {
331+
break;
332+
}
333+
}
334+
}
244335
}
245336
}
246337
// The only way out of the loop is via Exit() or return.

0 commit comments

Comments
 (0)