Skip to content

Commit 1fb9871

Browse files
nshylocker
authored andcommitted
fiber: don't hang on uncancellable client fiber
Similar to handling of uncancellable iproto request let's also panic on uncancellable client fiber on shutdown. Now `sql-tap/in2.test` start to fail under ASAN. `metrics_collector` fiber of feedback_daemon is not cancelled in due time on shutdown. Server panics and test fails. Turned out the fiber is cancelled during metrics collection which is done under pcall. So cancelling is ignored and we start the loop again going to sleep. fiber.sleep() will detect cancel state after sleep but we panic before that moment. We tried to make fiber.sleep() cancellation point but 2 tests start to spin in busy loop (one of them is the test added in the patch). So this approach looks dangeous. Part of tarantool#8423 NO_CHANGELOG=internal NO_DOC=internal
1 parent 22d507d commit 1fb9871

File tree

6 files changed

+54
-9
lines changed

6 files changed

+54
-9
lines changed

src/box/box.cc

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6005,7 +6005,10 @@ box_storage_shutdown()
60056005
* we won't need to handle requests from client fibers after/during
60066006
* subsystem shutdown.
60076007
*/
6008-
fiber_shutdown();
6008+
if (fiber_shutdown(box_shutdown_timeout) != 0) {
6009+
diag_log();
6010+
panic("cannot gracefully shutdown client fibers");
6011+
}
60096012
replication_shutdown();
60106013
gc_shutdown();
60116014
engine_shutdown();

src/box/lua/feedback_daemon.lua

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -426,6 +426,9 @@ local function metrics_collect_loop(self)
426426
if new_metric ~= nil then
427427
insert_metric(self, new_metric)
428428
end
429+
if not pcall(fiber.testcancel) then
430+
break
431+
end
429432
end
430433
self.shutdown:put("stopped")
431434
end

src/lib/core/fiber.c

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2377,8 +2377,8 @@ fiber_set_system(struct fiber *f, bool yesno)
23772377
}
23782378
}
23792379

2380-
void
2381-
fiber_shutdown(void)
2380+
int
2381+
fiber_shutdown(double timeout)
23822382
{
23832383
assert(cord()->shutdown_fiber == NULL);
23842384
cord()->is_shutdown = true;
@@ -2388,7 +2388,13 @@ fiber_shutdown(void)
23882388
fiber_cancel(fiber);
23892389
}
23902390
cord()->shutdown_fiber = fiber();
2391-
while (cord()->client_fiber_count != 0)
2392-
fiber_yield();
2391+
double deadline = ev_monotonic_now(loop()) + timeout;
2392+
while (cord()->client_fiber_count != 0) {
2393+
if (fiber_yield_deadline(deadline)) {
2394+
diag_set(TimedOut);
2395+
break;
2396+
}
2397+
}
23932398
cord()->shutdown_fiber = NULL;
2399+
return cord()->client_fiber_count == 0 ? 0 : -1;
23942400
}

src/lib/core/fiber.h

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1254,9 +1254,17 @@ fiber_lua_state(struct fiber *f);
12541254
void
12551255
fiber_set_system(struct fiber *f, bool yesno);
12561256

1257-
/** Cancel all client (non system) fibers and wait until they finished. */
1258-
void
1259-
fiber_shutdown(void);
1257+
/**
1258+
* Cancel all client (non system) fibers and wait until they finished.
1259+
*
1260+
* If not finished in given timeout then failure result code is returned.
1261+
*
1262+
* Return:
1263+
* 0 - success
1264+
* -1 - failure (diag is set)
1265+
*/
1266+
int
1267+
fiber_shutdown(double timeout);
12601268

12611269
#if defined(__cplusplus)
12621270
} /* extern "C" */

test/box-luatest/shutdown_test.lua

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -119,3 +119,27 @@ g.test_shutdown_of_hanging_iproto_request = function(cg)
119119
t.assert(cg.server:grep_log('cannot gracefully shutdown iproto', nil,
120120
{filename = log}))
121121
end
122+
123+
-- Test shutdown does not hang if there is client fiber that
124+
-- cannot be cancelled.
125+
g.test_shutdown_of_hanging_client_fiber = function(cg)
126+
cg.server:exec(function()
127+
local log = require('log')
128+
local fiber = require('fiber')
129+
local tweaks = require('internal.tweaks')
130+
tweaks.box_shutdown_timeout = 1.0
131+
fiber.new(function()
132+
log.info('going to sleep for test')
133+
while true do
134+
pcall(fiber.sleep, 1000)
135+
end
136+
end)
137+
end)
138+
t.helpers.retrying({}, function()
139+
t.assert(cg.server:grep_log('going to sleep for test'))
140+
end)
141+
local log = fio.pathjoin(cg.server.workdir, cg.server.alias .. '.log')
142+
test_no_hang_on_shutdown(cg.server)
143+
t.assert(cg.server:grep_log('cannot gracefully shutdown client fibers', nil,
144+
{filename = log}))
145+
end

test/unit/fiber.cc

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -722,7 +722,8 @@ fiber_test_shutdown(void)
722722
fail_unless(fiber4 != NULL);
723723
fiber_set_joinable(fiber4, true);
724724

725-
fiber_shutdown();
725+
int rc = fiber_shutdown(1000.0);
726+
fail_unless(rc == 0);
726727

727728
fail_unless((fiber1->flags & FIBER_IS_DEAD) != 0);
728729
fail_unless((fiber2->flags & FIBER_IS_DEAD) == 0);

0 commit comments

Comments
 (0)