Skip to content

Commit a2c04e2

Browse files
committed
MB-31402: Back off logging of stuck connections
Change logging of stuck connections during deletion to: 1. Don't log anything the first 30 seconds. Then dump the state of all stuck connections. 2. Don't og anything for the next 30 seconds. Then dump the state of all stuck connections which changed since the previous dump. 3. goto 2. Change-Id: Ia0520ae28bb0f6b799f52c9e8902607c95d9485b Reviewed-on: http://review.couchbase.org/103254 Tested-by: Build Bot <[email protected]> Reviewed-by: Dave Rigby <[email protected]>
1 parent 8e0e60f commit a2c04e2

File tree

6 files changed

+117
-135
lines changed

6 files changed

+117
-135
lines changed

daemon/connections.cc

Lines changed: 10 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -64,26 +64,19 @@ static Connection* allocate_connection(SOCKET sfd,
6464
static void release_connection(Connection* c);
6565

6666
/** External functions *******************************************************/
67-
int signal_idle_clients(FrontEndThread* me, int bucket_idx, bool logging) {
68-
// We've got a situation right now where we're seeing that
69-
// some of the connections is "stuck". Let's dump all
70-
// information until we solve the bug.
71-
logging = true;
7267

68+
int signal_idle_clients(FrontEndThread& me) {
7369
int connected = 0;
74-
std::lock_guard<std::mutex> lock(connections.mutex);
75-
for (auto* c : connections.conns) {
76-
if (c->getThread() == me) {
77-
++connected;
78-
if (bucket_idx == -1 || c->getBucketIndex() == bucket_idx) {
79-
if (!c->signalIfIdle() && logging) {
80-
auto details = c->toJSON().dump();
81-
LOG_INFO("Worker thread {}: {}", me->index, details);
70+
const auto index = me.index;
71+
72+
iterate_thread_connections(
73+
&me, [index, &connected](Connection& connection) {
74+
++connected;
75+
if (!connection.signalIfIdle()) {
76+
auto details = connection.toJSON().dump();
77+
LOG_INFO("Worker thread {}: {}", index, details);
8278
}
83-
}
84-
}
85-
}
86-
79+
});
8780
return connected;
8881
}
8982

daemon/connections.h

Lines changed: 8 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727

2828
struct FrontEndThread;
2929
class ListeningPort;
30+
class Bucket;
3031

3132
/* Destroy all connections and reset connection management */
3233
void destroy_connections();
@@ -82,17 +83,15 @@ Connection* conn_new(const SOCKET sfd,
8283
ListeningPort *get_listening_port_instance(const in_port_t port);
8384

8485
/**
85-
* Signal (set writable) all idle clients bound to either a specific
86-
* bucket specified by its index, or any bucket (specified as -1).
87-
* Due to the threading model we're only going to look at the clients
88-
* connected to the thread represented by me.
86+
* Signal all of the idle clients in the system.
8987
*
90-
* @param me the thread to inspect
91-
* @param bucket_idx the bucket we'd like to signal (set to -1 to signal
92-
* all buckets)
93-
* @return the number of client connections bound to this thread.
88+
* Due to the locking model we need to have the FrontEndThread locked
89+
* when calling the method.
90+
*
91+
* @param me The connections to inspect must be bound to this thread
92+
* @return The number of clients connections bound to this thread
9493
*/
95-
int signal_idle_clients(FrontEndThread* me, int bucket_idx, bool logging);
94+
int signal_idle_clients(FrontEndThread& me);
9695

9796
/**
9897
* Iterate over all of the connections and call the callback function

daemon/front_end_thread.h

Lines changed: 0 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -124,17 +124,6 @@ struct FrontEndThread {
124124
*/
125125
Subdoc::Operation subdoc_op;
126126

127-
/**
128-
* When we're deleting buckets we need to disconnect idle
129-
* clients. This variable is incremented for every delete bucket
130-
* thread running and decremented when it's done. When this
131-
* variable is set we'll try to look through all connections and
132-
* update them with a write event if they're in an "idle"
133-
* state. That should cause them to be rescheduled and cause the
134-
* client to disconnect.
135-
*/
136-
int deleting_buckets = 0;
137-
138127
/**
139128
* Shared validator used by all connections serviced by this thread
140129
* when they need to validate a JSON document
@@ -147,4 +136,3 @@ struct FrontEndThread {
147136

148137
void notify_thread(FrontEndThread& thread);
149138
void notify_dispatcher();
150-
void notify_thread_bucket_deletion(FrontEndThread& me);

daemon/memcached.cc

Lines changed: 97 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -765,7 +765,7 @@ void event_handler(evutil_socket_t fd, short which, void *arg) {
765765

766766
if (memcached_shutdown) {
767767
// Someone requested memcached to shut down.
768-
if (signal_idle_clients(thr, -1, false) == 0) {
768+
if (signal_idle_clients(*thr) == 0) {
769769
cb_assert(thr != nullptr);
770770
LOG_INFO("Stopping worker thread {}", thr->index);
771771
c->eventBaseLoopbreak();
@@ -781,7 +781,7 @@ void event_handler(evutil_socket_t fd, short which, void *arg) {
781781
if (memcached_shutdown) {
782782
// Someone requested memcached to shut down. If we don't have
783783
// any connections bound to this thread we can just shut down
784-
int connected = signal_idle_clients(thr, -1, true);
784+
int connected = signal_idle_clients(*thr);
785785
if (connected == 0) {
786786
LOG_INFO("Stopping worker thread {}", thr->index);
787787
event_base_loopbreak(thr->base);
@@ -1789,22 +1789,6 @@ void CreateBucketThread::run()
17891789
task->makeRunnable();
17901790
}
17911791

1792-
void notify_thread_bucket_deletion(FrontEndThread& me) {
1793-
for (size_t ii = 0; ii < all_buckets.size(); ++ii) {
1794-
bool destroy = false;
1795-
{
1796-
std::lock_guard<std::mutex> guard(all_buckets[ii].mutex);
1797-
if (all_buckets[ii].state == BucketState::Destroying) {
1798-
destroy = true;
1799-
}
1800-
}
1801-
1802-
if (destroy) {
1803-
signal_idle_clients(&me, gsl::narrow<int>(ii), false);
1804-
}
1805-
}
1806-
}
1807-
18081792
void DestroyBucketThread::destroy() {
18091793
ENGINE_ERROR_CODE ret = ENGINE_KEY_ENOENT;
18101794
std::unique_lock<std::mutex> all_bucket_lock(buckets_lock);
@@ -1861,58 +1845,114 @@ void DestroyBucketThread::destroy() {
18611845

18621846
perform_callbacks(ON_DELETE_BUCKET, nullptr, &all_buckets[idx]);
18631847

1864-
LOG_INFO("{} Delete bucket [{}]. Wait for clients to disconnect",
1865-
connection_id,
1866-
name);
1867-
18681848
/* If this thread is connected to the requested bucket... release it */
18691849
if (connection != nullptr && idx == size_t(connection->getBucketIndex())) {
18701850
disassociate_bucket(*connection);
18711851
}
18721852

1873-
/* Let all of the worker threads start invalidating connections */
1874-
threads_initiate_bucket_deletion();
1875-
1853+
// Wait until all users disconnected...
18761854
auto& bucket = all_buckets[idx];
1877-
1878-
/* Wait until all users disconnected... */
18791855
{
18801856
std::unique_lock<std::mutex> guard(bucket.mutex);
1857+
if (bucket.clients > 0) {
1858+
LOG_INFO("{} Delete bucket [{}]. Wait for {} clients to disconnect",
1859+
connection_id,
1860+
name,
1861+
bucket.clients);
18811862

1882-
while (bucket.clients > 0) {
1883-
LOG_INFO(
1884-
"{} Delete bucket [{}]. Still waiting: {} clients "
1885-
"connected",
1886-
connection_id.c_str(),
1887-
name.c_str(),
1888-
bucket.clients);
1889-
/* drop the lock and notify the worker threads */
1863+
// Signal clients bound to the bucket before waiting
18901864
guard.unlock();
1891-
threads_notify_bucket_deletion();
1865+
iterate_all_connections([&bucket](Connection& connection) {
1866+
if (&connection.getBucket() == &bucket) {
1867+
connection.signalIfIdle();
1868+
}
1869+
});
18921870
guard.lock();
1893-
1894-
bucket.cond.wait_for(guard,
1895-
std::chrono::milliseconds(1000),
1896-
[&bucket] { return bucket.clients == 0; });
18971871
}
1898-
}
18991872

1900-
/* Tell the worker threads to stop trying to invalidating connections */
1901-
threads_complete_bucket_deletion();
1873+
using std::chrono::seconds;
1874+
using std::chrono::steady_clock;
19021875

1903-
/*
1904-
* We cannot call assert_no_assocations(idx) because it iterates
1905-
* over all connections and calls c->getBucketIndex(). The problem
1906-
* is that a worker thread can call associate_initial_bucket() or
1907-
* associate_bucket() at the same time. This could lead to a call
1908-
* to c->setBucketIndex(0) (the "no bucket"), which although safe,
1909-
* raises a threadsanitizer warning.
1910-
1911-
* Note, if associate_bucket() attempts to associate a connection
1912-
* with a bucket that has been destroyed, or is in the process of
1913-
* being destroyed, the association will fail because
1914-
* BucketState != Ready. See associate_bucket() for more details.
1915-
*/
1876+
auto nextLog = steady_clock::now() + seconds(30);
1877+
nlohmann::json prevDump;
1878+
1879+
// We need to disconnect all of the clients before we can delete the
1880+
// bucket. We try to log stuff while we're waiting in order to
1881+
// know why stuff isn't complete. We'll do it in the following way:
1882+
//
1883+
// 1. Don't log anything the first 30 seconds. Then
1884+
// dump the state of all stuck connections.
1885+
// 2. Don't og anything for the next 30 seconds. Then
1886+
// dump the state of all stuck connections which
1887+
// changed since the previous dump.
1888+
// 3. goto 2.
1889+
//
1890+
while (bucket.clients > 0) {
1891+
bucket.cond.wait_for(guard, seconds(1), [&bucket] {
1892+
return bucket.clients == 0;
1893+
});
1894+
1895+
if (bucket.clients == 0) {
1896+
break;
1897+
}
1898+
1899+
if (steady_clock::now() < nextLog) {
1900+
guard.unlock();
1901+
iterate_all_connections([&bucket](Connection& connection) {
1902+
if (&connection.getBucket() == &bucket) {
1903+
connection.signalIfIdle();
1904+
}
1905+
});
1906+
guard.lock();
1907+
continue;
1908+
}
1909+
1910+
nextLog = steady_clock::now() + seconds(30);
1911+
1912+
// drop the lock and notify the worker threads
1913+
guard.unlock();
1914+
1915+
nlohmann::json json;
1916+
iterate_all_connections([&bucket, &json](Connection& connection) {
1917+
if (&connection.getBucket() == &bucket) {
1918+
if (!connection.signalIfIdle()) {
1919+
json[std::to_string(connection.getId())] =
1920+
connection.toJSON();
1921+
}
1922+
}
1923+
});
1924+
1925+
// remove all connections which didn't change
1926+
for (auto it = prevDump.begin(); it != prevDump.end(); ++it) {
1927+
auto entry = json.find(it.key());
1928+
if (entry != json.end()) {
1929+
auto old = it.value().dump();
1930+
auto current = entry->dump();
1931+
if (old == current) {
1932+
json.erase(entry);
1933+
}
1934+
}
1935+
}
1936+
1937+
prevDump = std::move(json);
1938+
if (prevDump.empty()) {
1939+
LOG_INFO(
1940+
R"({} Delete bucket [{}]. Still waiting: {} clients connected (state is unchanged).)",
1941+
connection_id,
1942+
name,
1943+
bucket.clients);
1944+
} else {
1945+
LOG_INFO(
1946+
R"({} Delete bucket [{}]. Still waiting: {} clients connected: {})",
1947+
connection_id,
1948+
name,
1949+
bucket.clients,
1950+
prevDump.dump());
1951+
}
1952+
1953+
guard.lock();
1954+
}
1955+
}
19161956

19171957
LOG_INFO(
19181958
"{} Delete bucket [{}]. Shut down the bucket", connection_id, name);
@@ -1923,7 +1963,7 @@ void DestroyBucketThread::destroy() {
19231963
connection_id,
19241964
name);
19251965

1926-
/* Clean up the stats... */
1966+
// Clean up the stats...
19271967
threadlocal_stats_reset(bucket.stats);
19281968

19291969
// Clear any registered event handlers

daemon/memcached.h

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -110,10 +110,6 @@ const char* get_server_version(void);
110110
*/
111111
void update_topkeys(const Cookie& cookie);
112112

113-
void threads_notify_bucket_deletion();
114-
void threads_complete_bucket_deletion();
115-
void threads_initiate_bucket_deletion();
116-
117113
SERVER_HANDLE_V1* get_server_api();
118114

119115
void shutdown_server();

daemon/thread.cc

Lines changed: 2 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -278,7 +278,7 @@ static void thread_libevent_process(evutil_socket_t fd, short, void* arg) {
278278
return;
279279
}
280280

281-
if (signal_idle_clients(&me, -1, false) == 0) {
281+
if (signal_idle_clients(me) == 0) {
282282
LOG_INFO("Stopping worker thread {}", me.index);
283283
event_base_loopbreak(me.base);
284284
return;
@@ -342,22 +342,14 @@ static void thread_libevent_process(evutil_socket_t fd, short, void* arg) {
342342
run_event_loop(c, EV_READ | EV_WRITE);
343343
}
344344

345-
/*
346-
* I could look at all of the connection objects bound to dying buckets
347-
*/
348-
if (me.deleting_buckets) {
349-
notify_thread_bucket_deletion(me);
350-
}
351-
352345
if (memcached_shutdown) {
353346
// Someone requested memcached to shut down. If we don't have
354347
// any connections bound to this thread we can just shut down
355-
auto connected = signal_idle_clients(&me, -1, true);
348+
auto connected = signal_idle_clients(me);
356349
if (connected == 0) {
357350
LOG_INFO("Stopping worker thread {}", me.index);
358351
event_base_loopbreak(me.base);
359352
} else {
360-
// @todo Change loglevel once MB-16255 is resolved
361353
LOG_INFO("Waiting for {} connected clients on worker thread {}",
362354
connected,
363355
me.index);
@@ -515,32 +507,6 @@ FrontEndThread::~FrontEndThread() {
515507
}
516508
}
517509

518-
void threads_notify_bucket_deletion() {
519-
for (auto& thr : threads) {
520-
notify_thread(thr);
521-
}
522-
}
523-
524-
void threads_complete_bucket_deletion() {
525-
for (auto& thr : threads) {
526-
TRACE_LOCKGUARD_TIMED(thr.mutex,
527-
"mutex",
528-
"threads_complete_bucket_deletion::threadLock",
529-
SlowMutexThreshold);
530-
thr.deleting_buckets--;
531-
}
532-
}
533-
534-
void threads_initiate_bucket_deletion() {
535-
for (auto& thr : threads) {
536-
TRACE_LOCKGUARD_TIMED(thr.mutex,
537-
"mutex",
538-
"threads_initiate_bucket_deletion::threadLock",
539-
SlowMutexThreshold);
540-
thr.deleting_buckets++;
541-
}
542-
}
543-
544510
void notify_thread(FrontEndThread& thread) {
545511
if (cb::net::send(thread.notify[1], "", 1, 0) != 1 &&
546512
!cb::net::is_blocking(cb::net::get_socket_error())) {

0 commit comments

Comments
 (0)