refactor: Use EventLoopRef instead of addClient/removeClient

ryanofsky · ryanofsky · commit 2b830e558e61 · 2025-06-11T15:16:29.000-05:00
Use EventLoopRef to avoid reference counting bugs and be more exception safe
diff --git a/include/mp/proxy-io.h b/include/mp/proxy-io.h
@@ -313,21 +313,13 @@ class Connection
     Connection(EventLoop& loop, kj::Own<kj::AsyncIoStream>&& stream_)
         : m_loop(loop), m_stream(kj::mv(stream_)),
           m_network(*m_stream, ::capnp::rpc::twoparty::Side::CLIENT, ::capnp::ReaderOptions()),
-          m_rpc_system(::capnp::makeRpcClient(m_network))
-    {
-        std::unique_lock<std::mutex> lock(m_loop.m_mutex);
-        m_loop.addClient(lock);
-    }
+          m_rpc_system(::capnp::makeRpcClient(m_network)) {}
     Connection(EventLoop& loop,
         kj::Own<kj::AsyncIoStream>&& stream_,
         const std::function<::capnp::Capability::Client(Connection&)>& make_client)
         : m_loop(loop), m_stream(kj::mv(stream_)),
           m_network(*m_stream, ::capnp::rpc::twoparty::Side::SERVER, ::capnp::ReaderOptions()),
-          m_rpc_system(::capnp::makeRpcServer(m_network, make_client(*this)))
-    {
-        std::unique_lock<std::mutex> lock(m_loop.m_mutex);
-        m_loop.addClient(lock);
-    }
+          m_rpc_system(::capnp::makeRpcServer(m_network, make_client(*this))) {}
 
     //! Run cleanup functions. Must be called from the event loop thread. First
     //! calls synchronous cleanup functions while blocked (to free capnp
@@ -356,12 +348,12 @@ class Connection
         // to the EventLoop TaskSet to avoid "Promise callback destroyed itself"
         // error in cases where f deletes this Connection object.
         m_on_disconnect.add(m_network.onDisconnect().then(
-            [f = std::forward<F>(f), this]() mutable { m_loop.m_task_set->add(kj::evalLater(kj::mv(f))); }));
+            [f = std::forward<F>(f), this]() mutable { m_loop->m_task_set->add(kj::evalLater(kj::mv(f))); }));
     }
 
-    EventLoop& m_loop;
+    EventLoopRef m_loop;
     kj::Own<kj::AsyncIoStream> m_stream;
-    LoggingErrorHandler m_error_handler{m_loop};
+    LoggingErrorHandler m_error_handler{*m_loop};
     kj::TaskSet m_on_disconnect{m_error_handler};
     ::capnp::TwoPartyVatNetwork m_network;
     std::optional<::capnp::RpcSystem<::capnp::rpc::twoparty::VatId>> m_rpc_system;
@@ -404,21 +396,12 @@ ProxyClientBase<Interface, Impl>::ProxyClientBase(typename Interface::Client cli
     : m_client(std::move(client)), m_context(connection)
 
 {
-    {
-        std::unique_lock<std::mutex> lock(m_context.connection->m_loop.m_mutex);
-        m_context.connection->m_loop.addClient(lock);
-    }
-
     // Handler for the connection getting destroyed before this client object.
     auto cleanup_it = m_context.connection->addSyncCleanup([this]() {
         // Release client capability by move-assigning to temporary.
         {
             typename Interface::Client(std::move(m_client));
         }
-        {
-            std::unique_lock<std::mutex> lock(m_context.connection->m_loop.m_mutex);
-            m_context.connection->m_loop.removeClient(lock);
-        }
         m_context.connection = nullptr;
     });
 
@@ -451,11 +434,6 @@ ProxyClientBase<Interface, Impl>::ProxyClientBase(typename Interface::Client cli
             {
                 typename Interface::Client(std::move(m_client));
             }
-            {
-                std::unique_lock<std::mutex> lock(m_context.connection->m_loop.m_mutex);
-                m_context.connection->m_loop.removeClient(lock);
-            }
-
             if (destroy_connection) {
                 delete m_context.connection;
                 m_context.connection = nullptr;
@@ -477,8 +455,6 @@ ProxyServerBase<Interface, Impl>::ProxyServerBase(std::shared_ptr<Impl> impl, Co
     : m_impl(std::move(impl)), m_context(&connection)
 {
     assert(m_impl);
-    std::unique_lock<std::mutex> lock(m_context.connection->m_loop.m_mutex);
-    m_context.connection->m_loop.addClient(lock);
 }
 
 //! ProxyServer destructor, called from the EventLoop thread by Cap'n Proto
@@ -512,8 +488,6 @@ ProxyServerBase<Interface, Impl>::~ProxyServerBase()
         });
     }
     assert(m_context.cleanup_fns.empty());
-    std::unique_lock<std::mutex> lock(m_context.connection->m_loop.m_mutex);
-    m_context.connection->m_loop.removeClient(lock);
 }
 
 //! If the capnp interface defined a special "destroy" method, as described the
diff --git a/include/mp/proxy.h b/include/mp/proxy.h
@@ -72,7 +72,7 @@ class EventLoopRef
 struct ProxyContext
 {
     Connection* connection;
-    EventLoop* loop;
+    EventLoopRef loop;
     CleanupList cleanup_fns;
 
     ProxyContext(Connection* connection);
diff --git a/src/mp/proxy.cpp b/src/mp/proxy.cpp
@@ -65,7 +65,7 @@ bool EventLoopRef::reset()
     return done;
 }
 
-ProxyContext::ProxyContext(Connection* connection) : connection(connection), loop{&connection->m_loop} {}
+ProxyContext::ProxyContext(Connection* connection) : connection(connection), loop{*connection->m_loop} {}
 
 Connection::~Connection()
 {
@@ -122,18 +122,17 @@ Connection::~Connection()
         m_sync_cleanup_fns.pop_front();
     }
     while (!m_async_cleanup_fns.empty()) {
-        const std::unique_lock<std::mutex> lock(m_loop.m_mutex);
-        m_loop.m_async_fns.emplace_back(std::move(m_async_cleanup_fns.front()));
+        const std::unique_lock<std::mutex> lock(m_loop->m_mutex);
+        m_loop->m_async_fns.emplace_back(std::move(m_async_cleanup_fns.front()));
         m_async_cleanup_fns.pop_front();
     }
-    std::unique_lock<std::mutex> lock(m_loop.m_mutex);
-    m_loop.startAsyncThread(lock);
-    m_loop.removeClient(lock);
+    std::unique_lock<std::mutex> lock(m_loop->m_mutex);
+    m_loop->startAsyncThread(lock);
 }
 
 CleanupIt Connection::addSyncCleanup(std::function<void()> fn)
 {
-    const std::unique_lock<std::mutex> lock(m_loop.m_mutex);
+    const std::unique_lock<std::mutex> lock(m_loop->m_mutex);
     // Add cleanup callbacks to the front of list, so sync cleanup functions run
     // in LIFO order. This is a good approach because sync cleanup functions are
     // added as client objects are created, and it is natural to clean up
@@ -147,13 +146,13 @@ CleanupIt Connection::addSyncCleanup(std::function<void()> fn)
 
 void Connection::removeSyncCleanup(CleanupIt it)
 {
-    const std::unique_lock<std::mutex> lock(m_loop.m_mutex);
+    const std::unique_lock<std::mutex> lock(m_loop->m_mutex);
     m_sync_cleanup_fns.erase(it);
 }
 
 void Connection::addAsyncCleanup(std::function<void()> fn)
 {
-    const std::unique_lock<std::mutex> lock(m_loop.m_mutex);
+    const std::unique_lock<std::mutex> lock(m_loop->m_mutex);
     // Add async cleanup callbacks to the back of the list. Unlike the sync
     // cleanup list, this list order is more significant because it determines
     // the order server objects are destroyed when there is a sudden disconnect,
@@ -244,7 +243,7 @@ void EventLoop::post(const std::function<void()>& fn)
         return;
     }
     std::unique_lock<std::mutex> lock(m_mutex);
-    addClient(lock);
+    EventLoopRef ref(*this, &lock);
     m_cv.wait(lock, [this] { return m_post_fn == nullptr; });
     m_post_fn = &fn;
     int post_fd{m_post_fd};
@@ -253,20 +252,22 @@ void EventLoop::post(const std::function<void()>& fn)
         KJ_SYSCALL(write(post_fd, &buffer, 1));
     });
     m_cv.wait(lock, [this, &fn] { return m_post_fn != &fn; });
-    removeClient(lock);
 }
 
 void EventLoop::addClient(std::unique_lock<std::mutex>& lock) { m_num_clients += 1; }
 
 bool EventLoop::removeClient(std::unique_lock<std::mutex>& lock)
 {
+    assert(m_num_clients > 0);
     m_num_clients -= 1;
     if (done(lock)) {
         m_cv.notify_all();
         int post_fd{m_post_fd};
         lock.unlock();
         char buffer = 0;
         KJ_SYSCALL(write(post_fd, &buffer, 1)); // NOLINT(bugprone-suspicious-semicolon)
+        // Do not try to relock `lock` after writing, because the event loop
+        // could wake up and destroy itself and the mutex might no longer exist.
         return true;
     }
     return false;
@@ -275,20 +276,25 @@ bool EventLoop::removeClient(std::unique_lock<std::mutex>& lock)
 void EventLoop::startAsyncThread(std::unique_lock<std::mutex>& lock)
 {
     if (m_async_thread.joinable()) {
+        // Notify to wake up the async thread if it is already running.
         m_cv.notify_all();
     } else if (!m_async_fns.empty()) {
         m_async_thread = std::thread([this] {
             std::unique_lock<std::mutex> lock(m_mutex);
-            while (true) {
+            while (!done(lock)) {
                 if (!m_async_fns.empty()) {
-                    addClient(lock);
+                    EventLoopRef ref{*this, &lock};
                     const std::function<void()> fn = std::move(m_async_fns.front());
                     m_async_fns.pop_front();
                     Unlock(lock, fn);
-                    if (removeClient(lock)) break;
+                    // Reset ref and break if that returns true instead of
+                    // passively letting ref go out of scope. This is important
+                    // because the ref destructor would leave m_mutex unlocked
+                    // when done() returns true, causing undefined behavior if
+                    // the loop continued to execute.
+                    if (ref.reset()) break;
+                    // Continue without waiting in case there are more async_fns
                     continue;
-                } else if (m_num_clients == 0) {
-                    break;
                 }
                 m_cv.wait(lock);
             }
@@ -394,7 +400,7 @@ kj::Promise<void> ProxyServer<ThreadMap>::makeThread(MakeThreadContext context)
     const std::string from = context.getParams().getName();
     std::promise<ThreadContext*> thread_context;
     std::thread thread([&thread_context, from, this]() {
-        g_thread_context.thread_name = ThreadName(m_connection.m_loop.m_exe_name) + " (from " + from + ")";
+        g_thread_context.thread_name = ThreadName(m_connection.m_loop->m_exe_name) + " (from " + from + ")";
         g_thread_context.waiter = std::make_unique<Waiter>();
         thread_context.set_value(&g_thread_context);
         std::unique_lock<std::mutex> lock(g_thread_context.waiter->m_mutex);

Original file line number	Diff line number	Diff line change
`@@ -72,7 +72,7 @@ class EventLoopRef`
`72`	`72`	`struct ProxyContext`
`73`	`73`	`{`
`74`	`74`	`Connection* connection;`
`75`		`- EventLoop* loop;`
	`75`	`+ EventLoopRef loop;`
`76`	`76`	`CleanupList cleanup_fns;`
`77`	`77`
`78`	`78`	`ProxyContext(Connection* connection);`