Skip to content

Commit 305343b

Browse files
SBonesmeta-codesync[bot]
authored andcommitted
fix EdenMount channel_ race
Summary: In this diff, change EdenMount `channel_` access to better handle concurrency and shutdown. When `waitForPendingWrites` was moved into `FsChannel` in 2023, we went from a single-thread FUSE dispatcher `FuseChannel` to a multiple thrift handlers trying to access `channel_`. This created the potential for a race condition where `channel_` could be destroyed by the time we try to call it - for example: ``` if (channel_) { return channel_->waitForPendingWrites(); ``` Explicitly handle concurrency by changing from `FsChannelPtr` (unique_ptr) to `AtomicReadMostlyMainPtr<FsChannel>` This allows a safer pattern to wait for writes: ``` auto ch = channel_.load(); if (ch) { return ch->waitForPendingWrites(); } ``` This works better in the active shutdown case because we can stop channel loads by setting a nullptr `channel_.store(std::shared_ptr<FsChannel>());` in `EdenMount::destroy()` Reviewed By: muirdm Differential Revision: D95870053 fbshipit-source-id: 2e6daeca15b21ce6acb0112aa33c88b9aadbbf24
1 parent 81b461b commit 305343b

File tree

4 files changed

+75
-50
lines changed

4 files changed

+75
-50
lines changed

eden/fs/inodes/BUCK

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -252,6 +252,7 @@ cpp_library(
252252
"//folly:stop_watch",
253253
"//folly:synchronized",
254254
"//folly:thread_local",
255+
"//folly/concurrency/memory:atomic_read_mostly_main_ptr",
255256
"//folly/concurrency/memory:read_mostly_shared_ptr",
256257
"//folly/container:evicting_cache_map",
257258
"//folly/coro/safe:now_task",

eden/fs/inodes/EdenMount.cpp

Lines changed: 65 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -966,6 +966,9 @@ ImmediateFuture<SetPathObjectIdResultAndTimes> EdenMount::setPathsToObjectIds(
966966

967967
void EdenMount::destroy() {
968968
auto oldState = state_.exchange(State::DESTROYING, std::memory_order_acq_rel);
969+
// Clear channel_ so concurrent readers see null and bail out.
970+
// Existing ReadMostlySharedPtr holders keep the channel alive.
971+
channel_.store(std::shared_ptr<FsChannel>());
969972
XLOGF(DBG4, "attempting to destroy EdenMount {}", getPath());
970973
switch (oldState) {
971974
case State::UNINITIALIZED:
@@ -1071,12 +1074,15 @@ folly::SemiFuture<folly::Unit> EdenMount::unmount(UnmountOptions options) {
10711074
mountingUnmountingState->fsChannelMountPromise->getFuture();
10721075
mountingUnmountingState.unlock();
10731076

1077+
auto channelHolder =
1078+
std::make_shared<folly::ReadMostlySharedPtr<FsChannel>>();
10741079
return std::move(mountFuture)
1075-
.thenTry([this, options](Try<Unit>&& mountResult) {
1080+
.thenTry([this, options, channelHolder](Try<Unit>&& mountResult) {
10761081
if (mountResult.hasException()) {
10771082
return folly::makeSemiFuture();
10781083
}
1079-
if (!channel_) {
1084+
auto ch = channel_.load();
1085+
if (!ch) {
10801086
throw std::runtime_error(
10811087
"attempting to unmount() an EdenMount without an FsChannel");
10821088
}
@@ -1089,18 +1095,25 @@ folly::SemiFuture<folly::Unit> EdenMount::unmount(UnmountOptions options) {
10891095
// is in the process of starting? Or can we assume that
10901096
// mountResult.hasException() above covers that case?
10911097

1092-
return channel_->unmount(options);
1098+
// Keep channel alive until the unmount future completes.
1099+
*channelHolder = std::move(ch);
1100+
return (*channelHolder)->unmount(options);
10931101
})
1094-
.thenTry([this](Try<Unit>&& result) noexcept -> folly::Future<Unit> {
1095-
auto unmountState = mountingUnmountingState_.wlock();
1096-
XDCHECK(unmountState->fsChannelUnmountPromise.has_value());
1097-
folly::SharedPromise<folly::Unit>* unsafeUnmountPromise =
1098-
&*unmountState->fsChannelUnmountPromise;
1099-
unmountState.unlock();
1100-
1101-
unsafeUnmountPromise->setTry(Try<Unit>{result});
1102-
return folly::makeFuture<folly::Unit>(std::move(result));
1103-
});
1102+
.thenTry(
1103+
[this,
1104+
channelHolder](Try<Unit>&& result) noexcept -> folly::Future<Unit> {
1105+
// Release the channel reference now that unmount has completed.
1106+
channelHolder->reset();
1107+
1108+
auto unmountState = mountingUnmountingState_.wlock();
1109+
XDCHECK(unmountState->fsChannelUnmountPromise.has_value());
1110+
folly::SharedPromise<folly::Unit>* unsafeUnmountPromise =
1111+
&*unmountState->fsChannelUnmountPromise;
1112+
unmountState.unlock();
1113+
1114+
unsafeUnmountPromise->setTry(Try<Unit>{result});
1115+
return folly::makeFuture<folly::Unit>(std::move(result));
1116+
});
11041117
}
11051118

11061119
const shared_ptr<UnboundedQueueExecutor>& EdenMount::getServerThreadPool()
@@ -1141,31 +1154,40 @@ InodeMetadataTable* EdenMount::getInodeMetadataTable() const {
11411154
#endif
11421155

11431156
FsChannel* EdenMount::getFsChannel() const {
1144-
return channel_.get();
1157+
return channel_.load().get();
11451158
}
11461159

11471160
Nfsd3* FOLLY_NULLABLE EdenMount::getNfsdChannel() const {
1148-
return dynamic_cast<Nfsd3*>(channel_.get());
1161+
return dynamic_cast<Nfsd3*>(channel_.load().get());
11491162
}
11501163

11511164
FuseChannel* FOLLY_NULLABLE EdenMount::getFuseChannel() const {
11521165
#ifndef _WIN32
1153-
return dynamic_cast<FuseChannel*>(channel_.get());
1166+
return dynamic_cast<FuseChannel*>(channel_.load().get());
11541167
#else
11551168
return nullptr;
11561169
#endif
11571170
}
11581171

11591172
PrjfsChannel* FOLLY_NULLABLE EdenMount::getPrjfsChannel() const {
11601173
#ifdef _WIN32
1161-
return dynamic_cast<PrjfsChannel*>(channel_.get());
1174+
return dynamic_cast<PrjfsChannel*>(channel_.load().get());
11621175
#else
11631176
return nullptr;
11641177
#endif
11651178
}
11661179

1180+
void EdenMount::setChannel(FsChannelPtr channel) {
1181+
if (channel) {
1182+
channel_.store(
1183+
std::shared_ptr<FsChannel>(channel.release(), FsChannelDeleter{}));
1184+
} else {
1185+
channel_.store(std::shared_ptr<FsChannel>());
1186+
}
1187+
}
1188+
11671189
void EdenMount::setTestFsChannel(FsChannelPtr channel) {
1168-
channel_ = std::move(channel);
1190+
setChannel(std::move(channel));
11691191
}
11701192

11711193
bool EdenMount::isNfsdChannel() const {
@@ -1220,11 +1242,12 @@ EdenMount::ReadLocation EdenMount::getReadLocationForMaterializedFiles() const {
12201242
}
12211243

12221244
ProcessAccessLog& EdenMount::getProcessAccessLog() const {
1223-
if (!channel_) {
1245+
auto ch = channel_.load();
1246+
if (!ch) {
12241247
EDEN_BUG() << "cannot call getProcessAccessLog() before "
12251248
"EdenMount has started or unmounted";
12261249
}
1227-
return channel_->getProcessAccessLog();
1250+
return ch->getProcessAccessLog();
12281251
}
12291252

12301253
const AbsolutePath& EdenMount::getPath() const {
@@ -1411,20 +1434,21 @@ ImmediateFuture<folly::Unit> EdenMount::waitForPendingWrites() const {
14111434
return serverState_->getFaultInjector()
14121435
.checkAsync("waitForPendingWrites", "")
14131436
.thenValue([this](auto&&) -> ImmediateFuture<folly::Unit> {
1414-
// TODO: This is a race condition since channel_ can be destroyed
1415-
// concurrently. We need to change EdenMount to never unset channel_.
1416-
if (channel_) {
1417-
return channel_->waitForPendingWrites();
1437+
// Snapshot the channel pointer. The ReadMostlySharedPtr keeps the
1438+
// channel alive for the duration of the call, even if the mount
1439+
// is being torn down concurrently.
1440+
auto ch = channel_.load();
1441+
if (ch) {
1442+
return ch->waitForPendingWrites().ensure([ch] {});
14181443
}
14191444
return folly::unit;
14201445
});
14211446
}
14221447

14231448
folly::coro::now_task<folly::Unit> EdenMount::co_waitForPendingWrites() const {
1424-
// TODO: This is a race condition since channel_ can be destroyed
1425-
// concurrently. We need to change EdenMount to never unset channel_.
1426-
if (channel_) {
1427-
co_await channel_->waitForPendingWrites().semi();
1449+
auto ch = channel_.load();
1450+
if (ch) {
1451+
co_await ch->waitForPendingWrites().semi();
14281452
}
14291453
co_return folly::unit;
14301454
}
@@ -1872,12 +1896,9 @@ void EdenMount::forgetStaleInodes() {
18721896

18731897
ImmediateFuture<folly::Unit> EdenMount::flushInvalidations() {
18741898
XLOG(DBG4, "waiting for inode invalidations to complete");
1875-
// TODO: If it's possible for flushInvalidations() and unmount() to run
1876-
// concurrently, accessing the channel_ pointer here is racy. It's deallocated
1877-
// by unmount(). We need to either guarantee these functions can never run
1878-
// concurrently or use some sort of lock or atomic pointer.
1879-
if (auto* fsChannel = getFsChannel()) {
1880-
return fsChannel->completeInvalidations().thenValue([](folly::Unit) {
1899+
auto ch = channel_.load();
1900+
if (ch) {
1901+
return ch->completeInvalidations().thenValue([ch](folly::Unit) {
18811902
XLOG(DBG4, "finished processing inode invalidations");
18821903
});
18831904
} else {
@@ -2470,13 +2491,13 @@ folly::Future<folly::Unit> EdenMount::fsChannelMount(bool readOnly) {
24702491
}
24712492

24722493
mountPromise->setValue();
2473-
channel_ = std::move(channel_2);
2494+
setChannel(std::move(channel_2));
24742495
return makeFuture(folly::unit);
24752496
});
24762497
#else
24772498
(void)options;
24782499
mountPromise->setValue();
2479-
channel_ = std::move(channel);
2500+
setChannel(std::move(channel));
24802501
return folly::makeFutureWith([]() { NOT_IMPLEMENTED(); });
24812502
#endif
24822503
});
@@ -2511,7 +2532,7 @@ folly::Future<folly::Unit> EdenMount::fsChannelMount(bool readOnly) {
25112532
// need to handle the case where mount was cancelled.
25122533

25132534
mountPromise->setValue();
2514-
channel_ = std::move(channel).value();
2535+
setChannel(std::move(channel).value());
25152536
return makeFuture(folly::unit);
25162537
});
25172538
#else
@@ -2556,8 +2577,8 @@ folly::Future<folly::Unit> EdenMount::fsChannelMount(bool readOnly) {
25562577
}
25572578

25582579
mountPromise->setValue();
2559-
channel_ =
2560-
makeFuseChannel(this, std::move(fuseDevice).value());
2580+
setChannel(
2581+
makeFuseChannel(this, std::move(fuseDevice).value()));
25612582
return folly::makeFuture(folly::unit);
25622583
});
25632584
#endif
@@ -2596,11 +2617,12 @@ folly::Future<folly::Unit> EdenMount::startFsChannel(bool readOnly) {
25962617
return fsChannelMount(readOnly);
25972618
})
25982619
.thenValue([this](auto&&) -> folly::Future<folly::Unit> {
2599-
if (!channel_) {
2620+
auto ch = channel_.load();
2621+
if (!ch) {
26002622
return EDEN_BUG_FUTURE(folly::Unit)
26012623
<< "EdenMount::channel_ is not constructed";
26022624
}
2603-
return channel_->initialize().thenValue(
2625+
return ch->initialize().thenValue(
26042626
[this](FsChannel::StopFuture mountCompleteFuture) {
26052627
fsChannelInitSuccessful(std::move(mountCompleteFuture));
26062628
});
@@ -2682,7 +2704,7 @@ void EdenMount::takeoverFuse(FuseChannelData takeoverData) {
26822704
auto channel = makeFuseChannel(this, std::move(takeoverData.fd));
26832705
auto fuseCompleteFuture =
26842706
channel->initializeFromTakeover(takeoverData.connInfo);
2685-
channel_ = std::move(channel);
2707+
setChannel(std::move(channel));
26862708
fsChannelInitSuccessful(std::move(fuseCompleteFuture));
26872709
} catch (const std::exception&) {
26882710
transitionToFsChannelInitializationErrorState();
@@ -2705,7 +2727,7 @@ folly::Future<folly::Unit> EdenMount::takeoverNfs(NfsChannelData takeoverData) {
27052727
auto& channel = mountInfo.nfsd;
27062728

27072729
auto stopFuture = channel->getStopFuture();
2708-
this->channel_ = std::move(channel);
2730+
this->setChannel(std::move(channel));
27092731
this->fsChannelInitSuccessful(std::move(stopFuture));
27102732
})
27112733
.thenError([this](auto&& err) {

eden/fs/inodes/EdenMount.h

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
#include <folly/SharedMutex.h>
1313
#include <folly/Synchronized.h>
1414
#include <folly/ThreadLocal.h>
15+
#include <folly/concurrency/memory/AtomicReadMostlyMainPtr.h>
1516
#include <folly/concurrency/memory/ReadMostlySharedPtr.h>
1617
#include <folly/coro/safe/NowTask.h>
1718
#include <folly/futures/Future.h>
@@ -1171,6 +1172,12 @@ class EdenMount : public std::enable_shared_from_this<EdenMount> {
11711172
friend class SharedRenameLock;
11721173
class JournalDiffCallback;
11731174

1175+
/**
1176+
* Convert a FsChannelPtr (unique_ptr with FsChannelDeleter) to a
1177+
* shared_ptr and store it in channel_.
1178+
*/
1179+
void setChannel(FsChannelPtr channel);
1180+
11741181
/**
11751182
* Attempt to transition from expected -> newState.
11761183
* If the current state is expected then the state is set to newState
@@ -1484,7 +1491,7 @@ class EdenMount : public std::enable_shared_from_this<EdenMount> {
14841491
std::shared_ptr<TraceBus<InodeTraceEvent>> inodeTraceBus_;
14851492
TraceSubscriptionHandle<InodeTraceEvent> inodeTraceHandle_;
14861493

1487-
FsChannelPtr channel_;
1494+
folly::AtomicReadMostlyMainPtr<FsChannel> channel_;
14881495

14891496
/**
14901497
* The clock. This is also available as serverState_->getClock().

eden/fs/inodes/test/EdenMountTest.cpp

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1202,10 +1202,6 @@ TEST(EdenMount, waitForPendingWritesDuringDestroy) {
12021202
rootInode.reset();
12031203
mount.reset();
12041204

1205-
// After destroy, bgThread is still paused at the fault before channel_
1206-
// access. Check whether destroy() nulled channel_.
1207-
// Pre-fix: channel_ (unique_ptr) is NOT nulled by destroy().
1208-
// Post-fix (D95870053): destroy() stores nullptr in channel_.
12091205
bool channelNullAfterDestroy = (rawMount->getFsChannel() == nullptr);
12101206

12111207
faultInjector.removeFault("waitForPendingWrites", ".*");
@@ -1214,8 +1210,7 @@ TEST(EdenMount, waitForPendingWritesDuringDestroy) {
12141210
bgThread.join();
12151211
shutdownBlocker.allowShutdownToComplete();
12161212

1217-
// FIXME(D95870053): After fix, change to EXPECT_TRUE.
1218-
EXPECT_FALSE(channelNullAfterDestroy);
1213+
EXPECT_TRUE(channelNullAfterDestroy);
12191214
}
12201215

12211216
namespace {

0 commit comments

Comments
 (0)