Skip to content

Commit 4f88b35

Browse files
committed
[CAS/LazyMappedFileRegion] Fix racing issue during the initial creation of a database file
There's a racing issue when multiple processes try to create a new database file on the same directory. Due to insufficient synchronization one process may try to use a newly created database file that another process haven't finished initializing yet. This can manifest with errors like "database: bad magic" or crashes. rdar://104829275 (cherry picked from commit 4792257)
1 parent 863d61d commit 4f88b35

File tree

6 files changed

+91
-10
lines changed

6 files changed

+91
-10
lines changed

llvm/include/llvm/Support/FileSystem.h

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1200,7 +1200,10 @@ std::error_code getRealPathFromHandle(file_t Handle,
12001200
/// Lock the file.
12011201
///
12021202
/// This function acts as @ref tryLockFile but it waits infinitely.
1203-
std::error_code lockFile(int FD);
1203+
/// \param FD file descriptor to use for locking.
1204+
/// \param Exclusive if \p true use exclusive/writer lock, otherwise use
1205+
/// shared/reader lock.
1206+
std::error_code lockFile(int FD, bool Exclusive = true);
12041207

12051208
/// Unlock the file.
12061209
///

llvm/lib/CAS/LazyMappedFileRegion.cpp

Lines changed: 50 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -242,9 +242,59 @@ Expected<LazyMappedFileRegion> LazyMappedFileRegion::create(
242242

243243
sys::fs::file_t File = sys::fs::convertFDToNativeFile(FD);
244244

245+
struct FileLockRAII {
246+
LazyMappedFileRegion &LMFR;
247+
bool IsLocked = false;
248+
249+
enum LockKind { Shared, Exclusive };
250+
251+
FileLockRAII(LazyMappedFileRegion &LMFR) : LMFR(LMFR) {}
252+
~FileLockRAII() { consumeError(unlock()); }
253+
254+
Error lock(LockKind LK) {
255+
if (IsLocked)
256+
return createStringError(inconvertibleErrorCode(),
257+
LMFR.Path + " already locked");
258+
if (std::error_code EC = sys::fs::lockFile(*LMFR.FD, LK == Exclusive))
259+
return createFileError(LMFR.Path, EC);
260+
IsLocked = true;
261+
return Error::success();
262+
}
263+
264+
Error unlock() {
265+
if (IsLocked) {
266+
IsLocked = false;
267+
if (std::error_code EC = sys::fs::unlockFile(*LMFR.FD))
268+
return createFileError(LMFR.Path, EC);
269+
}
270+
return Error::success();
271+
}
272+
273+
} FileLock(LMFR);
274+
275+
// Use shared/reader locking in case another process is in the process of
276+
// initializing the file.
277+
if (Error E = FileLock.lock(FileLockRAII::Shared))
278+
return std::move(E);
279+
245280
sys::fs::file_status Status;
246281
if (std::error_code EC = sys::fs::status(File, Status))
247282
return errorCodeToError(EC);
283+
284+
if (Status.getSize() == 0) {
285+
// Lock the file exclusively so only one process will do the initialization.
286+
if (Error E = FileLock.unlock())
287+
return std::move(E);
288+
if (Error E = FileLock.lock(FileLockRAII::Exclusive))
289+
return std::move(E);
290+
if (std::error_code EC = sys::fs::status(File, Status))
291+
return errorCodeToError(EC);
292+
}
293+
294+
// At this point either the file is still empty (this process won the race to
295+
// do the initialization) or we have the size for the completely initialized
296+
// file.
297+
248298
if (Status.getSize() > 0)
249299
// The file was already constructed.
250300
LMFR.CachedSize = Status.getSize();
@@ -263,11 +313,6 @@ Expected<LazyMappedFileRegion> LazyMappedFileRegion::create(
263313
if (!LMFR.IsConstructingNewFile)
264314
return std::move(LMFR);
265315

266-
// Lock the file so we can initialize it.
267-
if (std::error_code EC = sys::fs::lockFile(*LMFR.FD))
268-
return createFileError(Path, EC);
269-
auto Unlock = make_scope_exit([FD = *LMFR.FD]() { sys::fs::unlockFile(FD); });
270-
271316
// This is a new file. Resize to NewFileSize and run the constructor.
272317
if (Error E = NewFileConstructor(LMFR))
273318
return std::move(E);

llvm/lib/Support/Unix/Path.inc

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1265,10 +1265,10 @@ std::error_code tryLockFile(int FD, std::chrono::milliseconds Timeout) {
12651265
return make_error_code(errc::no_lock_available);
12661266
}
12671267

1268-
std::error_code lockFile(int FD) {
1268+
std::error_code lockFile(int FD, bool Exclusive) {
12691269
struct flock Lock;
12701270
memset(&Lock, 0, sizeof(Lock));
1271-
Lock.l_type = F_WRLCK;
1271+
Lock.l_type = Exclusive ? F_WRLCK : F_RDLCK;
12721272
Lock.l_whence = SEEK_SET;
12731273
Lock.l_start = 0;
12741274
Lock.l_len = 0;

llvm/lib/Support/Windows/Path.inc

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1312,8 +1312,8 @@ std::error_code tryLockFile(int FD, std::chrono::milliseconds Timeout) {
13121312
return mapWindowsError(ERROR_LOCK_VIOLATION);
13131313
}
13141314

1315-
std::error_code lockFile(int FD) {
1316-
DWORD Flags = LOCKFILE_EXCLUSIVE_LOCK;
1315+
std::error_code lockFile(int FD, bool Exclusive) {
1316+
DWORD Flags = Exclusive ? LOCKFILE_EXCLUSIVE_LOCK : 0;
13171317
OVERLAPPED OV = {};
13181318
file_t File = convertFDToNativeFile(FD);
13191319
if (::LockFileEx(File, Flags, 0, MAXDWORD, MAXDWORD, &OV))
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
#!/bin/sh
2+
3+
# parameters are: <llvm-cas path> <CAS path> <file to ingest> <number of repeats>
4+
LLVM_CAS_TOOL=$1
5+
CAS_PATH=$2
6+
INGEST_FILE=$3
7+
NUM_REPEAT=$4
8+
9+
set -e
10+
11+
for c in $(seq 1 $NUM_REPEAT); do
12+
rm -rf $CAS_PATH
13+
14+
pids=""
15+
for x in $(seq 1 10); do
16+
$LLVM_CAS_TOOL --ingest --data $INGEST_FILE --cas $CAS_PATH &
17+
pids="$pids $!"
18+
done
19+
20+
for pid in $pids; do
21+
wait "$pid"
22+
done
23+
done
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
// REQUIRES: ondisk_cas, shell
2+
3+
// RUN: mkdir -p %t
4+
5+
// This uses a script that triggers parallel `llvm-cas` invocations on an empty directory.
6+
// It's intended to ensure there are no racing issues at creation time that will create failures.
7+
// The last parameter controls how many times it will try.
8+
// To stress-test this extensively change it locally and pass a large number.
9+
10+
// RUN: %S/Inputs/cas-creation-stress-test.sh llvm-cas %t/cas %s 5

0 commit comments

Comments
 (0)