Skip to content

Commit e04d9a9

Browse files
[CAS] Add a validation method for the entire CAS
Add a validation method for the OnDiskCAS that checks the integrity of the persistent CAS. It currently supports two mode, with and without `CheckHash`. Without CheckHash validation, it is a shallow validation which includes: * Validate the trie data structure is sound. It is possible to walk the entire trie without hitting error. * Validate the record is valid and the stored data is accessible. With CheckHash, it will also rehashes all the data stored and make sure the hash matches. The check is done in parallel and should be relatively fast. (cherry picked from commit fac26c1)
1 parent 1afd314 commit e04d9a9

17 files changed

+385
-30
lines changed

llvm/include/llvm/CAS/ObjectStore.h

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -156,7 +156,10 @@ class ObjectStore {
156156
virtual Expected<bool> isMaterialized(ObjectRef Ref) const = 0;
157157

158158
/// Validate the underlying object referred by CASID.
159-
virtual Error validate(const CASID &ID) = 0;
159+
virtual Error validateObject(const CASID &ID) = 0;
160+
161+
/// Validate the entire ObjectStore.
162+
virtual Error validate(bool CheckHash) const = 0;
160163

161164
protected:
162165
/// Load the object referenced by \p Ref.

llvm/include/llvm/CAS/OnDiskGraphDB.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -309,6 +309,11 @@ class OnDiskGraphDB {
309309

310310
void print(raw_ostream &OS) const;
311311

312+
/// Hashing function type for validation.
313+
using HashingFuncT = function_ref<void(
314+
ArrayRef<ArrayRef<uint8_t>>, ArrayRef<char>, SmallVectorImpl<uint8_t> &)>;
315+
Error validate(bool Deep, HashingFuncT Hasher) const;
316+
312317
/// How to fault-in nodes if an upstream database is used.
313318
enum class FaultInPolicy {
314319
/// Copy only the requested node.

llvm/include/llvm/CAS/OnDiskHashMappedTrie.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,12 @@ class OnDiskHashMappedTrie {
9797
MutableArrayRef<char> Data;
9898
};
9999

100+
/// Validate the trie data structure.
101+
///
102+
/// Callback receives the file offset to the data entry and the data stored.
103+
Error validate(
104+
function_ref<Error(FileOffset, ConstValueProxy)> RecordVerifier) const;
105+
100106
public:
101107
template <class ProxyT> class PointerImpl {
102108
public:

llvm/lib/CAS/BuiltinCAS.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,7 @@ Expected<ObjectRef> BuiltinCAS::store(ArrayRef<ObjectRef> Refs,
7171
Refs, Data);
7272
}
7373

74-
Error BuiltinCAS::validate(const CASID &ID) {
74+
Error BuiltinCAS::validateObject(const CASID &ID) {
7575
auto Ref = getReference(ID);
7676
if (!Ref)
7777
return createUnknownObjectError(ID);

llvm/lib/CAS/BuiltinCAS.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,7 @@ class BuiltinCAS : public ObjectStore {
7070
"corrupt storage");
7171
}
7272

73-
Error validate(const CASID &ID) final;
73+
Error validateObject(const CASID &ID) final;
7474
};
7575

7676
/// Create a \p UnifiedOnDiskCache instance that uses \p BLAKE3 hashing.

llvm/lib/CAS/InMemoryCAS.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -228,6 +228,10 @@ class InMemoryCAS : public BuiltinCAS {
228228

229229
void print(raw_ostream &OS) const final;
230230

231+
Error validate(bool CheckHash) const final {
232+
return createStringError("InMemoryCAS doesn't support validate()");
233+
}
234+
231235
InMemoryCAS() = default;
232236

233237
private:

llvm/lib/CAS/ObjectStore.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -203,7 +203,7 @@ Error ObjectStore::validateTree(ObjectRef Root) {
203203
auto [I, Inserted] = ValidatedRefs.insert(Ref);
204204
if (!Inserted)
205205
continue; // already validated.
206-
if (Error E = validate(getID(Ref)))
206+
if (Error E = validateObject(getID(Ref)))
207207
return E;
208208
Expected<ObjectHandle> Obj = load(Ref);
209209
if (!Obj)

llvm/lib/CAS/OnDiskCAS.cpp

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,8 @@
77
//===----------------------------------------------------------------------===//
88

99
#include "BuiltinCAS.h"
10+
#include "llvm/CAS/BuiltinCASContext.h"
11+
#include "llvm/CAS/BuiltinObjectHasher.h"
1012
#include "llvm/CAS/OnDiskGraphDB.h"
1113
#include "llvm/CAS/UnifiedOnDiskCache.h"
1214
#include "llvm/Support/Compiler.h"
@@ -35,6 +37,7 @@ class OnDiskCAS : public BuiltinCAS {
3537
ArrayRef<char> getDataConst(ObjectHandle Node) const final;
3638

3739
void print(raw_ostream &OS) const final;
40+
Error validate(bool CheckHash) const final;
3841

3942
static Expected<std::unique_ptr<OnDiskCAS>> open(StringRef Path);
4043

@@ -84,6 +87,15 @@ class OnDiskCAS : public BuiltinCAS {
8487
} // end anonymous namespace
8588

8689
void OnDiskCAS::print(raw_ostream &OS) const { DB->print(OS); }
90+
Error OnDiskCAS::validate(bool CheckHash) const {
91+
return DB->validate(CheckHash, [](ArrayRef<ArrayRef<uint8_t>> Refs,
92+
ArrayRef<char> Data,
93+
SmallVectorImpl<uint8_t> &Result) {
94+
auto Hash = BuiltinObjectHasher<llvm::cas::builtin::HasherT>::hashObject(
95+
Refs, Data);
96+
Result.assign(Hash.begin(), Hash.end());
97+
});
98+
}
8799

88100
CASID OnDiskCAS::getID(ObjectRef Ref) const {
89101
ArrayRef<uint8_t> Hash = DB->getDigest(convertRef(Ref));

llvm/lib/CAS/OnDiskGraphDB.cpp

Lines changed: 127 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,10 +51,14 @@
5151
#include "OnDiskCommon.h"
5252
#include "llvm/ADT/DenseMap.h"
5353
#include "llvm/ADT/StringExtras.h"
54+
#include "llvm/CAS/OnDiskHashMappedTrie.h"
5455
#include "llvm/Support/Alignment.h"
5556
#include "llvm/Support/Compiler.h"
5657
#include "llvm/Support/Errc.h"
5758
#include "llvm/Support/Error.h"
59+
#include "llvm/Support/ErrorHandling.h"
60+
#include "llvm/Support/FileSystem.h"
61+
#include "llvm/Support/Format.h"
5862
#include "llvm/Support/MemoryBuffer.h"
5963
#include "llvm/Support/Path.h"
6064
#include "llvm/Support/Process.h"
@@ -869,6 +873,129 @@ int64_t DataRecordHandle::getDataRelOffset() const {
869873
return RelOffset;
870874
}
871875

876+
Error OnDiskGraphDB::validate(bool Deep, HashingFuncT Hasher) const {
877+
return Index.validate([&](FileOffset Offset,
878+
OnDiskHashMappedTrie::ConstValueProxy Record)
879+
-> Error {
880+
auto formatError = [&](Twine Msg) {
881+
return createStringError(
882+
llvm::errc::illegal_byte_sequence,
883+
"bad record at 0x" +
884+
utohexstr((unsigned)Offset.get(), /*LowerCase=*/true) + ": " +
885+
Msg.str());
886+
};
887+
888+
if (Record.Data.size() != sizeof(TrieRecord))
889+
return formatError("wrong data record size");
890+
if (!isAligned(Align::Of<TrieRecord>(), Record.Data.size()))
891+
return formatError("wrong data record alignment");
892+
893+
auto *R = reinterpret_cast<const TrieRecord *>(Record.Data.data());
894+
TrieRecord::Data D = R->load();
895+
std::unique_ptr<MemoryBuffer> FileBuffer;
896+
if ((uint8_t)D.SK != (uint8_t)TrieRecord::StorageKind::Unknown &&
897+
(uint8_t)D.SK != (uint8_t)TrieRecord::StorageKind::DataPool &&
898+
(uint8_t)D.SK != (uint8_t)TrieRecord::StorageKind::Standalone &&
899+
(uint8_t)D.SK != (uint8_t)TrieRecord::StorageKind::StandaloneLeaf &&
900+
(uint8_t)D.SK != (uint8_t)TrieRecord::StorageKind::StandaloneLeaf0)
901+
return formatError("invalid record kind value");
902+
903+
auto Ref = InternalRef::getFromOffset(Offset);
904+
auto I = getIndexProxyFromRef(Ref);
905+
906+
switch (D.SK) {
907+
case TrieRecord::StorageKind::Unknown:
908+
// This could be an abandoned entry due to a termination before updating
909+
// the record. It can be reused by later insertion so just skip this entry
910+
// for now.
911+
return Error::success();
912+
case TrieRecord::StorageKind::DataPool:
913+
// Check offset is a postive value, and large enough to hold the
914+
// header for the data record.
915+
if (D.Offset.get() <= 0 ||
916+
(uint64_t)D.Offset.get() + sizeof(DataRecordHandle::Header) >=
917+
DataPool.size())
918+
return formatError("datapool record out of bound");
919+
break;
920+
case TrieRecord::StorageKind::Standalone:
921+
case TrieRecord::StorageKind::StandaloneLeaf:
922+
case TrieRecord::StorageKind::StandaloneLeaf0:
923+
SmallString<256> Path;
924+
getStandalonePath(TrieRecord::getStandaloneFileSuffix(D.SK), I, Path);
925+
// If need to validate the content of the file later, just load the
926+
// buffer here. Otherwise, just check the existance of the file.
927+
if (Deep) {
928+
auto File = MemoryBuffer::getFile(Path, /*IsText=*/false,
929+
/*RequiresNullTerminator=*/false);
930+
if (!File || !*File)
931+
return formatError("record file \'" + Path + "\' does not exist");
932+
933+
FileBuffer = std::move(*File);
934+
} else if (!llvm::sys::fs::exists(Path))
935+
return formatError("record file \'" + Path + "\' does not exist");
936+
}
937+
938+
if (!Deep)
939+
return Error::success();
940+
941+
auto dataError = [&](Twine Msg) {
942+
return createStringError(llvm::errc::illegal_byte_sequence,
943+
"bad data for digest \'" + toHex(I.Hash) +
944+
"\': " + Msg.str());
945+
};
946+
SmallVector<ArrayRef<uint8_t>> Refs;
947+
ArrayRef<char> StoredData;
948+
949+
switch (D.SK) {
950+
case TrieRecord::StorageKind::Unknown:
951+
llvm_unreachable("already handled");
952+
case TrieRecord::StorageKind::DataPool: {
953+
auto DataRecord = DataRecordHandle::get(DataPool.beginData(D.Offset));
954+
if (DataRecord.getTotalSize() + D.Offset.get() >= DataPool.size())
955+
return dataError("data record span passed the end of the data pool");
956+
for (auto InternRef : DataRecord.getRefs()) {
957+
auto Index = getIndexProxyFromRef(InternRef);
958+
Refs.push_back(Index.Hash);
959+
}
960+
StoredData = DataRecord.getData();
961+
break;
962+
}
963+
case TrieRecord::StorageKind::Standalone: {
964+
if (FileBuffer->getBufferSize() < sizeof(DataRecordHandle::Header))
965+
return dataError("data record is not big enough to read the header");
966+
auto DataRecord = DataRecordHandle::get(FileBuffer->getBufferStart());
967+
if (DataRecord.getTotalSize() < FileBuffer->getBufferSize())
968+
return dataError(
969+
"data record span passed the end of the standalone file");
970+
for (auto InternRef : DataRecord.getRefs()) {
971+
auto Index = getIndexProxyFromRef(InternRef);
972+
Refs.push_back(Index.Hash);
973+
}
974+
StoredData = DataRecord.getData();
975+
break;
976+
}
977+
case TrieRecord::StorageKind::StandaloneLeaf:
978+
case TrieRecord::StorageKind::StandaloneLeaf0: {
979+
StoredData = arrayRefFromStringRef<char>(FileBuffer->getBuffer());
980+
if (D.SK == TrieRecord::StorageKind::StandaloneLeaf0) {
981+
if (!FileBuffer->getBuffer().ends_with('\0'))
982+
return dataError("standalone file is not zero terminated");
983+
StoredData = StoredData.drop_back(1);
984+
}
985+
break;
986+
}
987+
}
988+
989+
SmallVector<uint8_t> ComputedHash;
990+
Hasher(Refs, StoredData, ComputedHash);
991+
if (I.Hash != ArrayRef(ComputedHash))
992+
return dataError("hash mismatch, got \'" + toHex(ComputedHash) +
993+
"\' instead");
994+
995+
return Error::success();
996+
});
997+
}
998+
872999
void OnDiskGraphDB::print(raw_ostream &OS) const {
8731000
OS << "on-disk-root-path: " << RootPath << "\n";
8741001

0 commit comments

Comments
 (0)