-
Notifications
You must be signed in to change notification settings - Fork 15.3k
[CAS] Add UnifiedOnDiskCache and OnDiskCAS #114103
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
cachemeifyoucan
merged 5 commits into
main
from
users/cachemeifyoucan/spr/cas-add-ondiskcas
Nov 3, 2025
Merged
Changes from 2 commits
Commits
Show all changes
5 commits
Select commit
Hold shift + click to select a range
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,59 @@ | ||
| //===----------------------------------------------------------------------===// | ||
| // | ||
| // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. | ||
| // See https://llvm.org/LICENSE.txt for license information. | ||
| // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | ||
| // | ||
| //===----------------------------------------------------------------------===// | ||
|
|
||
| #ifndef LLVM_CAS_BUILTINUNIFIEDCASDATABASES_H | ||
| #define LLVM_CAS_BUILTINUNIFIEDCASDATABASES_H | ||
|
|
||
| #include "llvm/Support/Error.h" | ||
|
|
||
| namespace llvm::cas { | ||
|
|
||
| class ActionCache; | ||
| class ObjectStore; | ||
|
|
||
| /// Create on-disk \c ObjectStore and \c ActionCache instances based on | ||
| /// \c ondisk::UnifiedOnDiskCache, with built-in hashing. | ||
| Expected<std::pair<std::unique_ptr<ObjectStore>, std::unique_ptr<ActionCache>>> | ||
| createOnDiskUnifiedCASDatabases(StringRef Path); | ||
|
|
||
| /// Represents the result of validating the contents using | ||
| /// \c validateOnDiskUnifiedCASDatabasesIfNeeded. | ||
| /// | ||
| /// Note: invalid results are handled as an \c Error. | ||
| enum class ValidationResult { | ||
| /// The data is already valid. | ||
| Valid, | ||
| /// The data was invalid, but was recovered. | ||
| Recovered, | ||
| /// Validation was skipped, as it was not needed. | ||
| Skipped, | ||
| }; | ||
|
|
||
| /// Validate the data in \p Path, if needed to ensure correctness. | ||
| /// | ||
| /// \param Path directory for the on-disk database. | ||
| /// \param CheckHash Whether to validate hashes match the data. | ||
| /// \param AllowRecovery Whether to automatically recover from invalid data by | ||
| /// marking the files for garbage collection. | ||
| /// \param ForceValidation Whether to force validation to occur even if it | ||
| /// should not be necessary. | ||
| /// \param LLVMCasBinary If provided, validation is performed out-of-process | ||
| /// using the given \c llvm-cas executable which protects against crashes | ||
| /// during validation. Otherwise validation is performed in-process. | ||
| /// | ||
| /// \returns \c Valid if the data is already valid, \c Recovered if data | ||
| /// was invalid but has been cleared, \c Skipped if validation is not needed, | ||
| /// or an \c Error if validation cannot be performed or if the data is left | ||
| /// in an invalid state because \p AllowRecovery is false. | ||
| Expected<ValidationResult> validateOnDiskUnifiedCASDatabasesIfNeeded( | ||
| StringRef Path, bool CheckHash, bool AllowRecovery, bool ForceValidation, | ||
| std::optional<StringRef> LLVMCasBinary); | ||
|
|
||
| } // namespace llvm::cas | ||
|
|
||
| #endif // LLVM_CAS_BUILTINUNIFIEDCASDATABASES_H | ||
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,191 @@ | ||
| //===----------------------------------------------------------------------===// | ||
| // | ||
| // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. | ||
| // See https://llvm.org/LICENSE.txt for license information. | ||
| // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | ||
| // | ||
| //===----------------------------------------------------------------------===// | ||
|
|
||
| #ifndef LLVM_CAS_UNIFIEDONDISKCACHE_H | ||
| #define LLVM_CAS_UNIFIEDONDISKCACHE_H | ||
|
|
||
| #include "llvm/CAS/BuiltinUnifiedCASDatabases.h" | ||
| #include "llvm/CAS/OnDiskGraphDB.h" | ||
| #include <atomic> | ||
|
|
||
| namespace llvm::cas::ondisk { | ||
|
|
||
| class OnDiskKeyValueDB; | ||
|
|
||
| /// A unified CAS nodes and key-value database, using on-disk storage for both. | ||
| /// It manages storage growth and provides APIs for garbage collection. | ||
| /// | ||
| /// High-level properties: | ||
| /// * While \p UnifiedOnDiskCache is open on a directory, by any process, the | ||
| /// storage size in that directory will keep growing unrestricted. For data to | ||
| /// become eligible for garbage-collection there should be no open instances | ||
| /// of \p UnifiedOnDiskCache for that directory, by any process. | ||
| /// * Garbage-collection needs to be triggered explicitly by the client. It can | ||
| /// be triggered on a directory concurrently, at any time and by any process, | ||
| /// without affecting any active readers/writers, in the same process or other | ||
| /// processes. | ||
| /// | ||
| /// Usage patterns should be that an instance of \p UnifiedOnDiskCache is open | ||
| /// for a limited period of time, e.g. for the duration of a build operation. | ||
| /// For long-living processes that need periodic access to a | ||
| /// \p UnifiedOnDiskCache, the client should device a scheme where access is | ||
cachemeifyoucan marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| /// performed within some defined period. For example, if a service is designed | ||
| /// to continuously wait for requests that access a \p UnifiedOnDiskCache, it | ||
| /// could keep the instance alive while new requests are coming in but close it | ||
| /// after a time period in which there are no new requests. | ||
| class UnifiedOnDiskCache { | ||
| public: | ||
| /// The \p OnDiskGraphDB instance for the open directory. | ||
| OnDiskGraphDB &getGraphDB() { return *PrimaryGraphDB; } | ||
|
|
||
| /// Associate an \p ObjectID, of the \p OnDiskGraphDB instance, with a key. | ||
| /// | ||
| /// \param Key the hash bytes for the key. | ||
| /// \param Value the \p ObjectID value. | ||
| /// | ||
| /// \returns the \p ObjectID associated with the \p Key. It may be different | ||
| /// than \p Value if another value was already associated with this key. | ||
| Expected<ObjectID> KVPut(ArrayRef<uint8_t> Key, ObjectID Value); | ||
|
|
||
| /// Associate an \p ObjectID, of the \p OnDiskGraphDB instance, with a key. | ||
| /// An \p ObjectID as a key is equivalent to its digest bytes. | ||
| /// | ||
| /// \param Key the \p ObjectID for the key. | ||
| /// \param Value the \p ObjectID value. | ||
| /// | ||
| /// \returns the \p ObjectID associated with the \p Key. It may be different | ||
| /// than \p Value if another value was already associated with this key. | ||
| Expected<ObjectID> KVPut(ObjectID Key, ObjectID Value); | ||
|
|
||
| /// \returns the \p ObjectID, of the \p OnDiskGraphDB instance, associated | ||
| /// with the \p Key, or \p std::nullopt if the key does not exist. | ||
| Expected<std::optional<ObjectID>> KVGet(ArrayRef<uint8_t> Key); | ||
|
|
||
| /// Open a \p UnifiedOnDiskCache instance for a directory. | ||
| /// | ||
| /// \param Path directory for the on-disk database. The directory will be | ||
| /// created if it doesn't exist. | ||
| /// \param SizeLimit Optional size for limiting growth. This has an effect for | ||
| /// when the instance is closed. | ||
| /// \param HashName Identifier name for the hashing algorithm that is going to | ||
| /// be used. | ||
| /// \param HashByteSize Size for the object digest hash bytes. | ||
| /// \param FaultInPolicy Controls how nodes are copied to primary store. This | ||
| /// is recorded at creation time and subsequent opens need to pass the same | ||
| /// policy otherwise the \p open will fail. | ||
| static Expected<std::unique_ptr<UnifiedOnDiskCache>> | ||
| open(StringRef Path, std::optional<uint64_t> SizeLimit, StringRef HashName, | ||
| unsigned HashByteSize, | ||
| OnDiskGraphDB::FaultInPolicy FaultInPolicy = | ||
| OnDiskGraphDB::FaultInPolicy::FullTree); | ||
|
|
||
| /// Validate the data in \p Path, if needed to ensure correctness. | ||
| /// | ||
| /// Note: if invalid data is detected and \p AllowRecovery is true, then | ||
| /// recovery requires exclusive access to the CAS and it is an error to | ||
| /// attempt recovery if there is concurrent use of the CAS. | ||
| /// | ||
| /// \param Path directory for the on-disk database. | ||
| /// \param HashName Identifier name for the hashing algorithm that is going to | ||
| /// be used. | ||
| /// \param HashByteSize Size for the object digest hash bytes. | ||
| /// \param CheckHash Whether to validate hashes match the data. | ||
| /// \param AllowRecovery Whether to automatically recover from invalid data by | ||
| /// marking the files for garbage collection. | ||
| /// \param ForceValidation Whether to force validation to occur even if it | ||
| /// should not be necessary. | ||
| /// \param LLVMCasBinary If provided, validation is performed out-of-process | ||
| /// using the given \c llvm-cas executable which protects against crashes | ||
| /// during validation. Otherwise validation is performed in-process. | ||
| /// | ||
| /// \returns \c Valid if the data is already valid, \c Recovered if data | ||
| /// was invalid but has been cleared, \c Skipped if validation is not needed, | ||
| /// or an \c Error if validation cannot be performed or if the data is left | ||
| /// in an invalid state because \p AllowRecovery is false. | ||
| static Expected<ValidationResult> | ||
| validateIfNeeded(StringRef Path, StringRef HashName, unsigned HashByteSize, | ||
| bool CheckHash, bool AllowRecovery, bool ForceValidation, | ||
| std::optional<StringRef> LLVMCasBinary); | ||
|
|
||
| /// This is called implicitly at destruction time, so it is not required for a | ||
| /// client to call this. After calling \p close the only method that is valid | ||
| /// to call is \p needsGarbageCollection. | ||
| /// | ||
| /// \param CheckSizeLimit if true it will check whether the primary store has | ||
| /// exceeded its intended size limit. If false the check is skipped even if a | ||
| /// \p SizeLimit was passed to the \p open call. | ||
| Error close(bool CheckSizeLimit = true); | ||
|
|
||
| /// Set the size for limiting growth. This has an effect for when the instance | ||
| /// is closed. | ||
| void setSizeLimit(std::optional<uint64_t> SizeLimit); | ||
|
|
||
| /// \returns the storage size of the cache data. | ||
| uint64_t getStorageSize() const; | ||
|
|
||
| /// \returns whether the primary store has exceeded the intended size limit. | ||
| /// This can return false even if the overall size of the opened directory is | ||
| /// over the \p SizeLimit passed to \p open. To know whether garbage | ||
| /// collection needs to be triggered or not, call \p needsGarbaseCollection. | ||
| bool hasExceededSizeLimit() const; | ||
|
|
||
| /// \returns whether there are unused data that can be deleted using a | ||
| /// \p collectGarbage call. | ||
| bool needsGarbageCollection() const { return NeedsGarbageCollection; } | ||
|
|
||
| /// Remove any unused data from the directory at \p Path. If there are no such | ||
| /// data the operation is a no-op. | ||
| /// | ||
| /// This can be called concurrently, regardless of whether there is an open | ||
| /// \p UnifiedOnDiskCache instance or not; it has no effect on readers/writers | ||
| /// in the same process or other processes. | ||
| /// | ||
| /// It is recommended that garbage-collection is triggered concurrently in the | ||
| /// background, so that it has minimal effect on the workload of the process. | ||
| static Error collectGarbage(StringRef Path); | ||
|
|
||
| /// Remove unused data from the current UnifiedOnDiskCache. | ||
| Error collectGarbage(); | ||
|
|
||
| /// Validate the key value databases. | ||
| Error validateActionCache(); | ||
|
|
||
| /// Get the upstream OnDiskGraphDB if exists. | ||
| /// | ||
| /// \returns upstream database or nullptr if upstream database doesn't exist. | ||
| OnDiskGraphDB *getUpstreamGraphDB() const { return UpstreamGraphDB; } | ||
|
|
||
| ~UnifiedOnDiskCache(); | ||
|
|
||
| private: | ||
| UnifiedOnDiskCache(); | ||
|
|
||
| Expected<std::optional<ObjectID>> | ||
| faultInFromUpstreamKV(ArrayRef<uint8_t> Key); | ||
|
|
||
| /// \returns the storage size of the primary directory. | ||
| uint64_t getPrimaryStorageSize() const; | ||
|
|
||
| std::string RootPath; | ||
| std::atomic<uint64_t> SizeLimit; | ||
|
|
||
| int LockFD = -1; | ||
|
|
||
| std::atomic<bool> NeedsGarbageCollection; | ||
| std::string PrimaryDBDir; | ||
|
|
||
| OnDiskGraphDB *UpstreamGraphDB = nullptr; | ||
| std::unique_ptr<OnDiskGraphDB> PrimaryGraphDB; | ||
|
|
||
| std::unique_ptr<OnDiskKeyValueDB> UpstreamKVDB; | ||
| std::unique_ptr<OnDiskKeyValueDB> PrimaryKVDB; | ||
| }; | ||
|
|
||
| } // namespace llvm::cas::ondisk | ||
|
|
||
| #endif // LLVM_CAS_UNIFIEDONDISKCACHE_H | ||
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.