diff --git a/src/main/java/com/google/devtools/build/lib/remote/AbstractActionInputPrefetcher.java b/src/main/java/com/google/devtools/build/lib/remote/AbstractActionInputPrefetcher.java index fbb942f4727662..2fc08a35ed53c2 100644 --- a/src/main/java/com/google/devtools/build/lib/remote/AbstractActionInputPrefetcher.java +++ b/src/main/java/com/google/devtools/build/lib/remote/AbstractActionInputPrefetcher.java @@ -83,7 +83,7 @@ public abstract class AbstractActionInputPrefetcher implements ActionInputPrefet protected final Path execRoot; protected final RemoteOutputChecker remoteOutputChecker; - private final ActionOutputDirectoryHelper outputDirectoryHelper; + protected final ActionOutputDirectoryHelper outputDirectoryHelper; /** The state of a directory tracked by {@link DirectoryTracker}, as explained below. */ enum DirectoryState { @@ -129,8 +129,9 @@ private void setWritable(Path dir, DirectoryState newState) throws IOException { directoryStateMap.compute( dir, (unusedKey, oldState) -> { - if (oldState == DirectoryState.TEMPORARILY_WRITABLE - || oldState == DirectoryState.PERMANENTLY_WRITABLE) { + if (!forceRefetch(dir) + && (oldState == DirectoryState.TEMPORARILY_WRITABLE + || oldState == DirectoryState.PERMANENTLY_WRITABLE)) { // Already writable, but must potentially upgrade from temporary to permanent. return newState == DirectoryState.PERMANENTLY_WRITABLE ? newState : oldState; } @@ -162,8 +163,9 @@ void setOutputPermissions(Path dir) throws IOException { directoryStateMap.compute( dir, (unusedKey, oldState) -> { - if (oldState == DirectoryState.OUTPUT_PERMISSIONS - || oldState == DirectoryState.PERMANENTLY_WRITABLE) { + if (!forceRefetch(dir) + && (oldState == DirectoryState.OUTPUT_PERMISSIONS + || oldState == DirectoryState.PERMANENTLY_WRITABLE)) { // Either the output permissions have already been set, or we're not changing the // permissions ever again. return oldState; @@ -240,6 +242,12 @@ private boolean shouldDownloadFile(Path path, FileArtifactValue metadata) protected abstract boolean canDownloadFile(Path path, FileArtifactValue metadata); + /** + * If true, then all previously acquired knowledge of the file system state of this path (e.g. the + * existence of tree artifact directories or previously downloaded files) must be discarded. + */ + protected abstract boolean forceRefetch(Path path); + /** * Downloads file to the given path via its metadata. * @@ -565,7 +573,7 @@ private Completable downloadFileNoCheckRx( return Completable.error(new CacheNotFoundException(digest, execPath)); })); - return downloadCache.executeIfNot( + return downloadCache.execute( finalPath, Completable.defer( () -> { @@ -573,7 +581,8 @@ private Completable downloadFileNoCheckRx( return download; } return Completable.complete(); - })); + }), + forceRefetch(finalPath)); } private void finalizeDownload( @@ -646,7 +655,7 @@ private void deletePartialDownload(Path path) { } private Completable plantSymlink(Symlink symlink) { - return downloadCache.executeIfNot( + return downloadCache.execute( execRoot.getRelative(symlink.linkExecPath()), Completable.defer( () -> { @@ -657,7 +666,8 @@ private Completable plantSymlink(Symlink symlink) { link.delete(); link.createSymbolicLink(target); return Completable.complete(); - })); + }), + forceRefetch(execRoot.getRelative(symlink.linkExecPath()))); } public ImmutableSet downloadedFiles() { @@ -694,7 +704,7 @@ public void finalizeAction(Action action, OutputMetadataStore outputMetadataStor } var metadata = outputMetadataStore.getOutputMetadata(output); - if (!metadata.isRemote()) { + if (!canDownloadFile(output.getPath(), metadata)) { continue; } diff --git a/src/main/java/com/google/devtools/build/lib/remote/BUILD b/src/main/java/com/google/devtools/build/lib/remote/BUILD index 3d9f9fab41049c..80905ff488e2d2 100644 --- a/src/main/java/com/google/devtools/build/lib/remote/BUILD +++ b/src/main/java/com/google/devtools/build/lib/remote/BUILD @@ -28,6 +28,7 @@ java_library( srcs = glob( ["*.java"], exclude = [ + "ConcurrentArtifactPathTrie.java", "ExecutionStatusException.java", "ReferenceCountedChannel.java", "ChannelConnectionWithServerCapabilitiesFactory.java", @@ -51,6 +52,7 @@ java_library( ":ReferenceCountedChannel", ":Retrier", ":abstract_action_input_prefetcher", + ":concurrent_artifact_path_trie", ":lease_service", ":remote_output_checker", ":scrubber", @@ -208,6 +210,7 @@ java_library( name = "remote_output_checker", srcs = ["RemoteOutputChecker.java"], deps = [ + ":concurrent_artifact_path_trie", "//src/main/java/com/google/devtools/build/lib/actions:artifacts", "//src/main/java/com/google/devtools/build/lib/actions:file_metadata", "//src/main/java/com/google/devtools/build/lib/analysis:analysis_cluster", @@ -281,6 +284,16 @@ java_library( ], ) +java_library( + name = "concurrent_artifact_path_trie", + srcs = ["ConcurrentArtifactPathTrie.java"], + deps = [ + "//src/main/java/com/google/devtools/build/lib/actions:artifacts", + "//src/main/java/com/google/devtools/build/lib/vfs:pathfragment", + "//third_party:guava", + ], +) + java_library( name = "store", srcs = ["Store.java"], diff --git a/src/main/java/com/google/devtools/build/lib/remote/ConcurrentArtifactPathTrie.java b/src/main/java/com/google/devtools/build/lib/remote/ConcurrentArtifactPathTrie.java new file mode 100644 index 00000000000000..87bc5c695b536d --- /dev/null +++ b/src/main/java/com/google/devtools/build/lib/remote/ConcurrentArtifactPathTrie.java @@ -0,0 +1,57 @@ +// Copyright 2026 The Bazel Authors. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +package com.google.devtools.build.lib.remote; + +import com.google.common.base.Preconditions; +import com.google.devtools.build.lib.actions.ActionInput; +import com.google.devtools.build.lib.actions.Artifact; +import com.google.devtools.build.lib.vfs.PathFragment; +import java.util.concurrent.ConcurrentSkipListSet; + +/** + * A specialized concurrent trie that stores paths of artifacts and allows checking whether a given + * path is contained in (in the case of a tree artifact) or exactly matches (in any other case) an + * artifact in the trie. + */ +final class ConcurrentArtifactPathTrie { + // Invariant: no path in this set is a prefix of another path. + private final ConcurrentSkipListSet paths = + new ConcurrentSkipListSet<>(PathFragment.HIERARCHICAL_COMPARATOR); + + /** + * Adds the given {@link ActionInput} to the trie. + * + *

The caller must ensure that no object's path passed to this method is a prefix of any + * previously added object's path. Bazel enforces this for non-aggregate artifacts. Callers must + * not pass in {@link Artifact.TreeFileArtifact}s (which have exec paths that have their parent + * tree artifact's exec path as a prefix) or non-Artifact {@link ActionInput}s that violate this + * invariant. + */ + void add(ActionInput input) { + Preconditions.checkArgument( + !(input instanceof Artifact.TreeFileArtifact), + "TreeFileArtifacts should not be added to the trie: %s", + input); + paths.add(input.getExecPath()); + } + + /** Checks whether the given {@link PathFragment} is contained in an artifact in the trie. */ + boolean contains(PathFragment execPath) { + // By the invariant of this set, there is at most one prefix of execPath in the set. Since the + // comparator sorts all children of a path right after the path itself, if such a prefix + // exists, it must thus sort right before execPath (or be equal to it). + var floorPath = paths.floor(execPath); + return floorPath != null && execPath.startsWith(floorPath); + } +} diff --git a/src/main/java/com/google/devtools/build/lib/remote/RemoteActionInputFetcher.java b/src/main/java/com/google/devtools/build/lib/remote/RemoteActionInputFetcher.java index f9559ac8a4ffcd..45af579e91378a 100644 --- a/src/main/java/com/google/devtools/build/lib/remote/RemoteActionInputFetcher.java +++ b/src/main/java/com/google/devtools/build/lib/remote/RemoteActionInputFetcher.java @@ -13,7 +13,6 @@ // limitations under the License. package com.google.devtools.build.lib.remote; -import static com.google.common.base.Preconditions.checkArgument; import build.bazel.remote.execution.v2.Digest; import build.bazel.remote.execution.v2.RequestMetadata; @@ -21,6 +20,7 @@ import com.google.common.util.concurrent.ListenableFuture; import com.google.devtools.build.lib.actions.ActionExecutionMetadata; import com.google.devtools.build.lib.actions.ActionOutputDirectoryHelper; +import com.google.devtools.build.lib.actions.Artifact; import com.google.devtools.build.lib.actions.FileArtifactValue; import com.google.devtools.build.lib.actions.cache.VirtualActionInput; import com.google.devtools.build.lib.events.Reporter; @@ -31,7 +31,10 @@ import com.google.devtools.build.lib.vfs.OutputPermissions; import com.google.devtools.build.lib.vfs.Path; import com.google.devtools.build.lib.vfs.PathFragment; +import com.google.devtools.build.lib.vfs.Symlinks; import java.io.IOException; +import java.util.Collection; +import javax.annotation.Nullable; /** * Stages output files that are stored remotely to the local filesystem. @@ -44,6 +47,7 @@ public class RemoteActionInputFetcher extends AbstractActionInputPrefetcher { private final String buildRequestId; private final String commandId; private final CombinedCache combinedCache; + private final ConcurrentArtifactPathTrie rewoundActionOutputs = new ConcurrentArtifactPathTrie(); RemoteActionInputFetcher( Reporter reporter, @@ -79,7 +83,18 @@ protected void prefetchVirtualActionInput(VirtualActionInput input) throws IOExc @Override protected boolean canDownloadFile(Path path, FileArtifactValue metadata) { - return metadata.isRemote(); + // When action rewinding is enabled, an action that had remote metadata at some point during the + // build may have been re-executed locally to regenerate lost inputs, but may then be rewound + // again and thus have its (now local) outputs deleted. In this case, we need to download the + // outputs again, even if they are now considered local. + return metadata.isRemote() || (forceRefetch(path) && !path.exists(Symlinks.NOFOLLOW)); + } + + @Override + protected boolean forceRefetch(Path path) { + // Caches for download operations and output directory creation need to be disregarded for the + // outputs of rewound actions as they may have been deleted after they were first created. + return path.startsWith(execRoot) && rewoundActionOutputs.contains(path.relativeTo(execRoot)); } @Override @@ -92,7 +107,6 @@ protected ListenableFuture doDownloadFile( Priority priority, Reason reason) throws IOException { - checkArgument(metadata.isRemote(), "Cannot download file that is not a remote file."); RequestMetadata requestMetadata = TracingMetadataUtils.buildMetadata( buildRequestId, @@ -117,4 +131,22 @@ protected ListenableFuture doDownloadFile( execPath.toString(), digest.getSizeBytes())); } + + public void handleRewoundActionOutputs(Collection outputs) { + // SkyframeActionExecutor#prepareForRewinding does *not* call this method because the + // RemoteActionFileSystem corresponds to an ActionFileSystemType with inMemoryFileSystem() == + // true. While it is true that resetting outputDirectoryHelper isn't necessary to undo the + // caching of output directory creation during action preparation, we still need to reset here + // since outputDirectoryHelper is also used by AbstractActionInputPrefetcher. + outputDirectoryHelper.invalidateTreeArtifactDirectoryCreation(outputs); + for (Artifact output : outputs) { + // Action templates have TreeFileArtifacts as outputs, which isn't supported by the trie. We + // only need to track the tree artifacts themselves. + if (output instanceof Artifact.TreeFileArtifact) { + rewoundActionOutputs.add(output.getParent()); + } else { + rewoundActionOutputs.add(output); + } + } + } } diff --git a/src/main/java/com/google/devtools/build/lib/remote/RemoteExecutionService.java b/src/main/java/com/google/devtools/build/lib/remote/RemoteExecutionService.java index 24627b4c05978d..ec6b9b0054e872 100644 --- a/src/main/java/com/google/devtools/build/lib/remote/RemoteExecutionService.java +++ b/src/main/java/com/google/devtools/build/lib/remote/RemoteExecutionService.java @@ -62,6 +62,8 @@ import com.google.common.eventbus.Subscribe; import com.google.common.util.concurrent.Futures; import com.google.common.util.concurrent.ListenableFuture; +import com.google.common.util.concurrent.ListeningExecutorService; +import com.google.common.util.concurrent.MoreExecutors; import com.google.common.util.concurrent.SettableFuture; import com.google.devtools.build.lib.actions.ActionInput; import com.google.devtools.build.lib.actions.Artifact; @@ -156,8 +158,8 @@ import java.util.concurrent.CompletableFuture; import java.util.concurrent.CompletionException; import java.util.concurrent.ConcurrentLinkedQueue; +import java.util.concurrent.CountDownLatch; import java.util.concurrent.ExecutionException; -import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.concurrent.Phaser; import java.util.concurrent.Semaphore; @@ -194,9 +196,10 @@ public class RemoteExecutionService { private final Set reportedErrors = new HashSet<>(); @SuppressWarnings("AllowVirtualThreads") - private final ExecutorService backgroundTaskExecutor = - Executors.newThreadPerTaskExecutor( - Thread.ofVirtual().name("remote-execution-bg-", 0).factory()); + private final ListeningExecutorService backgroundTaskExecutor = + MoreExecutors.listeningDecorator( + Executors.newThreadPerTaskExecutor( + Thread.ofVirtual().name("remote-execution-bg-", 0).factory())); private final AtomicBoolean shutdown = new AtomicBoolean(false); private final AtomicBoolean buildInterrupted = new AtomicBoolean(false); @@ -1873,16 +1876,31 @@ public void uploadOutputs( if (remoteOptions.remoteCacheAsync && !action.getSpawn().getResourceOwner().mayModifySpawnOutputsAfterExecution()) { - backgroundTaskExecutor.execute( - () -> { - try { - doUploadOutputs(action, spawnResult, onUploadComplete); - } catch (ExecException e) { - reportUploadError(e); - } catch (InterruptedException ignored) { - // ThreadPerTaskExecutor does not care about interrupt status. - } - }); + var uploadDone = new CountDownLatch(1); + var future = + backgroundTaskExecutor.submit( + () -> { + try { + doUploadOutputs(action, spawnResult, onUploadComplete); + } catch (ExecException e) { + reportUploadError(e); + } catch (InterruptedException ignored) { + // ThreadPerTaskExecutor does not care about interrupt status. + } finally { + uploadDone.countDown(); + } + }); + + if (outputService instanceof RemoteOutputService remoteOutputService + && remoteOutputService.getRewoundActionSynchronizer() + instanceof RemoteRewoundActionSynchronizer remoteRewoundActionSynchronizer) { + remoteRewoundActionSynchronizer.registerOutputUploadTask( + action.getRemoteActionExecutionContext().getSpawnOwner(), + () -> { + future.cancel(true); + uploadDone.await(); + }); + } } else { doUploadOutputs(action, spawnResult, onUploadComplete); } diff --git a/src/main/java/com/google/devtools/build/lib/remote/RemoteModule.java b/src/main/java/com/google/devtools/build/lib/remote/RemoteModule.java index dd7b41779f46a7..b6390f2c02e683 100644 --- a/src/main/java/com/google/devtools/build/lib/remote/RemoteModule.java +++ b/src/main/java/com/google/devtools/build/lib/remote/RemoteModule.java @@ -504,7 +504,10 @@ public void beforeCommand(CommandEnvironment env) throws AbruptExitException { bazelOutputServiceChannel, lastBuildId); } else { - outputService = new RemoteOutputService(env.getDirectories()); + outputService = + new RemoteOutputService( + env.getDirectories(), + buildRequestOptions != null && buildRequestOptions.rewindLostInputs); } if ((enableHttpCache || enableDiskCache) && !enableGrpcCache) { diff --git a/src/main/java/com/google/devtools/build/lib/remote/RemoteOutputChecker.java b/src/main/java/com/google/devtools/build/lib/remote/RemoteOutputChecker.java index 80283d4280f09f..b295e050a86253 100644 --- a/src/main/java/com/google/devtools/build/lib/remote/RemoteOutputChecker.java +++ b/src/main/java/com/google/devtools/build/lib/remote/RemoteOutputChecker.java @@ -13,7 +13,6 @@ // limitations under the License. package com.google.devtools.build.lib.remote; -import static com.google.common.base.Preconditions.checkArgument; import static com.google.common.base.Preconditions.checkNotNull; import static com.google.common.base.Preconditions.checkState; import static com.google.devtools.build.lib.packages.TargetUtils.isTestRuleName; @@ -44,7 +43,6 @@ import com.google.devtools.build.skyframe.MemoizingEvaluator; import java.util.Set; import java.util.concurrent.ConcurrentHashMap; -import java.util.concurrent.ConcurrentSkipListSet; import java.util.function.Predicate; import java.util.function.Supplier; import javax.annotation.Nullable; @@ -351,41 +349,4 @@ public void maybeInvalidateSkyframeValues(MemoizingEvaluator memoizingEvaluator) }); } } - - /** - * A specialized concurrent trie that stores paths of artifacts and allows checking whether a - * given path is contained in (in the case of a tree artifact) or exactly matches (in any other - * case) an artifact in the trie. - */ - private static final class ConcurrentArtifactPathTrie { - // Invariant: no path in this set is a prefix of another path. - private final ConcurrentSkipListSet paths = - new ConcurrentSkipListSet<>(PathFragment.HIERARCHICAL_COMPARATOR); - - /** - * Adds the given {@link ActionInput} to the trie. - * - *

The caller must ensure that no object's path passed to this method is a prefix of any - * previously added object's path. Bazel enforces this for non-aggregate artifacts. Callers must - * not pass in {@link TreeFileArtifact}s (which have exec paths that have their parent tree - * artifact's exec path as a prefix) or non-Artifact {@link ActionInput}s that violate this - * invariant. - */ - void add(ActionInput input) { - checkArgument( - !(input instanceof TreeFileArtifact), - "TreeFileArtifacts should not be added to the trie: %s", - input); - paths.add(input.getExecPath()); - } - - /** Checks whether the given {@link PathFragment} is contained in an artifact in the trie. */ - boolean contains(PathFragment execPath) { - // By the invariant of this set, there is at most one prefix of execPath in the set. Since the - // comparator sorts all children of a path right after the path itself, if such a prefix - // exists, it must thus sort right before execPath (or be equal to it). - var floorPath = paths.floor(execPath); - return floorPath != null && execPath.startsWith(floorPath); - } - } } diff --git a/src/main/java/com/google/devtools/build/lib/remote/RemoteOutputService.java b/src/main/java/com/google/devtools/build/lib/remote/RemoteOutputService.java index b9a59aed85a052..c69d8def00af28 100644 --- a/src/main/java/com/google/devtools/build/lib/remote/RemoteOutputService.java +++ b/src/main/java/com/google/devtools/build/lib/remote/RemoteOutputService.java @@ -58,14 +58,18 @@ public class RemoteOutputService implements OutputService { private final BlazeDirectories directories; + private final boolean rewindLostInputs; + + private RewoundActionSynchronizer rewoundActionSynchronizer = RewoundActionSynchronizer.NOOP; @Nullable private RemoteOutputChecker remoteOutputChecker; @Nullable private RemoteActionInputFetcher actionInputFetcher; @Nullable private LeaseService leaseService; @Nullable private Supplier fileCacheSupplier; - RemoteOutputService(BlazeDirectories directories) { + RemoteOutputService(BlazeDirectories directories, boolean rewindLostInputs) { this.directories = checkNotNull(directories); + this.rewindLostInputs = rewindLostInputs; } void setRemoteOutputChecker(RemoteOutputChecker remoteOutputChecker) { @@ -74,6 +78,9 @@ void setRemoteOutputChecker(RemoteOutputChecker remoteOutputChecker) { void setActionInputFetcher(RemoteActionInputFetcher actionInputFetcher) { this.actionInputFetcher = checkNotNull(actionInputFetcher, "actionInputFetcher"); + if (rewindLostInputs) { + this.rewoundActionSynchronizer = new RemoteRewoundActionSynchronizer(actionInputFetcher); + } } void setLeaseService(LeaseService leaseService) { @@ -250,4 +257,9 @@ public void checkActionFileSystemForLostInputs(FileSystem actionFileSystem, Acti remoteFileSystem.checkForLostInputs(action); } } + + @Override + public RewoundActionSynchronizer getRewoundActionSynchronizer() { + return rewoundActionSynchronizer; + } } diff --git a/src/main/java/com/google/devtools/build/lib/remote/RemoteRewoundActionSynchronizer.java b/src/main/java/com/google/devtools/build/lib/remote/RemoteRewoundActionSynchronizer.java new file mode 100644 index 00000000000000..b14263d6ddb3c7 --- /dev/null +++ b/src/main/java/com/google/devtools/build/lib/remote/RemoteRewoundActionSynchronizer.java @@ -0,0 +1,295 @@ +// Copyright 2026 The Bazel Authors. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package com.google.devtools.build.lib.remote; + +import com.github.benmanes.caffeine.cache.Caffeine; +import com.github.benmanes.caffeine.cache.LoadingCache; +import com.google.common.collect.ImmutableList; +import com.google.common.collect.Iterables; +import com.google.devtools.build.lib.actions.Action; +import com.google.devtools.build.lib.actions.ActionExecutionMetadata; +import com.google.devtools.build.lib.actions.ActionLookupData; +import com.google.devtools.build.lib.actions.Artifact; +import com.google.devtools.build.lib.actions.Artifact.DerivedArtifact; +import com.google.devtools.build.lib.actions.InputMetadataProvider; +import com.google.devtools.build.lib.profiler.Profiler; +import com.google.devtools.build.lib.profiler.ProfilerTask; +import com.google.devtools.build.lib.profiler.SilentCloseable; +import com.google.devtools.build.lib.vfs.OutputService.RewoundActionSynchronizer; +import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.locks.Lock; +import java.util.concurrent.locks.ReadWriteLock; +import java.util.concurrent.locks.ReentrantReadWriteLock; +import java.util.concurrent.locks.StampedLock; +import javax.annotation.Nullable; + +/** + * A {@link RewoundActionSynchronizer} implementation for Bazel's remote filesystem, which is backed + * by actual files on disk and requires synchronization to ensure that action outputs aren't deleted + * while they are being read. + */ +final class RemoteRewoundActionSynchronizer implements RewoundActionSynchronizer { + /** A task with a cancellation callback. */ + public interface Cancellable { + void cancel() throws InterruptedException; + } + + private final RemoteActionInputFetcher actionInputFetcher; + private final ConcurrentHashMap outputUploadTasks = + new ConcurrentHashMap<>(); + + // A single coarse lock is used to synchronize rewound actions (writers) and both rewound and + // non-rewound actions (readers) as long as no rewound action has attempted to prepare for its + // execution. + // This ensures high throughput and low memory footprint for the common case of no rewound + // actions. In this case, there won't be any writers and the performance characteristics of a + // ReentrantReadWriteLock are comparable to that of an atomic counter. A StampedLock would not be + // a good fit as its performance regresses with 127 or more concurrent readers. + // Note that it wouldn't be correct to only start using this lock once an action is rewound, + // because a non-rewound action consuming its non-lost outputs could have already started + // executing. + @Nullable private volatile ReadWriteLock coarseLock = new ReentrantReadWriteLock(); + + // A fine-grained lock structure that is switched to when the first rewound action attempts to + // prepare for its execution. This structure is used to ensure that rewound actions do not + // delete their outputs while they are being read by other actions, while still allowing + // rewound actions and non-rewound actions to run concurrently (i.e., not force the equivalent + // of --jobs=1 for as long as a rewound action is running, as the coarse lock would). + // A rewound action will acquire a write lock on its lookup data before it prepares for + // execution, while any action will acquire a read lock on the lookup data of any generating + // action of its inputs before it starts executing. + // The values of this cache are weakly referenced to ensure that locks are cleaned up when they + // are no longer needed. + @Nullable private volatile LoadingCache fineLocks; + + public RemoteRewoundActionSynchronizer(RemoteActionInputFetcher actionInputFetcher) { + this.actionInputFetcher = actionInputFetcher; + } + + /* + Proof of deadlock freedom: + + As long as the coarse lock is used, there can't be any deadlock because there is only a single + read-write lock. + + Now assume that there is a deadlock while the fine locks are used. First, note that the logic in + ImportantOutputHandler that is guarded by enterProcessOutputsAndGetLostArtifacts does not block + on any (rewound or non-rewound) action executions while it holds read locks and can thus be + ignored in the following. Consider the directed labeled "wait-for" graph defined as follows: + + * Nodes are given by the currently active Skyframe action execution threads, each of which is + identified with the action it is (or will be) executing. Actions are in one-to-one + correspondence with the ActionLookupData that is used as the key in the fine locks map. + * For each pair of actions A_1 and A_2, there is an edge from A_1 to A_2 labeled with XY(A_3) + if A_1 is waiting for the X lock of A_3 and A_2 currently holds the Y lock of A_3, where X and + Y are either R (for read) or W (for write). The resulting graph may have parallel edges with + distinct labels. + + Let C be any directed cycle in the graph representing a deadlock, let A_1 -[XY(A_3)]-> A_2 be an + edge in C and consider the following cases for the pair XY: + + * RR: Since a read-write lock whose read lock is held by at least one thread doesn't + block any other thread from acquiring its read lock, this case doesn't occur. + * WW: The write lock of A_3 is only ever (attempted to be) acquired by A_3 itself when it is + rewound, which means that the edge would necessarily be of the shape A_3 -[WW(A_3)]-> A_3. + But this isn't possible since the write lock for an action is only acquired in one place ( + enterActionPreparationForRewinding) and not recursively. + * WR: In this case, A_1 attempts to acquire a write lock, which only happens when A_1 is a + rewound action about to prepare for its (re-)execution. This means that the edge is + necessarily of the shape A_1 -[WR(A_1)]-> A_2. While a rewound action is waiting for its + own write lock in enterActionPreparation, it doesn't hold any locks since + enterActionExecution hasn't been called yet in SkyframeActionExecutor and all past + executions of the action have released all their locks due to use of try-with-resources. + This means that A_1 can't have any incoming edges in the wait-for graph, which is a + contradiction to the assumption that it is contained in the directed cycle C. + + We conclude that XY = RW. Since the write lock of A_3 is only ever acquired by A_3 itself, all + edges in C are of the form A_1 -[RW(A_2)]-> A_2. But by construction of inputKeysFor, the + action A_1 is attempting to acquire the read locks of all its inputs' generating actions, and + thus the action A_1 depends on one of the outputs of A_2 (*). + + Applied to all edges of C, we conclude that there is a corresponding directed cycle in the + action graph, which is a contradiction since Bazel disallows dependency cycles. + + Notes: + * The proof would not go through at (*) if fineLocks were replaced by a Striped lock structure + with a fixed number of locks. In fact, this gives rise to a deadlock if the number of stripes + is at least 2, but low enough that distinct generating actions hash to the same stripe. + */ + + @Override + public SilentCloseable enterActionPreparation(Action action, boolean wasRewound) + throws InterruptedException { + // Skyframe schedules non-rewound actions such that they never run concurrently with actions + // that consume their outputs. + if (!wasRewound) { + return () -> {}; + } + try (SilentCloseable c = + Profiler.instance().profile(ProfilerTask.ACTION_LOCK, "action.enterActionPreparation")) { + return enterActionPreparationForRewinding(action); + } + } + + private SilentCloseable enterActionPreparationForRewinding(Action action) + throws InterruptedException { + var localCoarseLock = coarseLock; + if (localCoarseLock != null) { + // This is the first time a rewound action has attempted to prepare for its execution. + // Switch to using the fine locks under the protection of the coarse write lock. + localCoarseLock.writeLock().lockInterruptibly(); + try { + // Check again under the lock to avoid a race between multiple rewound actions attempting + // to prepare for execution at the same time. + if (fineLocks == null) { + fineLocks = + Caffeine.newBuilder() + .weakValues() + // ReentrantReadWriteLock would not work here as its individual read and write + // locks do not strongly reference the parent lock, which would lead to locks + // being cleaned up while they are still held + // (https://bugs.openjdk.org/browse/JDK-8189598). This can be worked around by + // using a construction similar to Guava's Striped helpers. StampedLock is both + // more memory-efficient and its views do strongly reference the parent lock + // (https://github.com/openjdk/jdk/blob/b349f661ea5f14b258191134714a7e712c90ef3e/src/java.base/share/classes/java/util/concurrent/locks/StampedLock.java#L1039), + // TODO: Investigate the effect of fair locks on build wall time. + .build((ActionLookupData unused) -> new StampedLock().asReadWriteLock()); + coarseLock = null; + } + } finally { + localCoarseLock.writeLock().unlock(); + } + } + + var writeLock = fineLocks.get(outputKeyFor(action)).writeLock(); + writeLock.lockInterruptibly(); + prepareOutputsForRewinding(action); + return writeLock::unlock; + } + + /** + * Cancels all async tasks that operate on the action's outputs and resets any cached data about + * their prefetching state. + */ + private void prepareOutputsForRewinding(Action action) throws InterruptedException { + Cancellable task = outputUploadTasks.remove(action); + if (task != null) { + task.cancel(); + } + actionInputFetcher.handleRewoundActionOutputs(action.getOutputs()); + } + + @Override + public SilentCloseable enterActionExecution(Action action, InputMetadataProvider metadataProvider) + throws InterruptedException { + try (SilentCloseable c = + Profiler.instance().profile(ProfilerTask.ACTION_LOCK, "action.enterActionExecution")) { + return lockArtifactsForConsumption( + () -> action.getInputs().toList().iterator(), metadataProvider); + } + } + + /** + * Guards a call to {@link + * com.google.devtools.build.lib.remote.RemoteImportantOutputHandler#processOutputsAndGetLostArtifacts}. + */ + public SilentCloseable enterProcessOutputsAndGetLostArtifacts( + Iterable importantOutputs, InputMetadataProvider fullMetadataProvider) + throws InterruptedException { + try (SilentCloseable c = + Profiler.instance() + .profile(ProfilerTask.ACTION_LOCK, "action.enterProcessOutputsAndGetLostArtifacts")) { + return lockArtifactsForConsumption(importantOutputs, fullMetadataProvider); + } + } + + /** + * Registers a cancellation callback for an upload of action outputs that may still be running + * after the action has completed. + */ + public void registerOutputUploadTask(ActionExecutionMetadata action, Cancellable task) { + // We don't expect to have multiple output upload tasks for the same action registered at the + // same time. + outputUploadTasks.merge( + action, + task, + (oldTask, newTask) -> { + throw new IllegalStateException( + "Attempted to register multiple output upload tasks for %s: %s and %s" + .formatted(action, oldTask, newTask)); + }); + } + + private SilentCloseable lockArtifactsForConsumption( + Iterable artifacts, InputMetadataProvider metadataProvider) + throws InterruptedException { + var localCoarseLock = coarseLock; + if (localCoarseLock != null) { + // Common case for builds without any rewound actions: acquire the single lock that is never + // acquired by a writer. + localCoarseLock.readLock().lockInterruptibly(); + } + // Read the fine locks after acquiring the coarse lock to allow the fine locks to be inflated + // lazily. + var localFineLocks = fineLocks; + if (localFineLocks == null) { + // Continuation of the common case for builds without any rewound actions: the fine locks + // have not been inflated. + return localCoarseLock.readLock()::unlock; + } + + // At this point, there has been at least one rewound action that has inflated the fine locks. + // We need to switch to it. + if (localCoarseLock != null) { + localCoarseLock.readLock().unlock(); + } + var allReadWriteLocks = + localFineLocks.getAll(inputKeysFor(artifacts, metadataProvider)).values(); + var locksToUnlockBuilder = + ImmutableList.builderWithExpectedSize(allReadWriteLocks.size()); + try { + for (var readWriteLock : allReadWriteLocks) { + var readLock = readWriteLock.readLock(); + readLock.lockInterruptibly(); + locksToUnlockBuilder.add(readLock); + } + } catch (InterruptedException e) { + for (var readLock : locksToUnlockBuilder.build()) { + readLock.unlock(); + } + throw e; + } + var locksToUnlock = locksToUnlockBuilder.build(); + return () -> locksToUnlock.forEach(Lock::unlock); + } + + private static Iterable inputKeysFor( + Iterable artifacts, InputMetadataProvider metadataProvider) { + var allArtifacts = + Iterables.concat( + artifacts, + Iterables.concat( + Iterables.transform( + metadataProvider.getRunfilesTrees(), + runfilesTree -> runfilesTree.getArtifacts().toList()))); + return Iterables.transform( + Iterables.filter(allArtifacts, artifact -> artifact instanceof DerivedArtifact), + artifact -> ((DerivedArtifact) artifact).getGeneratingActionKey()); + } + + private static ActionLookupData outputKeyFor(Action action) { + return ((DerivedArtifact) action.getPrimaryOutput()).getGeneratingActionKey(); + } +} diff --git a/src/main/java/com/google/devtools/build/lib/skyframe/SkyframeActionExecutor.java b/src/main/java/com/google/devtools/build/lib/skyframe/SkyframeActionExecutor.java index 8c1f9484507f6b..f54dbfb5b3b4b6 100644 --- a/src/main/java/com/google/devtools/build/lib/skyframe/SkyframeActionExecutor.java +++ b/src/main/java/com/google/devtools/build/lib/skyframe/SkyframeActionExecutor.java @@ -1048,38 +1048,47 @@ public ActionStepOrResult run(Environment env) statusReporter.updateStatus(event); } env.getListener().post(event); - if (actionFileSystemType().supportsLocalActions()) { - try (SilentCloseable d = profiler.profile(ProfilerTask.INFO, "action.prepare")) { - // This call generally deletes any files at locations that are declared outputs of the - // action, although some actions perform additional work, while others intentionally - // keep previous outputs in place. - action.prepare( - actionExecutionContext.getExecRoot(), - actionExecutionContext.getPathResolver(), - outputService.bulkDeleter(), - useArchivedTreeArtifacts(action)); - } catch (IOException e) { - logger.atWarning().withCause(e).log( - "failed to delete output files before executing action: '%s'", action); - throw toActionExecutionException( - "failed to delete output files before executing action", - e, - action, - null, - Code.ACTION_OUTPUTS_DELETION_FAILURE); + var rewoundActionSynchronizer = outputService.getRewoundActionSynchronizer(); + try (SilentCloseable outerLock = + rewoundActionSynchronizer.enterActionPreparation(action, wasRewound(action))) { + if (actionFileSystemType().supportsLocalActions()) { + try (SilentCloseable d = + profiler.profile(ProfilerTask.INFO, "action.prepare")) { + // This call generally deletes any files at locations that are declared outputs of + // the action, although some actions perform additional work, while others + // intentionally keep previous outputs in place. + action.prepare( + actionExecutionContext.getExecRoot(), + actionExecutionContext.getPathResolver(), + outputService.bulkDeleter(), + useArchivedTreeArtifacts(action)); + } catch (IOException e) { + logger.atWarning().withCause(e).log( + "failed to delete output files before executing action: '%s'", action); + throw toActionExecutionException( + "failed to delete output files before executing action", + e, + action, + null, + Code.ACTION_OUTPUTS_DELETION_FAILURE); + } } - } - if (actionFileSystemType().inMemoryFileSystem()) { - // There's nothing to delete when the action file system is used, but we must ensure - // that the output directories for stdout and stderr exist. - setupActionFsFileOutErr(actionExecutionContext.getFileOutErr(), action); - createActionFsOutputDirectories(action, actionExecutionContext.getPathResolver()); - } else { - createOutputDirectories(action); - } + if (actionFileSystemType().inMemoryFileSystem()) { + // There's nothing to delete when the action file system is used, but we must ensure + // that the output directories for stdout and stderr exist. + setupActionFsFileOutErr(actionExecutionContext.getFileOutErr(), action); + createActionFsOutputDirectories(action, actionExecutionContext.getPathResolver()); + } else { + createOutputDirectories(action); + } - return executeAction(env.getListener(), action); + try (SilentCloseable innerLock = + rewoundActionSynchronizer.enterActionExecution( + action, actionExecutionContext.getInputMetadataProvider())) { + return executeAction(env.getListener(), action); + } + } } catch (LostInputsActionExecutionException e) { lostInputs = true; throw e; diff --git a/src/main/java/com/google/devtools/build/lib/util/PersistentMap.java b/src/main/java/com/google/devtools/build/lib/util/PersistentMap.java index e8035e98c709b1..082a23ad1e5a42 100644 --- a/src/main/java/com/google/devtools/build/lib/util/PersistentMap.java +++ b/src/main/java/com/google/devtools/build/lib/util/PersistentMap.java @@ -83,7 +83,6 @@ public abstract class PersistentMap extends ForwardingConcurrentMap private final Path journalFile; private final LinkedBlockingQueue journal; - private DataOutputStream journalOut; /** * 'dirty' is true when the in-memory representation of the map is more recent than the on-disk @@ -202,22 +201,24 @@ public V remove(Object object) { */ private synchronized void writeJournal() { try { - if (journalOut == null) { - if (journalFile.exists()) { - // The journal file was left around after the last save() because - // keepJournal() was true. Append to it. - journalOut = - new DataOutputStream(new BufferedOutputStream(journalFile.getOutputStream(true))); - } else { - // Create new journal. - journalOut = createMapFile(journalFile); - } + DataOutputStream journalOut; + if (journalFile.exists()) { + // The journal file was left around after the last save() because + // keepJournal() was true. Append to it. + journalOut = + new DataOutputStream(new BufferedOutputStream(journalFile.getOutputStream(true))); + } else { + // Create new journal. + journalOut = createMapFile(journalFile); + } + try { + // Journal may have duplicates, we can ignore them. + LinkedHashSet items = Sets.newLinkedHashSetWithExpectedSize(journal.size()); + journal.drainTo(items); + writeEntries(journalOut, items, delegate()); + } finally { + journalOut.close(); } - // Journal may have duplicates, we can ignore them. - LinkedHashSet items = Sets.newLinkedHashSetWithExpectedSize(journal.size()); - journal.drainTo(items); - writeEntries(journalOut, items, delegate()); - journalOut.flush(); } catch (IOException e) { this.deferredIOFailure = e.getMessage() + " during journal append"; } @@ -303,8 +304,6 @@ private synchronized long save(boolean fullSave) throws IOException { if (dirty) { if (!fullSave && keepJournal()) { forceFlush(); - journalOut.close(); - journalOut = null; return journalSize() + cacheSize(); } else { dirty = false; @@ -343,12 +342,8 @@ protected boolean keepJournal() { return false; } - private synchronized void clearJournal() throws IOException { + private synchronized void clearJournal() { journal.clear(); - if (journalOut != null) { - journalOut.close(); - journalOut = null; - } } private synchronized void loadEntries(Path mapFile, boolean failFast) throws IOException { diff --git a/src/main/java/com/google/devtools/build/lib/vfs/BUILD b/src/main/java/com/google/devtools/build/lib/vfs/BUILD index 6ad2a2dcb66fec..d6a0dbbfada769 100644 --- a/src/main/java/com/google/devtools/build/lib/vfs/BUILD +++ b/src/main/java/com/google/devtools/build/lib/vfs/BUILD @@ -97,6 +97,7 @@ java_library( "//src/main/java/com/google/devtools/build/lib/actions:fileset_output_tree", "//src/main/java/com/google/devtools/build/lib/analysis:blaze_directories", "//src/main/java/com/google/devtools/build/lib/events", + "//src/main/java/com/google/devtools/build/lib/profiler", "//src/main/java/com/google/devtools/build/lib/util:abrupt_exit_exception", "//src/main/java/com/google/devtools/build/lib/util:detailed_exit_code", "//src/main/java/com/google/devtools/build/skyframe:skyframe-objects", diff --git a/src/main/java/com/google/devtools/build/lib/vfs/OutputService.java b/src/main/java/com/google/devtools/build/lib/vfs/OutputService.java index 1102cf34ea1343..82546f6cbd01e1 100644 --- a/src/main/java/com/google/devtools/build/lib/vfs/OutputService.java +++ b/src/main/java/com/google/devtools/build/lib/vfs/OutputService.java @@ -20,6 +20,7 @@ import com.google.devtools.build.lib.actions.Action; import com.google.devtools.build.lib.actions.ActionExecutionMetadata; import com.google.devtools.build.lib.actions.ActionInputMap; +import com.google.devtools.build.lib.actions.InputMetadataProvider; import com.google.devtools.build.lib.actions.Artifact; import com.google.devtools.build.lib.actions.Artifact.TreeFileArtifact; import com.google.devtools.build.lib.actions.ArtifactPathResolver; @@ -32,6 +33,7 @@ import com.google.devtools.build.lib.actions.cache.MetadataInjector; import com.google.devtools.build.lib.actions.cache.OutputMetadataStore; import com.google.devtools.build.lib.events.EventHandler; +import com.google.devtools.build.lib.profiler.SilentCloseable; import com.google.devtools.build.lib.util.AbruptExitException; import com.google.devtools.build.skyframe.SkyFunction.Environment; import java.io.IOException; @@ -247,4 +249,48 @@ default BulkDeleter bulkDeleter() { default XattrProvider getXattrProvider(XattrProvider delegate) { return delegate; } + + default RewoundActionSynchronizer getRewoundActionSynchronizer() { + return RewoundActionSynchronizer.NOOP; + } + + /** + * Provides synchronization for actions in the presence of action rewinding. + * + *

If an action discovers that some of its inputs have been lost, action rewinding will select + * actions that need to be re-executed to recover the lost inputs. Without synchronization, such + * actions may run concurrently with actions that consume their non-lost outputs. Depending on the + * particular output service and action filesystem implementation, this may lead to races, which + * this interface aims to prevent. + */ + interface RewoundActionSynchronizer { + /** + * Guards an action from the beginning of its {@link Action#prepare preparation} until the end + * of its {@link Action#execute execution}. + */ + SilentCloseable enterActionPreparation(Action action, boolean wasRewound) + throws InterruptedException; + + /** Guards an action from the beginning to the end of its {@link Action#execute execution}. */ + SilentCloseable enterActionExecution(Action action, InputMetadataProvider metadataProvider) + throws InterruptedException; + + /** + * A no-op implementation of {@link RewoundActionSynchronizer}, suitable for action filesystems + * that support racy access to action outputs. + */ + RewoundActionSynchronizer NOOP = + new RewoundActionSynchronizer() { + @Override + public SilentCloseable enterActionPreparation(Action action, boolean wasRewound) { + return () -> {}; + } + + @Override + public SilentCloseable enterActionExecution( + Action action, InputMetadataProvider metadataProvider) { + return () -> {}; + } + }; + } } diff --git a/src/test/java/com/google/devtools/build/lib/skyframe/OutputsInvalidationIntegrationTest.java b/src/test/java/com/google/devtools/build/lib/skyframe/OutputsInvalidationIntegrationTest.java index 6fd87ed8c6287b..44dc3fe2e955f5 100644 --- a/src/test/java/com/google/devtools/build/lib/skyframe/OutputsInvalidationIntegrationTest.java +++ b/src/test/java/com/google/devtools/build/lib/skyframe/OutputsInvalidationIntegrationTest.java @@ -57,6 +57,8 @@ public void prepareOutputServiceMock() when(outputService.startBuild(any(), any(), any(), anyBoolean())) .thenReturn(ModifiedFileSet.EVERYTHING_MODIFIED); when(outputService.getXattrProvider(any())).thenAnswer(i -> i.getArgument(0)); + when(outputService.getRewoundActionSynchronizer()) + .thenReturn(OutputService.RewoundActionSynchronizer.NOOP); } @Override diff --git a/src/test/java/com/google/devtools/build/lib/skyframe/rewinding/BUILD b/src/test/java/com/google/devtools/build/lib/skyframe/rewinding/BUILD index b2023054279d01..ca28c0da953a90 100644 --- a/src/test/java/com/google/devtools/build/lib/skyframe/rewinding/BUILD +++ b/src/test/java/com/google/devtools/build/lib/skyframe/rewinding/BUILD @@ -71,20 +71,24 @@ java_library( java_test( name = "RewindingTest", + timeout = "long", srcs = ["RewindingTest.java"], + jvm_flags = ["-Djava.lang.Thread.allowVirtualThreads=true"], shard_count = 12, - tags = ["no_windows"], # BuildIntegrationTestCase isn't fully compatible with Windows. deps = [ ":rewinding_tests_helper", "//src/main/java/com/google/devtools/build/lib:runtime", "//src/main/java/com/google/devtools/build/lib/actions:artifacts", "//src/main/java/com/google/devtools/build/lib/analysis:blaze_directories", "//src/main/java/com/google/devtools/build/lib/analysis:target_configured_event", + "//src/main/java/com/google/devtools/build/lib/authandtls/credentialhelper:credential_module", "//src/main/java/com/google/devtools/build/lib/includescanning", + "//src/main/java/com/google/devtools/build/lib/remote", "//src/main/java/com/google/devtools/build/lib/util:os", "//src/main/java/com/google/devtools/build/lib/vfs", "//src/test/java/com/google/devtools/build/lib/analysis/util", "//src/test/java/com/google/devtools/build/lib/buildtool/util", + "//src/test/java/com/google/devtools/build/lib/remote/util:integration_test_utils", "//src/test/java/com/google/devtools/build/lib/testutil:TestConstants", "//src/test/java/com/google/devtools/build/lib/testutil:action_event_recorder", "//third_party:guava", diff --git a/src/test/java/com/google/devtools/build/lib/skyframe/rewinding/RewindingTest.java b/src/test/java/com/google/devtools/build/lib/skyframe/rewinding/RewindingTest.java index 0b28d732c9db91..7dd0c94b16d01e 100644 --- a/src/test/java/com/google/devtools/build/lib/skyframe/rewinding/RewindingTest.java +++ b/src/test/java/com/google/devtools/build/lib/skyframe/rewinding/RewindingTest.java @@ -16,16 +16,22 @@ import static com.google.common.truth.Truth.assertThat; import static com.google.common.truth.TruthJUnit.assume; +import com.google.common.collect.ImmutableList; import com.google.common.collect.Iterables; import com.google.common.eventbus.Subscribe; import com.google.devtools.build.lib.actions.Artifact; import com.google.devtools.build.lib.analysis.BlazeDirectories; import com.google.devtools.build.lib.analysis.TargetConfiguredEvent; import com.google.devtools.build.lib.analysis.util.AnalysisMock; +import com.google.devtools.build.lib.authandtls.credentialhelper.CredentialModule; import com.google.devtools.build.lib.buildtool.util.BuildIntegrationTestCase; import com.google.devtools.build.lib.includescanning.IncludeScanningModule; +import com.google.devtools.build.lib.remote.RemoteModule; +import com.google.devtools.build.lib.remote.util.IntegrationTestUtils; +import com.google.devtools.build.lib.remote.util.IntegrationTestUtils.WorkerInstance; import com.google.devtools.build.lib.runtime.BlazeModule; import com.google.devtools.build.lib.runtime.BlazeRuntime; +import com.google.devtools.build.lib.runtime.BlockWaitingModule; import com.google.devtools.build.lib.runtime.WorkspaceBuilder; import com.google.devtools.build.lib.testutil.ActionEventRecorder; import com.google.devtools.build.lib.testutil.TestConstants; @@ -34,6 +40,8 @@ import com.google.testing.junit.testparameterinjector.TestParameter; import com.google.testing.junit.testparameterinjector.TestParameterInjector; import java.io.IOException; +import org.junit.ClassRule; +import org.junit.Rule; import org.junit.Test; import org.junit.runner.RunWith; @@ -53,14 +61,18 @@ public final class RewindingTest extends BuildIntegrationTestCase { @TestParameter private boolean keepGoing; @TestParameter private boolean skymeld; + @ClassRule @Rule public static final WorkerInstance worker = IntegrationTestUtils.createWorker(); + private final ActionEventRecorder actionEventRecorder = new ActionEventRecorder(); private final RewindingTestsHelper helper = new RewindingTestsHelper(this, actionEventRecorder); @Override protected BlazeRuntime.Builder getRuntimeBuilder() throws Exception { return super.getRuntimeBuilder() + .addBlazeModule(new RemoteModule()) + .addBlazeModule(new BlockWaitingModule()) .addBlazeModule(new IncludeScanningModule()) - .addBlazeModule(helper.makeControllableActionStrategyModule("standalone")) + .addBlazeModule(helper.makeControllableActionStrategyModule("remote", "standalone")) .addBlazeModule(helper.getLostOutputsModule()) .addBlazeModule( new BlazeModule() { @@ -79,15 +91,27 @@ public void workspaceInit( }); } + @Override + protected ImmutableList getSpawnModules() { + return ImmutableList.builder() + .addAll(super.getSpawnModules()) + .add(new CredentialModule()) + .build(); + } + @Override protected void setupOptions() throws Exception { super.setupOptions(); addOptions( - "--spawn_strategy=standalone", + "--enable_runfiles", + "--spawn_strategy=remote", + "--remote_executor=grpc://localhost:" + worker.getPort(), + "--remote_download_regex=.*\\.inlined$", "--noexperimental_merged_skyframe_analysis_execution", "--rewind_lost_inputs", "--features=cc_include_scanning", "--experimental_remote_include_extraction_size_threshold=0", + "--experimental_inmemory_dotincludes_files", "--experimental_remote_cache_eviction_retries=0", "--track_incremental_state=" + trackIncrementalState, "--keep_going=" + keepGoing, @@ -150,7 +174,17 @@ public void dependentActionsReevaluated() throws Exception { } @Test - public void multipleLostInputsForRewindPlan() throws Exception { + public void multipleLostInputsForRewindPlan( + @TestParameter({"standalone", "remote"}) String producerStrategy, + @TestParameter({"standalone", "remote"}) String consumerStrategy) + throws Exception { + if (!AnalysisMock.get().isThisBazel()) { + // TODO: without this, test running internally hangs forever. Need to investigate why. + addOptions("--remote_cache_async=false"); + } + addOptions( + "--strategy_regexp=.*//test:rule.*=" + producerStrategy, + "--strategy_regexp=.*//test:consume.*=" + consumerStrategy); helper.runMultipleLostInputsForRewindPlan(); } @@ -281,12 +315,6 @@ public void flakyActionFailsAfterRewind_raceWithIndirectConsumer_undoneDuringInp helper.runFlakyActionFailsAfterRewind_raceWithIndirectConsumer_undoneDuringInputChecking(); } - @Test - public void flakyActionFailsAfterRewind_raceWithIndirectConsumer_undoneDuringLostInputHandling() - throws Exception { - helper.runFlakyActionFailsAfterRewind_raceWithIndirectConsumer_undoneDuringLostInputHandling(); - } - @Test public void discoveredCppModuleLost() throws Exception { skipIfBazel(); diff --git a/src/test/java/com/google/devtools/build/lib/skyframe/rewinding/RewindingTestsHelper.java b/src/test/java/com/google/devtools/build/lib/skyframe/rewinding/RewindingTestsHelper.java index d259e798c724e6..580bf1693ea8e0 100644 --- a/src/test/java/com/google/devtools/build/lib/skyframe/rewinding/RewindingTestsHelper.java +++ b/src/test/java/com/google/devtools/build/lib/skyframe/rewinding/RewindingTestsHelper.java @@ -39,7 +39,6 @@ import com.google.common.eventbus.Subscribe; import com.google.common.flogger.GoogleLogger; import com.google.common.util.concurrent.Uninterruptibles; -import com.google.devtools.build.lib.actions.Action; import com.google.devtools.build.lib.actions.ActionExecutionContext; import com.google.devtools.build.lib.actions.ActionInput; import com.google.devtools.build.lib.actions.ActionInputDepOwnerMap; @@ -106,6 +105,7 @@ import java.util.concurrent.CountDownLatch; import java.util.concurrent.atomic.AtomicInteger; import java.util.concurrent.atomic.AtomicReference; +import java.util.stream.IntStream; /** * Implements rewinding-specific infrastructure and test logic used for rewinding tests. Search for @@ -167,21 +167,6 @@ public final BlazeModule getLostOutputsModule() { return lostOutputsModule; } - /** - * Returns whether the execution strategy can handle rewinding happening concurrently with another - * action consuming the rewound action's outputs. - * - *

When an action is rewound, it executes a second time, including the {@link Action#prepare} - * step which deletes previous outputs on disk. Another action which observed its dependency to be - * done (before rewinding was initiated) may simultaneously attempt to consume these deleted - * outputs, leading to a flaky build failure. If this method returns {@code false}, test cases - * which exercise the described scenario will set {@code --jobs=1} to avoid the race condition. - */ - @ForOverride - boolean supportsConcurrentRewinding() { - return false; - } - /** * Converts a file digest to a hex string compatible with the test's active {@link * com.google.devtools.build.lib.vfs.DigestHashFunction}. @@ -219,8 +204,8 @@ private Object[] filterExecutedSpawnDescriptions(String... expectedDescriptions) } public final ControllableActionStrategyModule makeControllableActionStrategyModule( - String identifier) { - return new ControllableActionStrategyModule(spawnController, identifier); + String... identifiers) { + return new ControllableActionStrategyModule(spawnController, identifiers); } public final ImmutableList getExecutedSpawnDescriptions() { @@ -330,7 +315,7 @@ static String latin1StringFromActionInput(ActionExecutionContext context, Action input.getExecPathString().endsWith(".inlined"), "Only inputs ending in .inlined are guaranteed readable. Tried to read: %s", input); - return new String(readContentAsLatin1(context.getInputPath(input))); + return new String(readContentAsLatin1(context.getInputPath(input))).replace("\r\n", "\n"); } /** @@ -352,7 +337,9 @@ final String buildAndGetOutput(String pkg, BuildIntegrationTestCase testCase) th return ExecResult.delegate(); }); testCase.buildTarget(String.format("//%s:consume_output", pkg)); - return invocationOutput.get(); + // Genrule output may have mixed line endings: source files written by testCase.write() use + // System.lineSeparator(), but echo in the genrule command always produces \n. + return invocationOutput.get().replace("\r\n", "\n").replace("\r", "\n"); } public final void runNoLossSmokeTest() throws Exception { @@ -765,9 +752,6 @@ private void writeNGenrulePackages(int n) throws IOException { * not contain the genrule action with one input. */ public final void runMultipleLostInputsForRewindPlan() throws Exception { - if (!supportsConcurrentRewinding()) { - testCase.addOptions("--jobs=1"); - } writeNGenrulePackages(ActionRewindStrategy.MAX_ACTION_REWIND_EVENTS + 1); for (int i = 1; i <= ActionRewindStrategy.MAX_ACTION_REWIND_EVENTS + 1; i++) { final int target = i; @@ -787,15 +771,16 @@ public final void runMultipleLostInputsForRewindPlan() throws Exception { } List rewoundKeys = collectOrderedRewoundKeys(); testCase.buildTarget( - "//test:consume_1", - "//test:consume_2", - "//test:consume_3", - "//test:consume_4", - "//test:consume_5", - "//test:consume_6"); + IntStream.rangeClosed(1, ActionRewindStrategy.MAX_ACTION_REWIND_EVENTS + 1) + .mapToObj(i -> "//test:consume_" + i) + .toArray(String[]::new)); assertOnlyActionsRewound(rewoundKeys); verifyAllSpawnShimsConsumed(); - recorder.assertTotalLostInputCountsFromStats(ImmutableList.of(21)); + recorder.assertTotalLostInputCountsFromStats( + ImmutableList.of( + (ActionRewindStrategy.MAX_ACTION_REWIND_EVENTS + 1) + * (ActionRewindStrategy.MAX_ACTION_REWIND_EVENTS + 2) + / 2)); } public final void runInterruptedDuringRewindStopsNormally() throws Exception { @@ -804,14 +789,13 @@ public final void runInterruptedDuringRewindStopsNormally() throws Exception { // build. The build should stop with an interrupt normally (and not crash). writeTwoGenrulePackage(testCase); - Thread mainThread = Thread.currentThread(); addSpawnShim( "Executing genrule //test:rule2", (spawn, context) -> { addSpawnShim( "Executing genrule //test:rule1", (ignoredSpawn, ignoredContext) -> { - mainThread.interrupt(); + Thread.currentThread().interrupt(); return ExecResult.delegate(); }); @@ -1563,10 +1547,6 @@ final void runTreeFileArtifactRewound(SpawnShim shim) throws Exception { addSpawnShim("Compiling tree/make_cc_dir.cc/file1.cc", shim); - if (!supportsConcurrentRewinding()) { - testCase.addOptions("--jobs=1"); - } - List rewoundKeys = collectOrderedRewoundKeys(); testCase.buildTarget("//tree:consumes_tree"); @@ -1677,10 +1657,6 @@ final void runTreeArtifactRewoundWhenTreeFilesLost( addSpawnShim("Linking tree/libconsumes_tree.so", shim); - if (!supportsConcurrentRewinding()) { - testCase.addOptions("--jobs=1"); - } - List rewoundKeys = collectOrderedRewoundKeys(); testCase.buildTarget("//tree:consumes_tree"); verifyAllSpawnShimsConsumed(); @@ -1859,6 +1835,7 @@ final void runGeneratedRunfilesRewound(ImmutableList lostRunfiles, Spawn HashSet expectedRewoundGenrules = new HashSet<>(ImmutableList.of("//middle:gen1", "//middle:gen2")); int i = 0; + boolean sourceManifestActionSeen = false; while (i < 5) { assertThat(rewoundKeys.get(i)).isInstanceOf(ActionLookupData.class); ActionLookupData actionKey = (ActionLookupData) rewoundKeys.get(i); @@ -1866,14 +1843,16 @@ final void runGeneratedRunfilesRewound(ImmutableList lostRunfiles, Spawn i++; if (actionLabel.equals("//middle:tool")) { switch (actionKey.getActionIndex()) { - case 0: // SymlinkAction - break; - case 1: // SourceManifestAction - assertActionKey(rewoundKeys.get(i), "//middle:tool", 2); - i++; - break; - default: - fail(String.format("Unexpected action index. actionKey: %s, i: %s", actionKey, i)); + // SymlinkAction + case 0 -> {} + case 1 -> sourceManifestActionSeen = true; + // SymlinkTreeAction + case 2 -> assertThat(sourceManifestActionSeen).isTrue(); + default -> + fail( + String.format( + "Unexpected action index. actionKey: %s, rewoundKeys: %s", + actionKey, rewoundKeys)); } } else { assertThat(expectedRewoundGenrules.remove(actionLabel)).isTrue(); @@ -2036,33 +2015,31 @@ final void runDupeDirectAndRunfilesDependencyRewound( if (buildRunfileManifests()) { assertThat(rewoundKeys).hasSize(6); - int i = 0; - while (i < 4) { + boolean sourceManifestActionSeen = false; + for (int i = 0; i < 4; i++) { assertThat(rewoundKeys.get(i)).isInstanceOf(ActionLookupData.class); ActionLookupData actionKey = (ActionLookupData) rewoundKeys.get(i); String actionLabel = actionKey.getLabel().getCanonicalForm(); - i++; if (actionLabel.equals("//test:tool")) { switch (actionKey.getActionIndex()) { - case 0: // SymlinkAction - break; - case 1: // SourceManifestAction - assertActionKey(rewoundKeys.get(i), "//test:tool", /* index= */ 2); - i++; - break; - default: - fail( - String.format( - "Unexpected action index. actionKey: %s, rewoundKeys: %s", - actionKey, rewoundKeys)); + // SymlinkAction + case 0 -> {} + case 1 -> sourceManifestActionSeen = true; + // SymlinkTreeAction + case 2 -> assertThat(sourceManifestActionSeen).isTrue(); + default -> + fail( + String.format( + "Unexpected action index. actionKey: %s, rewoundKeys: %s", + actionKey, rewoundKeys)); } } else { assertThat(actionLabel).isEqualTo("//test:rule1"); } } - assertActionKey(rewoundKeys.get(i++), "//test:tool", /* index= */ 3); - assertArtifactKey(rewoundKeys.get(i), "_middlemen/test_Stool-runfiles"); + assertActionKey(rewoundKeys.get(4), "//test:tool", /* index= */ 3); + assertArtifactKey(rewoundKeys.get(5), "_middlemen/test_Stool-runfiles"); } else { assertThat(rewoundKeys).hasSize(4); int i = 0; @@ -2192,6 +2169,7 @@ def _tree_impl(ctx): if (buildRunfileManifests()) { assertThat(rewoundKeys).hasSize(7); int i = 0; + boolean sourceManifestActionSeen = false; while (i < 5) { assertThat(rewoundKeys.get(i)).isInstanceOf(ActionLookupData.class); ActionLookupData actionKey = (ActionLookupData) rewoundKeys.get(i); @@ -2199,14 +2177,16 @@ def _tree_impl(ctx): i++; if (actionLabel.equals("//middle:tool")) { switch (actionKey.getActionIndex()) { - case 0: // SymlinkAction - break; - case 1: // SourceManifestAction - assertActionKey(rewoundKeys.get(i), "//middle:tool", 2); - i++; - break; - default: - fail(String.format("Unexpected action index. actionKey: %s", actionKey)); + // SymlinkAction + case 0 -> {} + case 1 -> sourceManifestActionSeen = true; + // SymlinkTreeAction + case 2 -> assertThat(sourceManifestActionSeen).isTrue(); + default -> + fail( + String.format( + "Unexpected action index. actionKey: %s, rewoundKeys: %s", + actionKey, rewoundKeys)); } } else { assertThat(actionLabel).isEqualTo("//middle:gen_tree");