HCD-130 incremental repair failure during compaction (#1728)

bereng · driftx · commit c6a2d79c730e · 2025-07-23T11:58:15.000-05:00
### What is the issue
Concurrent and incremental repairs would spin fail or deadlock.

### What does this PR fix and why was it fixed
Concurrent and incremental repairs would spin fail. This patch:
- Removes an optimization failing to observe max parallelism
- Provides an improved algorithm to enforce max parallelism
- Closes transactions on some exceptions failing to be caught
- Removes a deadlock between cfs and the compaction strategy for long
running sequential operations
diff --git a/src/java/org/apache/cassandra/db/ColumnFamilyStore.java b/src/java/org/apache/cassandra/db/ColumnFamilyStore.java
@@ -42,6 +42,7 @@
 import java.util.concurrent.TimeUnit;
 import java.util.concurrent.TimeoutException;
 import java.util.concurrent.atomic.AtomicReference;
+import java.util.concurrent.locks.ReentrantLock;
 import java.util.concurrent.locks.ReentrantReadWriteLock;
 import java.util.function.Consumer;
 import java.util.function.Supplier;
@@ -435,6 +436,8 @@ TablePaxosRepairHistory get()
 
     private final RequestTracker requestTracker = RequestTracker.instance;
 
+    private final ReentrantLock longRunningSerializedOperationsLock = new ReentrantLock();
+
     public static void shutdownPostFlushExecutor() throws InterruptedException
     {
         postFlushExecutor.shutdown();
@@ -3126,7 +3129,8 @@ public <V> V runWithCompactionsDisabled(Callable<V> callable,
     {
         // synchronize so that concurrent invocations don't re-enable compactions partway through unexpectedly,
         // and so we only run one major compaction at a time
-        synchronized (this)
+        longRunningSerializedOperationsLock.lock();
+        try
         {
             logger.debug("Cancelling in-progress compactions for {}", metadata.name);
             Iterable<ColumnFamilyStore> toInterruptFor = concatWith(interruptIndexes, interruptViews);
@@ -3154,16 +3158,9 @@ public <V> V runWithCompactionsDisabled(Callable<V> callable,
                 CompactionManager.instance.waitForCessation(toInterruptFor, sstablesPredicate);
 
                 // doublecheck that we finished, instead of timing out
-                for (ColumnFamilyStore cfs : toInterruptFor)
-                {
-                    if (cfs.getTracker().getCompacting().stream().anyMatch(sstablesPredicate))
-                    {
-                        logger.warn("Unable to cancel in-progress compactions for {}. " +
-                                    "Perhaps there is an unusually large row in progress somewhere, or the system is simply overloaded.",
-                                    metadata.name);
-                        return null;
-                    }
-                }
+                if (!allCompactionsFinished(toInterruptFor, sstablesPredicate))
+                    return null;
+
                 logger.trace("Compactions successfully cancelled");
 
                 // run our task
@@ -3177,6 +3174,26 @@ public <V> V runWithCompactionsDisabled(Callable<V> callable,
                 }
             }
         }
+        finally
+        {
+            longRunningSerializedOperationsLock.unlock();
+        }
+    }
+
+    private boolean allCompactionsFinished( Iterable<ColumnFamilyStore> cfss, Predicate<SSTableReader> sstablesPredicate)
+    {
+        for (ColumnFamilyStore cfs : cfss)
+        {
+            if (cfs.getTracker().getCompacting().stream().anyMatch(sstablesPredicate))
+            {
+                logger.warn("Unable to cancel in-progress compactions for {}.{}.  Perhaps there is an unusually " +
+                            "large row in progress somewhere, or the system is simply overloaded.", metadata.keyspace, metadata.name);
+                logger.debug("In-flight compactions: {}", Arrays.toString(cfs.getTracker().getCompacting().toArray()));
+                return false;
+            }
+        }
+
+        return true;
     }
 
     private static CompactionManager.CompactionPauser pauseCompactionStrategies(Iterable<ColumnFamilyStore> toPause)
diff --git a/src/java/org/apache/cassandra/db/compaction/BackgroundCompactions.java b/src/java/org/apache/cassandra/db/compaction/BackgroundCompactions.java
@@ -171,7 +171,7 @@ void setSubmitted(CompactionStrategy strategy, TimeUUID id, CompactionAggregate
         if (id == null || aggregate == null)
             throw new IllegalArgumentException("arguments cannot be null");
 
-        logger.debug("Submitting background compaction {}", id);
+        logger.debug("Submitting background compaction {} for {}.{}", id, metadata.keyspace, metadata.name);
         CompactionPick compaction = aggregate.getSelected();
 
         CompactionPick prev = compactions.put(id, compaction);
diff --git a/src/java/org/apache/cassandra/db/compaction/ShardManager.java b/src/java/org/apache/cassandra/db/compaction/ShardManager.java
@@ -21,6 +21,7 @@
 import java.util.ArrayList;
 import java.util.Collection;
 import java.util.HashSet;
+import java.util.Iterator;
 import java.util.List;
 import java.util.PriorityQueue;
 import java.util.Set;
@@ -267,68 +268,88 @@ default <T, R extends CompactionSSTable> List<T> splitSSTablesInShardsLimited(Co
     {
         if (coveredShards <= maxParallelism)
             return splitSSTablesInShards(sstables, operationRange, numShardsForDensity, maker);
-        // We may be in a simple case where we can reduce the number of shards by some power of 2.
-        int multiple = Integer.highestOneBit(coveredShards / maxParallelism);
-        if (maxParallelism * multiple == coveredShards)
-            return splitSSTablesInShards(sstables, operationRange, numShardsForDensity / multiple, maker);
 
         var shards = splitSSTablesInShards(sstables,
                                            operationRange,
                                            numShardsForDensity,
                                            (rangeSSTables, range) -> Pair.create(Set.copyOf(rangeSSTables), range));
+
         return applyMaxParallelism(maxParallelism, maker, shards);
     }
 
-    private static <T, R extends CompactionSSTable> List<T> applyMaxParallelism(int maxParallelism, BiFunction<Collection<R>, Range<Token>, T> maker, List<Pair<Set<R>, Range<Token>>> shards)
+    private static <T, R extends CompactionSSTable> List<T> applyMaxParallelism(int maxParallelism,
+                                                                                BiFunction<Collection<R>, Range<Token>, T> maker,
+                                                                                List<Pair<Set<R>, Range<Token>>> shards)
     {
-        int actualParallelism = shards.size();
-        if (maxParallelism >= actualParallelism)
-        {
-            // We can fit within the parallelism limit without grouping, because some ranges are empty.
-            // This is not expected to happen often, but if it does, take advantage.
-            List<T> tasks = new ArrayList<>();
-            for (Pair<Set<R>, Range<Token>> pair : shards)
-                tasks.add(maker.apply(pair.left, pair.right));
-            return tasks;
-        }
-
-        // Otherwise we have to group shards together. Define a target token span per task and greedily group
-        // to be as close to it as possible.
-        double spanPerTask = shards.stream().map(Pair::right).mapToDouble(t -> t.left.size(t.right)).sum() / maxParallelism;
-        double currentSpan = 0;
-        Set<R> currentSSTables = new HashSet<>();
-        Token rangeStart = null;
-        Token prevEnd = null;
+        Iterator<Pair<Set<R>, Range<Token>>> iter = shards.iterator();
         List<T> tasks = new ArrayList<>(maxParallelism);
-        for (var pair : shards)
+        int shardsRemaining = shards.size();
+        int tasksRemaining = maxParallelism;
+
+        if (shardsRemaining > tasksRemaining)
         {
-            final Token currentEnd = pair.right.right;
-            final Token currentStart = pair.right.left;
-            double span = currentStart.size(currentEnd);
-            if (rangeStart == null)
-                rangeStart = currentStart;
-            if (currentSpan + span >= spanPerTask - 0.001) // rounding error safety
+            double totalSpan = shards.stream().map(Pair::right).mapToDouble(r -> r.left.size(r.right)).sum();
+            double spanPerTask = totalSpan / maxParallelism;
+
+            Set<R> currentSSTables = new HashSet<>();
+            Token rangeStart = null;
+            double currentSpan = 0;
+
+            // While we have more shards to process than there are tasks, we need to bunch shards up into tasks.
+            while (shardsRemaining > tasksRemaining)
             {
-                boolean includeCurrent = currentSpan + span - spanPerTask <= spanPerTask - currentSpan;
-                if (includeCurrent)
-                    currentSSTables.addAll(pair.left);
-                tasks.add(maker.apply(currentSSTables, new Range<>(rangeStart, includeCurrent ? currentEnd : prevEnd)));
-                currentSpan -= spanPerTask;
-                rangeStart = null;
-                currentSSTables.clear();
-                if (!includeCurrent)
-                {
-                    currentSSTables.addAll(pair.left);
+                Pair<Set<R>, Range<Token>> pair = iter.next(); // shardsRemaining counts the shards so iter can't be exhausted at this point
+                Token currentStart = pair.right.left;
+                Token currentEnd = pair.right.right;
+                double span = currentStart.size(currentEnd);
+
+                if (rangeStart == null)
                     rangeStart = currentStart;
+
+                currentSSTables.addAll(pair.left);
+                currentSpan += span;
+
+                // If there is only one task remaining, we should not issue it until we are processing the last shard.
+                // The latter condition is normally guaranteed, but floating point rounding has a very small chance of making the calculations wrong
+                if (currentSpan >= spanPerTask && tasksRemaining > 1)
+                {
+                    tasks.add(maker.apply(currentSSTables, new Range<>(rangeStart, currentEnd)));
+                    --tasksRemaining;
+                    currentSSTables = new HashSet<>();
+                    rangeStart = null;
+                    currentSpan = 0;
                 }
+                --shardsRemaining;
             }
-            else
+
+            // At this point there are as many tasks remaining as there are shards
+            // (this includes the case of issuing a task for the last shard when only one task remains).
+
+            // Add any already collected sstables to the next task.
+            if (!currentSSTables.isEmpty())
+            {
+                assert shardsRemaining > 0;
+                Pair<Set<R>, Range<Token>> pair = iter.next(); // shardsRemaining counts the shards so iter can't be exhausted at this point
                 currentSSTables.addAll(pair.left);
+                Token currentEnd = pair.right.right;
+                tasks.add(maker.apply(currentSSTables, new Range<>(rangeStart, currentEnd)));
+                --tasksRemaining;
+                --shardsRemaining;
+            }
+            assert shardsRemaining == tasksRemaining : shardsRemaining + " != " + tasksRemaining;
+        }
 
-            currentSpan += span;
-            prevEnd = currentEnd;
+        // If we still have tasks and shards to process, produce one task for each shard.
+        while (iter.hasNext())
+        {
+            Pair<Set<R>, Range<Token>> pair = iter.next(); // shardsRemaining counts the shards so iter can't be exhausted at this point
+            tasks.add(maker.apply(pair.left, pair.right));
+            --tasksRemaining;
+            --shardsRemaining;
         }
-        assert currentSSTables.isEmpty();
+
+        assert tasks.size() == Math.min(maxParallelism, shards.size()) : tasks.size() + " != " + maxParallelism;
+        assert shardsRemaining == 0 : shardsRemaining + " != 0";
         return tasks;
     }
 
diff --git a/src/java/org/apache/cassandra/db/compaction/UnifiedCompactionStrategy.java b/src/java/org/apache/cassandra/db/compaction/UnifiedCompactionStrategy.java
@@ -271,12 +271,13 @@ public synchronized CompactionTasks getMaximalTasks(long gcBefore, boolean split
             permittedParallelism = Integer.MAX_VALUE;
 
         List<AbstractCompactionTask> tasks = new ArrayList<>();
+        LifecycleTransaction txn = null;
         try
         {
             // Split the space into independently compactable groups.
             for (var aggregate : getMaximalAggregates())
             {
-                LifecycleTransaction txn = realm.tryModify(aggregate.getSelected().sstables(),
+                txn = realm.tryModify(aggregate.getSelected().sstables(),
                                                            OperationType.COMPACTION,
                                                            aggregate.getSelected().id());
 
@@ -296,6 +297,8 @@ public synchronized CompactionTasks getMaximalTasks(long gcBefore, boolean split
         }
         catch (Throwable t)
         {
+            if (txn != null)
+                txn.close();
             throw rejectTasks(tasks, t);
         }
     }
@@ -442,9 +445,17 @@ public void createAndAddTasks(long gcBefore, CompactionAggregate.UnifiedAggregat
                                                            selected.id());
         if (transaction != null)
         {
-            // This will ignore the range of the operation, which is fine.
-            backgroundCompactions.setSubmitted(this, transaction.opId(), aggregate);
-            createAndAddTasks(gcBefore, transaction, aggregate.operationRange(), aggregate.keepOriginals(), getShardingStats(aggregate), parallelism, tasks, additionalObserver);
+            try
+            {
+                // This will ignore the range of the operation, which is fine.
+                backgroundCompactions.setSubmitted(this, transaction.opId(), aggregate);
+                createAndAddTasks(gcBefore, transaction, aggregate.operationRange(), aggregate.keepOriginals(), getShardingStats(aggregate), parallelism, tasks, additionalObserver);
+            }
+            catch (Throwable e)
+            {
+                transaction.close();
+                throw e;
+            }
         }
         else
         {
@@ -744,8 +755,8 @@ private Collection<UnifiedCompactionTask> createParallelCompactionTasks(Lifecycl
                                                                 sharedOperation)
         );
         compositeTransaction.completeInitialization();
-        assert tasks.size() <= parallelism;
-        assert tasks.size() <= coveredShardCount;
+        assert tasks.size() <= parallelism : "Task size: " + tasks.size() + " vs parallelism of: " + parallelism;
+        assert tasks.size() <= coveredShardCount : "Task size: " + tasks.size() + " vs covered shard count: " + coveredShardCount;
 
         if (tasks.isEmpty())
             transaction.close(); // this should not be reachable normally, close the transaction for safety
diff --git a/test/unit/org/apache/cassandra/db/ColumnFamilyStoreTest.java b/test/unit/org/apache/cassandra/db/ColumnFamilyStoreTest.java
@@ -31,6 +31,7 @@
 import java.util.Map;
 import java.util.Optional;
 import java.util.Set;
+import java.util.concurrent.CountDownLatch;
 import java.util.concurrent.ExecutionException;
 import java.util.concurrent.TimeUnit;
 import java.util.concurrent.atomic.AtomicReference;
@@ -53,6 +54,7 @@
 import org.apache.cassandra.db.ColumnFamilyStore.FlushReason;
 import org.apache.cassandra.db.commitlog.CommitLogPosition;
 import org.apache.cassandra.db.compaction.OperationType;
+import org.apache.cassandra.db.compaction.TableOperation;
 import org.apache.cassandra.db.filter.ColumnFilter;
 import org.apache.cassandra.db.lifecycle.LifecycleTransaction;
 import org.apache.cassandra.db.lifecycle.SSTableSet;
@@ -137,6 +139,58 @@ public void truncateCFS()
         Keyspace.open(KEYSPACE2).getColumnFamilyStore(CF_STANDARD1).truncateBlocking();
     }
 
+    @Test
+    public void testRWCDLocking() throws InterruptedException
+    {
+        ColumnFamilyStore cfs = Keyspace.open(KEYSPACE1).getColumnFamilyStore(CF_STANDARD1);
+        CountDownLatch task1StaredtLatch = new CountDownLatch(1);
+        CountDownLatch task1FinishLatch = new CountDownLatch(1);
+        CountDownLatch task2StaredtLatch = new CountDownLatch(1);
+
+        Thread task1 = new Thread(() -> {
+            cfs.runWithCompactionsDisabled(() -> {
+                                               task1StaredtLatch.countDown();
+                                               try
+                                               {
+                                                   task1FinishLatch.await();
+                                               }
+                                               catch (InterruptedException e)
+                                               {
+                                                   throw new RuntimeException(e);
+                                               }
+                                               return null;
+                                           },
+                                           OperationType.P0,
+                                           true,
+                                           true,
+                                           TableOperation.StopTrigger.UNIT_TESTS);
+        });
+        task1.start();
+
+        Thread task2 = new Thread(() -> {
+            cfs.runWithCompactionsDisabled(() -> {
+                                               task2StaredtLatch.countDown();
+                                               return null;
+                                           },
+                                           OperationType.P0,
+                                           true,
+                                           true,
+                                           TableOperation.StopTrigger.UNIT_TESTS);
+        });
+        task2.start();
+
+        // Check that task1 started but task2 is waiting
+        assertTrue(task1StaredtLatch.await(30, TimeUnit.SECONDS));
+        assertEquals(1, task2StaredtLatch.getCount());
+
+        // Allow task1 to complete and check task2 completed next
+        task1FinishLatch.countDown();
+        assertTrue(task2StaredtLatch.await(30, TimeUnit.SECONDS));
+
+        task1.join();
+        task2.join();
+    }
+
     @Test
     public void testMemtableTimestamp() throws Throwable
     {
diff --git a/test/unit/org/apache/cassandra/db/compaction/UnifiedCompactionStrategyTransactionClosingTest.java b/test/unit/org/apache/cassandra/db/compaction/UnifiedCompactionStrategyTransactionClosingTest.java