fix review findings, asked Claude to rewrite docs of KeyedPartitioning

peter-toth · peter-toth · commit fef4c229cea8 · 2026-02-19T17:02:51.000+01:00
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/physical/partitioning.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/physical/partitioning.scala
@@ -349,26 +349,54 @@ case class CoalescedHashPartitioning(from: HashPartitioning, partitions: Seq[Coa
 
 /**
  * Represents a partitioning where rows are split across partitions based on transforms defined by
- * `expressions`. `partitionKeys`, should contain value of partition key(s) in ascending order,
- * after evaluated by the transforms in `expressions`, for each input partition.
- * `partitionKeys` might not be unique when this partitioning is returned from a data source, but
- * the `GroupPartitionsExec` operator can group partitions with the same key and so make
- * `partitionKeys` unique.
+ * `expressions`.
  *
- * The `originalPartitionKeys`, on the other hand, are partition values from the original input
- * splits returned by data sources. It may contain duplicated values.
+ * == Partition Keys ==
+ * This partitioning has two sets of partition keys:
  *
- * For example, if a data source reports partition transform expressions `[years(ts_col)]` with 4
- * input splits whose corresponding partition values are `[0, 1, 2, 2]`, then the `expressions` in
- * this case is `[years(ts_col)]`, while both `partitionKeys` and `originalPartitionKeys` are
- * `[0, 1, 2, 2]`.
- * After placing a `GroupPartitionsExec` operator on top of the data source, `partitionKeys` becomes
- * `[0, 1, 2]` but `originalPartitionKeys` remains `[0, 1, 2, 2]`.
+ * - `partitionKeys`: The current partition key for each partition, in ascending order. May contain
+ *   duplicates when first created from a data source, but becomes unique after grouping.
  *
- * @param expressions Partition expressions for the partitioning.
- * @param partitionKeys The keys for the partitions, must be in ascending order.
- * @param originalPartitionKeys The original partition keys before any grouping has been applied by
- *                              a `GroupPartitionsExec` operator, must be in ascending order.
+ * - `originalPartitionKeys`: The original partition keys from the data source, in ascending order.
+ *   Always preserves the original values, even after grouping. Used to track the original
+ *   distribution for optimization purposes.
+ *
+ * == Grouping State ==
+ * A KeyedPartitioning can be in two states:
+ *
+ * - '''Ungrouped''' (when `isGrouped == false`): `partitionKeys` contains duplicates. Multiple
+ *   input partitions share the same key. This is the initial state when created from a data source.
+ *
+ * - '''Grouped''' (when `isGrouped == true`): `partitionKeys` contains only unique values. Each
+ *   partition has a distinct key. This state is achieved by applying `GroupPartitionsExec`, which
+ *   coalesces partitions with the same key.
+ *
+ * == Example ==
+ * Consider a data source with partition transform `[years(ts_col)]` and 4 input splits:
+ *
+ * '''Before GroupPartitionsExec''' (ungrouped):
+ * {{{
+ *   expressions:           [years(ts_col)]
+ *   partitionKeys:         [0, 1, 2, 2]    // partition 2 and 3 have the same key
+ *   originalPartitionKeys: [0, 1, 2, 2]
+ *   numPartitions:         4
+ *   isGrouped:             false
+ * }}}
+ *
+ * '''After GroupPartitionsExec''' (grouped):
+ * {{{
+ *   expressions:           [years(ts_col)]
+ *   partitionKeys:         [0, 1, 2]       // duplicates removed, partitions coalesced
+ *   originalPartitionKeys: [0, 1, 2, 2]    // unchanged, preserves original distribution
+ *   numPartitions:         3
+ *   isGrouped:             true
+ * }}}
+ *
+ * @param expressions Partition transform expressions (e.g., `years(col)`, `bucket(10, col)`).
+ * @param partitionKeys Current partition keys, one per partition, in ascending order.
+ *                      May contain duplicates before grouping.
+ * @param originalPartitionKeys Original partition keys from the data source, in ascending order.
+ *                              Preserves the initial distribution even after grouping.
  */
 case class KeyedPartitioning(
     expressions: Seq[Expression],
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/GroupPartitionsExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/GroupPartitionsExec.scala
@@ -20,7 +20,7 @@ package org.apache.spark.sql.execution.datasources.v2
 
 import scala.collection.mutable.ArrayBuffer
 
-import org.apache.spark.Partition
+import org.apache.spark.{Partition, SparkException}
 import org.apache.spark.rdd.{CoalescedRDD, PartitionCoalescer, PartitionGroup, RDD}
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions._
@@ -94,7 +94,8 @@ case class GroupPartitionsExec(
   lazy val firstKeyedPartitioning = {
     child.outputPartitioning.asInstanceOf[Partitioning with Expression].collectFirst {
       case k: KeyedPartitioning => k
-    }.get
+    }.getOrElse(
+      throw new SparkException("GroupPartitionsExec requires a child with KeyedPartitioning"))
   }
 
   /**