remove inadvertent changes, make comments more succinct

mbutrovich · mbutrovich · commit 135f1499d58c · 2025-11-04T12:17:12.000-05:00
diff --git a/crates/iceberg/src/arrow/reader.rs b/crates/iceberg/src/arrow/reader.rs
@@ -235,8 +235,6 @@ impl ArrowReader {
         // RecordBatchTransformer performs any transformations required on the RecordBatches
         // that come back from the file, such as type promotion, default column insertion
         // and column re-ordering.
-        // Always use build_with_partition_data to ensure name_mapping is passed through,
-        // even when partition spec/data aren't available.
         let mut record_batch_transformer = RecordBatchTransformer::build_with_partition_data(
             task.schema_ref(),
             task.project_field_ids(),
@@ -1949,7 +1947,7 @@ message schema {
                 start: 0,
                 length: 0,
                 record_count: None,
-                data_file_path: format!("{}/1.parquet", table_location),
+                data_file_path: format!("{table_location}/1.parquet"),
                 data_file_format: DataFileFormat::Parquet,
                 schema: schema.clone(),
                 project_field_ids: vec![1],
@@ -2323,21 +2321,16 @@ message schema {
             .unwrap();
 
         let total_rows_task2: usize = result2.iter().map(|b| b.num_rows()).sum();
-        println!(
-            "Task 2 (bytes {}-{}) returned {} rows",
-            rg1_start, file_end, total_rows_task2
-        );
+        println!("Task 2 (bytes {rg1_start}-{file_end}) returned {total_rows_task2} rows");
 
         assert_eq!(
             total_rows_task1, 100,
-            "Task 1 should read only the first row group (100 rows), but got {} rows",
-            total_rows_task1
+            "Task 1 should read only the first row group (100 rows), but got {total_rows_task1} rows"
         );
 
         assert_eq!(
             total_rows_task2, 200,
-            "Task 2 should read only the second+third row groups (200 rows), but got {} rows",
-            total_rows_task2
+            "Task 2 should read only the second+third row groups (200 rows), but got {total_rows_task2} rows"
         );
 
         // Verify the actual data values are correct (not just the row count)
@@ -2348,7 +2341,7 @@ message schema {
                 .as_primitive::<arrow_array::types::Int32Type>();
             let first_val = id_col.value(0);
             let last_val = id_col.value(id_col.len() - 1);
-            println!("Task 1 data range: {} to {}", first_val, last_val);
+            println!("Task 1 data range: {first_val} to {last_val}");
 
             assert_eq!(first_val, 0, "Task 1 should start with id=0");
             assert_eq!(last_val, 99, "Task 1 should end with id=99");
@@ -2360,7 +2353,7 @@ message schema {
                 .column(0)
                 .as_primitive::<arrow_array::types::Int32Type>();
             let first_val = id_col.value(0);
-            println!("Task 2 first value: {}", first_val);
+            println!("Task 2 first value: {first_val}");
 
             assert_eq!(first_val, 100, "Task 2 should start with id=100, not id=0");
         }
@@ -2418,7 +2411,7 @@ message schema {
                 start: 0,
                 length: 0,
                 record_count: None,
-                data_file_path: format!("{}/old_file.parquet", table_location),
+                data_file_path: format!("{table_location}/old_file.parquet"),
                 data_file_format: DataFileFormat::Parquet,
                 schema: new_schema.clone(),
                 project_field_ids: vec![1, 2], // Request both columns 'a' and 'b'
diff --git a/crates/iceberg/src/arrow/record_batch_transformer.rs b/crates/iceberg/src/arrow/record_batch_transformer.rs
@@ -37,56 +37,18 @@ use crate::{Error, ErrorKind, Result};
 
 /// Build a map of field ID to constant value for identity-partitioned fields.
 ///
-/// This implements the Iceberg spec's "Column Projection" rule #1
-/// (https://iceberg.apache.org/spec/#column-projection):
-/// > "Return the value from partition metadata if an Identity Transform exists for the field
-/// >  and the partition value is present in the `partition` struct on `data_file` object
-/// >  in the manifest."
+/// Implements Iceberg spec "Column Projection" rule #1: use partition metadata constants
+/// only for identity-transformed fields. Non-identity transforms (bucket, truncate, year, etc.)
+/// store derived values in partition metadata, so source columns must be read from data files.
 ///
-/// This matches Java's `PartitionUtil.constantsMap()` which only adds fields where:
-/// ```java
-/// if (field.transform().isIdentity()) {
-///     idToConstant.put(field.sourceId(), converted);
-/// }
-/// ```
+/// Example: For `bucket(4, id)`, partition metadata has `id_bucket = 2` (bucket number),
+/// but the actual `id` values (100, 200, 300) are only in the data file.
 ///
-/// # Why only identity transforms?
-///
-/// Non-identity transforms (bucket, truncate, year, month, day, hour) produce DERIVED values
-/// that differ from the source column values. For example:
-/// - `bucket(4, id)` produces hash values 0-3, not the actual `id` values
-/// - `day(timestamp)` produces day-since-epoch integers, not the timestamp values
-///
-/// These source columns MUST be read from the data file because partition metadata only
-/// stores the transformed values (e.g., bucket number), not the original column values.
-///
-/// # Java Implementation Reference
-///
-/// This matches Java's `PartitionUtil.constantsMap()` (util/PartitionUtil.java):
-/// ```java
-/// public static Map<Integer, Object> constantsMap(PartitionData data, PartitionSpec spec) {
-///   Map<Integer, Object> idToConstant = Maps.newHashMap();
-///   for (int pos = 0; pos < spec.fields().size(); pos += 1) {
-///     PartitionField field = spec.fields().get(pos);
-///     if (field.transform().isIdentity()) {  // <-- ONLY identity transforms
-///       Object converted = convertConstant(field.sourceId(), data.get(pos, javaClass));
-///       idToConstant.put(field.sourceId(), converted);
-///     }
-///   }
-///   return idToConstant;
-/// }
-/// ```
-///
-/// # Example: Bucket Partitioning
-///
-/// For a table partitioned by `bucket(4, id)`:
-/// - Partition metadata stores: `id_bucket = 2` (the bucket number)
-/// - Data file contains: `id = 100, 200, 300` (the actual values)
-/// - Reading must use data from the file, not the constant `2` from partition metadata
+/// Matches Java's `PartitionUtil.constantsMap()` which filters `if (field.transform().isIdentity())`.
 ///
 /// # References
-/// - Iceberg spec: format/spec.md "Column Projection" section
-/// - Java impl: core/src/main/java/org/apache/iceberg/util/PartitionUtil.java:constantsMap()
+/// - Spec: https://iceberg.apache.org/spec/#column-projection
+/// - Java: core/src/main/java/org/apache/iceberg/util/PartitionUtil.java:constantsMap()
 fn constants_map(
     partition_spec: &PartitionSpec,
     partition_data: &Struct,
@@ -191,8 +153,6 @@ pub(crate) struct RecordBatchTransformer {
     partition_data: Option<Struct>,
 
     // Optional name mapping for resolving field IDs from column names
-    // Per Iceberg spec rule #2: "Use schema.name-mapping.default metadata
-    // to map field id to columns without field id"
     name_mapping: Option<Arc<NameMapping>>,
 
     // BatchTransform gets lazily constructed based on the schema of
@@ -218,70 +178,28 @@ impl RecordBatchTransformer {
 
     /// Build a RecordBatchTransformer with partition spec and data for proper constant identification.
     ///
-    /// # Overview
-    ///
-    /// This method implements the Iceberg spec's "Column Projection" rules
-    /// (https://iceberg.apache.org/spec/#column-projection) for resolving field IDs that are
-    /// "not present" in a data file:
-    ///
+    /// Implements the Iceberg spec's "Column Projection" rules for resolving field IDs "not present" in data files:
     /// 1. Return the value from partition metadata if an Identity Transform exists
     /// 2. Use schema.name-mapping.default metadata to map field id to columns without field id
     /// 3. Return the default value if it has a defined initial-default
     /// 4. Return null in all other cases
     ///
-    /// # Why this method was added
-    ///
-    /// The gap in iceberg-rust was that `FileScanTask` had no way to pass partition information
-    /// to `RecordBatchTransformer`. This caused two problems:
-    ///
-    /// 1. **Incorrect handling of bucket partitioning**: Without partition spec information,
-    ///    iceberg-rust couldn't distinguish between:
-    ///    - Identity transforms (use constants from partition metadata)
-    ///    - Non-identity transforms like bucket (read from data file)
+    /// # Why this method exists
     ///
-    ///    This caused bucket-partitioned source columns to be incorrectly treated as constants,
-    ///    breaking runtime filtering and returning incorrect query results.
+    /// 1. **Bucket partitioning**: Distinguish identity transforms (use partition metadata constants)
+    ///    from non-identity transforms like bucket (read from data file) to enable runtime filtering on
+    ///    bucket-partitioned columns.
     ///
-    /// 2. **Add_files field ID conflicts**: When importing Hive tables via add_files,
-    ///    partition columns with `initial_default` values could have field IDs that conflicted
-    ///    with data column field IDs in the Parquet file.
+    /// 2. **Add_files field ID conflicts**: When importing Hive tables, partition columns can have field IDs
+    ///    conflicting with Parquet data columns (e.g., Parquet has field_id=1→"name", but Iceberg expects
+    ///    field_id=1→"id"). Per spec, such fields are "not present" and should use name mapping (rule #2).
     ///
-    ///    Example:
-    ///    - Parquet file written with: field_id=1→"name", field_id=2→"dept"
-    ///    - Imported via add_files: field_id=1→"id" (partition), field_id=2→"name", field_id=3→"dept"
-    ///
-    ///    When looking for field_id=1 ("id"), we find field_id=1 in the Parquet file, but it's
-    ///    the WRONG field (it's "name"). Per the spec, the correct field (id=1, name="id") is
-    ///    "not present" in the file and should be resolved via name mapping (rule #2) or
-    ///    initial-default (rule #3).
-    ///
-    /// # The fix
-    ///
-    /// This method accepts `partition_spec`, `partition_data`, and `name_mapping`, which are used to:
-    /// - Build a `constants_map` that ONLY includes identity-transformed partition fields
-    ///   (matching Java's `PartitionUtil.constantsMap()` behavior)
-    /// - Detect field ID conflicts by verifying both field ID AND name match (when name mapping present)
-    /// - Apply name mapping when field IDs are missing or conflicting (spec rule #2)
-    ///
-    /// This matches Java's approach (ParquetSchemaUtil.applyNameMapping, ReadConf.java lines 83-85)
-    /// which rewrites Parquet schema field IDs based on names before projection. Our implementation
-    /// detects conflicts during projection but achieves the same result.
-    ///
-    /// # What was changed
-    ///
-    /// To enable this fix, the following fields were added to `FileScanTask`:
-    /// - `partition: Option<Struct>` - The partition data for this file
-    /// - `partition_spec: Option<Arc<PartitionSpec>>` - The actual partition spec
-    /// - `name_mapping: Option<Arc<NameMapping>>` - The name mapping from table metadata
-    ///
-    /// These fields should be populated by any system that reads Iceberg tables and provides
-    /// FileScanTasks to the ArrowReader.
+    /// This matches Java's ParquetSchemaUtil.applyNameMapping approach but detects conflicts during projection.
     ///
     /// # References
-    /// - Iceberg spec: https://iceberg.apache.org/spec/#column-projection
-    /// - Java impl: core/src/main/java/org/apache/iceberg/util/PartitionUtil.java
-    /// - Java impl: parquet/src/main/java/org/apache/iceberg/parquet/ParquetSchemaUtil.java
-    /// - Java test: spark/src/test/java/.../TestRuntimeFiltering.java
+    /// - Spec: https://iceberg.apache.org/spec/#column-projection
+    /// - Java: core/src/main/java/org/apache/iceberg/util/PartitionUtil.java
+    /// - Java: parquet/src/main/java/org/apache/iceberg/parquet/ParquetSchemaUtil.java
     pub(crate) fn build_with_partition_data(
         snapshot_schema: Arc<IcebergSchema>,
         projected_iceberg_field_ids: &[i32],
diff --git a/crates/iceberg/src/scan/task.rs b/crates/iceberg/src/scan/task.rs
@@ -99,13 +99,6 @@ pub struct FileScanTask {
     /// Name mapping from table metadata (property: schema.name-mapping.default),
     /// used to resolve field IDs from column names when Parquet files lack field IDs
     /// or have field ID conflicts.
-    ///
-    /// Per Iceberg spec rule #2: "Use schema.name-mapping.default metadata to map
-    /// field id to columns without field id".
-    ///
-    /// This is essential for scenarios like:
-    /// - Hive table migrations via add_files where Parquet has no field IDs
-    /// - Field ID conflicts where partition columns conflict with data column IDs
     #[serde(default)]
     #[serde(skip_serializing_if = "Option::is_none")]
     #[serde(serialize_with = "serialize_not_implemented")]