[fix](parquet) Fix struct column reading error when all queried fields are missing after schema evolution (#59586)

suxiaogang223 · web-flow · commit 99b971cd27e9 · 2026-01-13T19:32:51.000+08:00
### What problem does this PR solve? - relate pr: #57204 **Problem Summary:** When querying struct fields in Iceberg tables after schema evolution, if all queried struct fields are missing in old Parquet files, the code fails with error: ``` File column name 'removed' not found in struct children ``` **Root Cause:** When all queried struct sub-fields are missing in the old Parquet file (e.g., newly added fields after schema evolution), the code needs to find a reference column from the file schema to get repetition level (RL) and definition level (DL) information. However, if the reference column (e.g., `removed`) was dropped from the table schema, calling `root_node->get_children_node_by_file_column_name()` will fail because the column doesn't exist in `root_node`. **Scenario:** 1. Create table with struct containing: `removed`, `rename`, `keep`, `drop_and_add` 2. Insert data (creates Parquet file with these fields) 3. Perform schema evolution: DROP `a_struct.removed`, DROP then ADD `a_struct.drop_and_add` (gets new field ID), ADD `a_struct.added` 4. Query `struct_element(a_struct, 'drop_and_add')` or `struct_element(a_struct, 'added')` on the old file 5. The query fails because: - All queried fields (`drop_and_add`, `added`) are missing in the old file - Code tries to use `removed` as reference column (it exists in file but was dropped from table schema) - Accessing `removed` via `root_node` fails because it doesn't exist in table schema ### Solution: Use `TableSchemaChangeHelper::ConstNode::get_instance()` instead of looking up from `root_node` for the reference column. Since the reference column is only used to get RL/DL information (not for schema mapping), using `ConstNode` is safe and avoids the issue where the reference column doesn't exist in `root_node`. ### Release note None ### Check List (For Author) - Test  - [ ] Regression test - [ ] Unit Test - [ ] Manual test (add detailed scripts or steps below) - [ ] No need to test or manual test. Explain why: - [ ] This is a refactor/code format and no logic has been changed. - [ ] Previous test can cover this change. - [ ] No code files have been changed. - [ ] Other reason  - Behavior changed: - [ ] No. - [ ] Yes.  - Does this need documentation? - [ ] No. - [ ] Yes.  ### Check List (For Reviewer who merge this PR) - [ ] Confirm the release note - [ ] Confirm test cases - [ ] Confirm document - [ ] Add branch pick label
diff --git a/be/src/vec/exec/format/parquet/vparquet_column_reader.cpp b/be/src/vec/exec/format/parquet/vparquet_column_reader.cpp
@@ -932,10 +932,14 @@ Status StructColumnReader::read_column_data(
             size_t field_rows = 0;
             bool field_eof = false;
 
-            // Use root_node to get the correct child node for the reference column
-            // reference_file_column_name is the file column name, use get_children_node_by_file_column_name
-            auto ref_child_node =
-                    root_node->get_children_node_by_file_column_name(reference_file_column_name);
+            // Use ConstNode for the reference column instead of looking up from root_node.
+            // The reference column is only used to get RL/DL information for determining the number
+            // of elements in the struct. It may be a column that has been dropped from the table
+            // schema (e.g., 'removed' field), but still exists in older parquet files.
+            // Since we don't need schema mapping for this column (we just need its RL/DL levels),
+            // using ConstNode is safe and avoids the issue where the reference column doesn't exist
+            // in root_node (because it was dropped from table schema).
+            auto ref_child_node = TableSchemaChangeHelper::ConstNode::get_instance();
             not_missing_orig_column_size = temp_column->size();
 
             RETURN_IF_ERROR((*reference_reader)
diff --git a/docker/thirdparties/docker-compose/iceberg/scripts/create_preinstalled_scripts/iceberg/run24.sql b/docker/thirdparties/docker-compose/iceberg/scripts/create_preinstalled_scripts/iceberg/run24.sql
@@ -0,0 +1,151 @@
+use demo.test_db;
+
+DROP TABLE IF EXISTS test_struct_evolution;
+
+-- Test case for struct schema evolution bug
+-- Bug scenario: When querying a struct field after schema evolution, if all queried fields are missing
+-- in old Parquet files, the code tries to find a reference column from file schema. However, if the
+-- reference column (e.g., 'removed') was dropped from table schema, accessing it via root_node will fail.
+--
+-- Steps to reproduce:
+-- 1. Create table with struct containing: removed, rename, keep, drop_and_add
+-- 2. Insert data (creates Parquet file with these fields)
+-- 3. DROP a_struct.removed - removes field from table schema
+-- 4. DROP a_struct.drop_and_add then ADD a_struct.drop_and_add - gets new field ID
+-- 5. ADD a_struct.added - adds new field
+-- 6. Query struct_element(a_struct, 'drop_and_add') or struct_element(a_struct, 'added')
+--    -> This will fail because all queried fields are missing in old file, and the reference
+--       column 'removed' doesn't exist in root_node (it was dropped from table schema)
+
+-- Step 1: Create table
+CREATE TABLE test_struct_evolution (
+    id BIGINT,
+    a_struct STRUCT<removed: BIGINT, rename: BIGINT, keep: BIGINT, drop_and_add: BIGINT>
+) USING ICEBERG
+TBLPROPERTIES ('write.format.default' = 'parquet', 'format-version' = 2);
+
+-- Step 2: Insert data (creates Parquet file with original schema)
+INSERT INTO test_struct_evolution 
+SELECT 1, named_struct('removed', 10, 'rename', 11, 'keep', 12, 'drop_and_add', 13);
+
+-- Step 3: Schema evolution - drop removed field
+ALTER TABLE test_struct_evolution DROP COLUMN a_struct.removed;
+
+-- Step 4: Rename field (field ID stays the same)
+ALTER TABLE test_struct_evolution RENAME COLUMN a_struct.rename TO renamed;
+
+-- Step 5: Drop and add drop_and_add (new field ID)
+ALTER TABLE test_struct_evolution DROP COLUMN a_struct.drop_and_add;
+ALTER TABLE test_struct_evolution ADD COLUMN a_struct.drop_and_add BIGINT;
+
+-- Step 6: Add new field
+ALTER TABLE test_struct_evolution ADD COLUMN a_struct.added BIGINT;
+
+-- Step 7: Insert new data after schema evolution (creates new Parquet file)
+INSERT INTO test_struct_evolution 
+SELECT 2, named_struct('renamed', 21, 'keep', 22, 'drop_and_add', 23, 'added', 24);
+
+-- Now the table contains two Parquet files:
+-- - Old file: contains removed, rename, keep, drop_and_add (old field ID)
+-- - New file: contains renamed, keep, drop_and_add (new field ID), added
+--
+-- Querying struct_element(a_struct, 'drop_and_add') or struct_element(a_struct, 'added')
+-- on the old file will trigger the bug
+
+-- ============================================================
+-- ORC format test table (for completeness, though ORC doesn't have the same bug)
+-- ============================================================
+DROP TABLE IF EXISTS test_struct_evolution_orc;
+
+-- Create ORC format table with same schema evolution scenario
+CREATE TABLE test_struct_evolution_orc (
+    id BIGINT,
+    a_struct STRUCT<removed: BIGINT, rename: BIGINT, keep: BIGINT, drop_and_add: BIGINT>
+) USING ICEBERG
+TBLPROPERTIES ('write.format.default' = 'orc', 'format-version' = 2);
+
+-- Insert initial data (creates ORC file with original schema)
+INSERT INTO test_struct_evolution_orc 
+SELECT 1, named_struct('removed', 10, 'rename', 11, 'keep', 12, 'drop_and_add', 13);
+
+-- Schema evolution - same operations as Parquet table
+ALTER TABLE test_struct_evolution_orc DROP COLUMN a_struct.removed;
+ALTER TABLE test_struct_evolution_orc RENAME COLUMN a_struct.rename TO renamed;
+ALTER TABLE test_struct_evolution_orc DROP COLUMN a_struct.drop_and_add;
+ALTER TABLE test_struct_evolution_orc ADD COLUMN a_struct.drop_and_add BIGINT;
+ALTER TABLE test_struct_evolution_orc ADD COLUMN a_struct.added BIGINT;
+
+-- Insert new data after schema evolution (creates new ORC file)
+INSERT INTO test_struct_evolution_orc 
+SELECT 2, named_struct('renamed', 21, 'keep', 22, 'drop_and_add', 23, 'added', 24);
+
+-- ============================================================
+-- Case sensitivity test table (mixed case field names)
+-- ============================================================
+DROP TABLE IF EXISTS test_struct_evolution_case;
+
+-- Test case for struct schema evolution with mixed case field names
+-- This tests that case sensitivity is handled correctly when:
+-- - Field names have mixed case (e.g., REMOVED, rename, keep, drop_and_add)
+-- - Schema evolution operations are performed
+-- - Querying struct fields with different case patterns
+
+-- Step 1: Create table with mixed case field names
+CREATE TABLE test_struct_evolution_case (
+    id BIGINT,
+    a_struct STRUCT<REMOVED: BIGINT, rename: BIGINT, keep: BIGINT, drop_and_add: BIGINT>
+) USING ICEBERG
+TBLPROPERTIES ('write.format.default' = 'parquet', 'format-version' = 2);
+
+-- Step 2: Insert data (creates Parquet file with original schema)
+INSERT INTO test_struct_evolution_case 
+SELECT 1, named_struct('REMOVED', 10, 'rename', 11, 'keep', 12, 'drop_and_add', 13);
+
+-- Step 3: Schema evolution - drop REMOVED field (uppercase)
+ALTER TABLE test_struct_evolution_case DROP COLUMN a_struct.REMOVED;
+
+-- Step 4: Rename field (field ID stays the same)
+ALTER TABLE test_struct_evolution_case RENAME COLUMN a_struct.rename TO renamed;
+
+-- Step 5: Drop and add drop_and_add with case change (new field ID)
+-- Initial: drop_and_add (lowercase), after re-add: DROP_AND_ADD (uppercase)
+ALTER TABLE test_struct_evolution_case DROP COLUMN a_struct.drop_and_add;
+ALTER TABLE test_struct_evolution_case ADD COLUMN a_struct.DROP_AND_ADD BIGINT;
+
+-- Step 6: Add new field
+ALTER TABLE test_struct_evolution_case ADD COLUMN a_struct.added BIGINT;
+
+-- Step 7: Insert new data after schema evolution (creates new Parquet file)
+-- Note: Use DROP_AND_ADD (uppercase) in the new data
+INSERT INTO test_struct_evolution_case 
+SELECT 2, named_struct('renamed', 21, 'keep', 22, 'DROP_AND_ADD', 23, 'added', 24);
+
+-- ============================================================
+-- ORC format test table with mixed case (for completeness)
+-- ============================================================
+DROP TABLE IF EXISTS test_struct_evolution_case_orc;
+
+-- Create ORC format table with same schema evolution scenario and mixed case
+CREATE TABLE test_struct_evolution_case_orc (
+    id BIGINT,
+    a_struct STRUCT<REMOVED: BIGINT, rename: BIGINT, keep: BIGINT, drop_and_add: BIGINT>
+) USING ICEBERG
+TBLPROPERTIES ('write.format.default' = 'orc', 'format-version' = 2);
+
+-- Insert initial data (creates ORC file with original schema)
+INSERT INTO test_struct_evolution_case_orc 
+SELECT 1, named_struct('REMOVED', 10, 'rename', 11, 'keep', 12, 'drop_and_add', 13);
+
+-- Schema evolution - same operations as Parquet table
+ALTER TABLE test_struct_evolution_case_orc DROP COLUMN a_struct.REMOVED;
+ALTER TABLE test_struct_evolution_case_orc RENAME COLUMN a_struct.rename TO renamed;
+-- Drop and add with case change: drop_and_add (lowercase) -> DROP_AND_ADD (uppercase)
+ALTER TABLE test_struct_evolution_case_orc DROP COLUMN a_struct.drop_and_add;
+ALTER TABLE test_struct_evolution_case_orc ADD COLUMN a_struct.DROP_AND_ADD BIGINT;
+ALTER TABLE test_struct_evolution_case_orc ADD COLUMN a_struct.added BIGINT;
+
+-- Insert new data after schema evolution (creates new ORC file)
+-- Note: Use DROP_AND_ADD (uppercase) in the new data
+INSERT INTO test_struct_evolution_case_orc 
+SELECT 2, named_struct('renamed', 21, 'keep', 22, 'DROP_AND_ADD', 23, 'added', 24);
+
diff --git a/regression-test/data/external_table_p0/iceberg/test_iceberg_struct_schema_evolution.out b/regression-test/data/external_table_p0/iceberg/test_iceberg_struct_schema_evolution.out
@@ -0,0 +1,161 @@
+-- This file is automatically generated. You should know what you did if you want to edit this
+-- !desc --
+id	bigint	Yes	true	\N	
+a_struct	struct<renamed:bigint,keep:bigint,drop_and_add:bigint,added:bigint>	Yes	true	\N	
+
+-- !select_all --
+1	{"renamed":11, "keep":12, "drop_and_add":null, "added":null}
+2	{"renamed":21, "keep":22, "drop_and_add":23, "added":24}
+
+-- !struct_keep --
+12
+22
+
+-- !struct_renamed --
+11
+21
+
+-- !struct_drop_and_add --
+\N
+23
+
+-- !struct_added --
+\N
+24
+
+-- !struct_full --
+{"renamed":11, "keep":12, "drop_and_add":null, "added":null}
+{"renamed":21, "keep":22, "drop_and_add":23, "added":24}
+
+-- !struct_predicate_1 --
+1
+
+-- !struct_predicate_2 --
+1
+
+-- !struct_predicate_3 --
+1
+
+-- !struct_predicate_4 --
+2
+
+-- !struct_multi --
+11	12	\N	\N
+21	22	23	24
+
+-- !struct_distinct --
+11	\N	12
+21	24	22
+
+-- !orc_desc --
+id	bigint	Yes	true	\N	
+a_struct	struct<renamed:bigint,keep:bigint,drop_and_add:bigint,added:bigint>	Yes	true	\N	
+
+-- !orc_select_all --
+1	{"renamed":11, "keep":12, "drop_and_add":null, "added":null}
+2	{"renamed":21, "keep":22, "drop_and_add":23, "added":24}
+
+-- !orc_struct_keep --
+12
+22
+
+-- !orc_struct_renamed --
+11
+21
+
+-- !orc_struct_drop_and_add --
+\N
+23
+
+-- !orc_struct_added --
+\N
+24
+
+-- !orc_struct_full --
+{"renamed":11, "keep":12, "drop_and_add":null, "added":null}
+{"renamed":21, "keep":22, "drop_and_add":23, "added":24}
+
+-- !orc_struct_multi --
+11	12	\N	\N
+21	22	23	24
+
+-- !case_desc --
+id	bigint	Yes	true	\N	
+a_struct	struct<renamed:bigint,keep:bigint,drop_and_add:bigint,added:bigint>	Yes	true	\N	
+
+-- !case_select_all --
+1	{"renamed":11, "keep":12, "drop_and_add":null, "added":null}
+2	{"renamed":21, "keep":22, "drop_and_add":23, "added":24}
+
+-- !case_struct_keep --
+12
+22
+
+-- !case_struct_renamed --
+11
+21
+
+-- !case_struct_drop_and_add --
+\N
+23
+
+-- !case_struct_added --
+\N
+24
+
+-- !case_struct_full --
+{"renamed":11, "keep":12, "drop_and_add":null, "added":null}
+{"renamed":21, "keep":22, "drop_and_add":23, "added":24}
+
+-- !case_struct_predicate_1 --
+1
+
+-- !case_struct_predicate_2 --
+1
+
+-- !case_struct_predicate_3 --
+1
+
+-- !case_struct_predicate_4 --
+2
+
+-- !case_struct_multi --
+11	12	\N	\N
+21	22	23	24
+
+-- !case_struct_distinct --
+11	\N	12
+21	24	22
+
+-- !case_orc_desc --
+id	bigint	Yes	true	\N	
+a_struct	struct<renamed:bigint,keep:bigint,drop_and_add:bigint,added:bigint>	Yes	true	\N	
+
+-- !case_orc_select_all --
+1	{"renamed":11, "keep":12, "drop_and_add":null, "added":null}
+2	{"renamed":21, "keep":22, "drop_and_add":23, "added":24}
+
+-- !case_orc_struct_keep --
+12
+22
+
+-- !case_orc_struct_renamed --
+11
+21
+
+-- !case_orc_struct_drop_and_add --
+\N
+23
+
+-- !case_orc_struct_added --
+\N
+24
+
+-- !case_orc_struct_full --
+{"renamed":11, "keep":12, "drop_and_add":null, "added":null}
+{"renamed":21, "keep":22, "drop_and_add":23, "added":24}
+
+-- !case_orc_struct_multi --
+11	12	\N	\N
+21	22	23	24
+
diff --git a/regression-test/suites/external_table_p0/iceberg/test_iceberg_struct_schema_evolution.groovy b/regression-test/suites/external_table_p0/iceberg/test_iceberg_struct_schema_evolution.groovy