chore(query): refactor parquet part to make reader work on parallel (#17987)

sundy-li · web-flow · commit dc25c759998d · 2025-05-25T18:19:08.000+08:00
* chore(query): refactor parquet part to make reader work on parallel

* chore(query): refactor parquet part to make reader work on parallel

* chore(query): refactor parquet part to make reader work on parallel

* increase timeout

* update

* update

* update

* update
diff --git a/src/query/settings/src/settings_default.rs b/src/query/settings/src/settings_default.rs
@@ -661,7 +661,13 @@ impl DefaultSettings {
                     scope: SettingScope::Both,
                     range: Some(SettingRange::Numeric(0..=u64::MAX)),
                 }),
-
+                ("parquet_rowgroup_hint_bytes", DefaultSettingValue {
+                    value: UserSettingValue::UInt64(128 * 1024 * 1024),
+                    desc: "Parquet file is very large, we will divide it into multiple rowgroups to read, the config is the hint bytes of each rowgroup, Default value: 128MB",
+                    mode: SettingMode::Both,
+                    scope: SettingScope::Both,
+                    range: Some(SettingRange::Numeric(1024 * 1024..=u64::MAX)),
+                }),
                 // enterprise license related settings
                 ("enterprise_license", DefaultSettingValue {
                     value: UserSettingValue::String("".to_owned()),
diff --git a/src/query/settings/src/settings_getter_setter.rs b/src/query/settings/src/settings_getter_setter.rs
@@ -518,6 +518,10 @@ impl Settings {
         self.try_get_u64("parquet_fast_read_bytes")
     }
 
+    pub fn get_parquet_rowgroup_hint_bytes(&self) -> Result<u64> {
+        self.try_get_u64("parquet_rowgroup_hint_bytes")
+    }
+
     pub fn get_enable_table_lock(&self) -> Result<bool> {
         Ok(self.try_get_u64("enable_table_lock")? != 0)
     }
diff --git a/src/query/storages/delta/src/table.rs b/src/query/storages/delta/src/table.rs
@@ -344,7 +344,8 @@ impl DeltaTable {
                                 file: add.path.clone(),
                                 compressed_size: add.size as u64,
                                 estimated_uncompressed_size: add.size as u64, // This field is not used here.
-                                dedup_key: format!("{}_{}", add.modification_time, add.size)
+                                dedup_key: format!("{}_{}", add.modification_time, add.size),
+                                bucket_option: None,
                             },
                         ),
                     }) as _))
diff --git a/src/query/storages/iceberg/src/partition.rs b/src/query/storages/iceberg/src/partition.rs
@@ -33,6 +33,7 @@ pub(crate) fn convert_file_scan_task(task: iceberg::scan::FileScanTask) -> Box<d
                 compressed_size: task.length,
                 estimated_uncompressed_size: task.length * 5,
                 dedup_key: format!("{}_{}", task.data_file_path, task.length),
+                bucket_option: None,
             };
 
             if !task.deletes.is_empty() {
diff --git a/src/query/storages/parquet/src/parquet_part.rs b/src/query/storages/parquet/src/parquet_part.rs
@@ -63,15 +63,27 @@ pub struct ParquetFilePart {
     pub file: String,
     pub compressed_size: u64,
     pub estimated_uncompressed_size: u64,
+    // used to cache parquet metadata
     pub dedup_key: String,
+
+    // For large parquet files, we will split the file into multiple parts
+    // But we don't read metadata during plan stage, so we split them by 128MB into buckets
+    // (bucket_idx, bucket_num)
+    pub bucket_option: Option<(usize, usize)>,
 }
 
 impl ParquetFilePart {
     pub fn compressed_size(&self) -> u64 {
-        self.compressed_size
+        match self.bucket_option {
+            Some((_, num)) => self.compressed_size / num as u64,
+            None => self.compressed_size,
+        }
     }
     pub fn uncompressed_size(&self) -> u64 {
-        self.estimated_uncompressed_size
+        match self.bucket_option {
+            Some((_, num)) => self.estimated_uncompressed_size / num as u64,
+            None => self.estimated_uncompressed_size,
+        }
     }
 }
 
@@ -170,6 +182,7 @@ pub(crate) fn collect_small_file_parts(
                     compressed_size: size,
                     estimated_uncompressed_size: (size as f64 / max_compression_ratio) as u64,
                     dedup_key,
+                    bucket_option: None,
                 })
                 .collect::<Vec<_>>();
 
@@ -191,6 +204,7 @@ pub(crate) fn collect_file_parts(
     stats: &mut PartStatistics,
     num_columns_to_read: usize,
     total_columns_to_read: usize,
+    rowgroup_hint_bytes: u64,
 ) {
     for (file, size, dedup_key) in files.into_iter() {
         stats.read_bytes += size as usize;
@@ -199,20 +213,24 @@ pub(crate) fn collect_file_parts(
             (size as f64) * (num_columns_to_read as f64) / (total_columns_to_read as f64);
 
         let estimated_uncompressed_size = read_bytes * compress_ratio;
+        let bucket_num = size.div_ceil(rowgroup_hint_bytes) as usize;
+        for bucket in 0..bucket_num {
+            partitions
+                .partitions
+                .push(Arc::new(Box::new(ParquetPart::File(ParquetFilePart {
+                    file: file.clone(),
+                    compressed_size: size,
+                    estimated_uncompressed_size: estimated_uncompressed_size as u64,
+                    dedup_key: dedup_key.clone(),
+                    bucket_option: Some((bucket, bucket_num)),
+                })) as Box<dyn PartInfo>));
 
-        partitions
-            .partitions
-            .push(Arc::new(Box::new(ParquetPart::File(ParquetFilePart {
-                file,
-                compressed_size: size,
-                estimated_uncompressed_size: estimated_uncompressed_size as u64,
-                dedup_key,
-            })) as Box<dyn PartInfo>));
+            stats.partitions_scanned += 1;
+            stats.partitions_total += 1;
+        }
 
         stats.read_bytes += read_bytes as usize;
         stats.read_rows += estimated_read_rows as usize;
         stats.is_exact = false;
-        stats.partitions_scanned += 1;
-        stats.partitions_total += 1;
     }
 }
diff --git a/src/query/storages/parquet/src/parquet_table/partition.rs b/src/query/storages/parquet/src/parquet_table/partition.rs
@@ -83,6 +83,8 @@ impl ParquetTable {
         let mut stats = PartStatistics::default();
 
         let fast_read_bytes = ctx.get_settings().get_parquet_fast_read_bytes()?;
+        let rowgroup_hint_bytes = ctx.get_settings().get_parquet_rowgroup_hint_bytes()?;
+
         let mut large_files = vec![];
         let mut small_files = vec![];
         for (location, size, dedup_key) in file_locations.into_iter() {
@@ -100,6 +102,7 @@ impl ParquetTable {
             &mut stats,
             num_columns_to_read,
             self.schema().num_fields(),
+            rowgroup_hint_bytes,
         );
 
         if !small_files.is_empty() {
diff --git a/src/query/storages/parquet/src/source.rs b/src/query/storages/parquet/src/source.rs
@@ -407,6 +407,13 @@ impl ParquetSource {
             Cow::Borrowed(&self.row_group_reader)
         };
 
+        let should_read = |rowgroup_idx: usize, bucket_option: Option<(usize, usize)>| -> bool {
+            if let Some((bucket, bucket_num)) = bucket_option {
+                return rowgroup_idx % bucket_num == bucket;
+            }
+            true
+        };
+
         let mut start_row = 0;
         let mut readers = VecDeque::with_capacity(meta.num_row_groups());
         // Deleted files only belong to the same Parquet, so they only need to be loaded once
@@ -415,7 +422,13 @@ impl ParquetSource {
             .as_ref()
             .map(|files| (meta.as_ref(), files.as_slice()));
 
-        for rg in meta.row_groups() {
+        for (rowgroup_idx, rg) in meta.row_groups().iter().enumerate() {
+            start_row += rg.num_rows() as u64;
+            // filter by bucket option
+            if !should_read(rowgroup_idx, part.bucket_option) {
+                continue;
+            }
+
             let part = ParquetRowGroupPart {
                 location: part.file.clone(),
                 start_row,
@@ -428,7 +441,6 @@ impl ParquetSource {
                 page_locations: None,
                 selectors: None,
             };
-            start_row += rg.num_rows() as u64;
 
             let reader = reader
                 .create_read_policy(
diff --git a/src/query/storages/result_cache/src/table_function/table.rs b/src/query/storages/result_cache/src/table_function/table.rs
@@ -126,6 +126,7 @@ impl Table for ResultScan {
             compressed_size: self.file_size,
             estimated_uncompressed_size: self.file_size,
             dedup_key: format!("{}_{}", self.location, self.file_size),
+            bucket_option: None,
         });
 
         let part_info: Box<dyn PartInfo> = Box::new(part);
diff --git a/tests/databend-test b/tests/databend-test
@@ -728,7 +728,7 @@ if __name__ == "__main__":
         "-t",
         "--timeout",
         type=int,
-        default=600,
+        default=900,
         help="Timeout for each test case in seconds",
     )
     parser.add_argument(
diff --git a/tests/suites/1_stateful/08_select_stage/08_00_parquet/08_00_00_basic.result b/tests/suites/1_stateful/08_select_stage/08_00_parquet/08_00_00_basic.result
@@ -15,3 +15,8 @@
 2	70
 1	[1,2,3]
 2	{"k":"v"}
+--- large parquet file should be worked on parallel by rowgroups
+5000000	213888890
+├── partitions total: 3
+├── partitions scanned: 3
+5000000	213888890
diff --git a/tests/suites/1_stateful/08_select_stage/08_00_parquet/08_00_00_basic.sh b/tests/suites/1_stateful/08_select_stage/08_00_parquet/08_00_00_basic.sh
@@ -41,4 +41,22 @@ echo "create stage s4 FILE_FORMAT = (type = PARQUET);" | $BENDSQL_CLIENT_CONNECT
 echo "copy into @s4 from t2;" | $BENDSQL_CLIENT_CONNECT | cut -d$'\t' -f1,2
 echo "select * from @s4;" | $BENDSQL_CLIENT_CONNECT
 
+## generate large parquet files will cause timeout, we comment it now
+echo '--- large parquet file should be worked on parallel by rowgroups'
+echo 'remove @s4;' | $BENDSQL_CLIENT_CONNECT
+echo 'remove @s1;' | $BENDSQL_CLIENT_CONNECT
+echo "copy into @s4 from (select number a, number::string b, number::decimal(15,2) c from numbers(5000000)) file_format=(type=parquet) single=true;" | $BENDSQL_CLIENT_CONNECT | cut -d$'\t' -f1,2
+
+## 5MB divide by 2MB = 2.5, so we will read 3 partitions
+echo """
+set parquet_rowgroup_hint_bytes = 2 * 1024 * 1024;
+explain select * from @s4;
+""" | $BENDSQL_CLIENT_CONNECT | grep 'partitions ' | sed  's/^[[:space:]]*//g'
+
+echo "copy into @s1 from (select * from @s4) file_format=(type=parquet)" | $BENDSQL_CLIENT_CONNECT | cut -d$'\t' -f1,2
+echo "select * from @s1 order by a except (select * from @s4 order by a)" | $BENDSQL_CLIENT_CONNECT
+
+echo 'remove @s4;' | $BENDSQL_CLIENT_CONNECT
+echo 'remove @s1;' | $BENDSQL_CLIENT_CONNECT
+
 rm -rf ${DATADIR_PATH}

Original file line number	Diff line number	Diff line change
`@@ -518,6 +518,10 @@ impl Settings {`
`518`	`518`	`self.try_get_u64("parquet_fast_read_bytes")`
`519`	`519`	`}`
`520`	`520`
	`521`	`+ pub fn get_parquet_rowgroup_hint_bytes(&self) -> Result<u64> {`
	`522`	`+ self.try_get_u64("parquet_rowgroup_hint_bytes")`
	`523`	`+ }`
	`524`	`+`
`521`	`525`	`pub fn get_enable_table_lock(&self) -> Result<bool> {`
`522`	`526`	`Ok(self.try_get_u64("enable_table_lock")? != 0)`
`523`	`527`	`}`
Original file line number	Diff line number	Diff line change
`@@ -728,7 +728,7 @@ if __name__ == "__main__":`
`728`	`728`	`"-t",`
`729`	`729`	`"--timeout",`
`730`	`730`	`type=int,`
`731`		`- default=600,`
	`731`	`+ default=900,`
`732`	`732`	`help="Timeout for each test case in seconds",`
`733`	`733`	`)`
`734`	`734`	`parser.add_argument(`