Skip to content

Commit 99cb6e1

Browse files
authored
chore: check schema for small parquet file too. (#17175)
1 parent a070678 commit 99cb6e1

File tree

7 files changed

+35
-5
lines changed

7 files changed

+35
-5
lines changed

src/query/storages/parquet/src/parquet_rs/copy_into_table/reader.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -116,6 +116,7 @@ impl RowGroupReaderForCopy {
116116
parquet_table_schema,
117117
schema_descr,
118118
Some(arrow_schema),
119+
None,
119120
)
120121
.with_push_downs(Some(&pushdowns));
121122
reader_builder.build_output()?;

src/query/storages/parquet/src/parquet_rs/meta.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -101,7 +101,7 @@ pub async fn read_metas_in_parallel(
101101
Ok(metas)
102102
}
103103

104-
fn check_parquet_schema(
104+
pub(crate) fn check_parquet_schema(
105105
expect: &SchemaDescriptor,
106106
actual: &SchemaDescriptor,
107107
path: &str,

src/query/storages/parquet/src/parquet_rs/parquet_reader/reader/builder.rs

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@ pub struct ParquetRSReaderBuilder<'a> {
5151
op: Operator,
5252
table_schema: TableSchemaRef,
5353
schema_desc: SchemaDescPtr,
54+
schema_desc_from: Option<String>,
5455
arrow_schema: Option<arrow_schema::Schema>,
5556

5657
push_downs: Option<&'a PushDownInfo>,
@@ -85,6 +86,7 @@ impl<'a> ParquetRSReaderBuilder<'a> {
8586
table_schema,
8687
schema_desc,
8788
Some(arrow_schema),
89+
None,
8890
))
8991
}
9092

@@ -94,13 +96,15 @@ impl<'a> ParquetRSReaderBuilder<'a> {
9496
table_schema: TableSchemaRef,
9597
schema_desc: SchemaDescPtr,
9698
arrow_schema: Option<arrow_schema::Schema>,
99+
schema_desc_from: Option<String>,
97100
) -> ParquetRSReaderBuilder<'a> {
98101
ParquetRSReaderBuilder {
99102
ctx,
100103
op,
101104
table_schema,
102105
schema_desc,
103106
arrow_schema,
107+
schema_desc_from,
104108
push_downs: None,
105109
options: Default::default(),
106110
pruner: None,
@@ -221,6 +225,10 @@ impl<'a> ParquetRSReaderBuilder<'a> {
221225
let (_, _, output_schema, _) = self.built_output.as_ref().unwrap();
222226
Ok(ParquetRSFullReader {
223227
op: self.op.clone(),
228+
expect_file_schema: self
229+
.schema_desc_from
230+
.as_ref()
231+
.map(|p| (self.schema_desc.clone(), p.clone())),
224232
output_schema: output_schema.clone(),
225233
predicate,
226234
projection,

src/query/storages/parquet/src/parquet_rs/parquet_reader/reader/full_reader.rs

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,9 @@ use parquet::arrow::async_reader::ParquetRecordBatchStream;
4141
use parquet::arrow::ParquetRecordBatchStreamBuilder;
4242
use parquet::arrow::ProjectionMask;
4343
use parquet::file::metadata::ParquetMetaData;
44+
use parquet::schema::types::SchemaDescPtr;
4445

46+
use crate::parquet_rs::meta::check_parquet_schema;
4547
use crate::parquet_rs::parquet_reader::predicate::ParquetPredicate;
4648
use crate::parquet_rs::parquet_reader::utils::transform_record_batch;
4749
use crate::parquet_rs::parquet_reader::utils::transform_record_batch_by_field_paths;
@@ -51,6 +53,7 @@ use crate::ParquetRSPruner;
5153
/// The reader to read a whole parquet file.
5254
pub struct ParquetRSFullReader {
5355
pub(super) op: Operator,
56+
pub(super) expect_file_schema: Option<(SchemaDescPtr, String)>,
5457
pub(super) output_schema: TableSchemaRef,
5558
pub(super) predicate: Option<Arc<ParquetPredicate>>,
5659

@@ -168,7 +171,7 @@ impl ParquetRSFullReader {
168171
}
169172

170173
/// Read a [`DataBlock`] from bytes.
171-
pub fn read_blocks_from_binary(&self, raw: Vec<u8>) -> Result<Vec<DataBlock>> {
174+
pub fn read_blocks_from_binary(&self, raw: Vec<u8>, path: &str) -> Result<Vec<DataBlock>> {
172175
let bytes = Bytes::from(raw);
173176
let mut builder = ParquetRecordBatchReaderBuilder::try_new_with_options(
174177
bytes,
@@ -179,6 +182,14 @@ impl ParquetRSFullReader {
179182

180183
// Prune row groups.
181184
let file_meta = builder.metadata().clone();
185+
if let Some((expect_schema, expect_schema_from)) = &self.expect_file_schema {
186+
check_parquet_schema(
187+
expect_schema,
188+
file_meta.file_metadata().schema_descr(),
189+
path,
190+
expect_schema_from,
191+
)?;
192+
}
182193

183194
let mut full_match = false;
184195
if let Some(pruner) = &self.pruner {

src/query/storages/parquet/src/parquet_rs/parquet_table/read.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,7 @@ impl ParquetRSTable {
7272
table_schema.clone(),
7373
self.schema_descr.clone(),
7474
Some(self.arrow_schema.clone()),
75+
Some(self.schema_from.clone()),
7576
)
7677
.with_options(self.read_options)
7778
.with_push_downs(plan.push_downs.as_ref())

src/query/storages/parquet/src/parquet_rs/source.rs

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -166,7 +166,7 @@ impl Processor for ParquetSource {
166166
.full_file_reader
167167
.as_ref()
168168
.unwrap()
169-
.read_blocks_from_binary(buffer)?;
169+
.read_blocks_from_binary(buffer, &path)?;
170170
let num_rows = bs.iter().map(|b| b.num_rows()).sum();
171171
self.copy_status.add_chunk(path.as_str(), FileStatus {
172172
num_rows_loaded: num_rows,
@@ -175,12 +175,12 @@ impl Processor for ParquetSource {
175175
blocks.extend(bs);
176176
}
177177
} else {
178-
for (_, buffer) in buffers {
178+
for (path, buffer) in buffers {
179179
blocks.extend(
180180
self.full_file_reader
181181
.as_ref()
182182
.unwrap()
183-
.read_blocks_from_binary(buffer)?,
183+
.read_blocks_from_binary(buffer, &path)?,
184184
);
185185
}
186186
}
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,11 @@
11
query error diff schema
22
select $1 from @data/parquet/ (files=>('tuple.parquet', 'complex.parquet'))
3+
4+
statement ok
5+
create or replace table t1 (id int, t TUPLE(A INT32, B STRING));
6+
7+
query error diff schema
8+
copy into t1 from @data/parquet/ files=('tuple.parquet', 'complex.parquet')
9+
10+
query error diff schema
11+
copy /*+ set_var(parquet_fast_read_bytes=0) */ into t1 from @data/parquet/ files=('tuple.parquet', 'complex.parquet')

0 commit comments

Comments
 (0)