1515#include < Storages/SelectQueryInfo.h>
1616
1717#include < lz4.h>
18+ #include < arrow/util/crc32.h>
1819
1920#if USE_SNAPPY
2021#include < snappy.h>
@@ -28,6 +29,7 @@ namespace DB::ErrorCodes
2829 extern const int INCORRECT_DATA;
2930 extern const int LOGICAL_ERROR;
3031 extern const int NOT_IMPLEMENTED;
32+ extern const int CHECKSUM_DOESNT_MATCH;
3133}
3234
3335namespace DB ::Parquet
@@ -176,7 +178,7 @@ parq::FileMetaData Reader::readFileMetaData(Prefetcher & prefetcher)
176178 prefetcher.readSync (buf.data (), initial_read_size, file_size - initial_read_size);
177179
178180 if (memcmp (buf.data () + initial_read_size - 4 , " PAR1" , 4 ) != 0 )
179- throw Exception (ErrorCodes::INCORRECT_DATA, " Not a parquet file (wrong magic bytes at the end of file)" );
181+ throw Exception (ErrorCodes::INCORRECT_DATA, " Not a Parquet file (wrong magic bytes at the end of file)" );
180182
181183 int32_t metadata_size_i32;
182184 memcpy (&metadata_size_i32, buf.data () + initial_read_size - 8 , 4 );
@@ -216,7 +218,7 @@ parq::FileMetaData Reader::readFileMetaData(Prefetcher & prefetcher)
216218 // / present. Instead, data_page_offset points to the dictionary page.
217219 // / (2) Old DuckDB versions (<= 0.10.2) wrote incorrect data_page_offset when dictionary is
218220 // / present.
219- // / We work around (1) in initializePage by allowing dictionary page in place of data page.
221+ // / We work around (1) in initializeDataPage by allowing dictionary page in place of data page.
220222 // / We work around (2) here by converting it to case (1):
221223 // / data_page_offset = dictionary_page_offset
222224 // / dictionary_page_offset.reset()
@@ -756,8 +758,9 @@ void Reader::processBloomFilterHeader(ColumnChunk & column, const PrimitiveColum
756758bool Reader::decodeDictionaryPage (ColumnChunk & column, const PrimitiveColumnInfo & column_info)
757759{
758760 auto data = prefetcher.getRangeData (column.dictionary_page_prefetch );
759- parq::PageHeader header;
760- size_t header_size = deserializeThriftStruct (header, data.data (), data.size ());
761+ const char * data_ptr = data.data ();
762+ const char * data_end = data.data () + data.size ();
763+ auto [header, page_data] = decodeAndCheckPageHeader (data_ptr, data_end);
761764
762765 if (header.type != parq::PageType::DICTIONARY_PAGE)
763766 {
@@ -768,15 +771,14 @@ bool Reader::decodeDictionaryPage(ColumnChunk & column, const PrimitiveColumnInf
768771 return false ;
769772 }
770773
771- decodeDictionaryPageImpl (header, data. subspan (header_size) , column, column_info);
774+ decodeDictionaryPageImpl (header, page_data , column, column_info);
772775 return true ;
773776}
774777
775778void Reader::decodeDictionaryPageImpl (const parq::PageHeader & header, std::span<const char > data, ColumnChunk & column, const PrimitiveColumnInfo & column_info)
776779{
777780 chassert (header.type == parq::PageType::DICTIONARY_PAGE);
778781
779- // / TODO [parquet]: Check checksum.
780782 size_t compressed_page_size = size_t (header.compressed_page_size );
781783 if (header.compressed_page_size < 0 || compressed_page_size > data.size ())
782784 throw Exception (ErrorCodes::INCORRECT_DATA, " Dictionary page size out of bounds: {} > {}" , header.compressed_page_size , data.size ());
@@ -1381,7 +1383,7 @@ void Reader::skipToRow(size_t row_idx, ColumnChunk & column, const PrimitiveColu
13811383
13821384 auto data = prefetcher.getRangeData (page_info.prefetch );
13831385 const char * ptr = data.data ();
1384- if (!initializePage (ptr, ptr + data.size (), first_row_idx, page_info.end_row_idx , row_idx, column, column_info))
1386+ if (!initializeDataPage (ptr, ptr + data.size (), first_row_idx, page_info.end_row_idx , row_idx, column, column_info))
13851387 throw Exception (ErrorCodes::LOGICAL_ERROR, " Page doesn't contain requested row" );
13861388 found_page = true ;
13871389 }
@@ -1403,12 +1405,33 @@ void Reader::skipToRow(size_t row_idx, ColumnChunk & column, const PrimitiveColu
14031405 chassert (column.next_page_offset <= all_pages.size ());
14041406 const char * ptr = all_pages.data () + column.next_page_offset ;
14051407 const char * end = all_pages.data () + all_pages.size ();
1406- initializePage (ptr, end, page.next_row_idx , /* end_row_idx=*/ std::nullopt , row_idx, column, column_info);
1408+ initializeDataPage (ptr, end, page.next_row_idx , /* end_row_idx=*/ std::nullopt , row_idx, column, column_info);
14071409 column.next_page_offset = ptr - all_pages.data ();
14081410 }
14091411}
14101412
1411- bool Reader::initializePage (const char * & data_ptr, const char * data_end, size_t next_row_idx, std::optional<size_t > end_row_idx, size_t target_row_idx, ColumnChunk & column, const PrimitiveColumnInfo & column_info)
1413+ std::tuple<parq::PageHeader, std::span<const char >> Reader::decodeAndCheckPageHeader (const char * & data_ptr, const char * data_end) const
1414+ {
1415+ parq::PageHeader header;
1416+ data_ptr += deserializeThriftStruct (header, data_ptr, data_end - data_ptr);
1417+ size_t compressed_page_size = size_t (header.compressed_page_size );
1418+ if (header.compressed_page_size < 0 || compressed_page_size > size_t (data_end - data_ptr))
1419+ throw Exception (ErrorCodes::INCORRECT_DATA, " Page size out of bounds: {} > {}" , header.compressed_page_size , data_end - data_ptr);
1420+
1421+ std::span page_data (data_ptr, compressed_page_size);
1422+ data_ptr += compressed_page_size;
1423+
1424+ if (header.__isset .crc && options.format .parquet .verify_checksums )
1425+ {
1426+ uint32_t crc = arrow::internal::crc32 (0 , page_data.data (), page_data.size ());
1427+ if (crc != uint32_t (header.crc ))
1428+ throw Exception (ErrorCodes::CHECKSUM_DOESNT_MATCH, " Page CRC checksum verification failed" );
1429+ }
1430+
1431+ return {header, page_data};
1432+ }
1433+
1434+ bool Reader::initializeDataPage (const char * & data_ptr, const char * data_end, size_t next_row_idx, std::optional<size_t > end_row_idx, size_t target_row_idx, ColumnChunk & column, const PrimitiveColumnInfo & column_info)
14121435{
14131436 PageState & page = column.page ;
14141437 // / We reuse PageState instance across pages to reuse memory in buffers like decompressed_buf.
@@ -1425,13 +1448,7 @@ bool Reader::initializePage(const char * & data_ptr, const char * data_end, size
14251448 // / Decode page header.
14261449
14271450 parq::PageHeader header;
1428- data_ptr += deserializeThriftStruct (header, data_ptr, data_end - data_ptr);
1429- // / TODO [parquet]: Check checksum.
1430- size_t compressed_page_size = size_t (header.compressed_page_size );
1431- if (header.compressed_page_size < 0 || compressed_page_size > size_t (data_end - data_ptr))
1432- throw Exception (ErrorCodes::INCORRECT_DATA, " Page size out of bounds: {} > {}" , header.compressed_page_size , data_end - data_ptr);
1433- page.data = std::span (data_ptr, compressed_page_size);
1434- data_ptr += compressed_page_size;
1451+ std::tie (header, page.data ) = decodeAndCheckPageHeader (data_ptr, data_end);
14351452
14361453 // / Check if all rows of the page are filtered out, if we have enough information.
14371454
@@ -1525,7 +1542,7 @@ bool Reader::initializePage(const char * & data_ptr, const char * data_end, size
15251542 page.codec = parq::CompressionCodec::UNCOMPRESSED;
15261543 }
15271544
1528- if (encoded_def_size + encoded_rep_size > compressed_page_size )
1545+ if (encoded_def_size + encoded_rep_size > page. data . size () )
15291546 throw Exception (ErrorCodes::INCORRECT_DATA, " Page data is too short (def+rep)" );
15301547 encoded_rep = page.data .data ();
15311548 encoded_def = page.data .data () + encoded_rep_size;
0 commit comments