Skip to content

Commit c3c334f

Browse files
al13n321mkmkme
authored andcommitted
Merge pull request ClickHouse#87735 from ClickHouse/pqf2
A few more parquet fixes
1 parent c87cef9 commit c3c334f

File tree

55 files changed

+937
-1169
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

55 files changed

+937
-1169
lines changed

src/Processors/Formats/Impl/Parquet/Reader.cpp

Lines changed: 30 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -211,6 +211,33 @@ parq::FileMetaData Reader::readFileMetaData(Prefetcher & prefetcher)
211211
}
212212
}
213213

214+
/// Consider two quirks:
215+
/// (1) Some versions of spark didn't write dictionary_page_offset even when dictionary page is
216+
/// present. Instead, data_page_offset points to the dictionary page.
217+
/// (2) Old DuckDB versions (<= 0.10.2) wrote incorrect data_page_offset when dictionary is
218+
/// present.
219+
/// We work around (1) in initializePage by allowing dictionary page in place of data page.
220+
/// We work around (2) here by converting it to case (1):
221+
/// data_page_offset = dictionary_page_offset
222+
/// dictionary_page_offset.reset()
223+
/// Note: newer versions of DuckDB include version number in the `created_by` string, so this
224+
/// `if` only applies to relatively old versions. Newer versions don't have this bug.
225+
if (file_metadata.created_by == "DuckDB")
226+
{
227+
for (auto & rg : file_metadata.row_groups)
228+
{
229+
for (auto & col : rg.columns)
230+
{
231+
if (!col.__isset.offset_index_offset && col.meta_data.__isset.dictionary_page_offset)
232+
{
233+
col.meta_data.data_page_offset = col.meta_data.dictionary_page_offset;
234+
col.meta_data.__isset.dictionary_page_offset = false;
235+
col.meta_data.dictionary_page_offset = 0;
236+
}
237+
}
238+
}
239+
}
240+
214241
return file_metadata;
215242
}
216243

@@ -1511,9 +1538,9 @@ bool Reader::initializePage(const char * & data_ptr, const char * data_end, size
15111538
if (column.dictionary.isInitialized())
15121539
throw Exception(ErrorCodes::INCORRECT_DATA, "Column chunk has multiple dictionary pages or inaccurate data_page_offset");
15131540

1514-
/// If we got here, this is a weird parquet file that has a dictionary page but no
1515-
/// dictionary_page_offset in ColumnMetaData. Not sure whether this is allowed, but spark
1516-
/// can output such files, so we have to support it.
1541+
/// There's a dictionary page, but there was no dictionary_page_offset in ColumnMetaData.
1542+
/// This is probably not allowed, but we have to support it because some writers wrote such
1543+
/// files, see comment in readFileMetaData.
15171544
decodeDictionaryPageImpl(header, page.data, column, column_info);
15181545
return false;
15191546
}

src/Processors/Formats/Impl/Parquet/Reader.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,8 @@ namespace DB::Parquet
2525

2626
// TODO [parquet]:
2727
// * either multistage PREWHERE or make query optimizer selectively move parts of the condition to prewhere instead of the whole condition
28-
// * test on files from https://github.com/apache/parquet-testing
28+
// * test on files from https://github.com/apache/parquet-testing and https://www.timestored.com/data/sample/parquet
29+
// * look at issues in 00900_long_parquet_load.sh
2930
// * check fields for false sharing, add cacheline padding as needed
3031
// * make sure userspace page cache read buffer supports readBigAt
3132
// * support newer parquet versions: https://github.com/apache/parquet-format/blob/master/CHANGES.md

src/Processors/Formats/Impl/Parquet/Write.cpp

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1322,7 +1322,12 @@ void writeFileFooter(FileWriteState & file,
13221322
meta.num_rows += rg.row_group.num_rows;
13231323
meta.row_groups.push_back(std::move(rg.row_group));
13241324
}
1325-
meta.__set_created_by(std::string(VERSION_NAME) + " " + VERSION_DESCRIBE);
1325+
1326+
/// parquet.thrift sayeth:
1327+
/// > This should be in the format
1328+
/// > <Application> version <App Version> (build <App Build Hash>).
1329+
/// > e.g. impala version 1.0 (build 6cf94d29b2b7115df4de2c06e2ab4326d721eb55)
1330+
meta.__set_created_by(fmt::format("ClickHouse version {}.{}.{} (build {})", VERSION_MAJOR, VERSION_MINOR, VERSION_PATCH, VERSION_GITHASH));
13261331

13271332
if (options.write_page_statistics || options.write_column_chunk_statistics)
13281333
{

src/Processors/Formats/Impl/ParquetBlockInputFormat.cpp

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -899,7 +899,7 @@ void ParquetBlockInputFormat::setStorageRelatedUniqueKey(const Settings & settin
899899

900900
void ParquetBlockInputFormat::initializeRowGroupBatchReader(size_t row_group_batch_idx)
901901
{
902-
const bool row_group_prefetch = io_pool != nullptr;
902+
bool row_group_prefetch = io_pool != nullptr;
903903
auto & row_group_batch = row_group_batches[row_group_batch_idx];
904904

905905
parquet::ArrowReaderProperties arrow_properties;
@@ -952,7 +952,10 @@ void ParquetBlockInputFormat::initializeRowGroupBatchReader(size_t row_group_bat
952952
// other, failing an assert. So we disable pre-buffering in this case.
953953
// That version is >10 years old, so this is not very important.
954954
if (metadata->writer_version().VersionLt(parquet::ApplicationVersion::PARQUET_816_FIXED_VERSION()))
955+
{
955956
arrow_properties.set_pre_buffer(false);
957+
row_group_prefetch = false;
958+
}
956959

957960
if (format_settings.parquet.use_native_reader)
958961
{

tests/clickhouse-test

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1832,19 +1832,17 @@ class TestCase:
18321832
self.stdout_file,
18331833
],
18341834
stdout=PIPE,
1835-
universal_newlines=True,
18361835
) as diff_proc:
18371836
if self.show_whitespaces_in_diff:
18381837
with Popen(
18391838
["sed", "-e", "s/[ \t]\\+$/&$/g"],
18401839
stdin=diff_proc.stdout,
18411840
stdout=PIPE,
18421841
) as sed_proc:
1843-
diff = sed_proc.communicate()[0].decode(
1844-
"utf-8", errors="ignore"
1845-
)
1842+
diff = sed_proc.communicate()[0]
18461843
else:
18471844
diff = diff_proc.communicate()[0]
1845+
diff = diff.decode("utf-8", errors="ignore")
18481846

18491847
if diff.startswith("Binary files "):
18501848
diff += "Content of stdout:\n===================\n"

0 commit comments

Comments
 (0)