Altinity
diff --git a/‎src/Processors/Formats/Impl/Parquet/Reader.cpp‎
Lines changed: 30 additions & 3 deletions b/‎src/Processors/Formats/Impl/Parquet/Reader.cpp‎
Lines changed: 30 additions & 3 deletions
diff --git a/‎src/Processors/Formats/Impl/Parquet/Reader.h‎
Lines changed: 2 additions & 1 deletion b/‎src/Processors/Formats/Impl/Parquet/Reader.h‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎src/Processors/Formats/Impl/Parquet/Write.cpp‎
Lines changed: 6 additions & 1 deletion b/‎src/Processors/Formats/Impl/Parquet/Write.cpp‎
Lines changed: 6 additions & 1 deletion
diff --git a/‎src/Processors/Formats/Impl/ParquetBlockInputFormat.cpp‎
Lines changed: 4 additions & 1 deletion b/‎src/Processors/Formats/Impl/ParquetBlockInputFormat.cpp‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎tests/clickhouse-test‎
Lines changed: 2 additions & 4 deletions b/‎tests/clickhouse-test‎
Lines changed: 2 additions & 4 deletions
@@ -211,6 +211,33 @@ parq::FileMetaData Reader::readFileMetaData(Prefetcher & prefetcher)
         }
     }
 
+    /// Consider two quirks:
+    ///  (1) Some versions of spark didn't write dictionary_page_offset even when dictionary page is
+    ///      present. Instead, data_page_offset points to the dictionary page.
+    ///  (2) Old DuckDB versions (<= 0.10.2) wrote incorrect data_page_offset when dictionary is
+    ///      present.
+    /// We work around (1) in initializePage by allowing dictionary page in place of data page.
+    /// We work around (2) here by converting it to case (1):
+    ///   data_page_offset = dictionary_page_offset
+    ///   dictionary_page_offset.reset()
+    /// Note: newer versions of DuckDB include version number in the `created_by` string, so this
+    /// `if` only applies to relatively old versions. Newer versions don't have this bug.
+    if (file_metadata.created_by == "DuckDB")
+    {
+        for (auto & rg : file_metadata.row_groups)
+        {
+            for (auto & col : rg.columns)
+            {
+                if (!col.__isset.offset_index_offset && col.meta_data.__isset.dictionary_page_offset)
+                {
+                    col.meta_data.data_page_offset = col.meta_data.dictionary_page_offset;
+                    col.meta_data.__isset.dictionary_page_offset = false;
+                    col.meta_data.dictionary_page_offset = 0;
+                }
+            }
+        }
+    }
+
     return file_metadata;
 }
 
@@ -1511,9 +1538,9 @@ bool Reader::initializePage(const char * & data_ptr, const char * data_end, size
         if (column.dictionary.isInitialized())
             throw Exception(ErrorCodes::INCORRECT_DATA, "Column chunk has multiple dictionary pages or inaccurate data_page_offset");
 
-        /// If we got here, this is a weird parquet file that has a dictionary page but no
-        /// dictionary_page_offset in ColumnMetaData. Not sure whether this is allowed, but spark
-        /// can output such files, so we have to support it.
+        /// There's a dictionary page, but there was no dictionary_page_offset in ColumnMetaData.
+        /// This is probably not allowed, but we have to support it because some writers wrote such
+        /// files, see comment in readFileMetaData.
         decodeDictionaryPageImpl(header, page.data, column, column_info);
         return false;
     }
 
@@ -25,7 +25,8 @@ namespace DB::Parquet
 
 // TODO [parquet]:
 //  * either multistage PREWHERE or make query optimizer selectively move parts of the condition to prewhere instead of the whole condition
-//  * test on files from https://github.com/apache/parquet-testing
+//  * test on files from https://github.com/apache/parquet-testing and https://www.timestored.com/data/sample/parquet
+//  * look at issues in 00900_long_parquet_load.sh
 //  * check fields for false sharing, add cacheline padding as needed
 //  * make sure userspace page cache read buffer supports readBigAt
 //  * support newer parquet versions: https://github.com/apache/parquet-format/blob/master/CHANGES.md
 
@@ -1322,7 +1322,12 @@ void writeFileFooter(FileWriteState & file,
         meta.num_rows += rg.row_group.num_rows;
         meta.row_groups.push_back(std::move(rg.row_group));
     }
-    meta.__set_created_by(std::string(VERSION_NAME) + " " + VERSION_DESCRIBE);
+
+    /// parquet.thrift sayeth:
+    ///  >  This should be in the format
+    ///  >  <Application> version <App Version> (build <App Build Hash>).
+    ///  >  e.g. impala version 1.0 (build 6cf94d29b2b7115df4de2c06e2ab4326d721eb55)
+    meta.__set_created_by(fmt::format("ClickHouse version {}.{}.{} (build {})", VERSION_MAJOR, VERSION_MINOR, VERSION_PATCH, VERSION_GITHASH));
 
     if (options.write_page_statistics || options.write_column_chunk_statistics)
     {
 
@@ -899,7 +899,7 @@ void ParquetBlockInputFormat::setStorageRelatedUniqueKey(const Settings & settin
 
 void ParquetBlockInputFormat::initializeRowGroupBatchReader(size_t row_group_batch_idx)
 {
-    const bool row_group_prefetch = io_pool != nullptr;
+    bool row_group_prefetch = io_pool != nullptr;
     auto & row_group_batch = row_group_batches[row_group_batch_idx];
 
     parquet::ArrowReaderProperties arrow_properties;
@@ -952,7 +952,10 @@ void ParquetBlockInputFormat::initializeRowGroupBatchReader(size_t row_group_bat
     // other, failing an assert. So we disable pre-buffering in this case.
     // That version is >10 years old, so this is not very important.
     if (metadata->writer_version().VersionLt(parquet::ApplicationVersion::PARQUET_816_FIXED_VERSION()))
+    {
         arrow_properties.set_pre_buffer(false);
+        row_group_prefetch = false;
+    }
 
     if (format_settings.parquet.use_native_reader)
     {
 
@@ -1832,19 +1832,17 @@ class TestCase:
                     self.stdout_file,
                 ],
                 stdout=PIPE,
-                universal_newlines=True,
             ) as diff_proc:
                 if self.show_whitespaces_in_diff:
                     with Popen(
                         ["sed", "-e", "s/[ \t]\\+$/&$/g"],
                         stdin=diff_proc.stdout,
                         stdout=PIPE,
                     ) as sed_proc:
-                        diff = sed_proc.communicate()[0].decode(
-                            "utf-8", errors="ignore"
-                        )
+                        diff = sed_proc.communicate()[0]
                 else:
                     diff = diff_proc.communicate()[0]
+            diff = diff.decode("utf-8", errors="ignore")
 
             if diff.startswith("Binary files "):
                 diff += "Content of stdout:\n===================\n"
Original file line number	Diff line number	Diff line change
`@@ -1322,7 +1322,12 @@ void writeFileFooter(FileWriteState & file,`
`1322`	`1322`	`meta.num_rows += rg.row_group.num_rows;`
`1323`	`1323`	`meta.row_groups.push_back(std::move(rg.row_group));`
`1324`	`1324`	`}`
`1325`		`- meta.__set_created_by(std::string(VERSION_NAME) + " " + VERSION_DESCRIBE);`
	`1325`	`+`
	`1326`	`+ /// parquet.thrift sayeth:`
	`1327`	`+ /// > This should be in the format`
	`1328`	`+ /// > <Application> version <App Version> (build <App Build Hash>).`
	`1329`	`+ /// > e.g. impala version 1.0 (build 6cf94d29b2b7115df4de2c06e2ab4326d721eb55)`
	`1330`	`+ meta.__set_created_by(fmt::format("ClickHouse version {}.{}.{} (build {})", VERSION_MAJOR, VERSION_MINOR, VERSION_PATCH, VERSION_GITHASH));`
`1326`	`1331`
`1327`	`1332`	`if (options.write_page_statistics \|\| options.write_column_chunk_statistics)`
`1328`	`1333`	`{`