33#include < Core/Settings.h>
44#include < Interpreters/Context.h>
55#include < Interpreters/convertFieldToType.h>
6+ #include < DataTypes/DataTypeLowCardinality.h>
67#include < Functions/keyvaluepair/impl/KeyValuePairExtractorBuilder.h>
78#include < Functions/keyvaluepair/impl/DuplicateKeyFoundException.h>
89#include < Formats/EscapingRuleUtils.h>
@@ -20,6 +21,7 @@ namespace Setting
2021namespace ErrorCodes
2122{
2223 extern const int INCORRECT_DATA;
24+ extern const int BAD_ARGUMENTS;
2325}
2426
2527namespace HivePartitioningUtils
@@ -83,7 +85,14 @@ NamesAndTypesList extractHivePartitionColumnsFromPath(
8385 {
8486 if (const auto type = tryInferDataTypeByEscapingRule (value, format_settings ? *format_settings : getFormatSettings (context), FormatSettings::EscapingRule::Raw))
8587 {
86- hive_partition_columns_to_read_from_file_path.emplace_back (key, type);
88+ if (type->canBeInsideLowCardinality ())
89+ {
90+ hive_partition_columns_to_read_from_file_path.emplace_back (key, std::make_shared<DataTypeLowCardinality>(type));
91+ }
92+ else
93+ {
94+ hive_partition_columns_to_read_from_file_path.emplace_back (key, type);
95+ }
8796 }
8897 else
8998 {
@@ -122,6 +131,29 @@ void addPartitionColumnsToChunk(
122131 }
123132}
124133
134+ void sanityCheckSchemaAndHivePartitionColumns (const NamesAndTypesList & hive_partition_columns_to_read_from_file_path, const ColumnsDescription & storage_columns)
135+ {
136+ for (const auto & column : hive_partition_columns_to_read_from_file_path)
137+ {
138+ if (!storage_columns.has (column.name ))
139+ {
140+ throw Exception (
141+ ErrorCodes::BAD_ARGUMENTS,
142+ " All hive partitioning columns must be present in the schema. Missing column: {}. "
143+ " If you do not want to use hive partitioning, try `use_hive_partitioning=0` and/or `partition_strategy != hive`" ,
144+ column.name );
145+ }
146+ }
147+
148+ if (storage_columns.size () == hive_partition_columns_to_read_from_file_path.size ())
149+ {
150+ throw Exception (
151+ ErrorCodes::INCORRECT_DATA,
152+ " A hive partitioned file can't contain only partition columns. "
153+ " Try reading it with `use_hive_partitioning=0` and/or `partition_strategy != hive`" );
154+ }
155+ }
156+
125157void extractPartitionColumnsFromPathAndEnrichStorageColumns (
126158 ColumnsDescription & storage_columns,
127159 NamesAndTypesList & hive_partition_columns_to_read_from_file_path,
@@ -144,13 +176,96 @@ void extractPartitionColumnsFromPathAndEnrichStorageColumns(
144176 }
145177 }
146178 }
179+ }
147180
148- if (hive_partition_columns_to_read_from_file_path.size () == storage_columns.size ())
181+ HivePartitionColumnsWithFileColumnsPair setupHivePartitioningForObjectStorage (
182+ ColumnsDescription & columns,
183+ const StorageObjectStorageConfigurationPtr & configuration,
184+ const std::string & sample_path,
185+ bool inferred_schema,
186+ std::optional<FormatSettings> format_settings,
187+ ContextPtr context)
188+ {
189+ NamesAndTypesList hive_partition_columns_to_read_from_file_path;
190+ NamesAndTypesList file_columns;
191+
192+ /*
193+ * If `partition_strategy=hive`, the partition columns shall be extracted from the `PARTITION BY` expression.
194+ * There is no need to read from the file's path.
195+ *
196+ * Otherwise, in case `use_hive_partitioning=1`, we can keep the old behavior of extracting it from the sample path.
197+ * And if the schema was inferred (not specified in the table definition), we need to enrich it with the path partition columns
198+ */
199+ if (configuration->partition_strategy && configuration->partition_strategy_type == PartitionStrategyFactory::StrategyType::HIVE)
200+ {
201+ hive_partition_columns_to_read_from_file_path = configuration->partition_strategy ->getPartitionColumns ();
202+ }
203+ else if (context->getSettingsRef ()[Setting::use_hive_partitioning])
204+ {
205+ extractPartitionColumnsFromPathAndEnrichStorageColumns (
206+ columns,
207+ hive_partition_columns_to_read_from_file_path,
208+ sample_path,
209+ inferred_schema,
210+ format_settings,
211+ context);
212+ }
213+
214+ sanityCheckSchemaAndHivePartitionColumns (hive_partition_columns_to_read_from_file_path, columns);
215+
216+ if (configuration->partition_columns_in_data_file )
217+ {
218+ file_columns = columns.getAllPhysical ();
219+ }
220+ else
221+ {
222+ std::unordered_set<String> hive_partition_columns_to_read_from_file_path_set;
223+
224+ for (const auto & [name, type] : hive_partition_columns_to_read_from_file_path)
225+ {
226+ hive_partition_columns_to_read_from_file_path_set.insert (name);
227+ }
228+
229+ for (const auto & [name, type] : columns.getAllPhysical ())
230+ {
231+ if (!hive_partition_columns_to_read_from_file_path_set.contains (name))
232+ {
233+ file_columns.emplace_back (name, type);
234+ }
235+ }
236+ }
237+
238+ return {hive_partition_columns_to_read_from_file_path, file_columns};
239+ }
240+
241+ HivePartitionColumnsWithFileColumnsPair setupHivePartitioningForFileURLLikeStorage (
242+ ColumnsDescription & columns,
243+ const std::string & sample_path,
244+ bool inferred_schema,
245+ std::optional<FormatSettings> format_settings,
246+ ContextPtr context)
247+ {
248+ NamesAndTypesList hive_partition_columns_to_read_from_file_path;
249+ NamesAndTypesList file_columns;
250+
251+ if (context->getSettingsRef ()[Setting::use_hive_partitioning])
149252 {
150- throw Exception (
151- ErrorCodes::INCORRECT_DATA,
152- " A hive partitioned file can't contain only partition columns. Try reading it with `use_hive_partitioning=0`" );
253+ extractPartitionColumnsFromPathAndEnrichStorageColumns (
254+ columns,
255+ hive_partition_columns_to_read_from_file_path,
256+ sample_path,
257+ inferred_schema,
258+ format_settings,
259+ context);
153260 }
261+
262+ sanityCheckSchemaAndHivePartitionColumns (hive_partition_columns_to_read_from_file_path, columns);
263+
264+ // / Partition strategy is not implemented for File/URL storages,
265+ // / so there is no option to set whether hive partition columns are in the data file or not.
266+ file_columns = columns.getAllPhysical ();
267+
268+ return {hive_partition_columns_to_read_from_file_path, file_columns};
154269}
155270
156271}
0 commit comments