Skip to content

Commit 5bcbe31

Browse files
ksseniiarthurpassos
authored andcommitted
Merge pull request ClickHouse#85538 from arthurpassos/hive_partition_columns_before_virtual_columns
Few fixes to object storage hive reads & writes
1 parent 2d8ac3c commit 5bcbe31

22 files changed

+309
-204
lines changed

src/Functions/keyvaluepair/impl/CHKeyValuePairExtractor.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -169,7 +169,7 @@ struct KeyValuePairExtractorReferenceMap : extractKV::KeyValuePairExtractor<extr
169169
explicit KeyValuePairExtractorReferenceMap(const extractKV::Configuration & configuration_, std::size_t max_number_of_pairs_)
170170
: KeyValuePairExtractor(configuration_, max_number_of_pairs_) {}
171171

172-
uint64_t extract(std::string_view data, absl::flat_hash_map<std::string_view, std::string_view> & map)
172+
uint64_t extract(std::string_view data, std::map<std::string_view, std::string_view> & map)
173173
{
174174
auto pair_writer = typename StateHandler::PairWriter(map);
175175
return extractImpl(data, pair_writer);

src/Functions/keyvaluepair/impl/StateHandlerImpl.h

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,6 @@
1313
#include <string_view>
1414
#include <string>
1515
#include <vector>
16-
#include <absl/container/flat_hash_map.h>
1716

1817

1918
namespace DB
@@ -577,13 +576,13 @@ struct ReferencesMapStateHandler : public StateHandlerImpl<false>
577576
* */
578577
class PairWriter
579578
{
580-
absl::flat_hash_map<std::string_view, std::string_view> & map;
579+
std::map<std::string_view, std::string_view> & map;
581580

582581
std::string_view key;
583582
std::string_view value;
584583

585584
public:
586-
explicit PairWriter(absl::flat_hash_map<std::string_view, std::string_view> & map_)
585+
explicit PairWriter(std::map<std::string_view, std::string_view> & map_)
587586
: map(map_)
588587
{}
589588

src/Storages/HivePartitioningUtils.cpp

Lines changed: 120 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
#include <Core/Settings.h>
44
#include <Interpreters/Context.h>
55
#include <Interpreters/convertFieldToType.h>
6+
#include <DataTypes/DataTypeLowCardinality.h>
67
#include <Functions/keyvaluepair/impl/KeyValuePairExtractorBuilder.h>
78
#include <Functions/keyvaluepair/impl/DuplicateKeyFoundException.h>
89
#include <Formats/EscapingRuleUtils.h>
@@ -20,6 +21,7 @@ namespace Setting
2021
namespace ErrorCodes
2122
{
2223
extern const int INCORRECT_DATA;
24+
extern const int BAD_ARGUMENTS;
2325
}
2426

2527
namespace HivePartitioningUtils
@@ -83,7 +85,14 @@ NamesAndTypesList extractHivePartitionColumnsFromPath(
8385
{
8486
if (const auto type = tryInferDataTypeByEscapingRule(value, format_settings ? *format_settings : getFormatSettings(context), FormatSettings::EscapingRule::Raw))
8587
{
86-
hive_partition_columns_to_read_from_file_path.emplace_back(key, type);
88+
if (type->canBeInsideLowCardinality())
89+
{
90+
hive_partition_columns_to_read_from_file_path.emplace_back(key, std::make_shared<DataTypeLowCardinality>(type));
91+
}
92+
else
93+
{
94+
hive_partition_columns_to_read_from_file_path.emplace_back(key, type);
95+
}
8796
}
8897
else
8998
{
@@ -122,6 +131,29 @@ void addPartitionColumnsToChunk(
122131
}
123132
}
124133

134+
void sanityCheckSchemaAndHivePartitionColumns(const NamesAndTypesList & hive_partition_columns_to_read_from_file_path, const ColumnsDescription & storage_columns)
135+
{
136+
for (const auto & column : hive_partition_columns_to_read_from_file_path)
137+
{
138+
if (!storage_columns.has(column.name))
139+
{
140+
throw Exception(
141+
ErrorCodes::BAD_ARGUMENTS,
142+
"All hive partitioning columns must be present in the schema. Missing column: {}. "
143+
"If you do not want to use hive partitioning, try `use_hive_partitioning=0` and/or `partition_strategy != hive`",
144+
column.name);
145+
}
146+
}
147+
148+
if (storage_columns.size() == hive_partition_columns_to_read_from_file_path.size())
149+
{
150+
throw Exception(
151+
ErrorCodes::INCORRECT_DATA,
152+
"A hive partitioned file can't contain only partition columns. "
153+
"Try reading it with `use_hive_partitioning=0` and/or `partition_strategy != hive`");
154+
}
155+
}
156+
125157
void extractPartitionColumnsFromPathAndEnrichStorageColumns(
126158
ColumnsDescription & storage_columns,
127159
NamesAndTypesList & hive_partition_columns_to_read_from_file_path,
@@ -144,13 +176,96 @@ void extractPartitionColumnsFromPathAndEnrichStorageColumns(
144176
}
145177
}
146178
}
179+
}
147180

148-
if (hive_partition_columns_to_read_from_file_path.size() == storage_columns.size())
181+
HivePartitionColumnsWithFileColumnsPair setupHivePartitioningForObjectStorage(
182+
ColumnsDescription & columns,
183+
const StorageObjectStorageConfigurationPtr & configuration,
184+
const std::string & sample_path,
185+
bool inferred_schema,
186+
std::optional<FormatSettings> format_settings,
187+
ContextPtr context)
188+
{
189+
NamesAndTypesList hive_partition_columns_to_read_from_file_path;
190+
NamesAndTypesList file_columns;
191+
192+
/*
193+
* If `partition_strategy=hive`, the partition columns shall be extracted from the `PARTITION BY` expression.
194+
* There is no need to read from the file's path.
195+
*
196+
* Otherwise, in case `use_hive_partitioning=1`, we can keep the old behavior of extracting it from the sample path.
197+
* And if the schema was inferred (not specified in the table definition), we need to enrich it with the path partition columns
198+
*/
199+
if (configuration->partition_strategy && configuration->partition_strategy_type == PartitionStrategyFactory::StrategyType::HIVE)
200+
{
201+
hive_partition_columns_to_read_from_file_path = configuration->partition_strategy->getPartitionColumns();
202+
}
203+
else if (context->getSettingsRef()[Setting::use_hive_partitioning])
204+
{
205+
extractPartitionColumnsFromPathAndEnrichStorageColumns(
206+
columns,
207+
hive_partition_columns_to_read_from_file_path,
208+
sample_path,
209+
inferred_schema,
210+
format_settings,
211+
context);
212+
}
213+
214+
sanityCheckSchemaAndHivePartitionColumns(hive_partition_columns_to_read_from_file_path, columns);
215+
216+
if (configuration->partition_columns_in_data_file)
217+
{
218+
file_columns = columns.getAllPhysical();
219+
}
220+
else
221+
{
222+
std::unordered_set<String> hive_partition_columns_to_read_from_file_path_set;
223+
224+
for (const auto & [name, type] : hive_partition_columns_to_read_from_file_path)
225+
{
226+
hive_partition_columns_to_read_from_file_path_set.insert(name);
227+
}
228+
229+
for (const auto & [name, type] : columns.getAllPhysical())
230+
{
231+
if (!hive_partition_columns_to_read_from_file_path_set.contains(name))
232+
{
233+
file_columns.emplace_back(name, type);
234+
}
235+
}
236+
}
237+
238+
return {hive_partition_columns_to_read_from_file_path, file_columns};
239+
}
240+
241+
HivePartitionColumnsWithFileColumnsPair setupHivePartitioningForFileURLLikeStorage(
242+
ColumnsDescription & columns,
243+
const std::string & sample_path,
244+
bool inferred_schema,
245+
std::optional<FormatSettings> format_settings,
246+
ContextPtr context)
247+
{
248+
NamesAndTypesList hive_partition_columns_to_read_from_file_path;
249+
NamesAndTypesList file_columns;
250+
251+
if (context->getSettingsRef()[Setting::use_hive_partitioning])
149252
{
150-
throw Exception(
151-
ErrorCodes::INCORRECT_DATA,
152-
"A hive partitioned file can't contain only partition columns. Try reading it with `use_hive_partitioning=0`");
253+
extractPartitionColumnsFromPathAndEnrichStorageColumns(
254+
columns,
255+
hive_partition_columns_to_read_from_file_path,
256+
sample_path,
257+
inferred_schema,
258+
format_settings,
259+
context);
153260
}
261+
262+
sanityCheckSchemaAndHivePartitionColumns(hive_partition_columns_to_read_from_file_path, columns);
263+
264+
/// Partition strategy is not implemented for File/URL storages,
265+
/// so there is no option to set whether hive partition columns are in the data file or not.
266+
file_columns = columns.getAllPhysical();
267+
268+
return {hive_partition_columns_to_read_from_file_path, file_columns};
154269
}
155270

156271
}

src/Storages/HivePartitioningUtils.h

Lines changed: 17 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
11
#pragma once
22

3-
#include <absl/container/flat_hash_map.h>
43
#include <Storages/ColumnsDescription.h>
4+
#include <Storages/ObjectStorage/StorageObjectStorageConfiguration.h>
5+
#include <Core/NamesAndTypes.h>
56

67
namespace DB
78
{
@@ -10,7 +11,7 @@ class Chunk;
1011

1112
namespace HivePartitioningUtils
1213
{
13-
using HivePartitioningKeysAndValues = absl::flat_hash_map<std::string_view, std::string_view>;
14+
using HivePartitioningKeysAndValues = std::map<std::string_view, std::string_view>;
1415

1516
HivePartitioningKeysAndValues parseHivePartitioningKeysAndValues(const std::string & path);
1617

@@ -19,10 +20,20 @@ void addPartitionColumnsToChunk(
1920
const NamesAndTypesList & hive_partition_columns_to_read_from_file_path,
2021
const std::string & path);
2122

22-
void extractPartitionColumnsFromPathAndEnrichStorageColumns(
23-
ColumnsDescription & storage_columns,
24-
NamesAndTypesList & hive_partition_columns_to_read_from_file_path,
25-
const std::string & path,
23+
/// Hive partition columns and file columns (Note that file columns might not contain the hive partition columns)
24+
using HivePartitionColumnsWithFileColumnsPair = std::pair<NamesAndTypesList, NamesAndTypesList>;
25+
26+
HivePartitionColumnsWithFileColumnsPair setupHivePartitioningForObjectStorage(
27+
ColumnsDescription & columns,
28+
const StorageObjectStorageConfigurationPtr & configuration,
29+
const std::string & sample_path,
30+
bool inferred_schema,
31+
std::optional<FormatSettings> format_settings,
32+
ContextPtr context);
33+
34+
HivePartitionColumnsWithFileColumnsPair setupHivePartitioningForFileURLLikeStorage(
35+
ColumnsDescription & columns,
36+
const std::string & sample_path,
2637
bool inferred_schema,
2738
std::optional<FormatSettings> format_settings,
2839
ContextPtr context);

src/Storages/IPartitionStrategy.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -118,7 +118,7 @@ namespace
118118

119119
if (file_format.empty() || file_format == "auto")
120120
{
121-
throw Exception(ErrorCodes::LOGICAL_ERROR, "File format can't be empty for hive style partitioning");
121+
throw Exception(ErrorCodes::BAD_ARGUMENTS, "File format can't be empty for hive style partitioning");
122122
}
123123

124124
const auto partition_key_description = KeyDescription::getKeyFromAST(partition_by, ColumnsDescription::fromNamesAndTypes(sample_block.getNamesAndTypes()), context);

src/Storages/ObjectStorage/StorageObjectStorage.cpp

Lines changed: 8 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -172,56 +172,13 @@ StorageObjectStorage::StorageObjectStorage(
172172
}
173173
}
174174

175-
/*
176-
* If `partition_strategy=hive`, the partition columns shall be extracted from the `PARTITION BY` expression.
177-
* There is no need to read from the file's path.
178-
*
179-
* Otherwise, in case `use_hive_partitioning=1`, we can keep the old behavior of extracting it from the sample path.
180-
* And if the schema was inferred (not specified in the table definition), we need to enrich it with the path partition columns
181-
*/
182-
if (configuration->partition_strategy && configuration->partition_strategy_type == PartitionStrategyFactory::StrategyType::HIVE)
183-
{
184-
hive_partition_columns_to_read_from_file_path = configuration->partition_strategy->getPartitionColumns();
185-
}
186-
else if (context->getSettingsRef()[Setting::use_hive_partitioning])
187-
{
188-
HivePartitioningUtils::extractPartitionColumnsFromPathAndEnrichStorageColumns(
189-
columns,
190-
hive_partition_columns_to_read_from_file_path,
191-
sample_path,
192-
columns_in_table_or_function_definition.empty(),
193-
format_settings,
194-
context);
195-
}
196-
197-
if (hive_partition_columns_to_read_from_file_path.size() == columns.size())
198-
{
199-
throw Exception(
200-
ErrorCodes::INCORRECT_DATA,
201-
"A hive partitioned file can't contain only partition columns. Try reading it with `partition_strategy=wildcard` and `use_hive_partitioning=0`");
202-
}
203-
204-
if (configuration->partition_columns_in_data_file)
205-
{
206-
file_columns = columns;
207-
}
208-
else
209-
{
210-
std::unordered_set<String> hive_partition_columns_to_read_from_file_path_set;
211-
212-
for (const auto & [name, type] : hive_partition_columns_to_read_from_file_path)
213-
{
214-
hive_partition_columns_to_read_from_file_path_set.insert(name);
215-
}
216-
217-
for (const auto & [name, type] : columns.getAllPhysical())
218-
{
219-
if (!hive_partition_columns_to_read_from_file_path_set.contains(name))
220-
{
221-
file_columns.add({name, type});
222-
}
223-
}
224-
}
175+
std::tie(hive_partition_columns_to_read_from_file_path, file_columns) = HivePartitioningUtils::setupHivePartitioningForObjectStorage(
176+
columns,
177+
configuration,
178+
sample_path,
179+
columns_in_table_or_function_definition.empty(),
180+
format_settings,
181+
context);
225182

226183
// Assert file contains at least one column. The assertion only takes place if we were able to deduce the schema. The storage might be empty.
227184
if (!columns.empty() && file_columns.empty())
@@ -500,15 +457,13 @@ void StorageObjectStorage::read(
500457
getName());
501458
}
502459

503-
auto all_file_columns = file_columns.getAll();
504-
505460
auto read_from_format_info = configuration->prepareReadingFromFormat(
506461
object_storage,
507462
column_names,
508463
storage_snapshot,
509464
supportsSubsetOfColumns(local_context),
510465
local_context,
511-
PrepareReadingFromFormatHiveParams { all_file_columns, hive_partition_columns_to_read_from_file_path.getNameToTypeMap() });
466+
PrepareReadingFromFormatHiveParams { file_columns, hive_partition_columns_to_read_from_file_path.getNameToTypeMap() });
512467

513468
const bool need_only_count = (query_info.optimize_trivial_count || read_from_format_info.requested_columns.empty())
514469
&& local_context->getSettingsRef()[Setting::optimize_count_from_files];

src/Storages/ObjectStorage/StorageObjectStorage.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -177,7 +177,7 @@ class StorageObjectStorage : public IStorage
177177
bool update_configuration_on_read_write = true;
178178

179179
NamesAndTypesList hive_partition_columns_to_read_from_file_path;
180-
ColumnsDescription file_columns;
180+
NamesAndTypesList file_columns;
181181

182182
LoggerPtr log;
183183
};

src/Storages/ObjectStorage/StorageObjectStorageCluster.cpp

Lines changed: 8 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,6 @@ namespace Setting
2929
namespace ErrorCodes
3030
{
3131
extern const int LOGICAL_ERROR;
32-
extern const int INCORRECT_DATA;
3332
}
3433

3534
String StorageObjectStorageCluster::getPathSample(ContextPtr context)
@@ -88,37 +87,14 @@ StorageObjectStorageCluster::StorageObjectStorageCluster(
8887
if (sample_path.empty() && context_->getSettingsRef()[Setting::use_hive_partitioning] && !configuration->isDataLakeConfiguration() && !configuration->partition_strategy)
8988
sample_path = getPathSample(context_);
9089

91-
/*
92-
* If `partition_strategy=hive`, the partition columns shall be extracted from the `PARTITION BY` expression.
93-
* There is no need to read from the filepath.
94-
*
95-
* Otherwise, in case `use_hive_partitioning=1`, we can keep the old behavior of extracting it from the sample path.
96-
* And if the schema was inferred (not specified in the table definition), we need to enrich it with the path partition columns
97-
*/
98-
if (configuration->partition_strategy && configuration->partition_strategy_type == PartitionStrategyFactory::StrategyType::HIVE)
99-
{
100-
hive_partition_columns_to_read_from_file_path = configuration->partition_strategy->getPartitionColumns();
101-
}
102-
else if (context_->getSettingsRef()[Setting::use_hive_partitioning])
103-
{
104-
HivePartitioningUtils::extractPartitionColumnsFromPathAndEnrichStorageColumns(
105-
columns,
106-
hive_partition_columns_to_read_from_file_path,
107-
sample_path,
108-
columns_in_table_or_function_definition.empty(),
109-
std::nullopt,
110-
context_
111-
);
112-
}
113-
114-
if (hive_partition_columns_to_read_from_file_path.size() == columns.size())
115-
{
116-
throw Exception(
117-
ErrorCodes::INCORRECT_DATA,
118-
"A hive partitioned file can't contain only partition columns. Try reading it with `partition_strategy=wildcard` and `use_hive_partitioning=0`");
119-
}
120-
121-
/// Hive: Not building the file_columns like `StorageObjectStorage` does because it is not necessary to do it here.
90+
/// Not grabbing the file_columns because it is not necessary to do it here.
91+
std::tie(hive_partition_columns_to_read_from_file_path, std::ignore) = HivePartitioningUtils::setupHivePartitioningForObjectStorage(
92+
columns,
93+
configuration,
94+
sample_path,
95+
columns_in_table_or_function_definition.empty(),
96+
std::nullopt,
97+
context_);
12298

12399
StorageInMemoryMetadata metadata;
124100
metadata.setColumns(columns);

0 commit comments

Comments
 (0)