Skip to content

Commit b8d7ddb

Browse files
authored
Merge pull request ClickHouse#79067 from arthurpassos/use_extract_key_value_pairs_for_hive
Improve performance of hive path parsing by using `extractKeyValuePairs` instead of regex
2 parents ecace1c + b737d09 commit b8d7ddb

16 files changed

+542
-226
lines changed

src/Core/Block.cpp

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -299,7 +299,7 @@ const ColumnWithTypeAndName & Block::safeGetByPosition(size_t position) const
299299
}
300300

301301

302-
const ColumnWithTypeAndName * Block::findByName(const std::string & name, bool case_insensitive) const
302+
const ColumnWithTypeAndName * Block::findByName(std::string_view name, bool case_insensitive) const
303303
{
304304
if (case_insensitive)
305305
{
@@ -319,6 +319,11 @@ const ColumnWithTypeAndName * Block::findByName(const std::string & name, bool c
319319
return &data[it->second];
320320
}
321321

322+
const ColumnWithTypeAndName * Block::findByName(const std::string & name, bool case_insensitive) const
323+
{
324+
return findByName(std::string_view{name}, case_insensitive);
325+
}
326+
322327
std::optional<ColumnWithTypeAndName> Block::findSubcolumnByName(const std::string & name) const
323328
{
324329
auto [name_in_storage, subcolumn_name] = Nested::splitName(name);

src/Core/Block.h

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77

88
#include <initializer_list>
99
#include <vector>
10+
#include <Common/StringHashForHeterogeneousLookup.h>
1011

1112

1213
class SipHash;
@@ -30,7 +31,7 @@ class Block
3031
{
3132
private:
3233
using Container = ColumnsWithTypeAndName;
33-
using IndexByName = std::unordered_map<String, size_t>;
34+
using IndexByName = std::unordered_map<String, size_t, StringHashForHeterogeneousLookup, StringHashForHeterogeneousLookup::transparent_key_equal>;
3435

3536
Container data;
3637
IndexByName index_by_name;
@@ -70,6 +71,8 @@ class Block
7071
const_cast<const Block *>(this)->findByName(name, case_insensitive));
7172
}
7273

74+
const ColumnWithTypeAndName * findByName(std::string_view name, bool case_insensitive = false) const;
75+
7376
const ColumnWithTypeAndName * findByName(const std::string & name, bool case_insensitive = false) const;
7477
std::optional<ColumnWithTypeAndName> findSubcolumnByName(const std::string & name) const;
7578
std::optional<ColumnWithTypeAndName> findColumnOrSubcolumnByName(const std::string & name) const;

src/Functions/keyvaluepair/extractKeyValuePairs.cpp

Lines changed: 10 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,6 @@
1010

1111
#include <Interpreters/Context.h>
1212

13-
#include <Functions/keyvaluepair/impl/KeyValuePairExtractor.h>
1413
#include <Functions/keyvaluepair/impl/KeyValuePairExtractorBuilder.h>
1514
#include <Functions/keyvaluepair/ArgumentExtractor.h>
1615

@@ -29,11 +28,6 @@ class ExtractKeyValuePairs : public IFunction
2928
{
3029
auto builder = KeyValuePairExtractorBuilder();
3130

32-
if constexpr (WITH_ESCAPING)
33-
{
34-
builder.withEscaping();
35-
}
36-
3731
if (parsed_arguments.key_value_delimiter)
3832
{
3933
builder.withKeyValueDelimiter(parsed_arguments.key_value_delimiter.value());
@@ -56,10 +50,17 @@ class ExtractKeyValuePairs : public IFunction
5650
builder.withMaxNumberOfPairs(context->getSettingsRef()[Setting::extract_key_value_pairs_max_pairs_per_row]);
5751
}
5852

59-
return builder.build();
53+
if constexpr (WITH_ESCAPING)
54+
{
55+
return builder.buildWithEscaping();
56+
}
57+
else
58+
{
59+
return builder.buildWithoutEscaping();
60+
}
6061
}
6162

62-
ColumnPtr extract(ColumnPtr data_column, std::shared_ptr<KeyValuePairExtractor> extractor, size_t input_rows_count) const
63+
ColumnPtr extract(ColumnPtr data_column, auto & extractor, size_t input_rows_count) const
6364
{
6465
auto offsets = ColumnUInt64::create();
6566

@@ -72,7 +73,7 @@ class ExtractKeyValuePairs : public IFunction
7273
{
7374
auto row = data_column->getDataAt(i).toView();
7475

75-
auto pairs_count = extractor->extract(row, keys, values);
76+
auto pairs_count = extractor.extract(row, keys, values);
7677

7778
offset += pairs_count;
7879

src/Functions/keyvaluepair/impl/CHKeyValuePairExtractor.h

Lines changed: 66 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,8 @@
55
#include <Columns/ColumnsNumber.h>
66

77
#include <Functions/keyvaluepair/impl/StateHandler.h>
8-
#include <Functions/keyvaluepair/impl/KeyValuePairExtractor.h>
8+
#include <Functions/keyvaluepair/impl/StateHandlerImpl.h>
9+
#include <absl/container/flat_hash_map.h>
910

1011
namespace DB
1112
{
@@ -16,37 +17,36 @@ namespace ErrorCodes
1617
extern const int LIMIT_EXCEEDED;
1718
}
1819

20+
namespace extractKV
21+
{
1922
/*
2023
* Handle state transitions and a few states like `FLUSH_PAIR` and `END`.
2124
* */
2225
template <typename StateHandler>
23-
class CHKeyValuePairExtractor : public KeyValuePairExtractor
26+
class KeyValuePairExtractor
2427
{
2528
using State = typename DB::extractKV::StateHandler::State;
2629
using NextState = DB::extractKV::StateHandler::NextState;
2730

2831
public:
29-
explicit CHKeyValuePairExtractor(StateHandler state_handler_, uint64_t max_number_of_pairs_)
30-
: state_handler(std::move(state_handler_)), max_number_of_pairs(max_number_of_pairs_)
31-
{}
32+
using PairWriter = typename StateHandler::PairWriter;
3233

33-
uint64_t extract(const std::string & data, ColumnString::MutablePtr & keys, ColumnString::MutablePtr & values) override
34+
KeyValuePairExtractor(const Configuration & configuration_, uint64_t max_number_of_pairs_)
35+
: state_handler(StateHandler(configuration_))
36+
, max_number_of_pairs(max_number_of_pairs_)
3437
{
35-
return extract(std::string_view {data}, keys, values);
3638
}
3739

38-
uint64_t extract(std::string_view data, ColumnString::MutablePtr & keys, ColumnString::MutablePtr & values) override
40+
protected:
41+
uint64_t extractImpl(std::string_view data, typename StateHandler::PairWriter & pair_writer)
3942
{
4043
auto state = State::WAITING_KEY;
4144

42-
auto key = typename StateHandler::StringWriter(*keys);
43-
auto value = typename StateHandler::StringWriter(*values);
44-
4545
uint64_t row_offset = 0;
4646

4747
while (state != State::END)
4848
{
49-
auto next_state = processState(data, state, key, value, row_offset);
49+
auto next_state = processState(data, state, pair_writer, row_offset);
5050

5151
if (next_state.position_in_string > data.size() && next_state.state != State::END)
5252
{
@@ -61,14 +61,13 @@ class CHKeyValuePairExtractor : public KeyValuePairExtractor
6161
}
6262

6363
// below reset discards invalid keys and values
64-
reset(key, value);
64+
reset(pair_writer);
6565

6666
return row_offset;
6767
}
6868

6969
private:
70-
71-
NextState processState(std::string_view file, State state, auto & key, auto & value, uint64_t & row_offset)
70+
NextState processState(std::string_view file, State state, auto & pair_writer, uint64_t & row_offset)
7271
{
7372
switch (state)
7473
{
@@ -78,11 +77,11 @@ class CHKeyValuePairExtractor : public KeyValuePairExtractor
7877
}
7978
case State::READING_KEY:
8079
{
81-
return state_handler.readKey(file, key);
80+
return state_handler.readKey(file, pair_writer);
8281
}
8382
case State::READING_QUOTED_KEY:
8483
{
85-
return state_handler.readQuotedKey(file, key);
84+
return state_handler.readQuotedKey(file, pair_writer);
8685
}
8786
case State::READING_KV_DELIMITER:
8887
{
@@ -94,15 +93,15 @@ class CHKeyValuePairExtractor : public KeyValuePairExtractor
9493
}
9594
case State::READING_VALUE:
9695
{
97-
return state_handler.readValue(file, value);
96+
return state_handler.readValue(file, pair_writer);
9897
}
9998
case State::READING_QUOTED_VALUE:
10099
{
101-
return state_handler.readQuotedValue(file, value);
100+
return state_handler.readQuotedValue(file, pair_writer);
102101
}
103102
case State::FLUSH_PAIR:
104103
{
105-
return flushPair(file, key, value, row_offset);
104+
return flushPair(file, pair_writer, row_offset);
106105
}
107106
case State::END:
108107
{
@@ -111,8 +110,7 @@ class CHKeyValuePairExtractor : public KeyValuePairExtractor
111110
}
112111
}
113112

114-
NextState flushPair(const std::string_view & file, auto & key,
115-
auto & value, uint64_t & row_offset)
113+
NextState flushPair(const std::string_view & file, auto & pair_writer, uint64_t & row_offset)
116114
{
117115
row_offset++;
118116

@@ -121,20 +119,61 @@ class CHKeyValuePairExtractor : public KeyValuePairExtractor
121119
throw Exception(ErrorCodes::LIMIT_EXCEEDED, "Number of pairs produced exceeded the limit of {}", max_number_of_pairs);
122120
}
123121

124-
key.commit();
125-
value.commit();
122+
pair_writer.commitKey();
123+
pair_writer.commitValue();
126124

127125
return {0, file.empty() ? State::END : State::WAITING_KEY};
128126
}
129127

130-
void reset(auto & key, auto & value)
128+
void reset(auto & pair_writer)
131129
{
132-
key.reset();
133-
value.reset();
130+
pair_writer.resetKey();
131+
pair_writer.resetValue();
134132
}
135133

136134
StateHandler state_handler;
137135
uint64_t max_number_of_pairs;
138136
};
139137

140138
}
139+
140+
struct KeyValuePairExtractorNoEscaping : extractKV::KeyValuePairExtractor<extractKV::NoEscapingStateHandler>
141+
{
142+
using StateHandler = extractKV::NoEscapingStateHandler;
143+
explicit KeyValuePairExtractorNoEscaping(const extractKV::Configuration & configuration_, std::size_t max_number_of_pairs_)
144+
: KeyValuePairExtractor(configuration_, max_number_of_pairs_) {}
145+
146+
uint64_t extract(std::string_view data, ColumnString::MutablePtr & keys, ColumnString::MutablePtr & values)
147+
{
148+
auto pair_writer = typename StateHandler::PairWriter(*keys, *values);
149+
return extractImpl(data, pair_writer);
150+
}
151+
};
152+
153+
struct KeyValuePairExtractorInlineEscaping : extractKV::KeyValuePairExtractor<extractKV::InlineEscapingStateHandler>
154+
{
155+
using StateHandler = extractKV::InlineEscapingStateHandler;
156+
explicit KeyValuePairExtractorInlineEscaping(const extractKV::Configuration & configuration_, std::size_t max_number_of_pairs_)
157+
: KeyValuePairExtractor(configuration_, max_number_of_pairs_) {}
158+
159+
uint64_t extract(std::string_view data, ColumnString::MutablePtr & keys, ColumnString::MutablePtr & values)
160+
{
161+
auto pair_writer = typename StateHandler::PairWriter(*keys, *values);
162+
return extractImpl(data, pair_writer);
163+
}
164+
};
165+
166+
struct KeyValuePairExtractorReferenceMap : extractKV::KeyValuePairExtractor<extractKV::ReferencesMapStateHandler>
167+
{
168+
using StateHandler = extractKV::ReferencesMapStateHandler;
169+
explicit KeyValuePairExtractorReferenceMap(const extractKV::Configuration & configuration_, std::size_t max_number_of_pairs_)
170+
: KeyValuePairExtractor(configuration_, max_number_of_pairs_) {}
171+
172+
uint64_t extract(std::string_view data, absl::flat_hash_map<std::string_view, std::string_view> & map)
173+
{
174+
auto pair_writer = typename StateHandler::PairWriter(map);
175+
return extractImpl(data, pair_writer);
176+
}
177+
};
178+
179+
}
Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
#pragma once
2+
3+
#include <Common/Exception.h>
4+
5+
namespace DB
6+
{
7+
8+
namespace extractKV
9+
{
10+
11+
struct DuplicateKeyFoundException : Exception
12+
{
13+
explicit DuplicateKeyFoundException(std::string_view key_) : key(key_) {}
14+
15+
std::string_view key;
16+
};
17+
18+
}
19+
20+
}

src/Functions/keyvaluepair/impl/KeyValuePairExtractor.h

Lines changed: 0 additions & 20 deletions
This file was deleted.
Lines changed: 0 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,5 @@
11
#include <Functions/keyvaluepair/impl/KeyValuePairExtractorBuilder.h>
22

3-
#include <Functions/keyvaluepair/impl/CHKeyValuePairExtractor.h>
4-
#include <Functions/keyvaluepair/impl/Configuration.h>
53
#include <Functions/keyvaluepair/impl/StateHandlerImpl.h>
64

75
namespace DB
@@ -25,52 +23,10 @@ KeyValuePairExtractorBuilder & KeyValuePairExtractorBuilder::withQuotingCharacte
2523
return *this;
2624
}
2725

28-
KeyValuePairExtractorBuilder & KeyValuePairExtractorBuilder::withEscaping()
29-
{
30-
with_escaping = true;
31-
return *this;
32-
}
33-
3426
KeyValuePairExtractorBuilder & KeyValuePairExtractorBuilder::withMaxNumberOfPairs(uint64_t max_number_of_pairs_)
3527
{
3628
max_number_of_pairs = max_number_of_pairs_;
3729
return *this;
3830
}
3931

40-
std::shared_ptr<KeyValuePairExtractor> KeyValuePairExtractorBuilder::build() const
41-
{
42-
if (with_escaping)
43-
{
44-
return buildWithEscaping();
45-
}
46-
47-
return buildWithoutEscaping();
48-
}
49-
50-
namespace
51-
{
52-
using namespace extractKV;
53-
54-
template <typename T>
55-
auto makeStateHandler(const T && handler, uint64_t max_number_of_pairs)
56-
{
57-
return std::make_shared<CHKeyValuePairExtractor<T>>(handler, max_number_of_pairs);
58-
}
59-
60-
}
61-
62-
std::shared_ptr<KeyValuePairExtractor> KeyValuePairExtractorBuilder::buildWithoutEscaping() const
63-
{
64-
auto configuration = ConfigurationFactory::createWithoutEscaping(key_value_delimiter, quoting_character, item_delimiters);
65-
66-
return makeStateHandler(NoEscapingStateHandler(configuration), max_number_of_pairs);
67-
}
68-
69-
std::shared_ptr<KeyValuePairExtractor> KeyValuePairExtractorBuilder::buildWithEscaping() const
70-
{
71-
auto configuration = ConfigurationFactory::createWithEscaping(key_value_delimiter, quoting_character, item_delimiters);
72-
73-
return makeStateHandler(InlineEscapingStateHandler(configuration), max_number_of_pairs);
74-
}
75-
7632
}

0 commit comments

Comments
 (0)