Skip to content

Commit ef784ea

Browse files
authored
Merge branch 'customizations/24.8.14' into backports/24.8.14/79369
2 parents 7943791 + f45fd21 commit ef784ea

File tree

9 files changed

+258
-17
lines changed

9 files changed

+258
-17
lines changed

src/Functions/keyvaluepair/impl/CHKeyValuePairExtractor.h

Lines changed: 12 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -102,7 +102,17 @@ class CHKeyValuePairExtractor : public KeyValuePairExtractor
102102
}
103103
case State::FLUSH_PAIR:
104104
{
105-
return flushPair(file, key, value, row_offset);
105+
incrementRowOffset(row_offset);
106+
return state_handler.flushPair(file, key, value);
107+
}
108+
case State::FLUSH_PAIR_AFTER_QUOTED_VALUE:
109+
{
110+
incrementRowOffset(row_offset);
111+
return state_handler.flushPairAfterQuotedValue(file, key, value);
112+
}
113+
case State::WAITING_PAIR_DELIMITER:
114+
{
115+
return state_handler.waitPairDelimiter(file);
106116
}
107117
case State::END:
108118
{
@@ -111,20 +121,14 @@ class CHKeyValuePairExtractor : public KeyValuePairExtractor
111121
}
112122
}
113123

114-
NextState flushPair(const std::string_view & file, auto & key,
115-
auto & value, uint64_t & row_offset)
124+
void incrementRowOffset(uint64_t & row_offset)
116125
{
117126
row_offset++;
118127

119128
if (row_offset > max_number_of_pairs)
120129
{
121130
throw Exception(ErrorCodes::LIMIT_EXCEEDED, "Number of pairs produced exceeded the limit of {}", max_number_of_pairs);
122131
}
123-
124-
key.commit();
125-
value.commit();
126-
127-
return {0, file.empty() ? State::END : State::WAITING_KEY};
128132
}
129133

130134
void reset(auto & key, auto & value)

src/Functions/keyvaluepair/impl/NeedleFactory.h

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ template <bool WITH_ESCAPING>
2020
class NeedleFactory
2121
{
2222
public:
23-
SearchSymbols getWaitNeedles(const Configuration & extractor_configuration)
23+
SearchSymbols getWaitKeyNeedles(const Configuration & extractor_configuration)
2424
{
2525
const auto & [key_value_delimiter, quoting_character, pair_delimiters]
2626
= extractor_configuration;
@@ -39,6 +39,17 @@ class NeedleFactory
3939
return SearchSymbols {std::string{needles.data(), needles.size()}};
4040
}
4141

42+
SearchSymbols getWaitPairDelimiterNeedles(const Configuration & extractor_configuration)
43+
{
44+
const auto & pair_delimiters = extractor_configuration.pair_delimiters;
45+
46+
std::vector<char> needles;
47+
48+
std::copy(pair_delimiters.begin(), pair_delimiters.end(), std::back_inserter(needles));
49+
50+
return SearchSymbols {std::string{needles.data(), needles.size()}};
51+
}
52+
4253
SearchSymbols getReadKeyNeedles(const Configuration & extractor_configuration)
4354
{
4455
const auto & [key_value_delimiter, quoting_character, pair_delimiters]

src/Functions/keyvaluepair/impl/StateHandler.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,11 @@ class StateHandler
3030
READING_QUOTED_VALUE,
3131
// In this state, both key and value have already been collected and should be flushed. Might jump to WAITING_KEY or END.
3232
FLUSH_PAIR,
33+
// The `READING_QUOTED_VALUE` will assert the closing quoting character is found and then flush the pair. In this case, we should not
34+
// move from `FLUSH_PAIR` directly to `WAITING_FOR_KEY` because a pair delimiter has not been found. Might jump to WAITING_FOR_PAIR_DELIMITER or END
35+
FLUSH_PAIR_AFTER_QUOTED_VALUE,
36+
// Might jump to WAITING_KEY or END.
37+
WAITING_PAIR_DELIMITER,
3338
END
3439
};
3540

src/Functions/keyvaluepair/impl/StateHandlerImpl.h

Lines changed: 35 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -40,18 +40,19 @@ class StateHandlerImpl : public StateHandler
4040
* */
4141
NeedleFactory<WITH_ESCAPING> needle_factory;
4242

43-
wait_needles = needle_factory.getWaitNeedles(configuration);
43+
wait_key_needles = needle_factory.getWaitKeyNeedles(configuration);
4444
read_key_needles = needle_factory.getReadKeyNeedles(configuration);
4545
read_value_needles = needle_factory.getReadValueNeedles(configuration);
4646
read_quoted_needles = needle_factory.getReadQuotedNeedles(configuration);
47+
wait_pair_delimiter_needles = needle_factory.getWaitPairDelimiterNeedles(configuration);
4748
}
4849

4950
/*
5051
* Find first character that is considered a valid key character and proceeds to READING_KEY like states.
5152
* */
5253
[[nodiscard]] NextState waitKey(std::string_view file) const
5354
{
54-
if (const auto * p = find_first_not_symbols_or_null(file, wait_needles))
55+
if (const auto * p = find_first_not_symbols_or_null(file, wait_key_needles))
5556
{
5657
const size_t character_position = p - file.begin();
5758
if (isQuotingCharacter(*p))
@@ -284,22 +285,52 @@ class StateHandlerImpl : public StateHandler
284285
{
285286
value.append(file.begin() + pos, file.begin() + character_position);
286287

287-
return {next_pos, State::FLUSH_PAIR};
288+
return {next_pos, State::FLUSH_PAIR_AFTER_QUOTED_VALUE};
288289
}
289290

290291
pos = next_pos;
291292
}
292293

293294
return {file.size(), State::END};
294295
}
296+
[[nodiscard]] NextState flushPair(std::string_view file, auto & key, auto & value) const
297+
{
298+
key.commit();
299+
value.commit();
300+
301+
return {0, file.empty() ? State::END : State::WAITING_KEY};
302+
}
303+
304+
[[nodiscard]] NextState flushPairAfterQuotedValue(std::string_view file, auto & key, auto & value) const
305+
{
306+
key.commit();
307+
value.commit();
308+
309+
return {0, file.empty() ? State::END : State::WAITING_PAIR_DELIMITER};
310+
}
311+
312+
[[nodiscard]] NextState waitPairDelimiter(std::string_view file) const
313+
{
314+
if (const auto * p = find_first_symbols_or_null(file, wait_pair_delimiter_needles))
315+
{
316+
const size_t character_position = p - file.data();
317+
size_t next_pos = character_position + 1u;
318+
319+
return {next_pos, State::WAITING_KEY};
320+
}
321+
322+
return {file.size(), State::END};
323+
}
324+
295325

296326
const Configuration configuration;
297327

298328
private:
299-
SearchSymbols wait_needles;
329+
SearchSymbols wait_key_needles;
300330
SearchSymbols read_key_needles;
301331
SearchSymbols read_value_needles;
302332
SearchSymbols read_quoted_needles;
333+
SearchSymbols wait_pair_delimiter_needles;
303334

304335
/*
305336
* Helper method to copy bytes until `character_pos` and process possible escape sequence. Returns a pair containing a boolean

src/Storages/MergeTree/KeyCondition.cpp

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -101,7 +101,7 @@ static String extractFixedPrefixFromRegularExpression(const String & regexp)
101101
const char * pos = begin;
102102
const char * end = regexp.data() + regexp.size();
103103

104-
while (pos != end)
104+
while (pos < end)
105105
{
106106
switch (*pos)
107107
{
@@ -124,19 +124,22 @@ static String extractFixedPrefixFromRegularExpression(const String & regexp)
124124
case '$':
125125
case '.':
126126
case '[':
127+
case ']':
127128
case '?':
128129
case '*':
129130
case '+':
131+
case '\\':
130132
case '{':
133+
case '}':
134+
case '-':
131135
fixed_prefix += *pos;
136+
++pos;
132137
break;
133138
default:
134139
/// all other escape sequences are not supported
135-
pos = end;
136-
break;
140+
pos = end;
137141
}
138142

139-
++pos;
140143
break;
141144
}
142145

tests/queries/0_stateless/02499_extract_key_value_pairs_multiple_input.reference

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -381,3 +381,15 @@ WITH
381381
SELECT
382382
x;
383383
{'age':'31','name':'neymar','nationality':'brazil','team':'psg'}
384+
-- after parsing a quoted value, the next key should only start after a pair delimiter
385+
WITH
386+
extractKeyValuePairs('key:"quoted_value"junk,second_key:0') as s_map,
387+
CAST(
388+
arrayMap(
389+
(x) -> (x, s_map[x]), arraySort(mapKeys(s_map))
390+
),
391+
'Map(String,String)'
392+
) AS x
393+
SELECT
394+
x;
395+
{'key':'quoted_value','second_key':'0'}

tests/queries/0_stateless/02499_extract_key_value_pairs_multiple_input.sql

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -516,3 +516,15 @@ WITH
516516
) AS x
517517
SELECT
518518
x;
519+
520+
-- after parsing a quoted value, the next key should only start after a pair delimiter
521+
WITH
522+
extractKeyValuePairs('key:"quoted_value"junk,second_key:0') as s_map,
523+
CAST(
524+
arrayMap(
525+
(x) -> (x, s_map[x]), arraySort(mapKeys(s_map))
526+
),
527+
'Map(String,String)'
528+
) AS x
529+
SELECT
530+
x;
Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
Condition: (path in [\'xxx(zzz\', \'xxx(zz{\'))
2+
1
3+
Condition: (path in [\'xxx)zzz\', \'xxx)zz{\'))
4+
1
5+
Condition: (path in [\'xxx^zzz\', \'xxx^zz{\'))
6+
1
7+
Condition: (path in [\'xxx$zzz\', \'xxx$zz{\'))
8+
1
9+
Condition: (path in [\'xxx.zzz\', \'xxx.zz{\'))
10+
1
11+
Condition: (path in [\'xxx[zzz\', \'xxx[zz{\'))
12+
1
13+
Condition: (path in [\'xxx]zzz\', \'xxx]zz{\'))
14+
1
15+
Condition: (path in [\'xxx?zzz\', \'xxx?zz{\'))
16+
1
17+
Condition: (path in [\'xxx*zzz\', \'xxx*zz{\'))
18+
1
19+
Condition: (path in [\'xxx+zzz\', \'xxx+zz{\'))
20+
1
21+
Condition: (path in [\'xxx\\\\zzz\', \'xxx\\\\zz{\'))
22+
1
23+
Condition: (path in [\'xxx{zzz\', \'xxx{zz{\'))
24+
1
25+
Condition: (path in [\'xxx}zzz\', \'xxx}zz{\'))
26+
1
27+
Condition: (path in [\'xxx-zzz\', \'xxx-zz{\'))
28+
1
29+
Condition: (path in [\'xxx\', \'xxy\'))
30+
0
31+
Condition: (path in [\'xxx\', \'xxy\'))
32+
0
33+
Condition: (path in [\'xxx\', \'xxy\'))
34+
0
35+
Condition: (path in [\'xxx\', \'xxy\'))
36+
0
37+
Condition: (path in [\'xxx\', \'xxy\'))
38+
0
39+
Condition: (path in [\'xxx\', \'xxy\'))
40+
0
41+
Condition: (path in [\'xxx\', \'xxy\'))
42+
15
43+
Condition: (path in [\'xxx\', \'xxy\'))
44+
15
45+
Condition: (path in [\'xxx\', \'xxy\'))
46+
15
47+
Condition: (path in [\'xxx\', \'xxy\'))
48+
0
49+
Condition: (path in [\'xxx\', \'xxy\'))
50+
0
51+
Condition: true
52+
Condition: true
Lines changed: 111 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,111 @@
1+
SET parallel_replicas_local_plan=1;
2+
3+
drop table if exists foo;
4+
5+
CREATE TABLE foo (id UInt8, path String) engine = MergeTree ORDER BY (path) SETTINGS index_granularity=1;
6+
7+
INSERT INTO foo VALUES (1, 'xxx|yyy'),
8+
(2, 'xxx(zzz'),
9+
(3, 'xxx)zzz'),
10+
(4, 'xxx^zzz'),
11+
(5, 'xxx$zzz'),
12+
(6, 'xxx.zzz'),
13+
(7, 'xxx[zzz'),
14+
(8, 'xxx]zzz'),
15+
(9, 'xxx?zzz'),
16+
(10, 'xxx*zzz'),
17+
(11, 'xxx+zzz'),
18+
(12, 'xxx\\zzz'),
19+
(13, 'xxx{zzz'),
20+
(14, 'xxx}zzz'),
21+
(15, 'xxx-zzz');
22+
23+
24+
-- check if also escaped sequence are properly extracted
25+
SELECT trimLeft(explain) FROM (EXPLAIN PLAN indexes=1 SELECT id FROM foo WHERE match(path, '^xxx\\(zzz')) WHERE explain like '%Condition%';
26+
SELECT count() FROM foo WHERE match(path, '^xxx\\(zzz') SETTINGS force_primary_key = 1;
27+
28+
SELECT trimLeft(explain) FROM (EXPLAIN PLAN indexes=1 SELECT id FROM foo WHERE match(path, '^xxx\\)zzz')) WHERE explain like '%Condition%';
29+
SELECT count() FROM foo WHERE match(path, '^xxx\\)zzz') SETTINGS force_primary_key = 1;
30+
31+
SELECT trimLeft(explain) FROM (EXPLAIN PLAN indexes=1 SELECT id FROM foo WHERE match(path, '^xxx\\^zzz')) WHERE explain like '%Condition%';
32+
SELECT count() FROM foo WHERE match(path, '^xxx\\^zzz') SETTINGS force_primary_key = 1;
33+
34+
SELECT trimLeft(explain) FROM (EXPLAIN PLAN indexes=1 SELECT id FROM foo WHERE match(path, '^xxx\\$zzz')) WHERE explain like '%Condition%';
35+
SELECT count() FROM foo WHERE match(path, '^xxx\\$zzz') SETTINGS force_primary_key = 1;
36+
37+
SELECT trimLeft(explain) FROM (EXPLAIN PLAN indexes=1 SELECT id FROM foo WHERE match(path, '^xxx\\.zzz')) WHERE explain like '%Condition%';
38+
SELECT count() FROM foo WHERE match(path, '^xxx\\.zzz') SETTINGS force_primary_key = 1;
39+
40+
SELECT trimLeft(explain) FROM (EXPLAIN PLAN indexes=1 SELECT id FROM foo WHERE match(path, '^xxx\\[zzz')) WHERE explain like '%Condition%';
41+
SELECT count() FROM foo WHERE match(path, '^xxx\\[zzz') SETTINGS force_primary_key = 1;
42+
43+
SELECT trimLeft(explain) FROM (EXPLAIN PLAN indexes=1 SELECT id FROM foo WHERE match(path, '^xxx\\]zzz')) WHERE explain like '%Condition%';
44+
SELECT count() FROM foo WHERE match(path, '^xxx\\]zzz') SETTINGS force_primary_key = 1;
45+
46+
SELECT trimLeft(explain) FROM (EXPLAIN PLAN indexes=1 SELECT id FROM foo WHERE match(path, '^xxx\\?zzz')) WHERE explain like '%Condition%';
47+
SELECT count() FROM foo WHERE match(path, '^xxx\\?zzz') SETTINGS force_primary_key = 1;
48+
49+
SELECT trimLeft(explain) FROM (EXPLAIN PLAN indexes=1 SELECT id FROM foo WHERE match(path, '^xxx\\*zzz')) WHERE explain like '%Condition%';
50+
SELECT count() FROM foo WHERE match(path, '^xxx\\*zzz') SETTINGS force_primary_key = 1;
51+
52+
SELECT trimLeft(explain) FROM (EXPLAIN PLAN indexes=1 SELECT id FROM foo WHERE match(path, '^xxx\\+zzz')) WHERE explain like '%Condition%';
53+
SELECT count() FROM foo WHERE match(path, '^xxx\\+zzz') SETTINGS force_primary_key = 1;
54+
55+
SELECT trimLeft(explain) FROM (EXPLAIN PLAN indexes=1 SELECT id FROM foo WHERE match(path, '^xxx\\\\zzz')) WHERE explain like '%Condition%';
56+
SELECT count() FROM foo WHERE match(path, '^xxx\\\\zzz') SETTINGS force_primary_key = 1;
57+
58+
SELECT trimLeft(explain) FROM (EXPLAIN PLAN indexes=1 SELECT id FROM foo WHERE match(path, '^xxx\\{zzz')) WHERE explain like '%Condition%';
59+
SELECT count() FROM foo WHERE match(path, '^xxx\\{zzz') SETTINGS force_primary_key = 1;
60+
61+
SELECT trimLeft(explain) FROM (EXPLAIN PLAN indexes=1 SELECT id FROM foo WHERE match(path, '^xxx\\}zzz')) WHERE explain like '%Condition%';
62+
SELECT count() FROM foo WHERE match(path, '^xxx\\}zzz') SETTINGS force_primary_key = 1;
63+
64+
SELECT trimLeft(explain) FROM (EXPLAIN PLAN indexes=1 SELECT id FROM foo WHERE match(path, '^xxx\\-zzz')) WHERE explain like '%Condition%';
65+
SELECT count() FROM foo WHERE match(path, '^xxx\\-zzz') SETTINGS force_primary_key = 1;
66+
67+
68+
-- those regex chars prevent the index use (only 3 first chars used during index scan)
69+
SELECT trimLeft(explain) FROM (EXPLAIN PLAN indexes=1 SELECT id FROM foo WHERE match(path, '^xxx\0bla')) WHERE explain like '%Condition%';
70+
SELECT count() FROM foo WHERE match(path, '^xxx\0bla') SETTINGS force_primary_key = 1;
71+
72+
SELECT trimLeft(explain) FROM (EXPLAIN PLAN indexes=1 SELECT id FROM foo WHERE match(path, '^xxx(bla)')) WHERE explain like '%Condition%';
73+
SELECT count() FROM foo WHERE match(path, '^xxx(bla)') SETTINGS force_primary_key = 1;
74+
75+
SELECT trimLeft(explain) FROM (EXPLAIN PLAN indexes=1 SELECT id FROM foo WHERE match(path, '^xxx[bla]')) WHERE explain like '%Condition%';
76+
SELECT count() FROM foo WHERE match(path, '^xxx[bla]') SETTINGS force_primary_key = 1;
77+
78+
SELECT trimLeft(explain) FROM (EXPLAIN PLAN indexes=1 SELECT id FROM foo WHERE match(path, '^xxx^bla')) WHERE explain like '%Condition%';
79+
SELECT count() FROM foo WHERE match(path, '^xxx^bla') SETTINGS force_primary_key = 1;
80+
81+
SELECT trimLeft(explain) FROM (EXPLAIN PLAN indexes=1 SELECT id FROM foo WHERE match(path, '^xxx.bla')) WHERE explain like '%Condition%';
82+
SELECT count() FROM foo WHERE match(path, '^xxx.bla') SETTINGS force_primary_key = 1;
83+
84+
SELECT trimLeft(explain) FROM (EXPLAIN PLAN indexes=1 SELECT id FROM foo WHERE match(path, '^xxx+bla')) WHERE explain like '%Condition%';
85+
SELECT count() FROM foo WHERE match(path, '^xxx+bla') SETTINGS force_primary_key = 1;
86+
87+
88+
-- here the forth char is not used during index, because it has 0+ quantifier
89+
SELECT trimLeft(explain) FROM (EXPLAIN PLAN indexes=1 SELECT id FROM foo WHERE match(path, '^xxxx{0,1}')) WHERE explain like '%Condition%';
90+
SELECT count() FROM foo WHERE match(path, '^xxxx{0,1}') SETTINGS force_primary_key = 1;
91+
92+
SELECT trimLeft(explain) FROM (EXPLAIN PLAN indexes=1 SELECT id FROM foo WHERE match(path, '^xxxx?')) WHERE explain like '%Condition%';
93+
SELECT count() FROM foo WHERE match(path, '^xxxx?') SETTINGS force_primary_key = 1;
94+
95+
SELECT trimLeft(explain) FROM (EXPLAIN PLAN indexes=1 SELECT id FROM foo WHERE match(path, '^xxxx*')) WHERE explain like '%Condition%';
96+
SELECT count() FROM foo WHERE match(path, '^xxxx*') SETTINGS force_primary_key = 1;
97+
98+
-- some unsupported regex chars - only 3 first chars used during index scan
99+
SELECT trimLeft(explain) FROM (EXPLAIN PLAN indexes=1 SELECT id FROM foo WHERE match(path, '^xxx\d+')) WHERE explain like '%Condition%';
100+
SELECT count() FROM foo WHERE match(path, '^xxx\d+') SETTINGS force_primary_key = 1;
101+
102+
SELECT trimLeft(explain) FROM (EXPLAIN PLAN indexes=1 SELECT id FROM foo WHERE match(path, '^xxx\w+')) WHERE explain like '%Condition%';
103+
SELECT count() FROM foo WHERE match(path, '^xxx\w+') SETTINGS force_primary_key = 1;
104+
105+
106+
-- fully disabled for pipes - see https://github.com/ClickHouse/ClickHouse/pull/54696
107+
SELECT trimLeft(explain) FROM (EXPLAIN PLAN indexes=1 SELECT id FROM foo WHERE match(path, '^xxx\\|zzz')) WHERE explain like '%Condition%';
108+
SELECT count() FROM foo WHERE match(path, '^xxx\\|zzz') SETTINGS force_primary_key = 1; -- { serverError INDEX_NOT_USED }
109+
110+
SELECT trimLeft(explain) FROM (EXPLAIN PLAN indexes=1 SELECT id FROM foo WHERE match(path, '^xxxx|foo')) WHERE explain like '%Condition%';
111+
SELECT count() FROM foo WHERE match(path, '^xxxx|foo') SETTINGS force_primary_key = 1; -- { serverError INDEX_NOT_USED }

0 commit comments

Comments
 (0)