Skip to content

Commit f45fd21

Browse files
authored
Merge pull request #865 from Altinity/backports/24.8/80657_bug_fixes_plus_changes_in_extract_kvp_logic
24.8 Partial Backport of ClickHouse#80657 - Wait for pair delimiter after flusing quoted value on extractKeyValuePairs
2 parents 957ef2b + c8f5926 commit f45fd21

File tree

6 files changed

+88
-13
lines changed

6 files changed

+88
-13
lines changed

src/Functions/keyvaluepair/impl/CHKeyValuePairExtractor.h

Lines changed: 12 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -102,7 +102,17 @@ class CHKeyValuePairExtractor : public KeyValuePairExtractor
102102
}
103103
case State::FLUSH_PAIR:
104104
{
105-
return flushPair(file, key, value, row_offset);
105+
incrementRowOffset(row_offset);
106+
return state_handler.flushPair(file, key, value);
107+
}
108+
case State::FLUSH_PAIR_AFTER_QUOTED_VALUE:
109+
{
110+
incrementRowOffset(row_offset);
111+
return state_handler.flushPairAfterQuotedValue(file, key, value);
112+
}
113+
case State::WAITING_PAIR_DELIMITER:
114+
{
115+
return state_handler.waitPairDelimiter(file);
106116
}
107117
case State::END:
108118
{
@@ -111,20 +121,14 @@ class CHKeyValuePairExtractor : public KeyValuePairExtractor
111121
}
112122
}
113123

114-
NextState flushPair(const std::string_view & file, auto & key,
115-
auto & value, uint64_t & row_offset)
124+
void incrementRowOffset(uint64_t & row_offset)
116125
{
117126
row_offset++;
118127

119128
if (row_offset > max_number_of_pairs)
120129
{
121130
throw Exception(ErrorCodes::LIMIT_EXCEEDED, "Number of pairs produced exceeded the limit of {}", max_number_of_pairs);
122131
}
123-
124-
key.commit();
125-
value.commit();
126-
127-
return {0, file.empty() ? State::END : State::WAITING_KEY};
128132
}
129133

130134
void reset(auto & key, auto & value)

src/Functions/keyvaluepair/impl/NeedleFactory.h

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ template <bool WITH_ESCAPING>
2020
class NeedleFactory
2121
{
2222
public:
23-
SearchSymbols getWaitNeedles(const Configuration & extractor_configuration)
23+
SearchSymbols getWaitKeyNeedles(const Configuration & extractor_configuration)
2424
{
2525
const auto & [key_value_delimiter, quoting_character, pair_delimiters]
2626
= extractor_configuration;
@@ -39,6 +39,17 @@ class NeedleFactory
3939
return SearchSymbols {std::string{needles.data(), needles.size()}};
4040
}
4141

42+
SearchSymbols getWaitPairDelimiterNeedles(const Configuration & extractor_configuration)
43+
{
44+
const auto & pair_delimiters = extractor_configuration.pair_delimiters;
45+
46+
std::vector<char> needles;
47+
48+
std::copy(pair_delimiters.begin(), pair_delimiters.end(), std::back_inserter(needles));
49+
50+
return SearchSymbols {std::string{needles.data(), needles.size()}};
51+
}
52+
4253
SearchSymbols getReadKeyNeedles(const Configuration & extractor_configuration)
4354
{
4455
const auto & [key_value_delimiter, quoting_character, pair_delimiters]

src/Functions/keyvaluepair/impl/StateHandler.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,11 @@ class StateHandler
3030
READING_QUOTED_VALUE,
3131
// In this state, both key and value have already been collected and should be flushed. Might jump to WAITING_KEY or END.
3232
FLUSH_PAIR,
33+
// The `READING_QUOTED_VALUE` will assert the closing quoting character is found and then flush the pair. In this case, we should not
34+
// move from `FLUSH_PAIR` directly to `WAITING_FOR_KEY` because a pair delimiter has not been found. Might jump to WAITING_FOR_PAIR_DELIMITER or END
35+
FLUSH_PAIR_AFTER_QUOTED_VALUE,
36+
// Might jump to WAITING_KEY or END.
37+
WAITING_PAIR_DELIMITER,
3338
END
3439
};
3540

src/Functions/keyvaluepair/impl/StateHandlerImpl.h

Lines changed: 35 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -40,18 +40,19 @@ class StateHandlerImpl : public StateHandler
4040
* */
4141
NeedleFactory<WITH_ESCAPING> needle_factory;
4242

43-
wait_needles = needle_factory.getWaitNeedles(configuration);
43+
wait_key_needles = needle_factory.getWaitKeyNeedles(configuration);
4444
read_key_needles = needle_factory.getReadKeyNeedles(configuration);
4545
read_value_needles = needle_factory.getReadValueNeedles(configuration);
4646
read_quoted_needles = needle_factory.getReadQuotedNeedles(configuration);
47+
wait_pair_delimiter_needles = needle_factory.getWaitPairDelimiterNeedles(configuration);
4748
}
4849

4950
/*
5051
* Find first character that is considered a valid key character and proceeds to READING_KEY like states.
5152
* */
5253
[[nodiscard]] NextState waitKey(std::string_view file) const
5354
{
54-
if (const auto * p = find_first_not_symbols_or_null(file, wait_needles))
55+
if (const auto * p = find_first_not_symbols_or_null(file, wait_key_needles))
5556
{
5657
const size_t character_position = p - file.begin();
5758
if (isQuotingCharacter(*p))
@@ -284,22 +285,52 @@ class StateHandlerImpl : public StateHandler
284285
{
285286
value.append(file.begin() + pos, file.begin() + character_position);
286287

287-
return {next_pos, State::FLUSH_PAIR};
288+
return {next_pos, State::FLUSH_PAIR_AFTER_QUOTED_VALUE};
288289
}
289290

290291
pos = next_pos;
291292
}
292293

293294
return {file.size(), State::END};
294295
}
296+
[[nodiscard]] NextState flushPair(std::string_view file, auto & key, auto & value) const
297+
{
298+
key.commit();
299+
value.commit();
300+
301+
return {0, file.empty() ? State::END : State::WAITING_KEY};
302+
}
303+
304+
[[nodiscard]] NextState flushPairAfterQuotedValue(std::string_view file, auto & key, auto & value) const
305+
{
306+
key.commit();
307+
value.commit();
308+
309+
return {0, file.empty() ? State::END : State::WAITING_PAIR_DELIMITER};
310+
}
311+
312+
[[nodiscard]] NextState waitPairDelimiter(std::string_view file) const
313+
{
314+
if (const auto * p = find_first_symbols_or_null(file, wait_pair_delimiter_needles))
315+
{
316+
const size_t character_position = p - file.data();
317+
size_t next_pos = character_position + 1u;
318+
319+
return {next_pos, State::WAITING_KEY};
320+
}
321+
322+
return {file.size(), State::END};
323+
}
324+
295325

296326
const Configuration configuration;
297327

298328
private:
299-
SearchSymbols wait_needles;
329+
SearchSymbols wait_key_needles;
300330
SearchSymbols read_key_needles;
301331
SearchSymbols read_value_needles;
302332
SearchSymbols read_quoted_needles;
333+
SearchSymbols wait_pair_delimiter_needles;
303334

304335
/*
305336
* Helper method to copy bytes until `character_pos` and process possible escape sequence. Returns a pair containing a boolean

tests/queries/0_stateless/02499_extract_key_value_pairs_multiple_input.reference

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -381,3 +381,15 @@ WITH
381381
SELECT
382382
x;
383383
{'age':'31','name':'neymar','nationality':'brazil','team':'psg'}
384+
-- after parsing a quoted value, the next key should only start after a pair delimiter
385+
WITH
386+
extractKeyValuePairs('key:"quoted_value"junk,second_key:0') as s_map,
387+
CAST(
388+
arrayMap(
389+
(x) -> (x, s_map[x]), arraySort(mapKeys(s_map))
390+
),
391+
'Map(String,String)'
392+
) AS x
393+
SELECT
394+
x;
395+
{'key':'quoted_value','second_key':'0'}

tests/queries/0_stateless/02499_extract_key_value_pairs_multiple_input.sql

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -516,3 +516,15 @@ WITH
516516
) AS x
517517
SELECT
518518
x;
519+
520+
-- after parsing a quoted value, the next key should only start after a pair delimiter
521+
WITH
522+
extractKeyValuePairs('key:"quoted_value"junk,second_key:0') as s_map,
523+
CAST(
524+
arrayMap(
525+
(x) -> (x, s_map[x]), arraySort(mapKeys(s_map))
526+
),
527+
'Map(String,String)'
528+
) AS x
529+
SELECT
530+
x;

0 commit comments

Comments
 (0)