Skip to content

Commit 6189cd4

Browse files
Merge pull request ClickHouse#62210 from bigo-sg/json_format_early_skip
Improve `JSONEachRowRowInputFormat` by skipping all remaining fields when all required fields are read
2 parents b981d2d + bd63a31 commit 6189cd4

File tree

7 files changed

+47
-0
lines changed

7 files changed

+47
-0
lines changed

src/Core/Settings.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1055,6 +1055,7 @@ class IColumn;
10551055
M(Bool, input_format_json_ignore_unknown_keys_in_named_tuple, true, "Ignore unknown keys in json object for named tuples", 0) \
10561056
M(Bool, input_format_json_defaults_for_missing_elements_in_named_tuple, true, "Insert default value in named tuple element if it's missing in json object", 0) \
10571057
M(Bool, input_format_json_throw_on_bad_escape_sequence, true, "Throw an exception if JSON string contains bad escape sequence in JSON input formats. If disabled, bad escape sequences will remain as is in the data", 0) \
1058+
M(Bool, input_format_json_ignore_unnecessary_fields, true, "Ignore unnecessary fields and not parse them. Enabling this may not throw exceptions on json strings of invalid format or with duplicated fields", 0) \
10581059
M(Bool, input_format_try_infer_integers, true, "Try to infer integers instead of floats while schema inference in text formats", 0) \
10591060
M(Bool, input_format_try_infer_dates, true, "Try to infer dates from string fields while schema inference in text formats", 0) \
10601061
M(Bool, input_format_try_infer_datetimes, true, "Try to infer datetimes from string fields while schema inference in text formats", 0) \

src/Core/SettingsChangesHistory.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,7 @@ static std::map<ClickHouseVersion, SettingsChangesHistory::SettingsChanges> sett
8989
{"ignore_drop_queries_probability", 0, 0, "Allow to ignore drop queries in server with specified probability for testing purposes"},
9090
{"lightweight_deletes_sync", 2, 2, "The same as 'mutation_sync', but controls only execution of lightweight deletes"},
9191
{"query_cache_system_table_handling", "save", "throw", "The query cache no longer caches results of queries against system tables"},
92+
{"input_format_json_ignore_unnecessary_fields", false, true, "Ignore unnecessary fields and not parse them. Enabling this may not throw exceptions on json strings of invalid format or with duplicated fields"},
9293
{"input_format_hive_text_allow_variable_number_of_columns", false, true, "Ignore extra columns in Hive Text input (if file has more columns than expected) and treat missing fields in Hive Text input as default values."},
9394
{"first_day_of_week", "Monday", "Monday", "Added a setting for the first day of the week for date/time functions"},
9495
}},

src/Formats/FormatFactory.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -144,6 +144,7 @@ FormatSettings getFormatSettings(const ContextPtr & context, const Settings & se
144144
format_settings.json.compact_allow_variable_number_of_columns = settings.input_format_json_compact_allow_variable_number_of_columns;
145145
format_settings.json.try_infer_objects_as_tuples = settings.input_format_json_try_infer_named_tuples_from_objects;
146146
format_settings.json.throw_on_bad_escape_sequence = settings.input_format_json_throw_on_bad_escape_sequence;
147+
format_settings.json.ignore_unnecessary_fields = settings.input_format_json_ignore_unnecessary_fields;
147148
format_settings.null_as_default = settings.input_format_null_as_default;
148149
format_settings.decimal_trailing_zeros = settings.output_format_decimal_trailing_zeros;
149150
format_settings.parquet.row_group_rows = settings.output_format_parquet_row_group_size;

src/Formats/FormatSettings.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -223,6 +223,7 @@ struct FormatSettings
223223
bool try_infer_objects_as_tuples = false;
224224
bool infer_incomplete_types_as_strings = true;
225225
bool throw_on_bad_escape_sequence = true;
226+
bool ignore_unnecessary_fields = true;
226227
} json{};
227228

228229
struct

src/Processors/Formats/Impl/JSONEachRowRowInputFormat.cpp

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -132,6 +132,7 @@ void JSONEachRowRowInputFormat::readField(size_t index, MutableColumns & columns
132132
throw Exception(ErrorCodes::INCORRECT_DATA, "Duplicate field found while parsing JSONEachRow format: {}", columnName(index));
133133

134134
seen_columns[index] = true;
135+
seen_columns_count++;
135136
const auto & type = getPort().getHeader().getByPosition(index).type;
136137
const auto & serialization = serializations[index];
137138
read_columns[index] = JSONUtils::readField(*in, *columns[index], type, serialization, columnName(index), format_settings, yield_strings);
@@ -161,6 +162,14 @@ void JSONEachRowRowInputFormat::readJSONObject(MutableColumns & columns)
161162
for (size_t key_index = 0; advanceToNextKey(key_index); ++key_index)
162163
{
163164
StringRef name_ref = readColumnName(*in);
165+
if (seen_columns_count >= total_columns && format_settings.json.ignore_unnecessary_fields)
166+
{
167+
// Keep parsing the remaining fields in case of the json is invalid.
168+
// But not look up the name in the name_map since the cost cannot be ignored
169+
JSONUtils::skipColon(*in);
170+
skipUnknownField(name_ref);
171+
continue;
172+
}
164173
const size_t column_index = columnIndex(name_ref, key_index);
165174

166175
if (unlikely(ssize_t(column_index) < 0))
@@ -210,6 +219,8 @@ bool JSONEachRowRowInputFormat::readRow(MutableColumns & columns, RowReadExtensi
210219
return false;
211220

212221
size_t num_columns = columns.size();
222+
total_columns = num_columns;
223+
seen_columns_count = 0;
213224

214225
read_columns.assign(num_columns, false);
215226
seen_columns.assign(num_columns, false);

src/Processors/Formats/Impl/JSONEachRowRowInputFormat.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,8 @@ class JSONEachRowRowInputFormat : public IRowInputFormat
8989
std::vector<UInt8> read_columns;
9090
/// Set of columns which already met in row. Exception is thrown if there are more than one column with the same name.
9191
std::vector<UInt8> seen_columns;
92+
size_t seen_columns_count = 0;
93+
size_t total_columns = 0;
9294

9395
/// This flag is needed to know if data is in square brackets.
9496
bool data_in_square_brackets = false;
Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
<test>
2+
<create_query>
3+
create table t(
4+
f1 Nullable(String),
5+
f2 Nullable(Int32),
6+
f3 Nullable(String),
7+
f4 Nullable(Int32),
8+
f5 Nullable(UInt64),
9+
f6 Nullable(UInt64),
10+
f7 Nullable(String),
11+
f8 Nullable(String)
12+
) ENGINE = File(JSONEachRow)</create_query>
13+
<fill_query>
14+
insert into t
15+
select
16+
'3014660362662815',
17+
123,
18+
'xxsdfsdfs',
19+
22,
20+
240321215532916041,
21+
1711036533457,
22+
'xxxxx',
23+
'sdsfsdfsddf'
24+
from numbers_mt(1000000)
25+
</fill_query>
26+
27+
<query>select count(f1) from t settings max_threads=2</query>
28+
29+
<drop_query>drop table t</drop_query>
30+
</test>

0 commit comments

Comments
 (0)