Skip to content

Commit b884cb9

Browse files
Merge pull request ClickHouse#80050 from scanhex12/impore_geo_parquets
Improve geo parquet
2 parents 0e8a695 + 92393cc commit b884cb9

17 files changed

+89
-10
lines changed

src/Core/FormatFactorySettings.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1316,6 +1316,10 @@ Set the quoting style for identifiers in SHOW CREATE query
13161316
Limits the size of the blocks formed during data parsing in input formats in bytes. Used in row based input formats when block is formed on ClickHouse side.
13171317
0 means no limit in bytes.
13181318
)", 0) \
1319+
DECLARE(Bool, input_format_parquet_allow_geoparquet_parser, true, R"(
1320+
Use geo column parser to convert Array(UInt8) into Point/Linestring/Polygon/MultiLineString/MultiPolygon types
1321+
)", 0) \
1322+
13191323

13201324
// End of FORMAT_FACTORY_SETTINGS
13211325

src/Core/SettingsChangesHistory.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -100,6 +100,7 @@ const VersionToSettingsChangesMap & getSettingsChangesHistory()
100100
{"allow_experimental_database_hms_catalog", false, false, "Allow experimental database engine DataLakeCatalog with catalog_type = 'hive'"},
101101
{"compile_expressions", false, true, "We believe that the LLVM infrastructure behind the JIT compiler is stable enough to enable this setting by default."},
102102
{"use_legacy_to_time", false, false, "New setting. Allows for user to use the old function logic for toTime, which works as toTimeWithFixedDate."},
103+
{"input_format_parquet_allow_geoparquet_parser", false, true, "A new setting to use geo columns in parquet file"},
103104
});
104105
addSettingsChanges(settings_changes_history, "25.4",
105106
{

src/Formats/FormatFactory.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -214,6 +214,7 @@ FormatSettings getFormatSettings(const ContextPtr & context, const Settings & se
214214
format_settings.parquet.bloom_filter_flush_threshold_bytes = settings[Setting::output_format_parquet_bloom_filter_flush_threshold_bytes];
215215
format_settings.parquet.local_read_min_bytes_for_seek = settings[Setting::input_format_parquet_local_file_min_bytes_for_seek];
216216
format_settings.parquet.enable_row_group_prefetch = settings[Setting::input_format_parquet_enable_row_group_prefetch];
217+
format_settings.parquet.allow_geoparquet_parser = settings[Setting::input_format_parquet_allow_geoparquet_parser];
217218
format_settings.pretty.charset = settings[Setting::output_format_pretty_grid_charset].toString() == "ASCII" ? FormatSettings::Pretty::Charset::ASCII : FormatSettings::Pretty::Charset::UTF8;
218219
format_settings.pretty.color = settings[Setting::output_format_pretty_color].valueOr(2);
219220
format_settings.pretty.glue_chunks = settings[Setting::output_format_pretty_glue_chunks].valueOr(2);

src/Formats/FormatSettings.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -304,6 +304,7 @@ struct FormatSettings
304304
size_t local_read_min_bytes_for_seek = 8192;
305305
double bloom_filter_bits_per_value = 10.5;
306306
size_t bloom_filter_flush_threshold_bytes = 1024 * 1024 * 128;
307+
bool allow_geoparquet_parser = true;
307308
} parquet{};
308309

309310
struct Pretty

src/Processors/Formats/Impl/ArrowBlockInputFormat.cpp

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -166,6 +166,7 @@ void ArrowBlockInputFormat::prepareReader()
166166
format_settings.arrow.allow_missing_columns,
167167
format_settings.null_as_default,
168168
format_settings.date_time_overflow_behavior,
169+
format_settings.parquet.allow_geoparquet_parser,
169170
format_settings.arrow.case_insensitive_column_matching,
170171
stream);
171172

@@ -212,7 +213,9 @@ NamesAndTypesList ArrowSchemaReader::readSchema()
212213
file_reader ? file_reader->metadata() : nullptr,
213214
stream ? "ArrowStream" : "Arrow",
214215
format_settings.arrow.skip_columns_with_unsupported_types_in_schema_inference,
215-
format_settings.schema_inference_make_columns_nullable != 0);
216+
format_settings.schema_inference_make_columns_nullable != 0,
217+
false,
218+
format_settings.parquet.allow_geoparquet_parser);
216219
if (format_settings.schema_inference_make_columns_nullable == 1)
217220
return getNamesAndRecursivelyNullableTypes(header, format_settings);
218221
return header.getNamesAndTypesList();

src/Processors/Formats/Impl/ArrowColumnToCHColumn.cpp

Lines changed: 14 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -482,8 +482,10 @@ static ColumnWithTypeAndName readColumnWithGeoData(const std::shared_ptr<arrow::
482482
{
483483
auto * raw_data = buffer->mutable_data() + chunk.value_offset(offset_i);
484484
if (chunk.IsNull(offset_i))
485-
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Geometry nullable columns are not supported");
486-
485+
{
486+
column_builder.appendDefault();
487+
continue;
488+
}
487489
ReadBuffer in_buffer(reinterpret_cast<char*>(raw_data), chunk.value_length(offset_i), 0);
488490
ArrowGeometricObject result_object;
489491
switch (geo_metadata.encoding)
@@ -789,6 +791,7 @@ struct ReadColumnFromArrowColumnSettings
789791
bool skip_columns_with_unsupported_types;
790792
bool allow_inferring_nullable_columns;
791793
bool case_insensitive_matching;
794+
bool allow_geoparquet_parser;
792795
};
793796

794797
static ColumnWithTypeAndName readColumnFromArrowColumn(
@@ -846,7 +849,7 @@ static ColumnWithTypeAndName readNonNullableColumnFromArrowColumn(
846849
break;
847850
}
848851
}
849-
if (geo_metadata)
852+
if (geo_metadata && settings.allow_geoparquet_parser)
850853
{
851854
return readColumnWithGeoData(arrow_column, column_name, *geo_metadata);
852855
}
@@ -1324,7 +1327,8 @@ Block ArrowColumnToCHColumn::arrowSchemaToCHHeader(
13241327
const std::string & format_name,
13251328
bool skip_columns_with_unsupported_types,
13261329
bool allow_inferring_nullable_columns,
1327-
bool case_insensitive_matching)
1330+
bool case_insensitive_matching,
1331+
bool allow_geoparquet_parser)
13281332
{
13291333
ReadColumnFromArrowColumnSettings settings
13301334
{
@@ -1333,7 +1337,8 @@ Block ArrowColumnToCHColumn::arrowSchemaToCHHeader(
13331337
.allow_arrow_null_type = false,
13341338
.skip_columns_with_unsupported_types = skip_columns_with_unsupported_types,
13351339
.allow_inferring_nullable_columns = allow_inferring_nullable_columns,
1336-
.case_insensitive_matching = case_insensitive_matching
1340+
.case_insensitive_matching = case_insensitive_matching,
1341+
.allow_geoparquet_parser = allow_geoparquet_parser
13371342
};
13381343

13391344
ColumnsWithTypeAndName sample_columns;
@@ -1371,13 +1376,15 @@ ArrowColumnToCHColumn::ArrowColumnToCHColumn(
13711376
bool allow_missing_columns_,
13721377
bool null_as_default_,
13731378
FormatSettings::DateTimeOverflowBehavior date_time_overflow_behavior_,
1379+
bool allow_geoparquet_parser_,
13741380
bool case_insensitive_matching_,
13751381
bool is_stream_)
13761382
: header(header_)
13771383
, format_name(format_name_)
13781384
, allow_missing_columns(allow_missing_columns_)
13791385
, null_as_default(null_as_default_)
13801386
, date_time_overflow_behavior(date_time_overflow_behavior_)
1387+
, allow_geoparquet_parser(allow_geoparquet_parser_)
13811388
, case_insensitive_matching(case_insensitive_matching_)
13821389
, is_stream(is_stream_)
13831390
{
@@ -1421,7 +1428,8 @@ Chunk ArrowColumnToCHColumn::arrowColumnsToCHChunk(
14211428
.allow_arrow_null_type = true,
14221429
.skip_columns_with_unsupported_types = false,
14231430
.allow_inferring_nullable_columns = true,
1424-
.case_insensitive_matching = case_insensitive_matching
1431+
.case_insensitive_matching = case_insensitive_matching,
1432+
.allow_geoparquet_parser = allow_geoparquet_parser
14251433
};
14261434

14271435
Columns columns;

src/Processors/Formats/Impl/ArrowColumnToCHColumn.h

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ class ArrowColumnToCHColumn
2727
bool allow_missing_columns_,
2828
bool null_as_default_,
2929
FormatSettings::DateTimeOverflowBehavior date_time_overflow_behavior_,
30+
bool allow_geoparquet_parser_,
3031
bool case_insensitive_matching_ = false,
3132
bool is_stream_ = false);
3233

@@ -43,7 +44,8 @@ class ArrowColumnToCHColumn
4344
const std::string & format_name,
4445
bool skip_columns_with_unsupported_types = false,
4546
bool allow_inferring_nullable_columns = true,
46-
bool case_insensitive_matching = false);
47+
bool case_insensitive_matching = false,
48+
bool allow_geoparquet_parser = true);
4749

4850
struct DictionaryInfo
4951
{
@@ -73,6 +75,7 @@ class ArrowColumnToCHColumn
7375
bool allow_missing_columns;
7476
bool null_as_default;
7577
FormatSettings::DateTimeOverflowBehavior date_time_overflow_behavior;
78+
bool allow_geoparquet_parser;
7679
bool case_insensitive_matching;
7780
bool is_stream;
7881

src/Processors/Formats/Impl/ArrowGeoTypes.cpp

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -321,6 +321,12 @@ void PointColumnBuilder::appendObject(const ArrowGeometricObject & object)
321321
point_column_data_y.push_back(point.y);
322322
}
323323

324+
void PointColumnBuilder::appendDefault()
325+
{
326+
point_column_data_x.push_back(0);
327+
point_column_data_y.push_back(0);
328+
}
329+
324330
ColumnWithTypeAndName PointColumnBuilder::getResultColumn()
325331
{
326332
ColumnPtr result_x = point_column_x->getPtr();
@@ -356,6 +362,11 @@ void LineColumnBuilder::appendObject(const ArrowGeometricObject & object)
356362
offsets.push_back(offset);
357363
}
358364

365+
void LineColumnBuilder::appendDefault()
366+
{
367+
offsets.push_back(offset);
368+
}
369+
359370
ColumnWithTypeAndName LineColumnBuilder::getResultColumn()
360371
{
361372
auto all_points_column = point_column_builder.getResultColumn();
@@ -385,6 +396,11 @@ void PolygonColumnBuilder::appendObject(const ArrowGeometricObject & object)
385396
offsets.push_back(offset);
386397
}
387398

399+
void PolygonColumnBuilder::appendDefault()
400+
{
401+
offsets.push_back(offset);
402+
}
403+
388404
ColumnWithTypeAndName PolygonColumnBuilder::getResultColumn()
389405
{
390406
auto all_points_column = line_column_builder.getResultColumn();
@@ -415,6 +431,11 @@ void MultiLineStringColumnBuilder::appendObject(const ArrowGeometricObject & obj
415431
offsets.push_back(offset);
416432
}
417433

434+
void MultiLineStringColumnBuilder::appendDefault()
435+
{
436+
offsets.push_back(offset);
437+
}
438+
418439
ColumnWithTypeAndName MultiLineStringColumnBuilder::getResultColumn()
419440
{
420441
auto all_points_column = line_column_builder.getResultColumn();
@@ -445,6 +466,11 @@ void MultiPolygonColumnBuilder::appendObject(const ArrowGeometricObject & object
445466
offsets.push_back(offset);
446467
}
447468

469+
void MultiPolygonColumnBuilder::appendDefault()
470+
{
471+
offsets.push_back(offset);
472+
}
473+
448474
ColumnWithTypeAndName MultiPolygonColumnBuilder::getResultColumn()
449475
{
450476
auto all_points_column = polygon_column_builder.getResultColumn();
@@ -482,6 +508,12 @@ void GeoColumnBuilder::appendObject(const ArrowGeometricObject & object)
482508
geomery_column_builder->appendObject(object);
483509
}
484510

511+
void GeoColumnBuilder::appendDefault()
512+
{
513+
geomery_column_builder->appendDefault();
514+
}
515+
516+
485517
ColumnWithTypeAndName GeoColumnBuilder::getResultColumn()
486518
{
487519
return geomery_column_builder->getResultColumn();

src/Processors/Formats/Impl/ArrowGeoTypes.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,7 @@ using ArrowGeometricObject = std::variant<ArrowPoint, ArrowLineString, ArrowPoly
6262
struct IGeometryColumnBuilder
6363
{
6464
virtual void appendObject(const ArrowGeometricObject & object) = 0;
65+
virtual void appendDefault() = 0;
6566
virtual ColumnWithTypeAndName getResultColumn() = 0;
6667

6768
virtual ~IGeometryColumnBuilder() = default;
@@ -73,6 +74,7 @@ class PointColumnBuilder : public IGeometryColumnBuilder
7374
explicit PointColumnBuilder(const String & name_);
7475

7576
void appendObject(const ArrowGeometricObject & object) override;
77+
void appendDefault() override;
7678

7779
ColumnWithTypeAndName getResultColumn() override;
7880

@@ -93,6 +95,7 @@ class LineColumnBuilder : public IGeometryColumnBuilder
9395
explicit LineColumnBuilder(const String & name_);
9496

9597
void appendObject(const ArrowGeometricObject & object) override;
98+
void appendDefault() override;
9699

97100
ColumnWithTypeAndName getResultColumn() override;
98101

@@ -114,6 +117,7 @@ class PolygonColumnBuilder : public IGeometryColumnBuilder
114117
explicit PolygonColumnBuilder(const String & name_);
115118

116119
void appendObject(const ArrowGeometricObject & object) override;
120+
void appendDefault() override;
117121

118122
ColumnWithTypeAndName getResultColumn() override;
119123

@@ -135,6 +139,7 @@ class MultiLineStringColumnBuilder : public IGeometryColumnBuilder
135139
explicit MultiLineStringColumnBuilder(const String & name_);
136140

137141
void appendObject(const ArrowGeometricObject & object) override;
142+
void appendDefault() override;
138143

139144
ColumnWithTypeAndName getResultColumn() override;
140145

@@ -156,6 +161,7 @@ class MultiPolygonColumnBuilder : public IGeometryColumnBuilder
156161
explicit MultiPolygonColumnBuilder(const String & name_);
157162

158163
void appendObject(const ArrowGeometricObject & object) override;
164+
void appendDefault() override;
159165

160166
ColumnWithTypeAndName getResultColumn() override;
161167

@@ -178,6 +184,7 @@ class GeoColumnBuilder : public IGeometryColumnBuilder
178184
explicit GeoColumnBuilder(const String & name_, GeoType type_);
179185

180186
void appendObject(const ArrowGeometricObject & object) override;
187+
void appendDefault() override;
181188

182189
ColumnWithTypeAndName getResultColumn() override;
183190

src/Processors/Formats/Impl/ORCBlockInputFormat.cpp

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -140,6 +140,7 @@ void ORCBlockInputFormat::prepareReader()
140140
format_settings.orc.allow_missing_columns,
141141
format_settings.null_as_default,
142142
format_settings.date_time_overflow_behavior,
143+
format_settings.parquet.allow_geoparquet_parser,
143144
format_settings.orc.case_insensitive_column_matching);
144145

145146
const bool ignore_case = format_settings.orc.case_insensitive_column_matching;
@@ -174,12 +175,15 @@ void ORCSchemaReader::initializeIfNeeded()
174175
NamesAndTypesList ORCSchemaReader::readSchema()
175176
{
176177
initializeIfNeeded();
178+
177179
auto header = ArrowColumnToCHColumn::arrowSchemaToCHHeader(
178180
*schema,
179181
metadata,
180182
"ORC",
181183
format_settings.orc.skip_columns_with_unsupported_types_in_schema_inference,
182-
format_settings.schema_inference_make_columns_nullable != 0);
184+
format_settings.schema_inference_make_columns_nullable != 0,
185+
false,
186+
format_settings.parquet.allow_geoparquet_parser);
183187
if (format_settings.schema_inference_make_columns_nullable == 1)
184188
return getNamesAndRecursivelyNullableTypes(header, format_settings);
185189
return header.getNamesAndTypesList();

0 commit comments

Comments
 (0)