Skip to content

Commit 8072c30

Browse files
committed
return back my parsing
1 parent b5b9959 commit 8072c30

File tree

3 files changed

+164
-82
lines changed

3 files changed

+164
-82
lines changed

src/Processors/Formats/Impl/ArrowColumnToCHColumn.cpp

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -484,15 +484,15 @@ static ColumnWithTypeAndName readColumnWithGeoData(const std::shared_ptr<arrow::
484484
if (chunk.IsNull(offset_i))
485485
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Geometry nullable columns are not supported");
486486

487-
String in_data(reinterpret_cast<char*>(raw_data), chunk.value_length(offset_i));
488-
GeometricObject result_object;
487+
ReadBuffer in_buffer(reinterpret_cast<char*>(raw_data), chunk.value_length(offset_i), 0);
488+
ArrowGeometricObject result_object;
489489
switch (geo_metadata.encoding)
490490
{
491491
case GeoEncoding::WKB:
492-
result_object = parseWKBFormat(in_data);
492+
result_object = parseWKBFormat(in_buffer);
493493
break;
494494
case GeoEncoding::WKT:
495-
result_object = parseWKTFormat(in_data);
495+
result_object = parseWKTFormat(in_buffer);
496496
break;
497497
}
498498
column_builder.appendObject(result_object);

src/Processors/Formats/Impl/ArrowGeoTypes.cpp

Lines changed: 139 additions & 65 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,6 @@
66
#include <IO/ReadHelpers.h>
77
#include <base/types.h>
88
#include <Common/Exception.h>
9-
#include "Functions/geometryConverters.h"
10-
#include "IO/ReadBufferFromString.h"
119

1210
#include <DataTypes/DataTypeArray.h>
1311
#include <DataTypes/DataTypeTuple.h>
@@ -95,79 +93,71 @@ std::unordered_map<String, GeoColumnMetadata> parseGeoMetadataEncoding(std::opti
9593
return geo_columns;
9694
}
9795

98-
CartesianPoint readPointWKB(ReadBuffer & in_buffer, std::endian endian_to_read)
96+
inline ArrowPoint readPointWKB(ReadBuffer & in_buffer, std::endian endian_to_read)
9997
{
10098
double x;
10199
double y;
102100
readBinaryEndian(x, in_buffer, endian_to_read);
103101
readBinaryEndian(y, in_buffer, endian_to_read);
104-
return CartesianPoint(x, y);
102+
return ArrowPoint{.x = x, .y = y};
105103
}
106104

107-
CartesianLineString readLineWKB(ReadBuffer & in_buffer, std::endian endian_to_read)
105+
inline ArrowLineString readLineWKB(ReadBuffer & in_buffer, std::endian endian_to_read)
108106
{
109107
int num_points;
110108
readBinaryEndian(num_points, in_buffer, endian_to_read);
111109

112-
CartesianLineString line;
110+
ArrowLineString line;
113111
for (int i = 0; i < num_points; ++i)
114112
{
115113
line.push_back(readPointWKB(in_buffer, endian_to_read));
116114
}
117115
return line;
118116
}
119117

120-
CartesianPolygon readPolygonWKB(ReadBuffer & in_buffer, std::endian endian_to_read)
118+
inline ArrowPolygon readPolygonWKB(ReadBuffer & in_buffer, std::endian endian_to_read)
121119
{
122120
int num_lines;
123121
readBinaryEndian(num_lines, in_buffer, endian_to_read);
124122

125-
CartesianPolygon polygon;
126-
{
127-
auto parsed_points = readLineWKB(in_buffer, endian_to_read);
128-
for (const auto & point : parsed_points)
129-
polygon.outer().push_back(point);
130-
}
131-
132-
for (int i = 1; i < num_lines; ++i)
123+
ArrowPolygon polygon;
124+
for (int i = 0; i < num_lines; ++i)
133125
{
134126
auto parsed_points = readLineWKB(in_buffer, endian_to_read);
135-
polygon.inners().push_back({});
136-
for (const auto & point : parsed_points)
137-
polygon.inners().back().push_back(point);
127+
polygon.push_back(std::move(parsed_points));
138128
}
139129
return polygon;
140130
}
141131

142-
GeometricObject parseWKBFormat(ReadBuffer & in_buffer);
132+
ArrowGeometricObject parseWKBFormat(ReadBuffer & in_buffer);
143133

144-
CartesianMultiLineString readMultiLineStringWKB(ReadBuffer & in_buffer, std::endian endian_to_read)
134+
ArrowMultiLineString readMultiLineStringWKB(ReadBuffer & in_buffer, std::endian endian_to_read)
145135
{
146-
CartesianMultiLineString multiline;
136+
ArrowMultiLineString multiline;
147137

148138
int num_lines;
149139
readBinaryEndian(num_lines, in_buffer, endian_to_read);
150140

151141
for (int i = 0; i < num_lines; ++i)
152-
multiline.push_back(std::get<CartesianLineString>(parseWKBFormat(in_buffer)));
142+
multiline.push_back(std::get<ArrowLineString>(parseWKBFormat(in_buffer)));
153143

154144
return multiline;
155145
}
156146

157-
CartesianMultiPolygon readMultiPolygonWKB(ReadBuffer & in_buffer, std::endian endian_to_read)
147+
ArrowMultiPolygon readMultiPolygonWKB(ReadBuffer & in_buffer, std::endian endian_to_read)
158148
{
159-
CartesianMultiPolygon multipolygon;
149+
ArrowMultiPolygon multipolygon;
160150

161151
int num_polygons;
162152
readBinaryEndian(num_polygons, in_buffer, endian_to_read);
163153

164154
for (int i = 0; i < num_polygons; ++i)
165-
multipolygon.push_back(std::get<CartesianPolygon>(parseWKBFormat(in_buffer)));
155+
multipolygon.push_back(std::get<ArrowPolygon>(parseWKBFormat(in_buffer)));
166156

167157
return multipolygon;
168158
}
169159

170-
GeometricObject parseWKBFormat(ReadBuffer & in_buffer)
160+
ArrowGeometricObject parseWKBFormat(ReadBuffer & in_buffer)
171161
{
172162
char little_endian;
173163
if (!in_buffer.read(little_endian))
@@ -195,34 +185,119 @@ GeometricObject parseWKBFormat(ReadBuffer & in_buffer)
195185
}
196186
}
197187

198-
GeometricObject parseWKBFormat(const String & input)
188+
inline ArrowPoint parseWKTPoint(ReadBuffer & in_buffer)
189+
{
190+
double x;
191+
double y;
192+
char ch;
193+
while (true)
194+
{
195+
in_buffer.peek(ch);
196+
if (ch != ' ')
197+
break;
198+
in_buffer.ignore();
199+
}
200+
tryReadFloatText(x, in_buffer);
201+
in_buffer.ignore();
202+
readFloatText(y, in_buffer);
203+
return {x, y};
204+
}
205+
206+
inline void readOpenBracket(ReadBuffer & in_buffer)
207+
{
208+
while (true)
209+
{
210+
char ch;
211+
readBinary(ch, in_buffer);
212+
if (ch == '(')
213+
break;
214+
}
215+
}
216+
217+
inline bool readNextItem(ReadBuffer & in_buffer)
218+
{
219+
char ch;
220+
while (true)
221+
{
222+
readBinary(ch, in_buffer);
223+
if (ch == ')')
224+
return true;
225+
226+
if (ch == ',')
227+
return false;
228+
}
229+
}
230+
231+
inline ArrowLineString parseWKTLine(ReadBuffer & in_buffer)
199232
{
200-
auto in_buffer = ReadBufferFromString(input);
233+
ArrowLineString ls;
234+
readOpenBracket(in_buffer);
235+
while (true)
236+
{
237+
ls.push_back(parseWKTPoint(in_buffer));
238+
if (readNextItem(in_buffer))
239+
break;
240+
}
241+
return ls;
242+
}
201243

202-
return parseWKBFormat(in_buffer);
244+
inline ArrowPolygon parseWKTPolygon(ReadBuffer & in_buffer)
245+
{
246+
ArrowPolygon poly;
247+
readOpenBracket(in_buffer);
248+
while (true)
249+
{
250+
poly.push_back(parseWKTLine(in_buffer));
251+
if (readNextItem(in_buffer))
252+
break;
253+
}
254+
return poly;
203255
}
204256

205-
GeometricObject parseWKTFormat(const String & input)
257+
inline ArrowMultiPolygon parseWKTMultiPolygon(ReadBuffer & in_buffer)
206258
{
207-
if (input.starts_with("POINT"))
259+
ArrowMultiPolygon poly;
260+
readOpenBracket(in_buffer);
261+
while (true)
208262
{
209-
CartesianPoint point;
210-
boost::geometry::read_wkt(input, point);
211-
return point;
263+
poly.push_back(parseWKTPolygon(in_buffer));
264+
if (readNextItem(in_buffer))
265+
break;
212266
}
213-
if (input.starts_with("POLYGON"))
267+
return poly;
268+
}
269+
270+
ArrowGeometricObject parseWKTFormat(ReadBuffer & in_buffer)
271+
{
272+
std::string type;
273+
while (true)
214274
{
215-
CartesianPolygon polygon;
216-
boost::geometry::read_wkt(input, polygon);
217-
return polygon;
275+
char current_symbol;
276+
in_buffer.peek(current_symbol);
277+
if (current_symbol == '(')
278+
break;
279+
type.push_back(current_symbol);
280+
in_buffer.ignore();
218281
}
219-
if (input.starts_with("LINESTRING"))
282+
283+
while (type.back() == ' ')
284+
type.pop_back();
285+
286+
if (type == "POINT")
220287
{
221-
CartesianLineString linestring;
222-
boost::geometry::read_wkt(input, linestring);
223-
return linestring;
288+
readOpenBracket(in_buffer);
289+
return parseWKTPoint(in_buffer);
224290
}
225-
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Unknown geometry object in WKT {}", input);
291+
if (type == "LINESTRING")
292+
return parseWKTLine(in_buffer);
293+
if (type == "POLYGON")
294+
return parseWKTPolygon(in_buffer);
295+
if (type == "MULTILINESTRING")
296+
return parseWKTPolygon(in_buffer);
297+
if (type == "MULTIPOLYGON")
298+
return parseWKTMultiPolygon(in_buffer);
299+
300+
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Error while reading WKT format: type {}", type);
226301
}
227302

228303
PointColumnBuilder::PointColumnBuilder(const String & name_)
@@ -234,14 +309,14 @@ PointColumnBuilder::PointColumnBuilder(const String & name_)
234309
{
235310
}
236311

237-
void PointColumnBuilder::appendObject(const GeometricObject & object)
312+
void PointColumnBuilder::appendObject(const ArrowGeometricObject & object)
238313
{
239-
if (!std::holds_alternative<CartesianPoint>(object))
314+
if (!std::holds_alternative<ArrowPoint>(object))
240315
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Types in parquet mismatched - expected point");
241316

242-
const auto & point = std::get<CartesianPoint>(object);
243-
point_column_data_x.push_back(point.x());
244-
point_column_data_y.push_back(point.y());
317+
const auto & point = std::get<ArrowPoint>(object);
318+
point_column_data_x.push_back(point.x);
319+
point_column_data_y.push_back(point.y);
245320
}
246321

247322
ColumnWithTypeAndName PointColumnBuilder::getResultColumn()
@@ -265,12 +340,12 @@ LineColumnBuilder::LineColumnBuilder(const String & name_)
265340
{
266341
}
267342

268-
void LineColumnBuilder::appendObject(const GeometricObject & object)
343+
void LineColumnBuilder::appendObject(const ArrowGeometricObject & object)
269344
{
270-
if (!std::holds_alternative<CartesianLineString>(object))
345+
if (!std::holds_alternative<ArrowLineString>(object))
271346
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Types in parquet mismatched - expected line string");
272347

273-
const auto & line = std::get<CartesianLineString>(object);
348+
const auto & line = std::get<ArrowLineString>(object);
274349
for (const auto & point : line)
275350
{
276351
point_column_builder.appendObject(point);
@@ -296,16 +371,15 @@ PolygonColumnBuilder::PolygonColumnBuilder(const String & name_)
296371
{
297372
}
298373

299-
void PolygonColumnBuilder::appendObject(const GeometricObject & object)
374+
void PolygonColumnBuilder::appendObject(const ArrowGeometricObject & object)
300375
{
301-
if (!std::holds_alternative<CartesianPolygon>(object))
376+
if (!std::holds_alternative<ArrowPolygon>(object))
302377
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Types in parquet mismatched - expected polygon");
303378

304-
const auto & polygon = std::get<CartesianPolygon>(object);
305-
line_column_builder.appendObject(CartesianLineString(polygon.outer().begin(), polygon.outer().end()));
306-
for (const auto & inner_circle : polygon.inners())
307-
line_column_builder.appendObject(CartesianLineString(inner_circle.begin(), inner_circle.end()));
308-
offset += 1 + polygon.inners().size();
379+
const auto & polygon = std::get<ArrowPolygon>(object);
380+
for (const auto & inner_circle : polygon)
381+
line_column_builder.appendObject(inner_circle);
382+
offset += polygon.size();
309383
offsets.push_back(offset);
310384
}
311385

@@ -326,12 +400,12 @@ MultiLineStringColumnBuilder::MultiLineStringColumnBuilder(const String & name_)
326400
{
327401
}
328402

329-
void MultiLineStringColumnBuilder::appendObject(const GeometricObject & object)
403+
void MultiLineStringColumnBuilder::appendObject(const ArrowGeometricObject & object)
330404
{
331-
if (!std::holds_alternative<CartesianMultiLineString>(object))
405+
if (!std::holds_alternative<ArrowMultiLineString>(object))
332406
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Types in parquet mismatched - expected multiline");
333407

334-
const auto & multilinestring = std::get<CartesianMultiLineString>(object);
408+
const auto & multilinestring = std::get<ArrowMultiLineString>(object);
335409
for (const auto & line : multilinestring)
336410
line_column_builder.appendObject(line);
337411

@@ -356,12 +430,12 @@ MultiPolygonColumnBuilder::MultiPolygonColumnBuilder(const String & name_)
356430
{
357431
}
358432

359-
void MultiPolygonColumnBuilder::appendObject(const GeometricObject & object)
433+
void MultiPolygonColumnBuilder::appendObject(const ArrowGeometricObject & object)
360434
{
361-
if (!std::holds_alternative<CartesianMultiPolygon>(object))
435+
if (!std::holds_alternative<ArrowMultiPolygon>(object))
362436
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Types in parquet mismatched - expected multi polygon");
363437

364-
const auto & multipolygon = std::get<CartesianMultiPolygon>(object);
438+
const auto & multipolygon = std::get<ArrowMultiPolygon>(object);
365439
for (const auto & polygon : multipolygon)
366440
polygon_column_builder.appendObject(polygon);
367441

@@ -401,7 +475,7 @@ GeoColumnBuilder::GeoColumnBuilder(const String & name_, GeoType type_)
401475
}
402476
}
403477

404-
void GeoColumnBuilder::appendObject(const GeometricObject & object)
478+
void GeoColumnBuilder::appendObject(const ArrowGeometricObject & object)
405479
{
406480
geomery_column_builder->appendObject(object);
407481
}

0 commit comments

Comments
 (0)