Skip to content

Commit 8534856

Browse files
committed
test: added test cases for more data types
1 parent 6fc5611 commit 8534856

File tree

6 files changed

+346
-70
lines changed

6 files changed

+346
-70
lines changed

programs/local/PandasAnalyzer.cpp

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
#include "PandasAnalyzer.h"
22
#include "PythonConversion.h"
33
#include "PythonImporter.h"
4+
#include "base/defines.h"
45

56
#include <Common/Exception.h>
67
#include <DataTypes/DataTypesNumber.h>
@@ -25,6 +26,12 @@ bool PandasAnalyzer::Analyze(py::object column) {
2526
if (sample_size == 0)
2627
return false;
2728

29+
if (sample_size < 0)
30+
{
31+
analyzed_type = std::make_shared<DataTypeObject>(DataTypeObject::SchemaFormat::JSON);
32+
return true;
33+
}
34+
2835
auto & import_cache = PythonImporter::ImportCache();
2936
auto pandas = import_cache.pandas();
3037
if (!pandas)
@@ -42,7 +49,7 @@ bool PandasAnalyzer::Analyze(py::object column) {
4249

4350
size_t PandasAnalyzer::getSampleIncrement(size_t rows)
4451
{
45-
auto sample = sample_size;
52+
auto sample = static_cast<uint64_t>(sample_size);
4653
if (sample > rows)
4754
sample = rows;
4855

programs/local/PandasAnalyzer.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ class PandasAnalyzer {
3030
size_t getSampleIncrement(size_t rows);
3131

3232
private:
33-
uint64_t sample_size;
33+
int64_t sample_size;
3434
PythonGILWrapper gil;
3535
DB::DataTypePtr analyzed_type;
3636
};

programs/local/PythonConversion.cpp

Lines changed: 84 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -90,7 +90,7 @@ bool isInteger(const py::handle & obj)
9090
return GetPythonObjectType(obj) == PythonObjectType::Integer;
9191
}
9292

93-
void writeInteger(const py::handle & obj, rapidjson::Value & json_value)
93+
static void writeInteger(const py::handle & obj, rapidjson::Value & json_value)
9494
{
9595
auto ptr = obj.ptr();
9696
int overflow = 0;
@@ -137,7 +137,7 @@ bool isNone(const py::handle & obj)
137137
return GetPythonObjectType(obj) == PythonObjectType::None;
138138
}
139139

140-
void writeNone(const py::handle & obj, rapidjson::Value & json_value)
140+
static void writeNone(const py::handle & obj, rapidjson::Value & json_value)
141141
{
142142
json_value.SetNull();
143143
}
@@ -147,25 +147,25 @@ bool isFloat(const py::handle & obj)
147147
return GetPythonObjectType(obj) == PythonObjectType::Float;
148148
}
149149

150-
void writeFloat(const py::handle & obj, rapidjson::Value & json_value)
150+
static void writeFloat(const py::handle & obj, rapidjson::Value & json_value)
151151
{
152152
auto ptr = obj.ptr();
153-
if (std::isnan(PyFloat_AsDouble(ptr)))
154-
{
155-
json_value.SetNull();
156-
return;
157-
}
153+
double value = PyFloat_AsDouble(ptr);
158154

159-
double value = obj.cast<double>();
160-
json_value.SetDouble(value);
155+
if (std::isnan(value) || std::isinf(value)) {
156+
json_value.SetNull();
157+
return;
158+
}
159+
160+
json_value.SetDouble(value);
161161
}
162162

163163
bool isBoolean(const py::handle & obj)
164164
{
165165
return GetPythonObjectType(obj) == PythonObjectType::Bool;
166166
}
167167

168-
void writeBoolean(const py::handle & obj, rapidjson::Value & json_value)
168+
static void writeBoolean(const py::handle & obj, rapidjson::Value & json_value)
169169
{
170170
json_value.SetBool(py::cast<bool>(obj));
171171
}
@@ -175,9 +175,9 @@ bool isDecimal(const py::handle & obj)
175175
return GetPythonObjectType(obj) == PythonObjectType::Decimal;
176176
}
177177

178-
void writeDecimal(const py::handle & obj, rapidjson::Value & json_value, rapidjson::Document::AllocatorType & allocator)
178+
static void writeDecimal(const py::handle & obj, rapidjson::Value & json_value, rapidjson::Document::AllocatorType & allocator)
179179
{
180-
const auto & str = obj.cast<std::string>();
180+
String str = py::str(obj);
181181
json_value.SetString(str.data(), str.size(), allocator);
182182
}
183183

@@ -187,18 +187,84 @@ bool isString(const py::handle & obj)
187187
|| GetPythonObjectType(obj) == PythonObjectType::Bytes;
188188
}
189189

190-
void writeString(const py::handle & obj, rapidjson::Value & json_value, rapidjson::Document::AllocatorType & allocator)
190+
bool isByteArray(const py::handle & obj)
191+
{
192+
return GetPythonObjectType(obj) == PythonObjectType::ByteArray;
193+
}
194+
195+
static void writeByteArray(const py::handle & obj, rapidjson::Value & json_value, rapidjson::Document::AllocatorType & allocator)
196+
{
197+
auto * ptr = obj.ptr();
198+
auto * data = PyByteArray_AsString(ptr);
199+
auto size = PyByteArray_GET_SIZE(ptr);
200+
json_value.SetString(data, size, allocator);
201+
}
202+
203+
bool isMemoryView(const py::handle & obj)
204+
{
205+
return GetPythonObjectType(obj) == PythonObjectType::MemoryView;
206+
}
207+
208+
static void writeMemoryView(const py::handle & obj, rapidjson::Value & json_value, rapidjson::Document::AllocatorType & allocator)
209+
{
210+
auto * ptr = obj.ptr();
211+
py::memoryview py_view = obj.cast<py::memoryview>();
212+
213+
Py_buffer * py_buf = PyMemoryView_GET_BUFFER(ptr);
214+
auto * data = static_cast<char *>(py_buf->buf);
215+
auto size = py_buf->len;
216+
217+
json_value.SetString(data, size, allocator);
218+
}
219+
220+
static void writeString(const py::handle & obj, rapidjson::Value & json_value, rapidjson::Document::AllocatorType & allocator)
191221
{
192222
const auto & str = obj.cast<std::string>();
193223
json_value.SetString(str.data(), str.size(), allocator);
194224
}
195225

196-
void writeOthers(const py::handle & obj, rapidjson::Value & json_value, rapidjson::Document::AllocatorType & allocator)
226+
static void writeOthers(const py::handle & obj, rapidjson::Value & json_value, rapidjson::Document::AllocatorType & allocator)
197227
{
198228
String str = py::str(obj);
199229
json_value.SetString(str.data(), str.size(), allocator);
200230
}
201231

232+
void handlePrimitiveTypes(
233+
const py::handle & obj, rapidjson::Value & json_value, rapidjson::Document::AllocatorType & allocator)
234+
{
235+
PythonObjectType type = GetPythonObjectType(obj);
236+
switch(type) {
237+
case PythonObjectType::Integer:
238+
writeInteger(obj, json_value);
239+
break;
240+
case PythonObjectType::Float:
241+
writeFloat(obj, json_value);
242+
break;
243+
case PythonObjectType::Bool:
244+
writeBoolean(obj, json_value);
245+
break;
246+
case PythonObjectType::Decimal:
247+
writeDecimal(obj, json_value, allocator);
248+
break;
249+
case PythonObjectType::String:
250+
case PythonObjectType::Bytes:
251+
writeString(obj, json_value, allocator);
252+
break;
253+
case PythonObjectType::ByteArray:
254+
writeByteArray(obj, json_value, allocator);
255+
break;
256+
case PythonObjectType::MemoryView:
257+
writeMemoryView(obj, json_value, allocator);
258+
break;
259+
case PythonObjectType::None:
260+
writeNone(obj, json_value);
261+
break;
262+
default:
263+
writeOthers(obj, json_value, allocator);
264+
break;
265+
}
266+
}
267+
202268
void convert_to_json_str(const py::handle & obj, String & ret)
203269
{
204270
rapidjson::Document d;
@@ -213,8 +279,8 @@ void convert_to_json_str(const py::handle & obj, String & ret)
213279
for (auto& item : py::cast<py::dict>(obj))
214280
{
215281
rapidjson::Value key;
216-
auto ket_str = py::str(item.first).cast<std::string>();
217-
key.SetString(ket_str.data(), ket_str.size(), allocator);
282+
auto key_str = py::str(item.first).cast<std::string>();
283+
key.SetString(key_str.data(), key_str.size(), allocator);
218284

219285
rapidjson::Value val;
220286
convert(item.second, val);
@@ -259,33 +325,9 @@ void convert_to_json_str(const py::handle & obj, String & ret)
259325
json_value.PushBack(element, allocator);
260326
}
261327
}
262-
else if (isInteger(obj))
263-
{
264-
writeInteger(obj, json_value);
265-
}
266-
else if (isNone(obj))
267-
{
268-
writeNone(obj, json_value);
269-
}
270-
else if (isFloat(obj))
271-
{
272-
writeFloat(obj, json_value);
273-
}
274-
else if (isBoolean(obj))
275-
{
276-
writeBoolean(obj, json_value);
277-
}
278-
else if (isDecimal(obj))
279-
{
280-
writeDecimal(obj, json_value, allocator);
281-
}
282-
else if (isString(obj))
283-
{
284-
writeString(obj, json_value, allocator);
285-
}
286328
else
287329
{
288-
writeOthers(obj, json_value, allocator);
330+
handlePrimitiveTypes(obj, json_value, allocator);
289331
}
290332
};
291333

programs/local/PythonConversion.h

Lines changed: 3 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -28,37 +28,26 @@ enum class PythonObjectType {
2828
Tuple,
2929
Dict,
3030
NdArray,
31-
NdDatetime,
32-
Value
31+
NdDatetime
3332
};
3433

3534
PythonObjectType GetPythonObjectType(const py::handle & obj);
3635

3736
bool isInteger(const py::handle & obj);
3837

39-
void writeInteger(const py::handle & obj, rapidjson::Value & json_value);
40-
4138
bool isNone(const py::handle & obj);
4239

43-
void writeNone(const py::handle & obj, rapidjson::Value & json_value);
44-
4540
bool isFloat(const py::handle & obj);
4641

47-
void writeFloat(const py::handle & obj, rapidjson::Value & json_value);
48-
4942
bool isBoolean(const py::handle & obj);
5043

51-
void writeBoolean(const py::handle & obj, rapidjson::Value & json_value);
52-
5344
bool isDecimal(const py::handle & obj);
5445

55-
void writeDecimal(const py::handle & obj, rapidjson::Value & json_value, rapidjson::Document::AllocatorType & allocator);
56-
5746
bool isString(const py::handle & obj);
5847

59-
void writeString(const py::handle & obj, rapidjson::Value & json_value, rapidjson::Document::AllocatorType & allocator);
48+
bool isByteArray(const py::handle & obj);
6049

61-
void writeOthers(const py::handle & obj, rapidjson::Value & json_value, rapidjson::Document::AllocatorType & allocator);
50+
bool isMemoryView(const py::handle & obj);
6251

6352
void convert_to_json_str(const py::handle & obj, String & ret);
6453

src/Core/Settings.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -968,7 +968,7 @@ class IColumn;
968968
\
969969
/** Experimental feature for moving data between shards. */ \
970970
M(Bool, allow_experimental_query_deduplication, false, "Experimental data deduplication for SELECT queries based on part UUIDs", 0) \
971-
M(UInt64, pandas_analyze_sample, 10000, "Sample rows in pandas to automatically determine the data types. When set to 0, sampling is disabled", 0) \
971+
M(Int64, pandas_analyze_sample, 10000, "Sample rows in pandas to automatically determine the data types. When set to 0, sampling is disabled", 0) \
972972

973973
/** End of experimental features */
974974

0 commit comments

Comments
 (0)