Skip to content

Commit 6fb176d

Browse files
committed
feat: Nested Type String Support Double Quoting
1 parent 4d31a3b commit 6fb176d

File tree

5 files changed

+98
-30
lines changed

5 files changed

+98
-30
lines changed

sql/Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,5 +35,5 @@ memchr = "2.7"
3535
roaring = { version = "0.10.12", features = ["serde"] }
3636
jiff = { workspace = true }
3737
serde = { version = "1.0", default-features = false, features = ["derive"] }
38-
serde_json = { version = "1.0", default-features = false, features = ["std"] }
38+
serde_json = { version = "1.0", default-features = false, features = ["std", "raw_value"] }
3939
url = { version = "2.5", default-features = false }

sql/src/rows.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -194,7 +194,7 @@ impl TryFrom<(RecordBatch, ResultFormatSettings)> for Rows {
194194
for j in 0..batch_schema.fields().len() {
195195
let v = batch.column(j);
196196
let field = batch_schema.field(j);
197-
let value = Value::try_from((field, v, i, &settings))?;
197+
let value = Value::try_from((field, v, i, false, &settings))?;
198198
values.push(value);
199199
}
200200
rows.push(Row::new(schema.clone(), values));

sql/src/value/arrow_decoder.rs

Lines changed: 33 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -79,15 +79,17 @@ impl
7979
&ArrowField,
8080
&Arc<dyn ArrowArray>,
8181
usize,
82+
bool,
8283
&ResultFormatSettings,
8384
)> for Value
8485
{
8586
type Error = Error;
8687
fn try_from(
87-
(field, array, seq, settings): (
88+
(field, array, seq, in_nested, settings): (
8889
&ArrowField,
8990
&Arc<dyn ArrowArray>,
9091
usize,
92+
bool,
9193
&ResultFormatSettings,
9294
),
9395
) -> std::result::Result<Self, Self::Error> {
@@ -344,15 +346,36 @@ impl
344346
}
345347
}
346348
ArrowDataType::Utf8 => match array.as_any().downcast_ref::<StringArray>() {
347-
Some(array) => Ok(Value::String(array.value(seq).to_string())),
349+
Some(array) => {
350+
let val = if in_nested {
351+
array.value(seq).replace('"', "\\\"").to_string()
352+
} else {
353+
array.value(seq).to_string()
354+
};
355+
Ok(Value::String(val))
356+
}
348357
None => Err(ConvertError::new("string", format!("{array:?}")).into()),
349358
},
350359
ArrowDataType::LargeUtf8 => match array.as_any().downcast_ref::<LargeStringArray>() {
351-
Some(array) => Ok(Value::String(array.value(seq).to_string())),
360+
Some(array) => {
361+
let val = if in_nested {
362+
array.value(seq).replace('"', "\\\"").to_string()
363+
} else {
364+
array.value(seq).to_string()
365+
};
366+
Ok(Value::String(val))
367+
}
352368
None => Err(ConvertError::new("large string", format!("{array:?}")).into()),
353369
},
354370
ArrowDataType::Utf8View => match array.as_any().downcast_ref::<StringViewArray>() {
355-
Some(array) => Ok(Value::String(array.value(seq).to_string())),
371+
Some(array) => {
372+
let val = if in_nested {
373+
array.value(seq).replace('"', "\\\"").to_string()
374+
} else {
375+
array.value(seq).to_string()
376+
};
377+
Ok(Value::String(val))
378+
}
356379
None => Err(ConvertError::new("string view", format!("{array:?}")).into()),
357380
},
358381
// we only support timestamp in microsecond in databend
@@ -392,7 +415,7 @@ impl
392415
let inner_array = unsafe { array.value_unchecked(seq) };
393416
let mut values = Vec::with_capacity(inner_array.len());
394417
for i in 0..inner_array.len() {
395-
let value = Value::try_from((f.as_ref(), &inner_array, i, settings))?;
418+
let value = Value::try_from((f.as_ref(), &inner_array, i, true, settings))?;
396419
values.push(value);
397420
}
398421
Ok(Value::Array(values))
@@ -404,7 +427,7 @@ impl
404427
let inner_array = unsafe { array.value_unchecked(seq) };
405428
let mut values = Vec::with_capacity(inner_array.len());
406429
for i in 0..inner_array.len() {
407-
let value = Value::try_from((f.as_ref(), &inner_array, i, settings))?;
430+
let value = Value::try_from((f.as_ref(), &inner_array, i, true, settings))?;
408431
values.push(value);
409432
}
410433
Ok(Value::Array(values))
@@ -421,12 +444,14 @@ impl
421444
fs[0].as_ref(),
422445
inner_array.column(0),
423446
i,
447+
true,
424448
settings,
425449
))?;
426450
let val = Value::try_from((
427451
fs[1].as_ref(),
428452
inner_array.column(1),
429453
i,
454+
true,
430455
settings,
431456
))?;
432457
values.push((key, val));
@@ -445,7 +470,8 @@ impl
445470
Some(array) => {
446471
let mut values = Vec::with_capacity(array.len());
447472
for (f, inner_array) in fs.iter().zip(array.columns().iter()) {
448-
let value = Value::try_from((f.as_ref(), inner_array, seq, settings))?;
473+
let value =
474+
Value::try_from((f.as_ref(), inner_array, seq, true, settings))?;
449475
values.push(value);
450476
}
451477
Ok(Value::Tuple(values))

sql/src/value/format/display.rs

Lines changed: 15 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -64,32 +64,37 @@ impl Value {
6464
}
6565
Value::Number(n) => write!(f, "{n}"),
6666
Value::Binary(s) => write!(f, "{}", hex::encode_upper(s)),
67-
Value::String(s)
68-
| Value::Bitmap(s)
69-
| Value::Variant(s)
70-
| Value::Interval(s)
71-
| Value::Geometry(s)
72-
| Value::Geography(s) => {
67+
Value::String(s) | Value::Bitmap(s) | Value::Interval(s) => {
7368
if raw {
7469
write!(f, "{s}")
7570
} else {
76-
write!(f, "'{s}'")
71+
write!(f, "\"{s}\"")
72+
}
73+
}
74+
Value::Variant(s) => {
75+
write!(f, "{s}")
76+
}
77+
Value::Geometry(s) | Value::Geography(s) => {
78+
if raw || s.starts_with('{') {
79+
write!(f, "{s}")
80+
} else {
81+
write!(f, "\"{s}\"")
7782
}
7883
}
7984
Value::Timestamp(dt) => {
8085
let formatted = dt.strftime(TIMESTAMP_FORMAT);
8186
if raw {
8287
write!(f, "{formatted}")
8388
} else {
84-
write!(f, "'{formatted}'")
89+
write!(f, "\"{formatted}\"")
8590
}
8691
}
8792
Value::TimestampTz(dt) => {
8893
let formatted = dt.strftime(TIMESTAMP_TIMEZONE_FORMAT);
8994
if raw {
9095
write!(f, "{formatted}")
9196
} else {
92-
write!(f, "'{formatted}'")
97+
write!(f, "\"{formatted}\"")
9398
}
9499
}
95100
Value::Date(i) => {
@@ -98,7 +103,7 @@ impl Value {
98103
if raw {
99104
write!(f, "{d}")
100105
} else {
101-
write!(f, "'{d}'")
106+
write!(f, "\"{d}\"")
102107
}
103108
}
104109
Value::Array(vals) => {

sql/src/value/string_decoder.rs

Lines changed: 48 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,8 @@ use databend_client::schema::{DataType, DecimalDataType, DecimalSize, NumberData
2424
use ethnum::i256;
2525
use hex;
2626
use jiff::{civil::DateTime as JiffDateTime, tz::TimeZone, Zoned};
27+
use serde::Deserialize;
28+
use serde_json::{value::RawValue, Deserializer};
2729
use std::io::{BufRead, Cursor};
2830
use std::str::FromStr;
2931

@@ -281,7 +283,9 @@ impl ValueDecoder {
281283

282284
fn read_string<R: AsRef<[u8]>>(&self, reader: &mut Cursor<R>) -> Result<Value> {
283285
let mut buf = Vec::new();
284-
reader.read_quoted_text(&mut buf, b'\'')?;
286+
if reader.read_quoted_text(&mut buf, b'"').is_err() {
287+
reader.read_quoted_text(&mut buf, b'\'')?;
288+
}
285289
Ok(Value::String(unsafe { String::from_utf8_unchecked(buf) }))
286290
}
287291

@@ -295,54 +299,78 @@ impl ValueDecoder {
295299

296300
fn read_date<R: AsRef<[u8]>>(&self, reader: &mut Cursor<R>) -> Result<Value> {
297301
let mut buf = Vec::new();
298-
reader.read_quoted_text(&mut buf, b'\'')?;
302+
if reader.read_quoted_text(&mut buf, b'"').is_err() {
303+
reader.read_quoted_text(&mut buf, b'\'')?;
304+
}
299305
let v = unsafe { std::str::from_utf8_unchecked(&buf) };
300306
let days = NaiveDate::parse_from_str(v, "%Y-%m-%d")?.num_days_from_ce() - DAYS_FROM_CE;
301307
Ok(Value::Date(days))
302308
}
303309

304310
fn read_timestamp<R: AsRef<[u8]>>(&self, reader: &mut Cursor<R>) -> Result<Value> {
305311
let mut buf = Vec::new();
306-
reader.read_quoted_text(&mut buf, b'\'')?;
312+
if reader.read_quoted_text(&mut buf, b'"').is_err() {
313+
reader.read_quoted_text(&mut buf, b'\'')?;
314+
}
307315
let v = unsafe { std::str::from_utf8_unchecked(&buf) };
308316
parse_timestamp(v, &self.timezone)
309317
}
310318

311319
fn read_timestamp_tz<R: AsRef<[u8]>>(&self, reader: &mut Cursor<R>) -> Result<Value> {
312320
let mut buf = Vec::new();
313-
reader.read_quoted_text(&mut buf, b'\'')?;
321+
if reader.read_quoted_text(&mut buf, b'"').is_err() {
322+
reader.read_quoted_text(&mut buf, b'\'')?;
323+
}
314324
let v = unsafe { std::str::from_utf8_unchecked(&buf) };
315325
let t = Zoned::strptime(TIMESTAMP_TIMEZONE_FORMAT, v)?;
316326
Ok(Value::TimestampTz(t))
317327
}
318328

319329
fn read_interval<R: AsRef<[u8]>>(&self, reader: &mut Cursor<R>) -> Result<Value> {
320330
let mut buf = Vec::new();
321-
reader.read_quoted_text(&mut buf, b'\'')?;
331+
if reader.read_quoted_text(&mut buf, b'"').is_err() {
332+
reader.read_quoted_text(&mut buf, b'\'')?;
333+
}
322334
Ok(Value::Interval(unsafe { String::from_utf8_unchecked(buf) }))
323335
}
324336

325337
fn read_bitmap<R: AsRef<[u8]>>(&self, reader: &mut Cursor<R>) -> Result<Value> {
326338
let mut buf = Vec::new();
327-
reader.read_quoted_text(&mut buf, b'\'')?;
339+
if reader.read_quoted_text(&mut buf, b'"').is_err() {
340+
reader.read_quoted_text(&mut buf, b'\'')?;
341+
}
328342
Ok(Value::Bitmap(unsafe { String::from_utf8_unchecked(buf) }))
329343
}
330344

331345
fn read_variant<R: AsRef<[u8]>>(&self, reader: &mut Cursor<R>) -> Result<Value> {
332-
let mut buf = Vec::new();
333-
reader.read_quoted_text(&mut buf, b'\'')?;
334-
Ok(Value::Variant(unsafe { String::from_utf8_unchecked(buf) }))
346+
if let Ok(val) = self.read_json(reader) {
347+
Ok(Value::Variant(val))
348+
} else {
349+
let mut buf = Vec::new();
350+
reader.read_quoted_text(&mut buf, b'\'')?;
351+
Ok(Value::Variant(unsafe { String::from_utf8_unchecked(buf) }))
352+
}
335353
}
336354

337355
fn read_geometry<R: AsRef<[u8]>>(&self, reader: &mut Cursor<R>) -> Result<Value> {
338356
let mut buf = Vec::new();
339-
reader.read_quoted_text(&mut buf, b'\'')?;
357+
if reader.read_quoted_text(&mut buf, b'"').is_err() {
358+
if let Ok(val) = self.read_json(reader) {
359+
return Ok(Value::Variant(val));
360+
}
361+
reader.read_quoted_text(&mut buf, b'\'')?;
362+
}
340363
Ok(Value::Geometry(unsafe { String::from_utf8_unchecked(buf) }))
341364
}
342365

343366
fn read_geography<R: AsRef<[u8]>>(&self, reader: &mut Cursor<R>) -> Result<Value> {
344367
let mut buf = Vec::new();
345-
reader.read_quoted_text(&mut buf, b'\'')?;
368+
if reader.read_quoted_text(&mut buf, b'"').is_err() {
369+
if let Ok(val) = self.read_json(reader) {
370+
return Ok(Value::Variant(val));
371+
}
372+
reader.read_quoted_text(&mut buf, b'\'')?;
373+
}
346374
Ok(Value::Geography(unsafe {
347375
String::from_utf8_unchecked(buf)
348376
}))
@@ -457,6 +485,15 @@ impl ValueDecoder {
457485
reader.must_ignore_byte(b')')?;
458486
Ok(Value::Tuple(vals))
459487
}
488+
489+
fn read_json<R: AsRef<[u8]>>(&self, reader: &mut Cursor<R>) -> Result<String> {
490+
let start = reader.position() as usize;
491+
let data = reader.get_ref().as_ref();
492+
let mut deserializer = Deserializer::from_slice(&data[start..]);
493+
let raw: Box<RawValue> = Box::<RawValue>::deserialize(&mut deserializer)?;
494+
reader.set_position((start + raw.get().len()) as u64);
495+
Ok(raw.to_string())
496+
}
460497
}
461498

462499
fn parse_timestamp(ts_string: &str, tz: &TimeZone) -> Result<Value> {

0 commit comments

Comments
 (0)