Skip to content

Commit de9864a

Browse files
committed
fix(fuse): validate inline variant bytes
1 parent 3a4cbd4 commit de9864a

File tree

2 files changed

+33
-26
lines changed

2 files changed

+33
-26
lines changed

src/query/storages/fuse/src/io/write/virtual_column_builder.rs

Lines changed: 12 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -793,31 +793,29 @@ impl VirtualColumnBuilder {
793793

794794
for row in 0..num_rows {
795795
let mut decoded_jsonb: Option<Vec<u8>> = None;
796+
let mut validated_jsonb = false;
796797
for (idx, builder) in builders.iter_mut().enumerate() {
797798
let entry = block.get_by_offset(source_offsets[idx]);
798799
let value = unsafe { entry.index_unchecked(row) };
799800
let ScalarRef::Variant(jsonb_bytes) = value else {
800801
builder.push(ScalarRef::Null);
801802
continue;
802803
};
803-
let mut raw_jsonb = RawJsonb::new(jsonb_bytes);
804+
if !validated_jsonb {
805+
validated_jsonb = true;
806+
if jsonb::from_slice(jsonb_bytes).is_err() {
807+
if let Ok(jsonb) = parquet_variant_bytes_to_jsonb(jsonb_bytes) {
808+
decoded_jsonb = Some(jsonb);
809+
}
810+
}
811+
}
812+
let jsonb_bytes = decoded_jsonb.as_deref().unwrap_or(jsonb_bytes);
813+
let raw_jsonb = RawJsonb::new(jsonb_bytes);
804814
let path = &borrowed_paths[idx];
805815
let raw_value = match raw_jsonb.get_by_keypath(path.paths.iter()) {
806816
Ok(Some(value)) => Some(value),
807817
Ok(None) => None,
808-
Err(_) => {
809-
if decoded_jsonb.is_none() {
810-
if let Ok(jsonb) = parquet_variant_bytes_to_jsonb(jsonb_bytes) {
811-
decoded_jsonb = Some(jsonb);
812-
}
813-
}
814-
if let Some(ref jsonb) = decoded_jsonb {
815-
raw_jsonb = RawJsonb::new(jsonb);
816-
raw_jsonb.get_by_keypath(path.paths.iter()).ok().flatten()
817-
} else {
818-
None
819-
}
820-
}
818+
Err(_) => None,
821819
};
822820
let Some(raw_value) = raw_value else {
823821
builder.push(ScalarRef::Null);

tests/sqllogictests/suites/ee/01_ee_system/01_0002_virtual_column.test

Lines changed: 21 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -143,29 +143,29 @@ test_virtual_column t2 val 3000000002 ['c'] String
143143
query II
144144
select row_count, virtual_column_size from fuse_block('test_virtual_column', 't2')
145145
----
146-
3 1032
146+
3 809
147147

148148
query III
149149
select block_count, row_count, virtual_column_size from fuse_segment('test_virtual_column', 't2');
150150
----
151-
1 3 1032
151+
1 3 809
152152

153153
query III
154154
select block_count, row_count, virtual_column_size from fuse_snapshot('test_virtual_column', 't2');
155155
----
156-
1 3 1032
156+
1 3 809
157157

158158
query IITTIII
159159
select virtual_block_size, row_count, column_name, column_type, column_id, block_offset, bytes_compressed from fuse_virtual_column('test_virtual_column', 't2')
160160
----
161-
1032 3 val['a'] UInt64 NULL 0 4 40
162-
1032 3 val['b'] UInt64 NULL 1 44 40
163-
1032 3 val['c'] String NULL 2 84 40
161+
809 3 val['a'] UInt64 NULL 3000000000 4 40
162+
809 3 val['b'] UInt64 NULL 3000000001 44 40
163+
809 3 val['c'] String NULL 3000000002 84 40
164164

165165
query IIIIII
166166
select block_count, row_count, bytes_uncompressed, bytes_compressed, index_size, virtual_block_count from fuse_segment('test_virtual_column', 't2')
167167
----
168-
1 3 134 734 1470 1
168+
1 3 134 734 1247 1
169169

170170
statement ok
171171
insert into t2 values(4, '{"a":44,"b":4,"c":"value"}'), (5, '{"a":55,"b":5,"c":"bend"}'), (6, '6')
@@ -343,6 +343,8 @@ test_virtual_column tweets data['create'] 10 10 10 0 1/08 6/07 16 (empty)
343343
test_virtual_column tweets data['id'] 10 10 10 0 1 10 8 (empty)
344344
test_virtual_column tweets data['likes'] 10 10 2 0 10 25 1 (empty)
345345
test_virtual_column tweets data['replies'] 10 10 7 3 0 9 8 (empty)
346+
test_virtual_column tweets data['tags'][0] 10 10 2 0 good new 3 (empty)
347+
test_virtual_column tweets data['tags'][1] 10 10 2 0 interesting popular 3 (empty)
346348
test_virtual_column tweets data['text'] 10 10 7 0 a z 16 (empty)
347349
test_virtual_column tweets data['user']['id'] 10 10 6 0 1 7 8 (empty)
348350
test_virtual_column tweets id 10 10 10 0 1 10 4 (empty)
@@ -360,6 +362,8 @@ data['replies'] 7 3 8 (empty)
360362
data['text'] 8 0 16 (empty)
361363
data['user']['id'] 6 0 8 (empty)
362364
data['likes'] 2 0 1 (empty)
365+
data['tags'][0] 2 0 3 (empty)
366+
data['tags'][1] 2 0 3 (empty)
363367

364368
query TTTIIIITTIT
365369
SHOW STATISTICS FROM TABLE test_virtual_column.tweets;
@@ -369,6 +373,8 @@ test_virtual_column tweets data['create'] 10 10 10 0 1/08 6/07 16 (empty)
369373
test_virtual_column tweets data['id'] 10 10 10 0 1 10 8 (empty)
370374
test_virtual_column tweets data['likes'] 10 10 2 0 10 25 1 (empty)
371375
test_virtual_column tweets data['replies'] 10 10 7 3 0 10 8 (empty)
376+
test_virtual_column tweets data['tags'][0] 10 10 2 0 good new 3 (empty)
377+
test_virtual_column tweets data['tags'][1] 10 10 2 0 interesting popular 3 (empty)
372378
test_virtual_column tweets data['text'] 10 10 8 0 a z 16 (empty)
373379
test_virtual_column tweets data['user']['id'] 10 10 6 0 1 7 8 (empty)
374380
test_virtual_column tweets id 10 10 10 0 1 10 4 (empty)
@@ -493,7 +499,7 @@ select id, data['id'], data['create'], data['text'], data['user']['id'], data['r
493499
query I
494500
SELECT count(*) FROM fuse_virtual_column('test_virtual_column', 'tweets');
495501
----
496-
27
502+
26
497503

498504
statement ok
499505
set enable_experimental_virtual_column = 0;
@@ -504,7 +510,7 @@ INSERT INTO tweets FROM (SELECT * FROM tweets);
504510
query I
505511
SELECT count(*) FROM fuse_virtual_column('test_virtual_column', 'tweets');
506512
----
507-
27
513+
26
508514

509515
statement error
510516
REFRESH VIRTUAL COLUMN FOR tweets;
@@ -522,7 +528,7 @@ optimize table test_virtual_column.tweets compact
522528
query I
523529
SELECT count(*) FROM fuse_virtual_column('test_virtual_column', 'tweets');
524530
----
525-
8
531+
6
526532

527533
query IITTIITFIT
528534
select id, data['id'], data['create'], data['text'], data['user']['id'], data['replies'], data['geo'], data['geo']['lat'], data['likes'], data['tags'] from tweets order by id;
@@ -705,8 +711,11 @@ test_virtual_column test_stream data1 3000000000 ['address']['city'] String
705711
test_virtual_column test_stream data1 3000000001 ['address']['district'] String
706712
test_virtual_column test_stream data1 3000000002 ['age'] UInt64
707713
test_virtual_column test_stream data1 3000000003 ['name'] String
708-
test_virtual_column test_stream data2 3000000004 ['hobbies'] Jsonb
709-
test_virtual_column test_stream data2 3000000005 ['scores'] Jsonb
714+
test_virtual_column test_stream data2 3000000004 ['hobbies'][0] String
715+
test_virtual_column test_stream data2 3000000005 ['hobbies'][1] String
716+
test_virtual_column test_stream data2 3000000006 ['scores'][0] UInt64
717+
test_virtual_column test_stream data2 3000000007 ['scores'][1] UInt64
718+
test_virtual_column test_stream data2 3000000008 ['scores'][2] UInt64
710719

711720
query ITTTTT
712721
SELECT

0 commit comments

Comments
 (0)