Skip to content

Commit 6e65287

Browse files
authored
Merge pull request #1 from alamb/variant-to-json
Merge + fix logical conflict of json writer
2 parents 31fb009 + 4b18f7f commit 6e65287

File tree

5 files changed

+405
-148
lines changed

5 files changed

+405
-148
lines changed

.github/pull_request_template.md

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
We generally require a GitHub issue to be filed for all bug fixes and enhancements and this helps us generate change logs for our releases. You can link an issue to this PR using the GitHub syntax.
44

5-
Closes #NNN.
5+
- Closes #NNN.
66

77
# Rationale for this change
88

@@ -13,6 +13,14 @@ Explaining clearly why changes are proposed helps reviewers understand your chan
1313

1414
There is no need to duplicate the description in the issue here but it is sometimes worth providing a summary of the individual changes in this PR.
1515

16+
# Are these changes tested?
17+
18+
We typically require tests for all PRs in order to:
19+
1. Prevent the code from being accidentally broken by subsequent changes
20+
2. Serve as another way to document the expected behavior of the code
21+
22+
If tests are not included in your PR, please explain why (for example, are they covered by existing tests)?
23+
1624
# Are there any user-facing changes?
1725

1826
If there are user-facing changes then we may require documentation to be updated before approving the PR.

parquet-variant/src/builder.rs

Lines changed: 104 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,8 @@
1515
// specific language governing permissions and limitations
1616
// under the License.
1717
use crate::decoder::{VariantBasicType, VariantPrimitiveType};
18-
use crate::{ShortString, Variant};
19-
use std::collections::HashMap;
18+
use crate::{ShortString, Variant, VariantDecimal16, VariantDecimal4, VariantDecimal8};
19+
use std::collections::BTreeMap;
2020

2121
const BASIC_TYPE_BITS: u8 = 2;
2222
const UNIX_EPOCH_DATE: chrono::NaiveDate = chrono::NaiveDate::from_ymd_opt(1970, 1, 1).unwrap();
@@ -166,15 +166,15 @@ fn make_room_for_header(buffer: &mut Vec<u8>, start_pos: usize, header_size: usi
166166
///
167167
pub struct VariantBuilder {
168168
buffer: Vec<u8>,
169-
dict: HashMap<String, u32>,
169+
dict: BTreeMap<String, u32>,
170170
dict_keys: Vec<String>,
171171
}
172172

173173
impl VariantBuilder {
174174
pub fn new() -> Self {
175175
Self {
176176
buffer: Vec::new(),
177-
dict: HashMap::new(),
177+
dict: BTreeMap::new(),
178178
dict_keys: Vec::new(),
179179
}
180180
}
@@ -296,7 +296,7 @@ impl VariantBuilder {
296296

297297
/// Add key to dictionary, return its ID
298298
fn add_key(&mut self, key: &str) -> u32 {
299-
use std::collections::hash_map::Entry;
299+
use std::collections::btree_map::Entry;
300300
match self.dict.entry(key.to_string()) {
301301
Entry::Occupied(entry) => *entry.get(),
302302
Entry::Vacant(entry) => {
@@ -384,9 +384,15 @@ impl VariantBuilder {
384384
Variant::Date(v) => self.append_date(v),
385385
Variant::TimestampMicros(v) => self.append_timestamp_micros(v),
386386
Variant::TimestampNtzMicros(v) => self.append_timestamp_ntz_micros(v),
387-
Variant::Decimal4 { integer, scale } => self.append_decimal4(integer, scale),
388-
Variant::Decimal8 { integer, scale } => self.append_decimal8(integer, scale),
389-
Variant::Decimal16 { integer, scale } => self.append_decimal16(integer, scale),
387+
Variant::Decimal4(VariantDecimal4 { integer, scale }) => {
388+
self.append_decimal4(integer, scale)
389+
}
390+
Variant::Decimal8(VariantDecimal8 { integer, scale }) => {
391+
self.append_decimal8(integer, scale)
392+
}
393+
Variant::Decimal16(VariantDecimal16 { integer, scale }) => {
394+
self.append_decimal16(integer, scale)
395+
}
390396
Variant::Float(v) => self.append_float(v),
391397
Variant::Double(v) => self.append_double(v),
392398
Variant::Binary(v) => self.append_binary(v),
@@ -482,7 +488,7 @@ impl<'a> ListBuilder<'a> {
482488
pub struct ObjectBuilder<'a> {
483489
parent: &'a mut VariantBuilder,
484490
start_pos: usize,
485-
fields: Vec<(u32, usize)>, // (field_id, offset)
491+
fields: BTreeMap<u32, usize>, // (field_id, offset)
486492
}
487493

488494
impl<'a> ObjectBuilder<'a> {
@@ -491,7 +497,7 @@ impl<'a> ObjectBuilder<'a> {
491497
Self {
492498
parent,
493499
start_pos,
494-
fields: Vec::new(),
500+
fields: BTreeMap::new(),
495501
}
496502
}
497503

@@ -500,25 +506,27 @@ impl<'a> ObjectBuilder<'a> {
500506
let id = self.parent.add_key(key);
501507
let field_start = self.parent.offset() - self.start_pos;
502508
self.parent.append_value(value);
503-
self.fields.push((id, field_start));
509+
let res = self.fields.insert(id, field_start);
510+
debug_assert!(res.is_none());
504511
}
505512

506513
/// Finalize object with sorted fields
507-
pub fn finish(mut self) {
508-
// Sort fields by key name
509-
self.fields.sort_by(|a, b| {
510-
let key_a = &self.parent.dict_keys[a.0 as usize];
511-
let key_b = &self.parent.dict_keys[b.0 as usize];
512-
key_a.cmp(key_b)
513-
});
514-
514+
pub fn finish(self) {
515515
let data_size = self.parent.offset() - self.start_pos;
516516
let num_fields = self.fields.len();
517517
let is_large = num_fields > u8::MAX as usize;
518518
let size_bytes = if is_large { 4 } else { 1 };
519519

520-
let max_id = self.fields.iter().map(|&(id, _)| id).max().unwrap_or(0);
521-
let id_size = int_size(max_id as usize);
520+
let field_ids_by_sorted_field_name = self
521+
.parent
522+
.dict
523+
.iter()
524+
.filter_map(|(_, id)| self.fields.contains_key(id).then_some(*id))
525+
.collect::<Vec<_>>();
526+
527+
let max_id = self.fields.keys().last().copied().unwrap_or(0) as usize;
528+
529+
let id_size = int_size(max_id);
522530
let offset_size = int_size(data_size);
523531

524532
let header_size = 1
@@ -542,17 +550,18 @@ impl<'a> ObjectBuilder<'a> {
542550
}
543551

544552
// Write field IDs (sorted order)
545-
for &(id, _) in &self.fields {
553+
for id in &field_ids_by_sorted_field_name {
546554
write_offset(
547555
&mut self.parent.buffer[pos..pos + id_size as usize],
548-
id as usize,
556+
*id as usize,
549557
id_size,
550558
);
551559
pos += id_size as usize;
552560
}
553561

554562
// Write field offsets
555-
for &(_, offset) in &self.fields {
563+
for id in &field_ids_by_sorted_field_name {
564+
let &offset = self.fields.get(id).unwrap();
556565
write_offset(
557566
&mut self.parent.buffer[pos..pos + offset_size as usize],
558567
offset,
@@ -749,6 +758,77 @@ mod tests {
749758
assert_eq!(field_ids, vec![1, 2, 0]);
750759
}
751760

761+
#[test]
762+
fn test_object_and_metadata_ordering() {
763+
let mut builder = VariantBuilder::new();
764+
765+
let mut obj = builder.new_object();
766+
767+
obj.append_value("zebra", "stripes"); // ID = 0
768+
obj.append_value("apple", "red"); // ID = 1
769+
770+
{
771+
// fields_map is ordered by insertion order (field id)
772+
let fields_map = obj.fields.keys().copied().collect::<Vec<_>>();
773+
assert_eq!(fields_map, vec![0, 1]);
774+
775+
// dict is ordered by field names
776+
// NOTE: when we support nested objects, we'll want to perform a filter by fields_map field ids
777+
let dict_metadata = obj
778+
.parent
779+
.dict
780+
.iter()
781+
.map(|(f, i)| (f.as_str(), *i))
782+
.collect::<Vec<_>>();
783+
784+
assert_eq!(dict_metadata, vec![("apple", 1), ("zebra", 0)]);
785+
786+
// dict_keys is ordered by insertion order (field id)
787+
let dict_keys = obj
788+
.parent
789+
.dict_keys
790+
.iter()
791+
.map(|k| k.as_str())
792+
.collect::<Vec<_>>();
793+
assert_eq!(dict_keys, vec!["zebra", "apple"]);
794+
}
795+
796+
obj.append_value("banana", "yellow"); // ID = 2
797+
798+
{
799+
// fields_map is ordered by insertion order (field id)
800+
let fields_map = obj.fields.keys().copied().collect::<Vec<_>>();
801+
assert_eq!(fields_map, vec![0, 1, 2]);
802+
803+
// dict is ordered by field names
804+
// NOTE: when we support nested objects, we'll want to perform a filter by fields_map field ids
805+
let dict_metadata = obj
806+
.parent
807+
.dict
808+
.iter()
809+
.map(|(f, i)| (f.as_str(), *i))
810+
.collect::<Vec<_>>();
811+
812+
assert_eq!(
813+
dict_metadata,
814+
vec![("apple", 1), ("banana", 2), ("zebra", 0)]
815+
);
816+
817+
// dict_keys is ordered by insertion order (field id)
818+
let dict_keys = obj
819+
.parent
820+
.dict_keys
821+
.iter()
822+
.map(|k| k.as_str())
823+
.collect::<Vec<_>>();
824+
assert_eq!(dict_keys, vec!["zebra", "apple", "banana"]);
825+
}
826+
827+
obj.finish();
828+
829+
builder.finish();
830+
}
831+
752832
#[test]
753833
fn test_append_object() {
754834
let (object_metadata, object_value) = {

0 commit comments

Comments
 (0)