Skip to content

Commit 07a3636

Browse files
authored
[Json] Replace ArrayData with typed Array construction in json-reader (apache#9497)
# Which issue does this PR close? <!-- We generally require a GitHub issue to be filed for all bug fixes and enhancements and this helps us generate change logs for our releases. You can link an issue to this PR using the GitHub syntax. --> - Part of apache#9298. # Rationale for this change <!-- Why are you proposing this change? If this is already explained clearly in the issue then this section is not needed. Explaining clearly why changes are proposed helps reviewers understand your changes and offer better suggestions for fixes. --> While implementing `ListViewArrayDecoder` in arrow-json, I noticed we could potentially retire `ArrayDataBuilder` inside `ListArrayDecoder`. Therefore, I'd like to use a small PR here to make sure there's no regression # What changes are included in this PR? <!-- There is no need to duplicate the description in the issue here but it is sometimes worth providing a summary of the individual changes in this PR. --> Replace `ArrayDataBuilder` with `GenericListArray` in `ListArrayDecoder` # Are these changes tested? <!-- We typically require tests for all PRs in order to: 1. Prevent the code from being accidentally broken by subsequent changes 2. Serve as another way to document the expected behavior of the code If tests are not included in your PR, please explain why (for example, are they covered by existing tests)? --> Covered by existing tests # Are there any user-facing changes? <!-- If there are user-facing changes then we may require documentation to be updated before approving the PR. If there are any breaking changes to public APIs, please call them out. --> No
1 parent f5365c3 commit 07a3636

13 files changed

Lines changed: 187 additions & 158 deletions

arrow-json/src/reader/binary_array.rs

Lines changed: 11 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -15,12 +15,13 @@
1515
// specific language governing permissions and limitations
1616
// under the License.
1717

18-
use arrow_array::builder::{BinaryViewBuilder, FixedSizeBinaryBuilder, GenericBinaryBuilder};
19-
use arrow_array::{Array, GenericStringArray, OffsetSizeTrait};
20-
use arrow_data::ArrayData;
21-
use arrow_schema::ArrowError;
2218
use std::io::Write;
2319
use std::marker::PhantomData;
20+
use std::sync::Arc;
21+
22+
use arrow_array::builder::{BinaryViewBuilder, FixedSizeBinaryBuilder, GenericBinaryBuilder};
23+
use arrow_array::{ArrayRef, GenericStringArray, OffsetSizeTrait};
24+
use arrow_schema::ArrowError;
2425

2526
use crate::reader::ArrayDecoder;
2627
use crate::reader::tape::{Tape, TapeElement};
@@ -87,7 +88,7 @@ pub struct BinaryArrayDecoder<O: OffsetSizeTrait> {
8788
}
8889

8990
impl<O: OffsetSizeTrait> ArrayDecoder for BinaryArrayDecoder<O> {
90-
fn decode(&mut self, tape: &Tape<'_>, pos: &[u32]) -> Result<ArrayData, ArrowError> {
91+
fn decode(&mut self, tape: &Tape<'_>, pos: &[u32]) -> Result<ArrayRef, ArrowError> {
9192
let data_capacity = estimate_data_capacity(tape, pos)?;
9293

9394
if O::from_usize(data_capacity).is_none() {
@@ -113,7 +114,7 @@ impl<O: OffsetSizeTrait> ArrayDecoder for BinaryArrayDecoder<O> {
113114
}
114115
}
115116

116-
Ok(builder.finish().into_data())
117+
Ok(Arc::new(builder.finish()))
117118
}
118119
}
119120

@@ -129,7 +130,7 @@ impl FixedSizeBinaryArrayDecoder {
129130
}
130131

131132
impl ArrayDecoder for FixedSizeBinaryArrayDecoder {
132-
fn decode(&mut self, tape: &Tape<'_>, pos: &[u32]) -> Result<ArrayData, ArrowError> {
133+
fn decode(&mut self, tape: &Tape<'_>, pos: &[u32]) -> Result<ArrayRef, ArrowError> {
133134
let mut builder = FixedSizeBinaryBuilder::with_capacity(pos.len(), self.len);
134135
// Preallocate for the decoded byte width (FixedSizeBinary len), not the hex string length.
135136
let mut scratch = Vec::with_capacity(self.len as usize);
@@ -148,15 +149,15 @@ impl ArrayDecoder for FixedSizeBinaryArrayDecoder {
148149
}
149150
}
150151

151-
Ok(builder.finish().into_data())
152+
Ok(Arc::new(builder.finish()))
152153
}
153154
}
154155

155156
#[derive(Default)]
156157
pub struct BinaryViewDecoder {}
157158

158159
impl ArrayDecoder for BinaryViewDecoder {
159-
fn decode(&mut self, tape: &Tape<'_>, pos: &[u32]) -> Result<ArrayData, ArrowError> {
160+
fn decode(&mut self, tape: &Tape<'_>, pos: &[u32]) -> Result<ArrayRef, ArrowError> {
160161
let data_capacity = estimate_data_capacity(tape, pos)?;
161162
let mut builder = BinaryViewBuilder::with_capacity(data_capacity);
162163
let mut scratch = Vec::new();
@@ -175,7 +176,7 @@ impl ArrayDecoder for BinaryViewDecoder {
175176
}
176177
}
177178

178-
Ok(builder.finish().into_data())
179+
Ok(Arc::new(builder.finish()))
179180
}
180181
}
181182

arrow-json/src/reader/boolean_array.rs

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -15,9 +15,10 @@
1515
// specific language governing permissions and limitations
1616
// under the License.
1717

18-
use arrow_array::Array;
18+
use std::sync::Arc;
19+
20+
use arrow_array::ArrayRef;
1921
use arrow_array::builder::BooleanBuilder;
20-
use arrow_data::ArrayData;
2122
use arrow_schema::ArrowError;
2223

2324
use crate::reader::ArrayDecoder;
@@ -27,7 +28,7 @@ use crate::reader::tape::{Tape, TapeElement};
2728
pub struct BooleanArrayDecoder {}
2829

2930
impl ArrayDecoder for BooleanArrayDecoder {
30-
fn decode(&mut self, tape: &Tape<'_>, pos: &[u32]) -> Result<ArrayData, ArrowError> {
31+
fn decode(&mut self, tape: &Tape<'_>, pos: &[u32]) -> Result<ArrayRef, ArrowError> {
3132
let mut builder = BooleanBuilder::with_capacity(pos.len());
3233
for p in pos {
3334
match tape.get(*p) {
@@ -38,6 +39,6 @@ impl ArrayDecoder for BooleanArrayDecoder {
3839
}
3940
}
4041

41-
Ok(builder.finish().into_data())
42+
Ok(Arc::new(builder.finish()))
4243
}
4344
}

arrow-json/src/reader/decimal_array.rs

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -16,12 +16,12 @@
1616
// under the License.
1717

1818
use std::marker::PhantomData;
19+
use std::sync::Arc;
1920

20-
use arrow_array::Array;
21+
use arrow_array::ArrayRef;
2122
use arrow_array::builder::PrimitiveBuilder;
2223
use arrow_array::types::DecimalType;
2324
use arrow_cast::parse::parse_decimal;
24-
use arrow_data::ArrayData;
2525
use arrow_schema::ArrowError;
2626

2727
use crate::reader::ArrayDecoder;
@@ -48,7 +48,7 @@ impl<D> ArrayDecoder for DecimalArrayDecoder<D>
4848
where
4949
D: DecimalType,
5050
{
51-
fn decode(&mut self, tape: &Tape<'_>, pos: &[u32]) -> Result<ArrayData, ArrowError> {
51+
fn decode(&mut self, tape: &Tape<'_>, pos: &[u32]) -> Result<ArrayRef, ArrowError> {
5252
let mut builder = PrimitiveBuilder::<D>::with_capacity(pos.len());
5353

5454
for p in pos {
@@ -94,9 +94,10 @@ where
9494
}
9595
}
9696

97-
Ok(builder
98-
.finish()
99-
.with_precision_and_scale(self.precision, self.scale)?
100-
.into_data())
97+
Ok(Arc::new(
98+
builder
99+
.finish()
100+
.with_precision_and_scale(self.precision, self.scale)?,
101+
))
101102
}
102103
}

arrow-json/src/reader/list_array.rs

Lines changed: 36 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -15,20 +15,24 @@
1515
// specific language governing permissions and limitations
1616
// under the License.
1717

18+
use std::marker::PhantomData;
19+
use std::sync::Arc;
20+
21+
use arrow_array::builder::BooleanBufferBuilder;
22+
use arrow_array::{ArrayRef, GenericListArray, OffsetSizeTrait, make_array};
23+
use arrow_buffer::buffer::NullBuffer;
24+
use arrow_buffer::{Buffer, OffsetBuffer, ScalarBuffer};
25+
use arrow_data::ArrayDataBuilder;
26+
use arrow_schema::{ArrowError, DataType, FieldRef};
27+
1828
use crate::reader::tape::{Tape, TapeElement};
1929
use crate::reader::{ArrayDecoder, DecoderContext};
20-
use arrow_array::OffsetSizeTrait;
21-
use arrow_array::builder::BooleanBufferBuilder;
22-
use arrow_buffer::{Buffer, buffer::NullBuffer};
23-
use arrow_data::{ArrayData, ArrayDataBuilder};
24-
use arrow_schema::{ArrowError, DataType};
25-
use std::marker::PhantomData;
2630

2731
pub type ListArrayDecoder<O> = ListLikeArrayDecoder<O, false>;
2832
pub type ListViewArrayDecoder<O> = ListLikeArrayDecoder<O, true>;
2933

3034
pub struct ListLikeArrayDecoder<O, const IS_VIEW: bool> {
31-
data_type: DataType,
35+
field: FieldRef,
3236
decoder: Box<dyn ArrayDecoder>,
3337
phantom: PhantomData<O>,
3438
is_nullable: bool,
@@ -50,7 +54,7 @@ impl<O: OffsetSizeTrait, const IS_VIEW: bool> ListLikeArrayDecoder<O, IS_VIEW> {
5054
let decoder = ctx.make_decoder(field.data_type(), field.is_nullable())?;
5155

5256
Ok(Self {
53-
data_type: data_type.clone(),
57+
field: field.clone(),
5458
decoder,
5559
phantom: Default::default(),
5660
is_nullable,
@@ -59,7 +63,7 @@ impl<O: OffsetSizeTrait, const IS_VIEW: bool> ListLikeArrayDecoder<O, IS_VIEW> {
5963
}
6064

6165
impl<O: OffsetSizeTrait, const IS_VIEW: bool> ArrayDecoder for ListLikeArrayDecoder<O, IS_VIEW> {
62-
fn decode(&mut self, tape: &Tape<'_>, pos: &[u32]) -> Result<ArrayData, ArrowError> {
66+
fn decode(&mut self, tape: &Tape<'_>, pos: &[u32]) -> Result<ArrayRef, ArrowError> {
6367
let mut child_pos = Vec::with_capacity(pos.len());
6468
let mut offsets = Vec::with_capacity(pos.len() + 1);
6569
offsets.push(O::from_usize(0).unwrap());
@@ -91,34 +95,42 @@ impl<O: OffsetSizeTrait, const IS_VIEW: bool> ArrayDecoder for ListLikeArrayDeco
9195
}
9296

9397
let offset = O::from_usize(child_pos.len()).ok_or_else(|| {
94-
ArrowError::JsonError(format!("offset overflow decoding {}", self.data_type))
98+
ArrowError::JsonError(format!("offset overflow decoding {}ListArray", O::PREFIX))
9599
})?;
96100
offsets.push(offset);
97101
}
98102

99-
let child_data = self.decoder.decode(tape, &child_pos)?;
103+
let values = self.decoder.decode(tape, &child_pos)?;
100104
let nulls = nulls.as_mut().map(|x| NullBuffer::new(x.finish()));
101105

102-
let mut data = ArrayDataBuilder::new(self.data_type.clone())
103-
.len(pos.len())
104-
.nulls(nulls)
105-
.child_data(vec![child_data]);
106-
107106
if IS_VIEW {
108107
let mut sizes = Vec::with_capacity(offsets.len() - 1);
109108
for i in 1..offsets.len() {
110109
sizes.push(offsets[i] - offsets[i - 1]);
111110
}
112111
offsets.pop();
113-
data = data
114-
.add_buffer(Buffer::from_vec(offsets))
115-
.add_buffer(Buffer::from_vec(sizes));
112+
let data_type = if O::IS_LARGE {
113+
DataType::LargeListView(self.field.clone())
114+
} else {
115+
DataType::ListView(self.field.clone())
116+
};
117+
// SAFETY: offsets and sizes are constructed correctly from the tape
118+
let array_data = unsafe {
119+
ArrayDataBuilder::new(data_type)
120+
.len(pos.len())
121+
.nulls(nulls)
122+
.child_data(vec![values.to_data()])
123+
.add_buffer(Buffer::from_vec(offsets))
124+
.add_buffer(Buffer::from_vec(sizes))
125+
.build_unchecked()
126+
};
127+
Ok(make_array(array_data))
116128
} else {
117-
data = data.add_buffer(Buffer::from_vec(offsets));
118-
}
129+
// SAFETY: offsets are built monotonically starting from 0
130+
let offsets = unsafe { OffsetBuffer::<O>::new_unchecked(ScalarBuffer::from(offsets)) };
119131

120-
// Safety
121-
// Validated lengths above
122-
Ok(unsafe { data.build_unchecked() })
132+
let array = GenericListArray::<O>::try_new(self.field.clone(), offsets, values, nulls)?;
133+
Ok(Arc::new(array))
134+
}
123135
}
124136
}

0 commit comments

Comments
 (0)