Skip to content

Commit 71ee9d9

Browse files
[Variant] Implement read support for remaining primitive types (#7644)
# Which issue does this PR close? Closes #7630. # What changes are included in this PR? This PR implements support for the following primitive variant types: - Binary - Date - TimestampMicros - TimestampNtzMicros - Int16 - Int32 - Int64 - Decimal4 - Decimal8 - Decimal16 - Float - Double The following types are not yet implemented(see [here](https://github.com/apache/parquet-testing/blob/b68bea40fed8d1a780a9e09dd2262017e04b19ad/variant/regen.py#L78-L83) for details): - TimeNTZ - TimestampNanos - TimestampNtzNanos - UUID # Are there any user-facing changes? Users who opt-in to the Variant feature can use these primitives.
1 parent e32f545 commit 71ee9d9

File tree

4 files changed

+904
-84
lines changed

4 files changed

+904
-84
lines changed

parquet-variant/Cargo.toml

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,6 @@ rust-version = { workspace = true }
3232

3333
[dependencies]
3434
arrow-schema = "55.1.0"
35+
chrono = { workspace = true }
3536

3637
[lib]
37-
38-

parquet-variant/src/decoder.rs

Lines changed: 251 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -15,9 +15,10 @@
1515
// specific language governing permissions and limitations
1616
// under the License.
1717
use arrow_schema::ArrowError;
18+
use chrono::{DateTime, Duration, NaiveDate, NaiveDateTime, Utc};
1819
use std::array::TryFromSliceError;
1920

20-
use crate::utils::{array_from_slice, first_byte_from_slice, string_from_slice};
21+
use crate::utils::{array_from_slice, slice_from_slice, string_from_slice};
2122

2223
#[derive(Debug, Clone, Copy)]
2324
pub enum VariantBasicType {
@@ -33,7 +34,18 @@ pub enum VariantPrimitiveType {
3334
BooleanTrue = 1,
3435
BooleanFalse = 2,
3536
Int8 = 3,
36-
// TODO: Add types for the rest of primitives, once API is agreed upon
37+
Int16 = 4,
38+
Int32 = 5,
39+
Int64 = 6,
40+
Double = 7,
41+
Decimal4 = 8,
42+
Decimal8 = 9,
43+
Decimal16 = 10,
44+
Date = 11,
45+
TimestampMicros = 12,
46+
TimestampNtzMicros = 13,
47+
Float = 14,
48+
Binary = 15,
3749
String = 16,
3850
}
3951

@@ -64,7 +76,18 @@ impl TryFrom<u8> for VariantPrimitiveType {
6476
1 => Ok(VariantPrimitiveType::BooleanTrue),
6577
2 => Ok(VariantPrimitiveType::BooleanFalse),
6678
3 => Ok(VariantPrimitiveType::Int8),
67-
// TODO: Add types for the rest, once API is agreed upon
79+
4 => Ok(VariantPrimitiveType::Int16),
80+
5 => Ok(VariantPrimitiveType::Int32),
81+
6 => Ok(VariantPrimitiveType::Int64),
82+
7 => Ok(VariantPrimitiveType::Double),
83+
8 => Ok(VariantPrimitiveType::Decimal4),
84+
9 => Ok(VariantPrimitiveType::Decimal8),
85+
10 => Ok(VariantPrimitiveType::Decimal16),
86+
11 => Ok(VariantPrimitiveType::Date),
87+
12 => Ok(VariantPrimitiveType::TimestampMicros),
88+
13 => Ok(VariantPrimitiveType::TimestampNtzMicros),
89+
14 => Ok(VariantPrimitiveType::Float),
90+
15 => Ok(VariantPrimitiveType::Binary),
6891
16 => Ok(VariantPrimitiveType::String),
6992
_ => Err(ArrowError::InvalidArgumentError(format!(
7093
"unknown primitive type: {}",
@@ -73,10 +96,10 @@ impl TryFrom<u8> for VariantPrimitiveType {
7396
}
7497
}
7598
}
76-
/// Extract the primitive type from a Variant value-header byte
77-
pub(crate) fn get_primitive_type(header: u8) -> Result<VariantPrimitiveType, ArrowError> {
99+
/// Extract the primitive type from a Variant value-metadata byte
100+
pub(crate) fn get_primitive_type(metadata: u8) -> Result<VariantPrimitiveType, ArrowError> {
78101
// last 6 bits contain the primitive-type, see spec
79-
VariantPrimitiveType::try_from(header >> 2)
102+
VariantPrimitiveType::try_from(metadata >> 2)
80103
}
81104

82105
/// To be used in `map_err` when unpacking an integer from a slice of bytes.
@@ -85,23 +108,103 @@ fn map_try_from_slice_error(e: TryFromSliceError) -> ArrowError {
85108
}
86109

87110
/// Decodes an Int8 from the value section of a variant.
88-
pub(crate) fn decode_int8(value: &[u8]) -> Result<i8, ArrowError> {
89-
let value = i8::from_le_bytes(array_from_slice(value, 1)?);
111+
pub(crate) fn decode_int8(data: &[u8]) -> Result<i8, ArrowError> {
112+
Ok(i8::from_le_bytes(array_from_slice(data, 0)?))
113+
}
114+
115+
/// Decodes an Int16 from the value section of a variant.
116+
pub(crate) fn decode_int16(data: &[u8]) -> Result<i16, ArrowError> {
117+
Ok(i16::from_le_bytes(array_from_slice(data, 0)?))
118+
}
119+
120+
/// Decodes an Int32 from the value section of a variant.
121+
pub(crate) fn decode_int32(data: &[u8]) -> Result<i32, ArrowError> {
122+
Ok(i32::from_le_bytes(array_from_slice(data, 0)?))
123+
}
124+
125+
/// Decodes an Int64 from the value section of a variant.
126+
pub(crate) fn decode_int64(data: &[u8]) -> Result<i64, ArrowError> {
127+
Ok(i64::from_le_bytes(array_from_slice(data, 0)?))
128+
}
129+
130+
/// Decodes a Decimal4 from the value section of a variant.
131+
pub(crate) fn decode_decimal4(data: &[u8]) -> Result<(i32, u8), ArrowError> {
132+
let scale = u8::from_le_bytes(array_from_slice(data, 0)?);
133+
let integer = i32::from_le_bytes(array_from_slice(data, 1)?);
134+
Ok((integer, scale))
135+
}
136+
137+
/// Decodes a Decimal8 from the value section of a variant.
138+
pub(crate) fn decode_decimal8(data: &[u8]) -> Result<(i64, u8), ArrowError> {
139+
let scale = u8::from_le_bytes(array_from_slice(data, 0)?);
140+
let integer = i64::from_le_bytes(array_from_slice(data, 1)?);
141+
Ok((integer, scale))
142+
}
143+
144+
/// Decodes a Decimal16 from the value section of a variant.
145+
pub(crate) fn decode_decimal16(data: &[u8]) -> Result<(i128, u8), ArrowError> {
146+
let scale = u8::from_le_bytes(array_from_slice(data, 0)?);
147+
let integer = i128::from_le_bytes(array_from_slice(data, 1)?);
148+
Ok((integer, scale))
149+
}
150+
151+
/// Decodes a Float from the value section of a variant.
152+
pub(crate) fn decode_float(data: &[u8]) -> Result<f32, ArrowError> {
153+
Ok(f32::from_le_bytes(array_from_slice(data, 0)?))
154+
}
155+
156+
/// Decodes a Double from the value section of a variant.
157+
pub(crate) fn decode_double(data: &[u8]) -> Result<f64, ArrowError> {
158+
Ok(f64::from_le_bytes(array_from_slice(data, 0)?))
159+
}
160+
161+
/// Decodes a Date from the value section of a variant.
162+
pub(crate) fn decode_date(data: &[u8]) -> Result<NaiveDate, ArrowError> {
163+
let days_since_epoch = i32::from_le_bytes(array_from_slice(data, 0)?);
164+
let value = DateTime::UNIX_EPOCH + Duration::days(i64::from(days_since_epoch));
165+
Ok(value.date_naive())
166+
}
167+
168+
/// Decodes a TimestampMicros from the value section of a variant.
169+
pub(crate) fn decode_timestamp_micros(data: &[u8]) -> Result<DateTime<Utc>, ArrowError> {
170+
let micros_since_epoch = i64::from_le_bytes(array_from_slice(data, 0)?);
171+
DateTime::from_timestamp_micros(micros_since_epoch).ok_or_else(|| {
172+
ArrowError::CastError(format!(
173+
"Could not cast `{micros_since_epoch}` microseconds into a DateTime<Utc>"
174+
))
175+
})
176+
}
177+
178+
/// Decodes a TimestampNtzMicros from the value section of a variant.
179+
pub(crate) fn decode_timestampntz_micros(data: &[u8]) -> Result<NaiveDateTime, ArrowError> {
180+
let micros_since_epoch = i64::from_le_bytes(array_from_slice(data, 0)?);
181+
DateTime::from_timestamp_micros(micros_since_epoch)
182+
.ok_or_else(|| {
183+
ArrowError::CastError(format!(
184+
"Could not cast `{micros_since_epoch}` microseconds into a NaiveDateTime"
185+
))
186+
})
187+
.map(|v| v.naive_utc())
188+
}
189+
190+
/// Decodes a Binary from the value section of a variant.
191+
pub(crate) fn decode_binary(data: &[u8]) -> Result<&[u8], ArrowError> {
192+
let len = u32::from_le_bytes(array_from_slice(data, 0)?) as usize;
193+
let value = slice_from_slice(data, 4..4 + len)?;
90194
Ok(value)
91195
}
92196

93197
/// Decodes a long string from the value section of a variant.
94-
pub(crate) fn decode_long_string(value: &[u8]) -> Result<&str, ArrowError> {
95-
let len = u32::from_le_bytes(array_from_slice(value, 1)?) as usize;
96-
let string = string_from_slice(value, 5..5 + len)?;
198+
pub(crate) fn decode_long_string(data: &[u8]) -> Result<&str, ArrowError> {
199+
let len = u32::from_le_bytes(array_from_slice(data, 0)?) as usize;
200+
let string = string_from_slice(data, 4..4 + len)?;
97201
Ok(string)
98202
}
99203

100204
/// Decodes a short string from the value section of a variant.
101-
pub(crate) fn decode_short_string(value: &[u8]) -> Result<&str, ArrowError> {
102-
let len = (first_byte_from_slice(value)? >> 2) as usize;
103-
104-
let string = string_from_slice(value, 1..1 + len)?;
205+
pub(crate) fn decode_short_string(metadata: u8, data: &[u8]) -> Result<&str, ArrowError> {
206+
let len = (metadata >> 2) as usize;
207+
let string = string_from_slice(data, 0..len)?;
105208
Ok(string)
106209
}
107210

@@ -111,47 +214,152 @@ mod tests {
111214

112215
#[test]
113216
fn test_i8() -> Result<(), ArrowError> {
114-
let value = [
115-
3 << 2, // Primitive type for i8
116-
42,
117-
];
118-
let result = decode_int8(&value)?;
217+
let data = [0x2a];
218+
let result = decode_int8(&data)?;
119219
assert_eq!(result, 42);
120220
Ok(())
121221
}
122222

123223
#[test]
124-
fn test_short_string() -> Result<(), ArrowError> {
125-
let value = [
126-
1 | 5 << 2, // Basic type for short string | length of short string
127-
b'H',
128-
b'e',
129-
b'l',
130-
b'l',
131-
b'o',
132-
b'o',
224+
fn test_i16() -> Result<(), ArrowError> {
225+
let data = [0xd2, 0x04];
226+
let result = decode_int16(&data)?;
227+
assert_eq!(result, 1234);
228+
Ok(())
229+
}
230+
231+
#[test]
232+
fn test_i32() -> Result<(), ArrowError> {
233+
let data = [0x40, 0xe2, 0x01, 0x00];
234+
let result = decode_int32(&data)?;
235+
assert_eq!(result, 123456);
236+
Ok(())
237+
}
238+
239+
#[test]
240+
fn test_i64() -> Result<(), ArrowError> {
241+
let data = [0x15, 0x81, 0xe9, 0x7d, 0xf4, 0x10, 0x22, 0x11];
242+
let result = decode_int64(&data)?;
243+
assert_eq!(result, 1234567890123456789);
244+
Ok(())
245+
}
246+
247+
#[test]
248+
fn test_decimal4() -> Result<(), ArrowError> {
249+
let data = [
250+
0x02, // Scale
251+
0xd2, 0x04, 0x00, 0x00, // Integer
133252
];
134-
let result = decode_short_string(&value)?;
253+
let result = decode_decimal4(&data)?;
254+
assert_eq!(result, (1234, 2));
255+
Ok(())
256+
}
257+
258+
#[test]
259+
fn test_decimal8() -> Result<(), ArrowError> {
260+
let data = [
261+
0x02, // Scale
262+
0xd2, 0x02, 0x96, 0x49, 0x00, 0x00, 0x00, 0x00, // Integer
263+
];
264+
let result = decode_decimal8(&data)?;
265+
assert_eq!(result, (1234567890, 2));
266+
Ok(())
267+
}
268+
269+
#[test]
270+
fn test_decimal16() -> Result<(), ArrowError> {
271+
let data = [
272+
0x02, // Scale
273+
0xd2, 0xb6, 0x23, 0xc0, 0xf4, 0x10, 0x22, 0x11, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
274+
0x00, 0x00, // Integer
275+
];
276+
let result = decode_decimal16(&data)?;
277+
assert_eq!(result, (1234567891234567890, 2));
278+
Ok(())
279+
}
280+
281+
#[test]
282+
fn test_float() -> Result<(), ArrowError> {
283+
let data = [0x06, 0x2c, 0x93, 0x4e];
284+
let result = decode_float(&data)?;
285+
assert_eq!(result, 1234567890.1234);
286+
Ok(())
287+
}
288+
289+
#[test]
290+
fn test_double() -> Result<(), ArrowError> {
291+
let data = [0xc9, 0xe5, 0x87, 0xb4, 0x80, 0x65, 0xd2, 0x41];
292+
let result = decode_double(&data)?;
293+
assert_eq!(result, 1234567890.1234);
294+
Ok(())
295+
}
296+
297+
#[test]
298+
fn test_date() -> Result<(), ArrowError> {
299+
let data = [0xe2, 0x4e, 0x0, 0x0];
300+
let result = decode_date(&data)?;
301+
assert_eq!(result, NaiveDate::from_ymd_opt(2025, 4, 16).unwrap());
302+
Ok(())
303+
}
304+
305+
#[test]
306+
fn test_timestamp_micros() -> Result<(), ArrowError> {
307+
let data = [0xe0, 0x52, 0x97, 0xdd, 0xe7, 0x32, 0x06, 0x00];
308+
let result = decode_timestamp_micros(&data)?;
309+
assert_eq!(
310+
result,
311+
NaiveDate::from_ymd_opt(2025, 4, 16)
312+
.unwrap()
313+
.and_hms_milli_opt(16, 34, 56, 780)
314+
.unwrap()
315+
.and_utc()
316+
);
317+
Ok(())
318+
}
319+
320+
#[test]
321+
fn test_timestampntz_micros() -> Result<(), ArrowError> {
322+
let data = [0xe0, 0x52, 0x97, 0xdd, 0xe7, 0x32, 0x06, 0x00];
323+
let result = decode_timestampntz_micros(&data)?;
324+
assert_eq!(
325+
result,
326+
NaiveDate::from_ymd_opt(2025, 4, 16)
327+
.unwrap()
328+
.and_hms_milli_opt(16, 34, 56, 780)
329+
.unwrap()
330+
);
331+
Ok(())
332+
}
333+
334+
#[test]
335+
fn test_binary() -> Result<(), ArrowError> {
336+
let data = [
337+
0x09, 0, 0, 0, // Length of binary data, 4-byte little-endian
338+
0x03, 0x13, 0x37, 0xde, 0xad, 0xbe, 0xef, 0xca, 0xfe,
339+
];
340+
let result = decode_binary(&data)?;
341+
assert_eq!(
342+
result,
343+
[0x03, 0x13, 0x37, 0xde, 0xad, 0xbe, 0xef, 0xca, 0xfe]
344+
);
345+
Ok(())
346+
}
347+
348+
#[test]
349+
fn test_short_string() -> Result<(), ArrowError> {
350+
let data = [b'H', b'e', b'l', b'l', b'o', b'o'];
351+
let result = decode_short_string(1 | 5 << 2, &data)?;
135352
assert_eq!(result, "Hello");
136353
Ok(())
137354
}
138355

139356
#[test]
140357
fn test_string() -> Result<(), ArrowError> {
141-
let value = [
142-
16 << 2, // Basic type for short string | length of short string
143-
5,
144-
0,
145-
0,
146-
0, // Length of string
147-
b'H',
148-
b'e',
149-
b'l',
150-
b'l',
151-
b'o',
152-
b'o',
358+
let data = [
359+
0x05, 0, 0, 0, // Length of string, 4-byte little-endian
360+
b'H', b'e', b'l', b'l', b'o', b'o',
153361
];
154-
let result = decode_long_string(&value)?;
362+
let result = decode_long_string(&data)?;
155363
assert_eq!(result, "Hello");
156364
Ok(())
157365
}

0 commit comments

Comments
 (0)