Skip to content

Commit 389b2b0

Browse files
authored
[Variant] Fix several overflow panic risks for 32-bit arch (#7752)
# Which issue does this PR close? Part of * #6736 # Rationale for this change The variant spec makes extensive use of 4-byte offsets. On a 32-bit system, this can lead to integer overflow panics when doing offset arithmetic on malicious or malformed variant data, because `usize` is 32 bits. That is bad. # What changes are included in this PR? Use checked arithmetic in code that operates on offsets extracted from (untrusted) variant byte buffers. Of particular interest, we define and use a new `slice_from_slice_at_offset` helper, which safely adds an offset to a range. # Are there any user-facing changes? No.
1 parent 8d8541c commit 389b2b0

File tree

6 files changed

+117
-71
lines changed

6 files changed

+117
-71
lines changed

parquet-variant/src/decoder.rs

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
// KIND, either express or implied. See the License for the
1515
// specific language governing permissions and limitations
1616
// under the License.
17-
use crate::utils::{array_from_slice, slice_from_slice, string_from_slice};
17+
use crate::utils::{array_from_slice, slice_from_slice_at_offset, string_from_slice};
1818
use crate::ShortString;
1919

2020
use arrow_schema::ArrowError;
@@ -23,6 +23,9 @@ use chrono::{DateTime, Duration, NaiveDate, NaiveDateTime, Utc};
2323
use std::array::TryFromSliceError;
2424
use std::num::TryFromIntError;
2525

26+
// Makes the code a bit more readable
27+
pub(crate) const VARIANT_VALUE_HEADER_BYTES: usize = 1;
28+
2629
#[derive(Debug, Clone, Copy, PartialEq)]
2730
pub enum VariantBasicType {
2831
Primitive = 0,
@@ -262,21 +265,19 @@ pub(crate) fn decode_timestampntz_micros(data: &[u8]) -> Result<NaiveDateTime, A
262265
/// Decodes a Binary from the value section of a variant.
263266
pub(crate) fn decode_binary(data: &[u8]) -> Result<&[u8], ArrowError> {
264267
let len = u32::from_le_bytes(array_from_slice(data, 0)?) as usize;
265-
let value = slice_from_slice(data, 4..4 + len)?;
266-
Ok(value)
268+
slice_from_slice_at_offset(data, 4, 0..len)
267269
}
268270

269271
/// Decodes a long string from the value section of a variant.
270272
pub(crate) fn decode_long_string(data: &[u8]) -> Result<&str, ArrowError> {
271273
let len = u32::from_le_bytes(array_from_slice(data, 0)?) as usize;
272-
let string = string_from_slice(data, 4..4 + len)?;
273-
Ok(string)
274+
string_from_slice(data, 4, 0..len)
274275
}
275276

276277
/// Decodes a short string from the value section of a variant.
277278
pub(crate) fn decode_short_string(metadata: u8, data: &[u8]) -> Result<ShortString, ArrowError> {
278279
let len = (metadata >> 2) as usize;
279-
let string = string_from_slice(data, 0..len)?;
280+
let string = string_from_slice(data, 0, 0..len)?;
280281
ShortString::try_new(string)
281282
}
282283

@@ -518,10 +519,11 @@ mod tests {
518519

519520
let width = OffsetSizeBytes::Two;
520521

521-
// dictionary_size starts immediately after the header
522+
// dictionary_size starts immediately after the header byte
522523
let dict_size = width.unpack_usize(&buf, 1, 0).unwrap();
523524
assert_eq!(dict_size, 2);
524525

526+
// offset array immediately follows the dictionary size
525527
let first = width.unpack_usize(&buf, 1, 1).unwrap();
526528
assert_eq!(first, 0);
527529

parquet-variant/src/utils.rs

Lines changed: 35 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,11 @@ use arrow_schema::ArrowError;
2121
use std::fmt::Debug;
2222
use std::slice::SliceIndex;
2323

24+
/// Helper for reporting integer overflow errors in a consistent way.
25+
pub(crate) fn overflow_error(msg: &str) -> ArrowError {
26+
ArrowError::InvalidArgumentError(format!("Integer overflow computing {msg}"))
27+
}
28+
2429
#[inline]
2530
pub(crate) fn slice_from_slice<I: SliceIndex<[u8]> + Clone + Debug>(
2631
bytes: &[u8],
@@ -33,17 +38,33 @@ pub(crate) fn slice_from_slice<I: SliceIndex<[u8]> + Clone + Debug>(
3338
))
3439
})
3540
}
41+
42+
/// Helper to safely slice bytes with offset calculations.
43+
///
44+
/// Equivalent to `slice_from_slice(bytes, (base_offset + range.start)..(base_offset + range.end))`
45+
/// but using checked addition to prevent integer overflow panics on 32-bit systems.
46+
#[inline]
47+
pub(crate) fn slice_from_slice_at_offset(
48+
bytes: &[u8],
49+
base_offset: usize,
50+
range: Range<usize>,
51+
) -> Result<&[u8], ArrowError> {
52+
let start_byte = base_offset
53+
.checked_add(range.start)
54+
.ok_or_else(|| overflow_error("slice start"))?;
55+
let end_byte = base_offset
56+
.checked_add(range.end)
57+
.ok_or_else(|| overflow_error("slice end"))?;
58+
slice_from_slice(bytes, start_byte..end_byte)
59+
}
60+
3661
pub(crate) fn array_from_slice<const N: usize>(
3762
bytes: &[u8],
3863
offset: usize,
3964
) -> Result<[u8; N], ArrowError> {
40-
let bytes = slice_from_slice(bytes, offset..offset + N)?;
41-
bytes.try_into().map_err(map_try_from_slice_error)
42-
}
43-
44-
/// To be used in `map_err` when unpacking an integer from a slice of bytes.
45-
pub(crate) fn map_try_from_slice_error(e: TryFromSliceError) -> ArrowError {
46-
ArrowError::InvalidArgumentError(e.to_string())
65+
slice_from_slice_at_offset(bytes, offset, 0..N)?
66+
.try_into()
67+
.map_err(|e: TryFromSliceError| ArrowError::InvalidArgumentError(e.to_string()))
4768
}
4869

4970
pub(crate) fn first_byte_from_slice(slice: &[u8]) -> Result<u8, ArrowError> {
@@ -53,9 +74,13 @@ pub(crate) fn first_byte_from_slice(slice: &[u8]) -> Result<u8, ArrowError> {
5374
.ok_or_else(|| ArrowError::InvalidArgumentError("Received empty bytes".to_string()))
5475
}
5576

56-
/// Helper to get a &str from a slice based on range, if it's valid or an error otherwise
57-
pub(crate) fn string_from_slice(slice: &[u8], range: Range<usize>) -> Result<&str, ArrowError> {
58-
str::from_utf8(slice_from_slice(slice, range)?)
77+
/// Helper to get a &str from a slice at the given offset and range, or an error if invalid.
78+
pub(crate) fn string_from_slice(
79+
slice: &[u8],
80+
offset: usize,
81+
range: Range<usize>,
82+
) -> Result<&str, ArrowError> {
83+
str::from_utf8(slice_from_slice_at_offset(slice, offset, range)?)
5984
.map_err(|_| ArrowError::InvalidArgumentError("invalid UTF-8 string".to_string()))
6085
}
6186

parquet-variant/src/variant.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -168,13 +168,13 @@ impl<'a> TryFrom<&'a str> for ShortString<'a> {
168168
}
169169
}
170170

171-
impl<'a> AsRef<str> for ShortString<'a> {
171+
impl AsRef<str> for ShortString<'_> {
172172
fn as_ref(&self) -> &str {
173173
self.0
174174
}
175175
}
176176

177-
impl<'a> Deref for ShortString<'a> {
177+
impl Deref for ShortString<'_> {
178178
type Target = str;
179179

180180
fn deref(&self) -> &Self::Target {

parquet-variant/src/variant/list.rs

Lines changed: 18 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -15,11 +15,16 @@
1515
// specific language governing permissions and limitations
1616
// under the License.
1717
use crate::decoder::OffsetSizeBytes;
18-
use crate::utils::{first_byte_from_slice, slice_from_slice, validate_fallible_iterator};
18+
use crate::utils::{
19+
first_byte_from_slice, overflow_error, slice_from_slice_at_offset, validate_fallible_iterator,
20+
};
1921
use crate::variant::{Variant, VariantMetadata};
2022

2123
use arrow_schema::ArrowError;
2224

25+
// The value header occupies one byte; use a named constant for readability
26+
const NUM_HEADER_BYTES: usize = 1;
27+
2328
/// A parsed version of the variant array value header byte.
2429
#[derive(Clone, Debug, PartialEq)]
2530
pub(crate) struct VariantListHeader {
@@ -78,25 +83,16 @@ impl<'m, 'v> VariantList<'m, 'v> {
7883
false => OffsetSizeBytes::One,
7984
};
8085

81-
// Skip the header byte to read the num_elements
82-
let num_elements = num_elements_size.unpack_usize(value, 1, 0)?;
83-
let first_offset_byte = 1 + num_elements_size as usize;
84-
85-
let overflow =
86-
|| ArrowError::InvalidArgumentError("Variant value_byte_length overflow".into());
87-
88-
// 1. num_elements + 1
89-
let n_offsets = num_elements.checked_add(1).ok_or_else(overflow)?;
90-
91-
// 2. (num_elements + 1) * offset_size
92-
let value_bytes = n_offsets
93-
.checked_mul(header.offset_size as usize)
94-
.ok_or_else(overflow)?;
86+
// Skip the header byte to read the num_elements; the offset array immediately follows
87+
let num_elements = num_elements_size.unpack_usize(value, NUM_HEADER_BYTES, 0)?;
88+
let first_offset_byte = NUM_HEADER_BYTES + num_elements_size as usize;
9589

96-
// 3. first_offset_byte + ...
97-
let first_value_byte = first_offset_byte
98-
.checked_add(value_bytes)
99-
.ok_or_else(overflow)?;
90+
// (num_elements + 1) * offset_size + first_offset_byte
91+
let first_value_byte = num_elements
92+
.checked_add(1)
93+
.and_then(|n| n.checked_mul(header.offset_size as usize))
94+
.and_then(|n| n.checked_add(first_offset_byte))
95+
.ok_or_else(|| overflow_error("offset of variant list values"))?;
10096

10197
let new_self = Self {
10298
metadata,
@@ -139,9 +135,10 @@ impl<'m, 'v> VariantList<'m, 'v> {
139135
};
140136

141137
// Read the value bytes from the offsets
142-
let variant_value_bytes = slice_from_slice(
138+
let variant_value_bytes = slice_from_slice_at_offset(
143139
self.value,
144-
self.first_value_byte + unpack(index)?..self.first_value_byte + unpack(index + 1)?,
140+
self.first_value_byte,
141+
unpack(index)?..unpack(index + 1)?,
145142
)?;
146143
let variant = Variant::try_new_with_metadata(self.metadata, variant_value_bytes)?;
147144
Ok(variant)

parquet-variant/src/variant/metadata.rs

Lines changed: 19 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,8 @@
1717

1818
use crate::decoder::OffsetSizeBytes;
1919
use crate::utils::{
20-
first_byte_from_slice, slice_from_slice, string_from_slice, validate_fallible_iterator,
20+
first_byte_from_slice, overflow_error, slice_from_slice, string_from_slice,
21+
validate_fallible_iterator,
2122
};
2223

2324
use arrow_schema::ArrowError;
@@ -35,6 +36,9 @@ pub(crate) struct VariantMetadataHeader {
3536
// purposes and to make that visible.
3637
const CORRECT_VERSION_VALUE: u8 = 1;
3738

39+
// The metadata header occupies one byte; use a named constant for readability
40+
const NUM_HEADER_BYTES: usize = 1;
41+
3842
impl VariantMetadataHeader {
3943
/// Tries to construct the variant metadata header, which has the form
4044
///
@@ -102,20 +106,22 @@ impl<'m> VariantMetadata<'m> {
102106
let header_byte = first_byte_from_slice(bytes)?;
103107
let header = VariantMetadataHeader::try_new(header_byte)?;
104108

105-
// Offset 1, index 0 because first element after header is dictionary size
106-
let dict_size = header.offset_size.unpack_usize(bytes, 1, 0)?;
109+
// First element after header is dictionary size
110+
let dict_size = header
111+
.offset_size
112+
.unpack_usize(bytes, NUM_HEADER_BYTES, 0)?;
107113

108114
// Calculate the starting offset of the dictionary string bytes.
109115
//
110116
// Value header, dict_size (offset_size bytes), and dict_size+1 offsets
111-
// = 1 + offset_size + (dict_size + 1) * offset_size
112-
// = (dict_size + 2) * offset_size + 1
117+
// = NUM_HEADER_BYTES + offset_size + (dict_size + 1) * offset_size
118+
// = (dict_size + 2) * offset_size + NUM_HEADER_BYTES
113119
let dictionary_key_start_byte = dict_size
114120
.checked_add(2)
115121
.and_then(|n| n.checked_mul(header.offset_size as usize))
116-
.and_then(|n| n.checked_add(1))
117-
.ok_or_else(|| ArrowError::InvalidArgumentError("metadata length overflow".into()))?;
118-
println!("dictionary_key_start_byte: {dictionary_key_start_byte}");
122+
.and_then(|n| n.checked_add(NUM_HEADER_BYTES))
123+
.ok_or_else(|| overflow_error("offset of variant metadata dictionary"))?;
124+
119125
let new_self = Self {
120126
bytes,
121127
header,
@@ -149,16 +155,17 @@ impl<'m> VariantMetadata<'m> {
149155
/// This offset is an index into the dictionary, at the boundary between string `i-1` and string
150156
/// `i`. See [`Self::get`] to retrieve a specific dictionary entry.
151157
fn get_offset(&self, i: usize) -> Result<usize, ArrowError> {
152-
// Skipping the header byte (setting byte_offset = 1) and the dictionary_size (setting offset_index +1)
158+
// Skip the header byte and the dictionary_size entry (by offset_index + 1)
153159
let bytes = slice_from_slice(self.bytes, ..self.dictionary_key_start_byte)?;
154-
self.header.offset_size.unpack_usize(bytes, 1, i + 1)
160+
self.header
161+
.offset_size
162+
.unpack_usize(bytes, NUM_HEADER_BYTES, i + 1)
155163
}
156164

157165
/// Gets a dictionary entry by index
158166
pub fn get(&self, i: usize) -> Result<&'m str, ArrowError> {
159-
let dictionary_keys_bytes = slice_from_slice(self.bytes, self.dictionary_key_start_byte..)?;
160167
let byte_range = self.get_offset(i)?..self.get_offset(i + 1)?;
161-
string_from_slice(dictionary_keys_bytes, byte_range)
168+
string_from_slice(self.bytes, self.dictionary_key_start_byte, byte_range)
162169
}
163170

164171
/// Get all dictionary entries as an Iterator of strings

parquet-variant/src/variant/object.rs

Lines changed: 34 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -16,12 +16,16 @@
1616
// under the License.
1717
use crate::decoder::OffsetSizeBytes;
1818
use crate::utils::{
19-
first_byte_from_slice, slice_from_slice, try_binary_search_range_by, validate_fallible_iterator,
19+
first_byte_from_slice, overflow_error, slice_from_slice, try_binary_search_range_by,
20+
validate_fallible_iterator,
2021
};
2122
use crate::variant::{Variant, VariantMetadata};
2223

2324
use arrow_schema::ArrowError;
2425

26+
// The value header occupies one byte; use a named constant for readability
27+
const NUM_HEADER_BYTES: usize = 1;
28+
2529
/// Header structure for [`VariantObject`]
2630
#[derive(Clone, Debug, PartialEq)]
2731
pub(crate) struct VariantObjectHeader {
@@ -72,36 +76,43 @@ impl<'m, 'v> VariantObject<'m, 'v> {
7276
let header_byte = first_byte_from_slice(value)?;
7377
let header = VariantObjectHeader::try_new(header_byte)?;
7478

75-
// Determine num_elements size based on is_large flag
79+
// Determine num_elements size based on is_large flag and fetch the value
7680
let num_elements_size = if header.is_large {
7781
OffsetSizeBytes::Four
7882
} else {
7983
OffsetSizeBytes::One
8084
};
85+
let num_elements = num_elements_size.unpack_usize(value, NUM_HEADER_BYTES, 0)?;
86+
87+
// Calculate byte offsets for different sections with overflow protection
88+
let field_ids_start_byte = NUM_HEADER_BYTES
89+
.checked_add(num_elements_size as usize)
90+
.ok_or_else(|| overflow_error("offset of variant object field ids"))?;
8191

82-
// Parse num_elements
83-
let num_elements = num_elements_size.unpack_usize(value, 1, 0)?;
92+
let field_offsets_start_byte = num_elements
93+
.checked_mul(header.field_id_size as usize)
94+
.and_then(|n| n.checked_add(field_ids_start_byte))
95+
.ok_or_else(|| overflow_error("offset of variant object field offsets"))?;
8496

85-
// Calculate byte offsets for different sections
86-
let field_ids_start_byte = 1 + num_elements_size as usize;
87-
let field_offsets_start_byte =
88-
field_ids_start_byte + num_elements * header.field_id_size as usize;
89-
let values_start_byte =
90-
field_offsets_start_byte + (num_elements + 1) * header.field_offset_size as usize;
97+
let values_start_byte = num_elements
98+
.checked_add(1)
99+
.and_then(|n| n.checked_mul(header.field_offset_size as usize))
100+
.and_then(|n| n.checked_add(field_offsets_start_byte))
101+
.ok_or_else(|| overflow_error("offset of variant object field values"))?;
91102

92103
// Spec says: "The last field_offset points to the byte after the end of the last value"
93104
//
94105
// Use the last offset as a bounds check. The iterator check below doesn't use it -- offsets
95106
// are not monotonic -- so we have to check separately here.
96-
let last_field_offset =
97-
header
98-
.field_offset_size
99-
.unpack_usize(value, field_offsets_start_byte, num_elements)?;
100-
if values_start_byte + last_field_offset > value.len() {
107+
let end_offset = header
108+
.field_offset_size
109+
.unpack_usize(value, field_offsets_start_byte, num_elements)?
110+
.checked_add(values_start_byte)
111+
.ok_or_else(|| overflow_error("end of variant object field values"))?;
112+
if end_offset > value.len() {
101113
return Err(ArrowError::InvalidArgumentError(format!(
102-
"Last field offset value {} at offset {} is outside the value slice of length {}",
103-
last_field_offset,
104-
values_start_byte,
114+
"Last field offset value {} is outside the value slice of length {}",
115+
end_offset,
105116
value.len()
106117
)));
107118
}
@@ -140,7 +151,11 @@ impl<'m, 'v> VariantObject<'m, 'v> {
140151
self.field_offsets_start_byte,
141152
i,
142153
)?;
143-
let value_bytes = slice_from_slice(self.value, self.values_start_byte + start_offset..)?;
154+
let value_start = self
155+
.values_start_byte
156+
.checked_add(start_offset)
157+
.ok_or_else(|| overflow_error("offset of variant object field"))?;
158+
let value_bytes = slice_from_slice(self.value, value_start..)?;
144159
Variant::try_new_with_metadata(self.metadata, value_bytes)
145160
}
146161

0 commit comments

Comments
 (0)