Skip to content

Commit 416490d

Browse files
Improved API and fixed edge case in delta runs (#7)
1 parent f3ba1b8 commit 416490d

File tree

12 files changed

+307
-78
lines changed

12 files changed

+307
-78
lines changed

Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[package]
22
name = "orc-format"
3-
version = "0.2.0"
3+
version = "0.3.0"
44
license = "MIT/Apache-2.0"
55
description = "Unofficial implementation of Apache ORC spec in safe Rust"
66
homepage = "https://github.com/DataEngineeringLabs/orc-format"

src/lib.md

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -10,24 +10,22 @@ use std::fs::File;
1010
use orc_format::{error::Error, read, read::Column};
1111

1212

13-
fn get_stripe(path: &str, column: u32) -> Result<Column, Error> {
13+
fn get_column(path: &str, column: u32) -> Result<Column, Error> {
1414
// open the file, as expected. buffering this is not necessary - we
1515
// are very careful about the number of `read`s we perform.
1616
let mut f = File::open(path).expect("no file found");
1717

1818
// read the files' metadata
1919
let metadata = read::read_metadata(&mut f)?;
20-
// and copy the compression it is using
21-
let compression = metadata.postscript.compression();
2220

2321
// the next step is to identify which stripe we want to read. Let's say it is the first one.
24-
let stripe = &metadata.footer.stripes[0];
22+
let stripe = 0;
2523

2624
// Each stripe has a footer - we need to read it to extract the location of each column on it.
27-
let stripe_footer = read::read_stripe_footer(&mut f, stripe, compression, &mut vec![])?;
25+
let stripe_footer = read::read_stripe_footer(&mut f, &metadata, stripe, &mut vec![])?;
2826

2927
// Finally, we read the column into `Column`
30-
read::read_stripe_column(&mut f, stripe, stripe_footer, compression, column, vec![])
28+
read::read_stripe_column(&mut f, &metadata, stripe, stripe_footer, column, vec![])
3129
}
3230
```
3331

src/lib.rs

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,5 +3,3 @@
33
pub mod error;
44
pub mod proto;
55
pub mod read;
6-
7-
pub use fallible_streaming_iterator;

src/read/stripe.rs renamed to src/read/column.rs

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -71,9 +71,9 @@ impl Column {
7171
self.number_of_rows as usize
7272
}
7373

74-
/// Returns the underlying scratch containing a pre-allocated memory region
74+
/// Returns the underlying footer and the pre-allocated memory region
7575
/// containing all (compressed) streams of this column.
76-
pub fn into_inner(self) -> Vec<u8> {
77-
self.data
76+
pub fn into_inner(self) -> (StripeFooter, Vec<u8>) {
77+
(self.footer, self.data)
7878
}
7979
}

src/read/decode/boolean_rle.rs

Lines changed: 21 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -11,14 +11,18 @@ pub enum BooleanRun {
1111
Literals([u8; 255]),
1212
}
1313

14-
pub struct BooleanRleRunIter<'a, R: Read> {
15-
reader: &'a mut R,
14+
pub struct BooleanRleRunIter<R: Read> {
15+
reader: R,
1616
}
1717

18-
impl<'a, R: Read> BooleanRleRunIter<'a, R> {
19-
pub fn new(reader: &'a mut R) -> Self {
18+
impl<R: Read> BooleanRleRunIter<R> {
19+
pub fn new(reader: R) -> Self {
2020
Self { reader }
2121
}
22+
23+
pub fn into_inner(self) -> R {
24+
self.reader
25+
}
2226
}
2327

2428
fn read_literals<R: Read>(reader: &mut R, header: i8) -> Result<[u8; 255], Error> {
@@ -33,22 +37,22 @@ fn read_literals<R: Read>(reader: &mut R, header: i8) -> Result<[u8; 255], Error
3337
Ok(literals)
3438
}
3539

36-
impl<'a, R: Read> Iterator for BooleanRleRunIter<'a, R> {
40+
impl<R: Read> Iterator for BooleanRleRunIter<R> {
3741
type Item = Result<BooleanRun, Error>;
3842

3943
#[inline]
4044
fn next(&mut self) -> Option<Self::Item> {
41-
let header = read_u8(self.reader);
45+
let header = read_u8(&mut self.reader);
4246
let header = match header {
4347
Ok(header) => header as i8,
4448
Err(e) => return Some(Err(e.into())),
4549
};
4650
if header < 0 {
47-
Some(read_literals(self.reader, header).map(BooleanRun::Literals))
51+
Some(read_literals(&mut self.reader, header).map(BooleanRun::Literals))
4852
} else {
4953
let length = header as u16 + 3;
5054
// this is not ok - it may require more than one byte
51-
let value = read_u8(self.reader);
55+
let value = read_u8(&mut self.reader);
5256
let value = match value {
5357
Ok(value) => value,
5458
Err(e) => return Some(Err(e.into())),
@@ -58,16 +62,16 @@ impl<'a, R: Read> Iterator for BooleanRleRunIter<'a, R> {
5862
}
5963
}
6064

61-
pub struct BooleanIter<'a, R: Read> {
62-
iter: BooleanRleRunIter<'a, R>,
65+
pub struct BooleanIter<R: Read> {
66+
iter: BooleanRleRunIter<R>,
6367
current: Option<BooleanRun>,
6468
position: u8,
6569
byte_position: usize,
6670
remaining: usize,
6771
}
6872

69-
impl<'a, R: Read> BooleanIter<'a, R> {
70-
pub fn new(reader: &'a mut R, length: usize) -> Self {
73+
impl<'a, R: Read> BooleanIter<R> {
74+
pub fn new(reader: R, length: usize) -> Self {
7175
Self {
7276
iter: BooleanRleRunIter::new(reader),
7377
current: None,
@@ -76,9 +80,13 @@ impl<'a, R: Read> BooleanIter<'a, R> {
7680
remaining: length,
7781
}
7882
}
83+
84+
pub fn into_inner(self) -> R {
85+
self.iter.into_inner()
86+
}
7987
}
8088

81-
impl<'a, R: Read> Iterator for BooleanIter<'a, R> {
89+
impl<R: Read> Iterator for BooleanIter<R> {
8290
type Item = Result<bool, Error>;
8391

8492
#[inline]

src/read/decode/float.rs

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -31,16 +31,16 @@ impl Float for f64 {
3131
}
3232

3333
/// An iterator
34-
pub struct FloatIter<'a, T: Float, R: std::io::Read> {
35-
reader: &'a mut R,
34+
pub struct FloatIter<T: Float, R: std::io::Read> {
35+
reader: R,
3636
remaining: usize,
3737
phantom: std::marker::PhantomData<T>,
3838
}
3939

40-
impl<'a, T: Float, R: std::io::Read> FloatIter<'a, T, R> {
40+
impl<T: Float, R: std::io::Read> FloatIter<T, R> {
4141
/// Returns a new [`FloatIter`]
4242
#[inline]
43-
pub fn new(reader: &'a mut R, length: usize) -> Self {
43+
pub fn new(reader: R, length: usize) -> Self {
4444
Self {
4545
reader,
4646
remaining: length,
@@ -59,9 +59,14 @@ impl<'a, T: Float, R: std::io::Read> FloatIter<'a, T, R> {
5959
pub fn is_empty(&self) -> bool {
6060
self.len() == 0
6161
}
62+
63+
/// Returns its internal reader
64+
pub fn into_inner(self) -> R {
65+
self.reader
66+
}
6267
}
6368

64-
impl<'a, T: Float, R: std::io::Read> Iterator for FloatIter<'a, T, R> {
69+
impl<T: Float, R: std::io::Read> Iterator for FloatIter<T, R> {
6570
type Item = Result<T, Error>;
6671

6772
#[inline]

src/read/decode/mod.rs

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,11 @@ mod rle_v2;
66
mod variable_length;
77

88
pub use boolean_rle::{BooleanIter, BooleanRleRunIter, BooleanRun};
9-
pub use float::FloatIter;
10-
pub use rle_v2::{SignedRleV2Iter, SignedRleV2Run, UnsignedRleV2Iter, UnsignedRleV2Run};
9+
pub use float::{Float, FloatIter};
10+
pub use rle_v2::{
11+
SignedRleV2Iter, SignedRleV2Run, SignedRleV2RunIter, UnsignedRleV2Iter, UnsignedRleV2Run,
12+
UnsignedRleV2RunIter,
13+
};
1114
pub use variable_length::Values;
1215

1316
#[inline]

0 commit comments

Comments
 (0)