Skip to content

Commit eb17394

Browse files
Improved docs (#6)
1 parent ddd87f2 commit eb17394

File tree

9 files changed

+99
-9
lines changed

9 files changed

+99
-9
lines changed

src/error.rs

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,22 @@
1+
//! Contains [`Error`]
12
use crate::proto::stream::Kind;
23

4+
/// Possible errors from this crate.
35
#[derive(Debug, Clone)]
46
pub enum Error {
7+
/// Generic error returned when the file is out of spec
58
OutOfSpec,
6-
RleLiteralTooLarge,
9+
/// When a string column contains a value with invalid UTF8
710
InvalidUtf8,
11+
/// When the user requests a column that does not exist
812
InvalidColumn(u32),
13+
/// When the user requests a type that does not exist for the given column
914
InvalidKind(u32, Kind),
15+
/// When decoding a float fails
1016
DecodeFloat,
17+
/// When decompression fails
1118
Decompression,
19+
/// When decoding the proto files fail
1220
InvalidProto,
1321
}
1422

src/lib.md

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
Welcome to `orc-format` documentation. Thanks for checking it out!
2+
3+
This Rust crate is a toolkit to read and deserialize ORC to your favourite in-memory format.
4+
5+
Below is an example of how to read a column from ORC into memory:
6+
7+
```rust
8+
use std::fs::File;
9+
10+
use orc_format::{error::Error, read, read::Column};
11+
12+
13+
fn get_stripe(path: &str, column: u32) -> Result<Column, Error> {
14+
// open the file, as expected. buffering this is not necessary - we
15+
// are very careful about the number of `read`s we perform.
16+
let mut f = File::open(path).expect("no file found");
17+
18+
// read the files' metadata
19+
let metadata = read::read_metadata(&mut f)?;
20+
// and copy the compression it is using
21+
let compression = metadata.postscript.compression();
22+
23+
// the next step is to identify which stripe we want to read. Let's say it is the first one.
24+
let stripe = &metadata.footer.stripes[0];
25+
26+
// Each stripe has a footer - we need to read it to extract the location of each column on it.
27+
let stripe_footer = read::read_stripe_footer(&mut f, stripe, compression, &mut vec![])?;
28+
29+
// Finally, we read the column into `Column`
30+
read::read_stripe_column(&mut f, stripe, stripe_footer, compression, column, vec![])
31+
}
32+
```
33+
34+
To deserialize the values of a column, use things inside `read::decode`.
35+
For example, the below contains the deserialization of the "Present" to a `Vec<bool>`.
36+
37+
```rust
38+
use orc_format::{error::Error, proto::stream::Kind, read::decode::BooleanIter, read::Column};
39+
40+
fn deserialize_present(column: &Column, scratch: &mut Vec<u8>) -> Result<Vec<bool>, Error> {
41+
let mut reader = column.get_stream(Kind::Present, std::mem::take(scratch))?;
42+
43+
let mut validity = Vec::with_capacity(column.number_of_rows());
44+
BooleanIter::new(&mut reader, column.number_of_rows()).try_for_each(|item| {
45+
validity.push(item?);
46+
Result::<(), Error>::Ok(())
47+
})?;
48+
49+
*scratch = std::mem::take(&mut reader.into_inner());
50+
51+
Ok(validity)
52+
}
53+
```
54+
55+
Check out the integration tests of the crate to find deserialization of other types such
56+
as floats, integers, strings and dictionaries.

src/lib.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
#![doc = include_str!("lib.md")]
2+
#![forbid(unsafe_code)]
13
pub mod error;
24
pub mod proto;
35
pub mod read;

src/read/decode/float.rs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@ pub struct FloatIter<'a, T: Float, R: std::io::Read> {
3838
}
3939

4040
impl<'a, T: Float, R: std::io::Read> FloatIter<'a, T, R> {
41+
/// Returns a new [`FloatIter`]
4142
#[inline]
4243
pub fn new(reader: &'a mut R, length: usize) -> Self {
4344
Self {
@@ -47,11 +48,13 @@ impl<'a, T: Float, R: std::io::Read> FloatIter<'a, T, R> {
4748
}
4849
}
4950

51+
/// The number of items remaining
5052
#[inline]
5153
pub fn len(&self) -> usize {
5254
self.remaining
5355
}
5456

57+
/// Whether the iterator is empty
5558
#[must_use]
5659
pub fn is_empty(&self) -> bool {
5760
self.len() == 0

src/read/decode/mod.rs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,12 @@
1+
//! Contains different iterators that receive a reader ([`std::io::Read`])
2+
//! and return values for each of ORC's physical types (e.g. boolean).
13
mod boolean_rle;
24
mod float;
35
mod rle_v2;
46
mod variable_length;
57

68
pub use boolean_rle::{BooleanIter, BooleanRleRunIter, BooleanRun};
79
pub use float::FloatIter;
8-
pub use rle_v2::IteratorEnum;
910
pub use rle_v2::{SignedRleV2Iter, SignedRleV2Run, UnsignedRleV2Iter, UnsignedRleV2Run};
1011
pub use variable_length::Values;
1112

src/read/decode/rle_v2.rs

Lines changed: 21 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -413,12 +413,6 @@ impl Iterator for SignedDeltaRun {
413413
}
414414
}
415415

416-
pub enum IteratorEnum<I, II, III> {
417-
Direct(I),
418-
Delta(II),
419-
ShortRepeat(III),
420-
}
421-
422416
#[inline]
423417
fn run_encoding(header: u8) -> EncodingTypeV2 {
424418
match (header & 128 == 128, header & 64 == 64) {
@@ -433,13 +427,18 @@ fn run_encoding(header: u8) -> EncodingTypeV2 {
433427
}
434428
}
435429

430+
/// An enum describing one of the RLE v2 runs for unsigned integers
436431
pub enum UnsignedRleV2Run {
432+
/// Direct
437433
Direct(UnsignedDirectRun),
434+
/// Delta
438435
Delta(UnsignedDeltaRun),
436+
/// Short repeat
439437
ShortRepeat(UnsignedShortRepeat),
440438
}
441439

442440
impl UnsignedRleV2Run {
441+
/// Returns a new [`UnsignedRleV2Run`] owning `scratch`.
443442
pub fn try_new<R: Read>(reader: &mut R, scratch: Vec<u8>) -> Result<Self, Error> {
444443
let mut header = [0u8];
445444
reader.read_exact(&mut header)?;
@@ -460,6 +459,7 @@ impl UnsignedRleV2Run {
460459
}
461460
}
462461

462+
/// The number of items remaining
463463
pub fn len(&self) -> usize {
464464
match self {
465465
Self::Direct(run) => run.len(),
@@ -468,19 +468,22 @@ impl UnsignedRleV2Run {
468468
}
469469
}
470470

471+
/// Whether the iterator is empty
471472
#[must_use]
472473
pub fn is_empty(&self) -> bool {
473474
self.len() == 0
474475
}
475476
}
476477

478+
/// A fallible [`Iterator`] of [`UnsignedRleV2Run`].
477479
pub struct UnsignedRleV2Iter<'a, R: Read> {
478480
reader: &'a mut R,
479481
scratch: Vec<u8>,
480482
length: usize,
481483
}
482484

483485
impl<'a, R: Read> UnsignedRleV2Iter<'a, R> {
486+
/// Returns a new [`UnsignedRleV2Iter`].
484487
pub fn new(reader: &'a mut R, length: usize, scratch: Vec<u8>) -> Self {
485488
Self {
486489
reader,
@@ -514,6 +517,7 @@ impl SignedDirectRun {
514517
self.0.len()
515518
}
516519

520+
/// Whether the iterator is empty
517521
#[must_use]
518522
pub fn is_empty(&self) -> bool {
519523
self.len() == 0
@@ -540,10 +544,12 @@ impl SignedShortRepeat {
540544
UnsignedShortRepeat::try_new(header, reader, scratch).map(Self)
541545
}
542546

547+
/// The number of items remaining
543548
pub fn len(&self) -> usize {
544549
self.0.len()
545550
}
546551

552+
/// Whether the iterator is empty
547553
#[must_use]
548554
pub fn is_empty(&self) -> bool {
549555
self.len() == 0
@@ -562,14 +568,19 @@ impl Iterator for SignedShortRepeat {
562568
}
563569
}
564570

571+
/// An enum describing one of the RLE v2 runs for signed integers
565572
#[derive(Debug)]
566573
pub enum SignedRleV2Run {
574+
/// Direct
567575
Direct(SignedDirectRun),
576+
/// Delta
568577
Delta(SignedDeltaRun),
578+
/// Short repeat
569579
ShortRepeat(SignedShortRepeat),
570580
}
571581

572582
impl SignedRleV2Run {
583+
/// Returns a new [`SignedRleV2Run`], moving `scratch` to itself
573584
pub fn try_new<R: Read>(reader: &mut R, scratch: Vec<u8>) -> Result<Self, Error> {
574585
let mut header = [0u8];
575586
reader.read_exact(&mut header)?;
@@ -590,6 +601,7 @@ impl SignedRleV2Run {
590601
}
591602
}
592603

604+
/// The number of items remaining
593605
pub fn len(&self) -> usize {
594606
match self {
595607
Self::Direct(run) => run.len(),
@@ -598,19 +610,22 @@ impl SignedRleV2Run {
598610
}
599611
}
600612

613+
/// Whether the iterator is empty
601614
#[must_use]
602615
pub fn is_empty(&self) -> bool {
603616
self.len() == 0
604617
}
605618
}
606619

620+
/// A fallible [`Iterator`] of [`SignedRleV2Run`].
607621
pub struct SignedRleV2Iter<R: Read> {
608622
reader: R,
609623
scratch: Vec<u8>,
610624
length: usize,
611625
}
612626

613627
impl<R: Read> SignedRleV2Iter<R> {
628+
/// Returns a new [`SignedRleV2Iter`].
614629
pub fn new(reader: R, length: usize, scratch: Vec<u8>) -> Self {
615630
Self {
616631
reader,

src/read/decompress/mod.rs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
//! Contains [`Decompressor`]
12
use std::io::Read;
23

34
use fallible_streaming_iterator::FallibleStreamingIterator;
@@ -91,13 +92,15 @@ impl<'a> FallibleStreamingIterator for DecompressorIter<'a> {
9192
}
9293
}
9394

95+
/// A [`Read`]er fulfilling the ORC specification of reading compressed data.
9496
pub struct Decompressor<'a> {
9597
decompressor: DecompressorIter<'a>,
9698
offset: usize,
9799
is_first: bool,
98100
}
99101

100102
impl<'a> Decompressor<'a> {
103+
/// Creates a new [`Decompressor`] that will use `scratch` as a temporary region.
101104
pub fn new(stream: &'a [u8], compression: CompressionKind, scratch: Vec<u8>) -> Self {
102105
Self {
103106
decompressor: DecompressorIter::new(stream, compression, scratch),
@@ -106,6 +109,7 @@ impl<'a> Decompressor<'a> {
106109
}
107110
}
108111

112+
/// Returns the internal memory region, so it can be re-used
109113
pub fn into_inner(self) -> Vec<u8> {
110114
self.decompressor.into_inner()
111115
}

src/read/mod.rs

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@
55
//! 2. Read the stripe (or part of it in projection pushdown)
66
//! 3. For each column, select the relevant region of the stripe
77
//! 4. Attach an Iterator to the region
8-
#![forbid(unsafe_code)]
98
109
use std::io::{Read, Seek, SeekFrom};
1110

src/read/stripe.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,10 +54,12 @@ impl Column {
5454
.ok_or(Error::InvalidKind(self.column, kind))
5555
}
5656

57+
/// Returns the encoding of the column
5758
pub fn encoding(&self) -> &ColumnEncoding {
5859
&self.footer.columns[self.column as usize]
5960
}
6061

62+
/// Returns the number of items in the dictionary, if any
6163
pub fn dictionary_size(&self) -> Option<usize> {
6264
self.footer.columns[self.column as usize]
6365
.dictionary_size

0 commit comments

Comments
 (0)