Skip to content

Commit 499acd9

Browse files
authored
feat: VortexDataset.get_fragments yields one fragment per split (#4386)
Signed-off-by: Daniel King <[email protected]>
1 parent 7e625a7 commit 499acd9

File tree

12 files changed

+204
-65
lines changed

12 files changed

+204
-65
lines changed

vortex-file/src/file.rs

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,18 +6,19 @@
66
//! The `VortexFile` provides methods for accessing file metadata, creating segment sources for reading
77
//! data from the file, and initiating scans to read the file's contents into memory as Vortex arrays.
88
9+
use std::ops::Range;
910
use std::sync::Arc;
1011

1112
use vortex_array::ArrayRef;
1213
use vortex_array::stats::StatsSet;
13-
use vortex_dtype::{DType, Field, FieldPath, FieldPathSet};
14+
use vortex_dtype::{DType, Field, FieldMask, FieldPath, FieldPathSet};
1415
use vortex_error::VortexResult;
1516
use vortex_expr::pruning::checked_pruning_expr;
1617
use vortex_expr::{ExprRef, Scope};
1718
use vortex_layout::LayoutReader;
1819
use vortex_layout::segments::SegmentSource;
1920
use vortex_metrics::VortexMetrics;
20-
use vortex_scan::ScanBuilder;
21+
use vortex_scan::{ScanBuilder, SplitBy};
2122
use vortex_utils::aliases::hash_map::HashMap;
2223

2324
use crate::footer::Footer;
@@ -133,4 +134,8 @@ impl VortexFile {
133134
.as_constant()
134135
.is_some_and(|result| result.as_bool().value() == Some(true)))
135136
}
137+
138+
pub fn splits(&self) -> VortexResult<Vec<Range<u64>>> {
139+
SplitBy::Layout.splits(self.layout_reader()?.as_ref(), &[FieldMask::All])
140+
}
136141
}

vortex-python/python/vortex/_lib/dataset.pyi

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,16 +8,27 @@ from .expr import Expr
88

99
class VortexDataset:
1010
def to_array(
11-
self, columns: list[str] | list[int] | None = None, row_filter: Expr | None = None, indices: Array | None = None
11+
self,
12+
columns: list[str] | list[int] | None = None,
13+
row_filter: Expr | None = None,
14+
indices: Array | None = None,
15+
row_range: tuple[int, int] | None = None,
1216
) -> Array: ...
1317
def to_record_batch_reader(
1418
self,
1519
columns: list[str] | list[int] | None = None,
1620
row_filter: Expr | None = None,
1721
indices: Array | None = None,
1822
split_by: int | None = None,
23+
row_range: tuple[int, int] | None = None,
1924
) -> pyarrow.RecordBatchReader: ...
20-
def count_rows(self, row_filter: Expr | None = None, split_by: int | None = None) -> int: ...
25+
def count_rows(
26+
self,
27+
row_filter: Expr | None = None,
28+
split_by: int | None = None,
29+
row_range: tuple[int, int] | None = None,
30+
) -> int: ...
2131
def schema(self) -> pyarrow.Schema: ...
32+
def splits(self) -> list[tuple[int, int]]: ...
2233

2334
def dataset_from_url(url: str) -> VortexDataset: ...

vortex-python/python/vortex/_lib/file.pyi

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,5 +45,6 @@ class VortexFile:
4545
) -> pa.RecordBatchReader: ...
4646
def to_dataset(self) -> VortexDataset: ...
4747
def to_polars(self) -> pl.LazyFrame: ...
48+
def splits(self) -> list[tuple[int, int]]: ...
4849

4950
def open(path: str, *, without_segment_cache: bool = False) -> VortexFile: ...

vortex-python/python/vortex/arrays.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,10 @@ def __ArrowDtype_type_patched(self: pandas.ArrowDtype):
4141

4242

4343
def empty_arrow_table(schema: pyarrow.Schema) -> pyarrow.Table:
44-
return pyarrow.Table.from_arrays([pyarrow.array([], type=t) for t in schema], schema=schema) # pyright: ignore[reportUnknownVariableType, reportUnknownArgumentType]
44+
def empty_array(f: pyarrow.Field[pyarrow.DataType]) -> pyarrow.Array[pyarrow.Scalar[pyarrow.DataType]]:
45+
return pyarrow.array([], type=f.type)
46+
47+
return pyarrow.Table.from_arrays([empty_array(field) for field in schema], schema=schema) # pyright: ignore[reportUnknownVariableType, reportUnknownArgumentType]
4548

4649

4750
def arrow_table_from_struct_array(

0 commit comments

Comments
 (0)