Skip to content

Commit 97c2d6a

Browse files
authored
fix: cleanup old to_record_batch_reader (#4368)
Confusing and incomplete API, instead we just expose via the `VortexFile::scan` method --------- Signed-off-by: Andrew Duffy <[email protected]>
1 parent c8b28a1 commit 97c2d6a

File tree

3 files changed

+24
-25
lines changed

3 files changed

+24
-25
lines changed

docs/quickstart/python.rst

Lines changed: 15 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -28,30 +28,14 @@ Vortex array:
2828
>>> vtx.nbytes
2929
141024
3030

31-
Compress
32-
--------
33-
34-
Use :func:`~vortex.compress` to compress the Vortex array and check the relative size:
35-
36-
.. doctest::
37-
38-
>>> cvtx = vx.compress(vtx)
39-
>>> cvtx.nbytes
40-
14298
41-
>>> cvtx.nbytes / vtx.nbytes
42-
0.10...
43-
44-
Vortex uses nearly ten times fewer bytes than Arrow. Fewer bytes means more of your data fits in
45-
cache and RAM.
46-
4731
Write
4832
-----
4933

5034
Use :func:`~vortex.io.write` to write the Vortex array to disk:
5135

5236
.. doctest::
5337

54-
>>> vortex.io.write(cvtx, "example.vortex")
38+
>>> vortex.io.write(cvtx, "example.vortex") # doctest: +SKIP
5539

5640
Small Vortex files (this one is just 71KiB) currently have substantial overhead relative to their
5741
size. This will be addressed shortly. On files with at least tens of megabytes of data, Vortex is
@@ -71,3 +55,17 @@ Use :func:`~vortex.open` to open and read the Vortex array from disk:
7155
.. doctest::
7256

7357
>>> cvtx = vortex.open("example.vortex").scan().read_all()
58+
59+
60+
Vortex is architected to achieve fast random access, in many cases hundreds of times faster
61+
than what can be achieved with Parquet.
62+
63+
If you have an external index that gives you specific rows to pull out of the Vortex file, you can skip a lot more
64+
IO and decoding and read just the data that is relevant to you:
65+
66+
.. doctest::
67+
68+
>>> vf = vortex.open("example.vortex")
69+
>>> # row indices must be ordered and unique
70+
>>> result = vf.scan(indices=vortex.array([1, 2, 10])).read_all()
71+
>>> assert len(result) == 3

vortex-python/src/dataset.rs

Lines changed: 2 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -127,26 +127,20 @@ impl PyVortexDataset {
127127
Ok(PyArrayRef::from(array))
128128
}
129129

130-
#[pyo3(signature = (*, columns = None, row_filter = None, indices = None, split_by = None))]
130+
#[pyo3(signature = (*, columns = None, row_filter = None, split_by = None))]
131131
pub fn to_record_batch_reader(
132132
self_: PyRef<Self>,
133133
columns: Option<Vec<Bound<'_, PyAny>>>,
134134
row_filter: Option<&Bound<'_, PyExpr>>,
135-
indices: Option<PyArrayRef>,
136135
split_by: Option<usize>,
137136
) -> PyResult<PyObject> {
138-
let mut scan = self_
137+
let scan = self_
139138
.vxf
140139
.scan()?
141140
.with_projection(projection_from_python(columns)?)
142141
.with_some_filter(filter_from_python(row_filter))
143142
.with_split_by(split_by.map(SplitBy::RowCount).unwrap_or(SplitBy::Layout));
144143

145-
if let Some(indices) = indices.map(|i| i.inner().clone()) {
146-
let indices = indices.to_primitive()?.into_buffer();
147-
scan = scan.with_row_indices(indices);
148-
}
149-
150144
// TODO(ngates): should we use multi-threaded read or not?
151145
let schema = Arc::new(scan.dtype()?.to_arrow_schema()?);
152146
let reader: Box<dyn RecordBatchReader + Send> =

vortex-python/test/test_file.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,13 @@ def test_scan(vxf: VortexFile):
4545
pass
4646

4747

48+
def test_scan_with_indices(vxf: VortexFile):
49+
total_rows = 0
50+
for rb in vxf.scan(indices=vx.array([1, 10, 1_000, 999_999])):
51+
total_rows += len(rb)
52+
assert total_rows == 4
53+
54+
4855
def test_to_arrow_batch_size(vxf: VortexFile):
4956
assert len(list(vxf.to_arrow(batch_size=1_000_000))) == 1, "batch_size=1_000_000"
5057
assert len(list(vxf.to_arrow(batch_size=1_000))) == 1_000, "batch_size=1_000"

0 commit comments

Comments
 (0)