Skip to content

Commit 75d1440

Browse files
committed
[Python][Dataset] Add filters support to orc.read_table with path/file-like compatibility
1 parent c6ecd4c commit 75d1440

File tree

1 file changed

+81
-6
lines changed

1 file changed

+81
-6
lines changed

python/pyarrow/orc.py

Lines changed: 81 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,26 @@
1515
# specific language governing permissions and limitations
1616
# under the License.
1717

18+
"""
19+
Apache ORC file format with predicate pushdown support.
20+
21+
ORC supports stripe-level filtering using column statistics for INT32/INT64 columns.
22+
23+
**Dataset API** (recommended for multiple files)::
24+
25+
>>> import pyarrow.dataset as ds
26+
>>> dataset = ds.dataset('data.orc', format='orc')
27+
>>> table = dataset.to_table(filter=ds.field('value') > 100)
28+
29+
**Convenience API** (single file)::
30+
31+
>>> import pyarrow.orc as orc
32+
>>> table = orc.read_table('data.orc', filters=ds.field('value') > 100)
33+
>>> table = orc.read_table('data.orc', filters=[('value', '>', 100)]) # DNF tuples
34+
"""
1835

1936
from numbers import Integral
37+
import os
2038
import warnings
2139

2240
from pyarrow.lib import Table
@@ -297,17 +315,35 @@ def close(self):
297315
self.is_open = False
298316

299317

300-
def read_table(source, columns=None, filesystem=None):
318+
def read_table(source, columns=None, filesystem=None, filters=None):
301319
filesystem, path = _resolve_filesystem_and_path(source, filesystem)
320+
321+
if filters is not None:
322+
import pyarrow.dataset as ds
323+
from pyarrow.parquet.core import filters_to_expression
324+
325+
# filters_to_expression handles both Expression and DNF tuple formats
326+
filter_expr = filters_to_expression(filters)
327+
328+
# Dataset API requires path-like inputs. For file-like/NativeFile inputs
329+
# fall back to direct ORC read + in-memory filtering for compatibility.
330+
if filesystem is None and not isinstance(path, (str, bytes, os.PathLike)):
331+
if columns is not None and len(columns) == 0:
332+
result = ORCFile(source).read().select(columns)
333+
else:
334+
result = ORCFile(source).read(columns=columns)
335+
return result.filter(filter_expr)
336+
337+
dataset_source = path if filesystem is not None else source
338+
dataset = ds.dataset(dataset_source, format='orc', filesystem=filesystem)
339+
return dataset.to_table(columns=columns, filter=filter_expr)
340+
302341
if filesystem is not None:
303342
source = filesystem.open_input_file(path)
304343

305344
if columns is not None and len(columns) == 0:
306-
result = ORCFile(source).read().select(columns)
307-
else:
308-
result = ORCFile(source).read(columns=columns)
309-
310-
return result
345+
return ORCFile(source).read().select(columns)
346+
return ORCFile(source).read(columns=columns)
311347

312348

313349
read_table.__doc__ = """
@@ -330,6 +366,45 @@ def read_table(source, columns=None, filesystem=None):
330366
If nothing passed, will be inferred based on path.
331367
Path will try to be found in the local on-disk filesystem otherwise
332368
it will be parsed as an URI to determine the filesystem.
369+
filters : pyarrow.compute.Expression or List[Tuple] or List[List[Tuple]], default None
370+
Predicate expression to filter rows. Uses ORC stripe-level statistics for
371+
optimization when possible.
372+
373+
Accepts Expression objects or DNF (Disjunctive Normal Form) tuples::
374+
375+
# Expression format
376+
filters=ds.field('id') > 100
377+
378+
# DNF tuples: list of conditions (AND), or list of lists (OR of ANDs)
379+
filters=[('id', '>', 100)] # single condition
380+
filters=[('id', '>', 100), ('id', '<', 200)] # AND
381+
filters=[[('x', '==', 1)], [('x', '==', 2)]] # OR
382+
383+
Supported operators: ==, !=, <, >, <=, >=, in, not in
384+
385+
Note: For path-like inputs, filters are evaluated through the dataset API.
386+
For file-like inputs, read_table falls back to in-memory filtering.
387+
388+
Returns
389+
-------
390+
pyarrow.Table
391+
Content of the file as a Table.
392+
393+
Examples
394+
--------
395+
Read entire file:
396+
397+
>>> import pyarrow.orc as orc
398+
>>> table = orc.read_table('data.orc')
399+
400+
Read with predicate pushdown:
401+
402+
>>> import pyarrow.dataset as ds
403+
>>> table = orc.read_table('data.orc', filters=ds.field('id') > 1000)
404+
405+
Read with column selection and filtering:
406+
407+
>>> table = orc.read_table('data.orc', columns=['id', 'value'], filters=[('id', '>', 1000)])
333408
"""
334409

335410

0 commit comments

Comments
 (0)