1515# specific language governing permissions and limitations
1616# under the License.
1717
18+ """
19+ Apache ORC file format with predicate pushdown support.
20+
21+ ORC supports stripe-level filtering using column statistics for INT32/INT64 columns.
22+
23+ **Dataset API** (recommended for multiple files)::
24+
25+ >>> import pyarrow.dataset as ds
26+ >>> dataset = ds.dataset('data.orc', format='orc')
27+ >>> table = dataset.to_table(filter=ds.field('value') > 100)
28+
29+ **Convenience API** (single file)::
30+
31+ >>> import pyarrow.orc as orc
32+ >>> table = orc.read_table('data.orc', filters=ds.field('value') > 100)
33+ >>> table = orc.read_table('data.orc', filters=[('value', '>', 100)]) # DNF tuples
34+ """
1835
1936from numbers import Integral
37+ import os
2038import warnings
2139
2240from pyarrow .lib import Table
@@ -297,17 +315,35 @@ def close(self):
297315 self .is_open = False
298316
299317
300- def read_table (source , columns = None , filesystem = None ):
318+ def read_table (source , columns = None , filesystem = None , filters = None ):
301319 filesystem , path = _resolve_filesystem_and_path (source , filesystem )
320+
321+ if filters is not None :
322+ import pyarrow .dataset as ds
323+ from pyarrow .parquet .core import filters_to_expression
324+
325+ # filters_to_expression handles both Expression and DNF tuple formats
326+ filter_expr = filters_to_expression (filters )
327+
328+ # Dataset API requires path-like inputs. For file-like/NativeFile inputs
329+ # fall back to direct ORC read + in-memory filtering for compatibility.
330+ if filesystem is None and not isinstance (path , (str , bytes , os .PathLike )):
331+ if columns is not None and len (columns ) == 0 :
332+ result = ORCFile (source ).read ().select (columns )
333+ else :
334+ result = ORCFile (source ).read (columns = columns )
335+ return result .filter (filter_expr )
336+
337+ dataset_source = path if filesystem is not None else source
338+ dataset = ds .dataset (dataset_source , format = 'orc' , filesystem = filesystem )
339+ return dataset .to_table (columns = columns , filter = filter_expr )
340+
302341 if filesystem is not None :
303342 source = filesystem .open_input_file (path )
304343
305344 if columns is not None and len (columns ) == 0 :
306- result = ORCFile (source ).read ().select (columns )
307- else :
308- result = ORCFile (source ).read (columns = columns )
309-
310- return result
345+ return ORCFile (source ).read ().select (columns )
346+ return ORCFile (source ).read (columns = columns )
311347
312348
313349read_table .__doc__ = """
@@ -330,6 +366,45 @@ def read_table(source, columns=None, filesystem=None):
330366 If nothing passed, will be inferred based on path.
331367 Path will try to be found in the local on-disk filesystem otherwise
332368 it will be parsed as an URI to determine the filesystem.
369+ filters : pyarrow.compute.Expression or List[Tuple] or List[List[Tuple]], default None
370+ Predicate expression to filter rows. Uses ORC stripe-level statistics for
371+ optimization when possible.
372+
373+ Accepts Expression objects or DNF (Disjunctive Normal Form) tuples::
374+
375+ # Expression format
376+ filters=ds.field('id') > 100
377+
378+ # DNF tuples: list of conditions (AND), or list of lists (OR of ANDs)
379+ filters=[('id', '>', 100)] # single condition
380+ filters=[('id', '>', 100), ('id', '<', 200)] # AND
381+ filters=[[('x', '==', 1)], [('x', '==', 2)]] # OR
382+
383+ Supported operators: ==, !=, <, >, <=, >=, in, not in
384+
385+ Note: For path-like inputs, filters are evaluated through the dataset API.
386+ For file-like inputs, read_table falls back to in-memory filtering.
387+
388+ Returns
389+ -------
390+ pyarrow.Table
391+ Content of the file as a Table.
392+
393+ Examples
394+ --------
395+ Read entire file:
396+
397+ >>> import pyarrow.orc as orc
398+ >>> table = orc.read_table('data.orc')
399+
400+ Read with predicate pushdown:
401+
402+ >>> import pyarrow.dataset as ds
403+ >>> table = orc.read_table('data.orc', filters=ds.field('id') > 1000)
404+
405+ Read with column selection and filtering:
406+
407+ >>> table = orc.read_table('data.orc', columns=['id', 'value'], filters=[('id', '>', 1000)])
333408"""
334409
335410
0 commit comments