pandas-dev
diff --git a/‎doc/source/getting_started/install.rst‎
Lines changed: 2 additions & 1 deletion b/‎doc/source/getting_started/install.rst‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎doc/source/reference/io.rst‎
Lines changed: 6 additions & 0 deletions b/‎doc/source/reference/io.rst‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎doc/source/user_guide/io.rst‎
Lines changed: 97 additions & 0 deletions b/‎doc/source/user_guide/io.rst‎
Lines changed: 97 additions & 0 deletions
diff --git a/‎pandas/__init__.py‎
Lines changed: 2 additions & 0 deletions b/‎pandas/__init__.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎pandas/io/api.py‎
Lines changed: 2 additions & 0 deletions b/‎pandas/io/api.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎pandas/io/iceberg.py‎
Lines changed: 89 additions & 0 deletions b/‎pandas/io/iceberg.py‎
Lines changed: 89 additions & 0 deletions
diff --git a/‎pandas/tests/api/test_api.py‎
Lines changed: 1 addition & 0 deletions b/‎pandas/tests/api/test_api.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎pandas/tests/io/data/iceberg/catalog.sqlite‎
20 KB b/‎pandas/tests/io/data/iceberg/catalog.sqlite‎
20 KB
diff --git a/‎pandas/tests/io/data/iceberg/default.db/simple/data/00000-0-ffa15aa8-ae32-47d6-87fe-daa9721c489f.parquet‎
886 Bytes b/‎pandas/tests/io/data/iceberg/default.db/simple/data/00000-0-ffa15aa8-ae32-47d6-87fe-daa9721c489f.parquet‎
886 Bytes
diff --git a/‎pandas/tests/io/data/iceberg/default.db/simple/metadata/00000-766b7dba-43b8-4a1a-b3db-e14bc9922afe.metadata.json‎
Lines changed: 1 addition & 0 deletions b/‎pandas/tests/io/data/iceberg/default.db/simple/metadata/00000-766b7dba-43b8-4a1a-b3db-e14bc9922afe.metadata.json‎
Lines changed: 1 addition & 0 deletions
@@ -299,7 +299,7 @@ Dependency                                                         Minimum Versi
 Other data sources
 ^^^^^^^^^^^^^^^^^^
 
-Installable with ``pip install "pandas[hdf5, parquet, feather, spss, excel]"``
+Installable with ``pip install "pandas[hdf5, parquet, iceberg, feather, spss, excel]"``
 
 ====================================================== ================== ================ ==========================================================
 Dependency                                             Minimum Version    pip extra        Notes
@@ -309,6 +309,7 @@ Dependency                                             Minimum Version    pip ex
 `zlib <https://github.com/madler/zlib>`__                                 hdf5             Compression for HDF5
 `fastparquet <https://github.com/dask/fastparquet>`__  2024.2.0           -                Parquet reading / writing (pyarrow is default)
 `pyarrow <https://github.com/apache/arrow>`__          10.0.1             parquet, feather Parquet, ORC, and feather reading / writing
+`PyIceberg <https://py.iceberg.apache.org/>`__         0.6.1              iceberg          Apache Iceberg reading
 `pyreadstat <https://github.com/Roche/pyreadstat>`__   1.2.6              spss             SPSS files (.sav) reading
 `odfpy <https://github.com/eea/odfpy>`__               1.4.1              excel            Open document format (.odf, .ods, .odt) reading / writing
 ====================================================== ================== ================ ==========================================================
 
@@ -156,6 +156,12 @@ Parquet
    read_parquet
    DataFrame.to_parquet
 
+Iceberg
+~~~~~~~
+.. autosummary::
+    :toctree: api/
+    read_iceberg
+
 ORC
 ~~~
 .. autosummary::
 
@@ -29,6 +29,7 @@ The pandas I/O API is a set of top level ``reader`` functions accessed like
     binary,`HDF5 Format <https://support.hdfgroup.org/HDF5/whatishdf5.html>`__, :ref:`read_hdf<io.hdf5>`, :ref:`to_hdf<io.hdf5>`
     binary,`Feather Format <https://github.com/wesm/feather>`__, :ref:`read_feather<io.feather>`, :ref:`to_feather<io.feather>`
     binary,`Parquet Format <https://parquet.apache.org/>`__, :ref:`read_parquet<io.parquet>`, :ref:`to_parquet<io.parquet>`
+    binary,`Apache Iceberg <https://iceberg.apache.org/>`__, :ref:`read_iceberg<io.iceberg>` , NA
     binary,`ORC Format <https://orc.apache.org/>`__, :ref:`read_orc<io.orc>`, :ref:`to_orc<io.orc>`
     binary,`Stata <https://en.wikipedia.org/wiki/Stata>`__, :ref:`read_stata<io.stata_reader>`, :ref:`to_stata<io.stata_writer>`
     binary,`SAS <https://en.wikipedia.org/wiki/SAS_(software)>`__, :ref:`read_sas<io.sas_reader>` , NA
@@ -5403,6 +5404,102 @@ The above example creates a partitioned dataset that may look like:
    except OSError:
        pass
 
+.. _io.iceberg:
+
+Iceberg
+-------
+
+.. versionadded:: 3.0.0
+
+Apache Iceberg is a high performance open-source format for large analytic tables.
+Iceberg enables the use of SQL tables for big data while making it possible for different
+engines to safely work with the same tables at the same time.
+
+Iceberg support predicate pushdown and column pruning, which are available to pandas
+users via the ``row_filter`` and ``selected_fields`` parameters of the :func:`~pandas.read_iceberg`
+function. This is convenient to extract from large tables a subset that fits in memory asa
+pandas ``DataFrame``.
+
+Internally, pandas uses PyIceberg_ to query Iceberg.
+
+.. _PyIceberg: https://py.iceberg.apache.org/
+
+A simple example loading all data from an Iceberg table ``my_table`` defined in the
+``my_catalog`` catalog.
+
+.. code-block:: python
+
+    df = pd.read_iceberg("my_table", catalog_name="my_catalog")
+
+Catalogs must be defined in the ``.pyiceberg.yaml`` file, usually in the home directory.
+It is possible to to change properties of the catalog definition with the
+``catalog_properties`` parameter:
+
+.. code-block:: python
+
+    df = pd.read_iceberg(
+        "my_table",
+        catalog_name="my_catalog",
+        catalog_properties={"s3.secret-access-key": "my_secret"},
+    )
+
+It is also possible to fully specify the catalog in ``catalog_properties`` and not provide
+a ``catalog_name``:
+
+.. code-block:: python
+
+    df = pd.read_iceberg(
+        "my_table",
+        catalog_properties={
+            "uri": "http://127.0.0.1:8181",
+            "s3.endpoint": "http://127.0.0.1:9000",
+        },
+    )
+
+To create the ``DataFrame`` with only a subset of the columns:
+
+.. code-block:: python
+
+    df = pd.read_iceberg(
+        "my_table",
+        catalog_name="my_catalog",
+        selected_fields=["my_column_3", "my_column_7"]
+    )
+
+This will execute the function faster, since other columns won't be read. And it will also
+save memory, since the data from other columns won't be loaded into the underlying memory of
+the ``DataFrame``.
+
+To fetch only a subset of the rows, we can do it with the ``limit`` parameter:
+
+.. code-block:: python
+
+    df = pd.read_iceberg(
+        "my_table",
+        catalog_name="my_catalog",
+        limit=100,
+    )
+
+This will create a ``DataFrame`` with 100 rows, assuming there are at least this number in
+the table.
+
+To fetch a subset of the rows based on a condition, this can be done using the ``row_filter``
+parameter:
+
+.. code-block:: python
+
+    df = pd.read_iceberg(
+        "my_table",
+        catalog_name="my_catalog",
+        row_filter="distance > 10.0",
+    )
+
+Reading a particular snapshot is also possible providing the snapshot ID as an argument to
+``snapshot_id``.
+
+More information about the Iceberg format can be found in the `Apache Iceberg official
+page <https://iceberg.apache.org/>`__.
+
 .. _io.orc:
 
 ORC
 
@@ -164,6 +164,7 @@
     read_stata,
     read_sas,
     read_spss,
+    read_iceberg,
 )
 
 from pandas.io.json._normalize import json_normalize
@@ -319,6 +320,7 @@
     "read_fwf",
     "read_hdf",
     "read_html",
+    "read_iceberg",
     "read_json",
     "read_orc",
     "read_parquet",
 
@@ -10,6 +10,7 @@
 )
 from pandas.io.feather_format import read_feather
 from pandas.io.html import read_html
+from pandas.io.iceberg import read_iceberg
 from pandas.io.json import read_json
 from pandas.io.orc import read_orc
 from pandas.io.parquet import read_parquet
@@ -47,6 +48,7 @@
     "read_fwf",
     "read_hdf",
     "read_html",
+    "read_iceberg",
     "read_json",
     "read_orc",
     "read_parquet",
 
@@ -0,0 +1,89 @@
+from typing import (
+    Any,
+)
+
+from pandas.compat._optional import import_optional_dependency
+
+from pandas import DataFrame
+
+
+def read_iceberg(
+    table_identifier: str,
+    catalog_name: str | None = None,
+    catalog_properties: dict[str, Any] | None = None,
+    row_filter: str | None = None,
+    selected_fields: tuple[str] | None = None,
+    case_sensitive: bool = True,
+    snapshot_id: int | None = None,
+    limit: int | None = None,
+    scan_properties: dict[str, Any] | None = None,
+) -> DataFrame:
+    """
+    Read an Apache Iceberg table into a pandas DataFrame.
+
+    Parameters
+    ----------
+    table_identifier : str
+        Table identifier.
+    catalog_name : str, optional
+        The name of the catalog.
+    catalog_properties : dict of {str: str}, optional
+        The properties that are used next to the catalog configuration.
+    row_filter : str, optional
+        A string that describes the desired rows.
+    selected_fields : tuple of str, optional
+        A tuple of strings representing the column names to return in the output
+        dataframe.
+    case_sensitive : bool, default True
+        If True column matching is case sensitive,
+    snapshot_id : int, optional
+        Snapshot ID to time travel to. By default the table will be scanned as of the
+        current snapshot ID.
+    limit : int, optional
+        An integer representing the number of rows to return in the scan result.
+        By default all matching rows will be fetched.
+    scan_properties : dict of {str: obj}, optional
+        Additional Table properties as a dictionary of string key value pairs to use
+        for this scan.
+
+    Returns
+    -------
+    DataFrame
+        DataFrame based on the Iceberg table.
+
+    See Also
+    --------
+    read_parquet : Read a Parquet file.
+
+    Examples
+    --------
+    >>> df = pandas.read_iceberg(
+    ...     catalog_name="my_catalog",
+    ...     catalog_options={"s3.secret-access-key": "my-secret"},
+    ...     table_name="my_table",
+    ...     row_filter="trip_distance >= 10.0",
+    ...     selected_fields=("VendorID", "tpep_pickup_datetime"),
+    ... )
+    """
+    pyiceberg_catalog = import_optional_dependency("pyiceberg.catalog")
+    pyiceberg_expressions = import_optional_dependency("pyiceberg.expressions")
+
+    if catalog_properties is None:
+        catalog_properties = {}
+    catalog = pyiceberg_catalog.load_catalog(catalog_name, **catalog_properties)
+    table = catalog.load_table(table_identifier)
+    if row_filter is None:
+        row_filter = pyiceberg_expressions.AlwaysTrue()
+    if selected_fields is None:
+        selected_fields = ("*",)
+    if scan_properties is None:
+        scan_properties = {}
+    result = table.scan(
+        row_filter=row_filter,
+        selected_fields=selected_fields,
+        case_sensitive=case_sensitive,
+        snapshot_id=snapshot_id,
+        options=scan_properties,
+        limit=limit,
+    )
+    return result.to_pandas()
@@ -168,6 +168,7 @@ class TestPDApi(Base):
         "read_parquet",
         "read_orc",
         "read_spss",
+        "read_iceberg",
     ]
 
     # top-level json funcs
 
@@ -0,0 +1 @@
+{"location":"file:///home/mgarcia/src/pandas/pandas/tests/io/data/iceberg/default.db/simple","table-uuid":"b991389a-a555-4af4-a26a-260eba47eca9","last-updated-ms":1746040267249,"last-column-id":2,"schemas":[{"type":"struct","fields":[{"id":1,"name":"A","type":"long","required":false},{"id":2,"name":"B","type":"string","required":false}],"schema-id":0,"identifier-field-ids":[]}],"current-schema-id":0,"partition-specs":[{"spec-id":0,"fields":[]}],"default-spec-id":0,"last-partition-id":999,"properties":{},"snapshots":[],"snapshot-log":[],"metadata-log":[],"sort-orders":[{"order-id":0,"fields":[]}],"default-sort-order-id":0,"refs":{},"statistics":[],"format-version":2,"last-sequence-number":0}
Original file line number	Diff line number	Diff line change
`@@ -168,6 +168,7 @@ class TestPDApi(Base):`
`168`	`168`	`"read_parquet",`
`169`	`169`	`"read_orc",`
`170`	`170`	`"read_spss",`
	`171`	`+ "read_iceberg",`
`171`	`172`	`]`
`172`	`173`
`173`	`174`	`# top-level json funcs`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	+{"location":"file:///home/mgarcia/src/pandas/pandas/tests/io/data/iceberg/default.db/simple","table-uuid":"b991389a-a555-4af4-a26a-260eba47eca9","last-updated-ms":1746040267249,"last-column-id":2,"schemas":[{"type":"struct","fields":[{"id":1,"name":"A","type":"long","required":false},{"id":2,"name":"B","type":"string","required":false}],"schema-id":0,"identifier-field-ids":[]}],"current-schema-id":0,"partition-specs":[{"spec-id":0,"fields":[]}],"default-spec-id":0,"last-partition-id":999,"properties":{},"snapshots":[],"snapshot-log":[],"metadata-log":[],"sort-orders":[{"order-id":0,"fields":[]}],"default-sort-order-id":0,"refs":{},"statistics":[],"format-version":2,"last-sequence-number":0}