@@ -17,6 +17,72 @@ def write(
1717 warehouse : str | None = None ,
1818 min_commit_frequency : int | None = 60_000 ,
1919):
20+ """
21+ Writes the stream of changes from ``table`` into `Iceberg <https://iceberg.apache.org/>`_
22+ data storage. The data storage must be defined with the REST catalog URI, the namespace,
23+ and the table name.
24+
25+ If the namespace or the table doesn't exist, they will be created by the connector.
26+ The schema of the new table is inferred from the ``table``'s schema. The output table
27+ must include two additional integer columns: ``time``, representing the computation
28+ minibatch, and ``diff``, indicating the type of change (``1`` for row addition and
29+ ``-1`` for row deletion).
30+
31+ Args:
32+ table: Table to be written.
33+ catalog_uri: URI of the Iceberg REST catalog.
34+ namespace: The name of the namespace containing the target table. If the namespace
35+ doesn't exist, it will be created by the connector.
36+ table_name: The name of the table to be written. If a table with such a name
37+ doesn't exist, it will be created by the connector.
38+ warehouse: Optional, path to the Iceberg storage warehouse.
39+ min_commit_frequency: Specifies the minimum time interval between two data
40+ commits in storage, measured in milliseconds. If set to ``None``, finalized
41+ minibatches will be committed as soon as possible. Keep in mind that each
42+ commit in Iceberg creates a new Parquet file and writes an entry in the
43+ transaction log. Therefore, it is advisable to limit the frequency of commits
44+ to reduce the overhead of processing the resulting table.
45+
46+ Returns:
47+ None
48+
49+ Example:
50+
51+ Consider a users data table stored locally in a file called ``users.txt`` in CSV format.
52+ The Iceberg output connector provides the capability to place this table into
53+ Iceberg storage, defined by the catalog with URI ``http://localhost:8181``. The target
54+ table is ``users``, located in the ``app`` namespace.
55+
56+ First, the table must be read. To do this, you need to define the schema. For
57+ simplicity, consider that it consists of two fields: the user ID and the name.
58+
59+ The schema definition may look as follows:
60+
61+ >>> import pathway as pw
62+ >>> class InputSchema(pw.Schema):
63+ ... user_id: int
64+ ... name: str
65+
66+ Using this schema, you can read the table from the input file. You need to use the
67+ ``pw.io.csv.read`` connector. Here, you can use the static mode since the text file
68+ with the users doesn't change dynamically.
69+
70+ >>> users = pw.io.csv.read("./users.txt", schema=InputSchema, mode="static")
71+
72+ Once the table is read, you can use ``pw.io.iceberg.write`` to save this table into
73+ Iceberg storage.
74+
75+ >>> pw.io.iceberg.write(
76+ ... users,
77+ ... catalog_uri="http://localhost:8181/",
78+ ... namespace=["app"],
79+ ... table_name="users",
80+ ... )
81+
82+ Don't forget to run your program with ``pw.run`` once you define all necessary
83+ computations. After execution, you will be able to see the users' data in the
84+ Iceberg storage.
85+ """
2086 _check_entitlements ("iceberg" )
2187 data_storage = api .DataStorage (
2288 storage_type = "iceberg" ,
0 commit comments