14
14
# KIND, either express or implied. See the License for the
15
15
# specific language governing permissions and limitations
16
16
# under the License.
17
+ from pathlib import PosixPath
18
+
17
19
import pytest
18
20
from datafusion import SessionContext
19
21
from pyarrow import Table as pa_table
20
22
23
+ from pyiceberg .catalog import Catalog
24
+ from pyiceberg .exceptions import NoSuchTableError
21
25
from pyiceberg .table import UpsertResult
22
26
from tests .catalog .test_base import InMemoryCatalog , Table
23
27
24
- _TEST_NAMESPACE = "test_ns"
28
+
29
+ @pytest .fixture
30
+ def catalog (tmp_path : PosixPath ) -> InMemoryCatalog :
31
+ catalog = InMemoryCatalog ("test.in_memory.catalog" , warehouse = tmp_path .absolute ().as_posix ())
32
+ catalog .create_namespace ("default" )
33
+ return catalog
34
+
35
+
36
+ def _drop_table (catalog : Catalog , identifier : str ) -> None :
37
+ try :
38
+ catalog .drop_table (identifier )
39
+ except NoSuchTableError :
40
+ pass
25
41
26
42
27
43
def show_iceberg_table (table : Table , ctx : SessionContext ) -> None :
@@ -72,7 +88,7 @@ def gen_source_dataset(start_row: int, end_row: int, composite_key: bool, add_du
72
88
73
89
74
90
def gen_target_iceberg_table (
75
- start_row : int , end_row : int , composite_key : bool , ctx : SessionContext , catalog : InMemoryCatalog , namespace : str
91
+ start_row : int , end_row : int , composite_key : bool , ctx : SessionContext , catalog : InMemoryCatalog , identifier : str
76
92
) -> Table :
77
93
additional_columns = ", t.order_id + 1000 as order_line_id" if composite_key else ""
78
94
@@ -83,7 +99,7 @@ def gen_target_iceberg_table(
83
99
from t
84
100
""" ).to_arrow_table ()
85
101
86
- table = catalog .create_table (f" { _TEST_NAMESPACE } .target" , df .schema )
102
+ table = catalog .create_table (identifier , df .schema )
87
103
88
104
table .append (df )
89
105
@@ -95,13 +111,6 @@ def assert_upsert_result(res: UpsertResult, expected_updated: int, expected_inse
95
111
assert res .rows_inserted == expected_inserted , f"rows inserted should be { expected_inserted } , but got { res .rows_inserted } "
96
112
97
113
98
- @pytest .fixture (scope = "session" )
99
- def catalog_conn () -> InMemoryCatalog :
100
- catalog = InMemoryCatalog ("test" )
101
- catalog .create_namespace (namespace = _TEST_NAMESPACE )
102
- yield catalog
103
-
104
-
105
114
@pytest .mark .parametrize (
106
115
"join_cols, src_start_row, src_end_row, target_start_row, target_end_row, when_matched_update_all, when_not_matched_insert_all, expected_updated, expected_inserted" ,
107
116
[
@@ -112,7 +121,7 @@ def catalog_conn() -> InMemoryCatalog:
112
121
],
113
122
)
114
123
def test_merge_rows (
115
- catalog_conn : InMemoryCatalog ,
124
+ catalog : Catalog ,
116
125
join_cols : list [str ],
117
126
src_start_row : int ,
118
127
src_end_row : int ,
@@ -123,12 +132,13 @@ def test_merge_rows(
123
132
expected_updated : int ,
124
133
expected_inserted : int ,
125
134
) -> None :
126
- ctx = SessionContext ()
135
+ identifier = "default.test_merge_rows"
136
+ _drop_table (catalog , identifier )
127
137
128
- catalog = catalog_conn
138
+ ctx = SessionContext ()
129
139
130
140
source_df = gen_source_dataset (src_start_row , src_end_row , False , False , ctx )
131
- ice_table = gen_target_iceberg_table (target_start_row , target_end_row , False , ctx , catalog , _TEST_NAMESPACE )
141
+ ice_table = gen_target_iceberg_table (target_start_row , target_end_row , False , ctx , catalog , identifier )
132
142
res = ice_table .upsert (
133
143
df = source_df ,
134
144
join_cols = join_cols ,
@@ -138,13 +148,13 @@ def test_merge_rows(
138
148
139
149
assert_upsert_result (res , expected_updated , expected_inserted )
140
150
141
- catalog .drop_table (f"{ _TEST_NAMESPACE } .target" )
142
151
143
-
144
- def test_merge_scenario_skip_upd_row (catalog_conn : InMemoryCatalog ) -> None :
152
+ def test_merge_scenario_skip_upd_row (catalog : Catalog ) -> None :
145
153
"""
146
154
tests a single insert and update; skips a row that does not need to be updated
147
155
"""
156
+ identifier = "default.test_merge_scenario_skip_upd_row"
157
+ _drop_table (catalog , identifier )
148
158
149
159
ctx = SessionContext ()
150
160
@@ -154,8 +164,7 @@ def test_merge_scenario_skip_upd_row(catalog_conn: InMemoryCatalog) -> None:
154
164
select 2 as order_id, date '2021-01-01' as order_date, 'A' as order_type
155
165
""" ).to_arrow_table ()
156
166
157
- catalog = catalog_conn
158
- table = catalog .create_table (f"{ _TEST_NAMESPACE } .target" , df .schema )
167
+ table = catalog .create_table (identifier , df .schema )
159
168
160
169
table .append (df )
161
170
@@ -174,24 +183,24 @@ def test_merge_scenario_skip_upd_row(catalog_conn: InMemoryCatalog) -> None:
174
183
175
184
assert_upsert_result (res , expected_updated , expected_inserted )
176
185
177
- catalog .drop_table (f"{ _TEST_NAMESPACE } .target" )
178
-
179
186
180
- def test_merge_scenario_date_as_key (catalog_conn : InMemoryCatalog ) -> None :
187
+ def test_merge_scenario_date_as_key (catalog : Catalog ) -> None :
181
188
"""
182
189
tests a single insert and update; primary key is a date column
183
190
"""
184
191
185
192
ctx = SessionContext ()
186
193
194
+ identifier = "default.test_merge_scenario_date_as_key"
195
+ _drop_table (catalog , identifier )
196
+
187
197
df = ctx .sql ("""
188
198
select date '2021-01-01' as order_date, 'A' as order_type
189
199
union all
190
200
select date '2021-01-02' as order_date, 'A' as order_type
191
201
""" ).to_arrow_table ()
192
202
193
- catalog = catalog_conn
194
- table = catalog .create_table (f"{ _TEST_NAMESPACE } .target" , df .schema )
203
+ table = catalog .create_table (identifier , df .schema )
195
204
196
205
table .append (df )
197
206
@@ -210,14 +219,15 @@ def test_merge_scenario_date_as_key(catalog_conn: InMemoryCatalog) -> None:
210
219
211
220
assert_upsert_result (res , expected_updated , expected_inserted )
212
221
213
- catalog .drop_table (f"{ _TEST_NAMESPACE } .target" )
214
-
215
222
216
- def test_merge_scenario_string_as_key (catalog_conn : InMemoryCatalog ) -> None :
223
+ def test_merge_scenario_string_as_key (catalog : Catalog ) -> None :
217
224
"""
218
225
tests a single insert and update; primary key is a string column
219
226
"""
220
227
228
+ identifier = "default.test_merge_scenario_string_as_key"
229
+ _drop_table (catalog , identifier )
230
+
221
231
ctx = SessionContext ()
222
232
223
233
df = ctx .sql ("""
@@ -226,8 +236,7 @@ def test_merge_scenario_string_as_key(catalog_conn: InMemoryCatalog) -> None:
226
236
select 'def' as order_id, 'A' as order_type
227
237
""" ).to_arrow_table ()
228
238
229
- catalog = catalog_conn
230
- table = catalog .create_table (f"{ _TEST_NAMESPACE } .target" , df .schema )
239
+ table = catalog .create_table (identifier , df .schema )
231
240
232
241
table .append (df )
233
242
@@ -246,18 +255,18 @@ def test_merge_scenario_string_as_key(catalog_conn: InMemoryCatalog) -> None:
246
255
247
256
assert_upsert_result (res , expected_updated , expected_inserted )
248
257
249
- catalog .drop_table (f"{ _TEST_NAMESPACE } .target" )
250
258
251
-
252
- def test_merge_scenario_composite_key (catalog_conn : InMemoryCatalog ) -> None :
259
+ def test_merge_scenario_composite_key (catalog : Catalog ) -> None :
253
260
"""
254
261
tests merging 200 rows with a composite key
255
262
"""
256
263
264
+ identifier = "default.test_merge_scenario_composite_key"
265
+ _drop_table (catalog , identifier )
266
+
257
267
ctx = SessionContext ()
258
268
259
- catalog = catalog_conn
260
- table = gen_target_iceberg_table (1 , 200 , True , ctx , catalog , _TEST_NAMESPACE )
269
+ table = gen_target_iceberg_table (1 , 200 , True , ctx , catalog , identifier )
261
270
source_df = gen_source_dataset (101 , 300 , True , False , ctx )
262
271
263
272
res = table .upsert (df = source_df , join_cols = ["order_id" , "order_line_id" ])
@@ -267,43 +276,41 @@ def test_merge_scenario_composite_key(catalog_conn: InMemoryCatalog) -> None:
267
276
268
277
assert_upsert_result (res , expected_updated , expected_inserted )
269
278
270
- catalog .drop_table (f"{ _TEST_NAMESPACE } .target" )
271
279
272
-
273
- def test_merge_source_dups (catalog_conn : InMemoryCatalog ) -> None :
280
+ def test_merge_source_dups (catalog : Catalog ) -> None :
274
281
"""
275
282
tests duplicate rows in source
276
283
"""
277
284
285
+ identifier = "default.test_merge_source_dups"
286
+ _drop_table (catalog , identifier )
287
+
278
288
ctx = SessionContext ()
279
289
280
- catalog = catalog_conn
281
- table = gen_target_iceberg_table (1 , 10 , False , ctx , catalog , _TEST_NAMESPACE )
290
+ table = gen_target_iceberg_table (1 , 10 , False , ctx , catalog , identifier )
282
291
source_df = gen_source_dataset (5 , 15 , False , True , ctx )
283
292
284
293
with pytest .raises (Exception , match = "Duplicate rows found in source dataset based on the key columns. No upsert executed" ):
285
294
table .upsert (df = source_df , join_cols = ["order_id" ])
286
295
287
- catalog .drop_table (f"{ _TEST_NAMESPACE } .target" )
288
-
289
296
290
- def test_key_cols_misaligned (catalog_conn : InMemoryCatalog ) -> None :
297
+ def test_key_cols_misaligned (catalog : Catalog ) -> None :
291
298
"""
292
299
tests join columns missing from one of the tables
293
300
"""
294
301
302
+ identifier = "default.test_key_cols_misaligned"
303
+ _drop_table (catalog , identifier )
304
+
295
305
ctx = SessionContext ()
296
306
297
307
df = ctx .sql ("select 1 as order_id, date '2021-01-01' as order_date, 'A' as order_type" ).to_arrow_table ()
298
308
299
- catalog = catalog_conn
300
- table = catalog .create_table (f"{ _TEST_NAMESPACE } .target" , df .schema )
309
+ table = catalog .create_table (identifier , df .schema )
301
310
302
311
table .append (df )
303
312
304
313
df_src = ctx .sql ("select 1 as item_id, date '2021-05-01' as order_date, 'B' as order_type" ).to_arrow_table ()
305
314
306
315
with pytest .raises (Exception , match = r"""Field ".*" does not exist in schema""" ):
307
316
table .upsert (df = df_src , join_cols = ["order_id" ])
308
-
309
- catalog .drop_table (f"{ _TEST_NAMESPACE } .target" )
0 commit comments