66import pytest
77
88from data_rentgen .consumer .extractors .generic import GenericExtractor
9- from data_rentgen .dto import DatasetDTO , InputDTO , LocationDTO , OperationDTO , OutputDTO , OutputTypeDTO , SchemaDTO
9+ from data_rentgen .dto import (
10+ DatasetDTO ,
11+ InputDTO ,
12+ LocationDTO ,
13+ OperationDTO ,
14+ OutputDTO ,
15+ OutputTypeDTO ,
16+ SchemaDTO ,
17+ SQLQueryDTO ,
18+ )
1019from data_rentgen .openlineage .dataset import (
1120 OpenLineageDataset ,
1221 OpenLineageInputDataset ,
@@ -199,7 +208,7 @@ def test_extractors_extract_input_for_long_operations():
199208 (None , None , None ),
200209 ],
201210)
202- def test_extractors_extract_output_batch (
211+ def test_extractors_extract_output_batch_with_lifecycle (
203212 lifecycle_state_change : OpenLineageDatasetLifecycleStateChange ,
204213 expected_type : OutputTypeDTO ,
205214 row_count : int | None ,
@@ -247,6 +256,58 @@ def test_extractors_extract_output_batch(
247256 )
248257
249258
259+ @pytest .mark .parametrize (
260+ ["sql_query" , "expected_type" ],
261+ [
262+ ("CREATE TABLE AS SELECT * FROM mytable" , OutputTypeDTO .CREATE ),
263+ ("INSERT INTO mytable SELECT * FROM mytable" , OutputTypeDTO .APPEND ),
264+ ("UPDATE mytable SET a=1" , OutputTypeDTO .UPDATE ),
265+ ("DELETE FROM mytable" , OutputTypeDTO .DELETE ),
266+ ("COPY mytable FROM '...'" , OutputTypeDTO .APPEND ),
267+ ("ALTER TABLE mytable RENAME TO mytable_new" , OutputTypeDTO .RENAME ),
268+ ("ALTER TABLE mytable DROP COLUMN a" , OutputTypeDTO .ALTER ),
269+ ("TRUNCATE TABLE mytable" , OutputTypeDTO .TRUNCATE ),
270+ ("TRUNCATE TABLE mytable DROP STORAGE" , OutputTypeDTO .TRUNCATE ),
271+ ("ALTER TABLE mytable TRUNCATE PARTITION (a=1, b=2)" , OutputTypeDTO .TRUNCATE ),
272+ ("DROP TABLE mytable" , OutputTypeDTO .DROP ),
273+ ("DROP TABLE mytable PURGE" , OutputTypeDTO .DROP ),
274+ ("ALTER TABLE mytable DROP PARTITION (a=1, b=2)" , OutputTypeDTO .DROP ),
275+ ("MERGE INTO mytable" , OutputTypeDTO .MERGE ),
276+ ("CALL myproc()" , OutputTypeDTO .UNKNOWN ),
277+ ],
278+ )
279+ def test_extractors_extract_output_batch_with_sql (
280+ sql_query : str ,
281+ expected_type : OutputTypeDTO ,
282+ ):
283+ output = OpenLineageOutputDataset (
284+ namespace = "hdfs://test-hadoop:9820" ,
285+ name = "/user/hive/warehouse/mydb.db/mytable" ,
286+ )
287+ operation = Mock (spec = OperationDTO )
288+ operation .sql_query = SQLQueryDTO (query = sql_query )
289+
290+ event = Mock (spec = OpenLineageRunEvent )
291+ operation .created_at = event .eventTime = datetime (2024 , 7 , 5 , 9 , 6 , 29 , 462000 , tzinfo = timezone .utc )
292+
293+ assert GenericExtractor ().extract_output (operation , output , event ) == (
294+ OutputDTO (
295+ created_at = operation .created_at ,
296+ type = expected_type ,
297+ operation = operation ,
298+ dataset = DatasetDTO (
299+ name = "/user/hive/warehouse/mydb.db/mytable" ,
300+ location = LocationDTO (
301+ type = "hdfs" ,
302+ name = "test-hadoop:9820" ,
303+ addresses = {"hdfs://test-hadoop:9820" },
304+ ),
305+ ),
306+ ),
307+ [],
308+ )
309+
310+
250311def test_extractors_extract_output_for_long_running_operations ():
251312 output = OpenLineageOutputDataset (
252313 namespace = "hdfs://test-hadoop:9820" ,
@@ -255,6 +316,7 @@ def test_extractors_extract_output_for_long_running_operations():
255316
256317 # operation is streaming and created long time ago
257318 operation = Mock (spec = OperationDTO )
319+ operation .sql_query = None
258320 operation .created_at = datetime (2024 , 7 , 5 , tzinfo = timezone .utc )
259321
260322 event = Mock (spec = OpenLineageRunEvent )
@@ -264,7 +326,7 @@ def test_extractors_extract_output_for_long_running_operations():
264326 OutputDTO (
265327 # count only whole hours since operation was created
266328 created_at = operation .created_at + timedelta (hours = 9 ),
267- type = OutputTypeDTO .APPEND ,
329+ type = OutputTypeDTO .UNKNOWN ,
268330 operation = operation ,
269331 dataset = DatasetDTO (
270332 name = "/user/hive/warehouse/mydb.db/mytable" ,
0 commit comments