[DOP-26403] Set output type based on SQL query type

dolfinus · dolfinus · commit 84d25c310836 · 2025-09-16T13:41:15.000+03:00
diff --git a/data_rentgen/consumer/extractors/generic/io.py b/data_rentgen/consumer/extractors/generic/io.py
@@ -2,6 +2,7 @@
 # SPDX-License-Identifier: Apache-2.0
 from __future__ import annotations
 
+import re
 from abc import ABC, abstractmethod
 from datetime import datetime, timedelta
 
@@ -23,6 +24,25 @@
 from data_rentgen.openlineage.dataset_facets import OpenLineageSchemaField
 from data_rentgen.openlineage.run_event import OpenLineageRunEvent
 
+SQL_QUERY_SYNTAX = re.compile(
+    r"\b(?P<query_type>MERGE|INSERT|UPDATE|DELETE|CREATE|RENAME|TRUNCATE|DROP(?!\sCOLUMN)|COPY)\s",
+    re.IGNORECASE | re.DOTALL,
+)
+# Alter has lowest priority
+ALTER_SYNTAX = re.compile(r"\bALTER\s", re.IGNORECASE | re.DOTALL)
+QUERY_TYPE_TO_OUTPUT_TYPE = {
+    "MERGE": OutputTypeDTO.MERGE,
+    "INSERT": OutputTypeDTO.APPEND,
+    "UPDATE": OutputTypeDTO.UPDATE,
+    "DELETE": OutputTypeDTO.DELETE,
+    "CREATE": OutputTypeDTO.CREATE,
+    "RENAME": OutputTypeDTO.RENAME,
+    "ALTER": OutputTypeDTO.ALTER,
+    "TRUNCATE": OutputTypeDTO.TRUNCATE,
+    "DROP": OutputTypeDTO.DROP,
+    "COPY": OutputTypeDTO.APPEND,
+}
+
 METASTORE = DatasetSymlinkTypeDTO.METASTORE
 WAREHOUSE = DatasetSymlinkTypeDTO.WAREHOUSE
 
@@ -121,7 +141,7 @@ def extract_output(
             created_at=created_at,
             operation=operation,
             dataset=resolved_dataset_dto,
-            type=self._extract_output_type(operation, dataset),
+            type=self._extract_output_type(operation, dataset) or OutputTypeDTO.UNKNOWN,
             schema=self.extract_schema(dataset),
         )
         if dataset.outputFacets.outputStatistics:
@@ -135,10 +155,21 @@ def _extract_output_type(
         self,
         operation: OperationDTO,
         dataset: OpenLineageOutputDataset,
-    ) -> OutputTypeDTO:
+    ) -> OutputTypeDTO | None:
         if dataset.facets.lifecycleStateChange:
             return OutputTypeDTO[dataset.facets.lifecycleStateChange.lifecycleStateChange]
-        return OutputTypeDTO.APPEND
+        if operation.sql_query:
+            return self._extract_output_type_from_sql(operation.sql_query.query)
+        return None
+
+    def _extract_output_type_from_sql(self, sql: str) -> OutputTypeDTO | None:
+        found = SQL_QUERY_SYNTAX.search(sql)
+        if found:
+            return QUERY_TYPE_TO_OUTPUT_TYPE[found.group("query_type")]
+        found = ALTER_SYNTAX.search(sql)
+        if found:
+            return OutputTypeDTO.ALTER
+        return None
 
     def _schema_field_to_json(self, field: OpenLineageSchemaField):
         result: dict = {
diff --git a/data_rentgen/consumer/extractors/impl/dbt.py b/data_rentgen/consumer/extractors/impl/dbt.py
@@ -3,8 +3,8 @@
 from __future__ import annotations
 
 from data_rentgen.consumer.extractors.generic import GenericExtractor
-from data_rentgen.dto import DatasetDTO, OperationDTO, RunDTO
-from data_rentgen.openlineage.dataset import OpenLineageDataset
+from data_rentgen.dto import DatasetDTO, OperationDTO, OutputTypeDTO, RunDTO
+from data_rentgen.openlineage.dataset import OpenLineageDataset, OpenLineageOutputDataset
 from data_rentgen.openlineage.dataset_facets import (
     OpenLineageColumnLineageDatasetFacetFieldRef,
     OpenLineageSymlinkIdentifier,
@@ -41,9 +41,18 @@ def extract_operation(self, event: OpenLineageRunEvent) -> OperationDTO:
 
     def _extract_dataset_ref(
         self,
-        dataset_ref: OpenLineageDataset | OpenLineageColumnLineageDatasetFacetFieldRef | OpenLineageSymlinkIdentifier,
+        dataset: OpenLineageDataset | OpenLineageColumnLineageDatasetFacetFieldRef | OpenLineageSymlinkIdentifier,
     ) -> DatasetDTO:
-        dataset = super()._extract_dataset_ref(dataset_ref)
+        dataset_dto = super()._extract_dataset_ref(dataset)
         # https://github.com/OpenLineage/OpenLineage/pull/3707
-        dataset.name = dataset.name.replace("None.", "")
-        return dataset
+        dataset_dto.name = dataset.name.replace("None.", "")
+        return dataset_dto
+
+    def _extract_output_type(
+        self,
+        operation: OperationDTO,
+        dataset: OpenLineageOutputDataset,
+    ) -> OutputTypeDTO | None:
+        # by default, model is not materialized, and is either VIEW or INSERT INTO
+        result = super()._extract_output_type(operation, dataset)
+        return result or OutputTypeDTO.APPEND
diff --git a/data_rentgen/consumer/extractors/impl/flink.py b/data_rentgen/consumer/extractors/impl/flink.py
@@ -3,8 +3,8 @@
 from __future__ import annotations
 
 from data_rentgen.consumer.extractors.generic import GenericExtractor
-from data_rentgen.dto import DatasetDTO, DatasetSymlinkDTO, RunDTO
-from data_rentgen.openlineage.dataset import OpenLineageDataset
+from data_rentgen.dto import DatasetDTO, DatasetSymlinkDTO, OperationDTO, OutputTypeDTO, RunDTO
+from data_rentgen.openlineage.dataset import OpenLineageDataset, OpenLineageOutputDataset
 from data_rentgen.openlineage.dataset_facets import (
     OpenLineageSymlinkIdentifier,
     OpenLineageSymlinkType,
@@ -59,3 +59,12 @@ def _extract_dataset_and_symlinks(
             if not (identifier.namespace.startswith("kafka://") and identifier.type == OpenLineageSymlinkType.TABLE)
         ]
         return super()._extract_dataset_and_symlinks(dataset, symlink_identifiers)
+
+    def _extract_output_type(
+        self,
+        operation: OperationDTO,
+        dataset: OpenLineageOutputDataset,
+    ) -> OutputTypeDTO | None:
+        # In most real cases, Flink writes to Kafka with APPEND
+        result = super()._extract_output_type(operation, dataset)
+        return result or OutputTypeDTO.APPEND
diff --git a/data_rentgen/consumer/extractors/impl/spark.py b/data_rentgen/consumer/extractors/impl/spark.py
@@ -77,16 +77,16 @@ def extract_operation(self, event: OpenLineageRunEvent) -> OperationDTO:
 
     def _extract_dataset_ref(
         self,
-        dataset_ref: OpenLineageDataset | OpenLineageColumnLineageDatasetFacetFieldRef | OpenLineageSymlinkIdentifier,
+        dataset: OpenLineageDataset | OpenLineageColumnLineageDatasetFacetFieldRef | OpenLineageSymlinkIdentifier,
     ) -> DatasetDTO:
-        dataset = super()._extract_dataset_ref(dataset_ref)
+        dataset_dto = super()._extract_dataset_ref(dataset)
 
         # convert /some/long/path/with=partition/another=abc to /some/long/path
-        if "=" in dataset.name and "/" in dataset.name:
-            name_with_partitions = PARTITION_PATH_PATTERN.match(dataset.name)
+        if "=" in dataset_dto.name and "/" in dataset_dto.name:
+            name_with_partitions = PARTITION_PATH_PATTERN.match(dataset_dto.name)
             if name_with_partitions:
-                dataset.name = name_with_partitions.group(1)
-        return dataset
+                dataset_dto.name = name_with_partitions.group(1)
+        return dataset_dto
 
     def _extract_dataset_and_symlinks(
         self,
diff --git a/data_rentgen/db/migrations/versions/2025-09-16_fc001835e473_increase_output_type_to_int32.py b/data_rentgen/db/migrations/versions/2025-09-16_fc001835e473_increase_output_type_to_int32.py
@@ -0,0 +1,38 @@
+# SPDX-FileCopyrightText: 2024-2025 MTS PJSC
+# SPDX-License-Identifier: Apache-2.0
+"""Increase output.type to int32
+
+Revision ID: fc001835e473
+Revises: 85592fce8fb0
+Create Date: 2025-09-15 18:53:32.392011
+
+"""
+
+import sqlalchemy as sa
+from alembic import op
+
+# revision identifiers, used by Alembic.
+revision = "fc001835e473"
+down_revision = "85592fce8fb0"
+branch_labels = None
+depends_on = None
+
+
+def upgrade() -> None:
+    op.alter_column(
+        "output",
+        "type",
+        existing_type=sa.SMALLINT(),
+        type_=sa.INTEGER(),
+        existing_nullable=False,
+    )
+
+
+def downgrade() -> None:
+    op.alter_column(
+        "output",
+        "type",
+        existing_type=sa.INTEGER(),
+        type_=sa.SMALLINT(),
+        existing_nullable=False,
+    )
diff --git a/data_rentgen/db/models/output.py b/data_rentgen/db/models/output.py
@@ -8,7 +8,7 @@
 from uuid import UUID
 
 from sqlalchemy import UUID as SQL_UUID
-from sqlalchemy import BigInteger, DateTime, PrimaryKeyConstraint, SmallInteger
+from sqlalchemy import BigInteger, DateTime, Integer, PrimaryKeyConstraint
 from sqlalchemy.orm import Mapped, mapped_column, relationship
 from sqlalchemy_utils import ChoiceType
 
@@ -21,6 +21,8 @@
 
 
 class OutputType(IntFlag):
+    UNKNOWN = 0
+
     APPEND = 1
 
     CREATE = 2
@@ -32,6 +34,10 @@ class OutputType(IntFlag):
     DROP = 32
     TRUNCATE = 64
 
+    DELETE = 128
+    UPDATE = 256
+    MERGE = 512
+
 
 # no foreign keys to avoid scanning all the partitions
 class Output(Base):
@@ -102,9 +108,9 @@ class Output(Base):
     )
 
     type: Mapped[OutputType] = mapped_column(
-        ChoiceType(OutputType, impl=SmallInteger()),
+        ChoiceType(OutputType, impl=Integer()),
         nullable=False,
-        default=OutputType.APPEND,
+        default=OutputType.UNKNOWN,
         doc="Type of the output, e.g. READ, CREATE, APPEND",
     )
 
diff --git a/data_rentgen/dto/output.py b/data_rentgen/dto/output.py
@@ -15,6 +15,8 @@
 
 
 class OutputTypeDTO(IntFlag):
+    UNKNOWN = 0
+
     APPEND = 1
 
     CREATE = 2
@@ -26,13 +28,17 @@ class OutputTypeDTO(IntFlag):
     DROP = 32
     TRUNCATE = 64
 
+    DELETE = 128
+    UPDATE = 256
+    MERGE = 512
+
 
 @dataclass(slots=True)
 class OutputDTO:
     created_at: datetime
     operation: OperationDTO
     dataset: DatasetDTO
-    type: OutputTypeDTO
+    type: OutputTypeDTO = OutputTypeDTO.UNKNOWN
     schema: SchemaDTO | None = None
     num_rows: int | None = None
     num_bytes: int | None = None
diff --git a/data_rentgen/server/schemas/v1/lineage.py b/data_rentgen/server/schemas/v1/lineage.py
@@ -136,6 +136,10 @@ class OutputTypeV1(IntFlag):
     DROP = 32
     TRUNCATE = 64
 
+    DELETE = 128
+    UPDATE = 256
+    MERGE = 512
+
     def __str__(self) -> str:
         return f"{self.name}"
 
diff --git a/docs/changelog/next_release/310.feature.rst b/docs/changelog/next_release/310.feature.rst
@@ -0,0 +1 @@
+Set ``output.type`` based on executed SQL query, e.g. ``INSERT``, ``UPDATE``, ``DELETE``, and so on.
diff --git a/tests/test_consumer/test_extractors/test_extractors_interaction.py b/tests/test_consumer/test_extractors/test_extractors_interaction.py
@@ -6,7 +6,16 @@
 import pytest
 
 from data_rentgen.consumer.extractors.generic import GenericExtractor
-from data_rentgen.dto import DatasetDTO, InputDTO, LocationDTO, OperationDTO, OutputDTO, OutputTypeDTO, SchemaDTO
+from data_rentgen.dto import (
+    DatasetDTO,
+    InputDTO,
+    LocationDTO,
+    OperationDTO,
+    OutputDTO,
+    OutputTypeDTO,
+    SchemaDTO,
+    SQLQueryDTO,
+)
 from data_rentgen.openlineage.dataset import (
     OpenLineageDataset,
     OpenLineageInputDataset,
@@ -199,7 +208,7 @@ def test_extractors_extract_input_for_long_operations():
         (None, None, None),
     ],
 )
-def test_extractors_extract_output_batch(
+def test_extractors_extract_output_batch_with_lifecycle(
     lifecycle_state_change: OpenLineageDatasetLifecycleStateChange,
     expected_type: OutputTypeDTO,
     row_count: int | None,
@@ -247,6 +256,58 @@ def test_extractors_extract_output_batch(
     )
 
 
+@pytest.mark.parametrize(
+    ["sql_query", "expected_type"],
+    [
+        ("CREATE TABLE AS SELECT * FROM mytable", OutputTypeDTO.CREATE),
+        ("INSERT INTO mytable SELECT * FROM mytable", OutputTypeDTO.APPEND),
+        ("UPDATE mytable SET a=1", OutputTypeDTO.UPDATE),
+        ("DELETE FROM mytable", OutputTypeDTO.DELETE),
+        ("COPY mytable FROM '...'", OutputTypeDTO.APPEND),
+        ("ALTER TABLE mytable RENAME TO mytable_new", OutputTypeDTO.RENAME),
+        ("ALTER TABLE mytable DROP COLUMN a", OutputTypeDTO.ALTER),
+        ("TRUNCATE TABLE mytable", OutputTypeDTO.TRUNCATE),
+        ("TRUNCATE TABLE mytable DROP STORAGE", OutputTypeDTO.TRUNCATE),
+        ("ALTER TABLE mytable TRUNCATE PARTITION (a=1, b=2)", OutputTypeDTO.TRUNCATE),
+        ("DROP TABLE mytable", OutputTypeDTO.DROP),
+        ("DROP TABLE mytable PURGE", OutputTypeDTO.DROP),
+        ("ALTER TABLE mytable DROP PARTITION (a=1, b=2)", OutputTypeDTO.DROP),
+        ("MERGE INTO mytable", OutputTypeDTO.MERGE),
+        ("CALL myproc()", OutputTypeDTO.UNKNOWN),
+    ],
+)
+def test_extractors_extract_output_batch_with_sql(
+    sql_query: str,
+    expected_type: OutputTypeDTO,
+):
+    output = OpenLineageOutputDataset(
+        namespace="hdfs://test-hadoop:9820",
+        name="/user/hive/warehouse/mydb.db/mytable",
+    )
+    operation = Mock(spec=OperationDTO)
+    operation.sql_query = SQLQueryDTO(query=sql_query)
+
+    event = Mock(spec=OpenLineageRunEvent)
+    operation.created_at = event.eventTime = datetime(2024, 7, 5, 9, 6, 29, 462000, tzinfo=timezone.utc)
+
+    assert GenericExtractor().extract_output(operation, output, event) == (
+        OutputDTO(
+            created_at=operation.created_at,
+            type=expected_type,
+            operation=operation,
+            dataset=DatasetDTO(
+                name="/user/hive/warehouse/mydb.db/mytable",
+                location=LocationDTO(
+                    type="hdfs",
+                    name="test-hadoop:9820",
+                    addresses={"hdfs://test-hadoop:9820"},
+                ),
+            ),
+        ),
+        [],
+    )
+
+
 def test_extractors_extract_output_for_long_running_operations():
     output = OpenLineageOutputDataset(
         namespace="hdfs://test-hadoop:9820",
@@ -255,6 +316,7 @@ def test_extractors_extract_output_for_long_running_operations():
 
     # operation is streaming and created long time ago
     operation = Mock(spec=OperationDTO)
+    operation.sql_query = None
     operation.created_at = datetime(2024, 7, 5, tzinfo=timezone.utc)
 
     event = Mock(spec=OpenLineageRunEvent)
@@ -264,7 +326,7 @@ def test_extractors_extract_output_for_long_running_operations():
         OutputDTO(
             # count only whole hours since operation was created
             created_at=operation.created_at + timedelta(hours=9),
-            type=OutputTypeDTO.APPEND,
+            type=OutputTypeDTO.UNKNOWN,
             operation=operation,
             dataset=DatasetDTO(
                 name="/user/hive/warehouse/mydb.db/mytable",

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	+Set ``output.type`` based on executed SQL query, e.g. ``INSERT``, ``UPDATE``, ``DELETE``, and so on.