Merge branch 'develop'

dolfinus · dolfinus · commit 04d73bb58080 · 2025-11-21T18:45:37.000+03:00
diff --git a/.env.docker b/.env.docker
@@ -37,6 +37,7 @@ DATA_RENTGEN__KAFKA__COMPRESSION=zstd
 DATA_RENTGEN__UI__API_BROWSER_URL=http://localhost:8000
 
 # Session
+DATA_RENTGEN__SERVER__SESSION__ENABLED=True
 DATA_RENTGEN__SERVER__SESSION__SECRET_KEY=session_secret_key
 
 # Keycloak Auth
@@ -54,8 +55,8 @@ DATA_RENTGEN__AUTH__PROVIDER=data_rentgen.server.providers.auth.dummy_provider.D
 DATA_RENTGEN__AUTH__ACCESS_TOKEN__SECRET_KEY=access_key_secret
 
 # Personal Tokens
-export DATA_RENTGEN__AUTH__PERSONAL_TOKENS__ENABLED=True
-export DATA_RENTGEN__AUTH__PERSONAL_TOKENS__SECRET_KEY=pat_secret
+DATA_RENTGEN__AUTH__PERSONAL_TOKENS__ENABLED=True
+DATA_RENTGEN__AUTH__PERSONAL_TOKENS__SECRET_KEY=pat_secret
 
 # Cors
 DATA_RENTGEN__SERVER__CORS__ENABLED=True
diff --git a/.env.local b/.env.local
@@ -16,6 +16,7 @@ export DATA_RENTGEN__SERVER__DEBUG=True
 export DATA_RENTGEN__UI__API_BROWSER_URL=http://localhost:8000
 
 # Session
+export DATA_RENTGEN__SERVER__SESSION__ENABLED=True
 export DATA_RENTGEN__SERVER__SESSION__SECRET_KEY=session_secret_key
 
 # Keycloak Auth
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -51,7 +51,7 @@ repos:
           - --no-extra-eol
 
   - repo: https://github.com/asottile/pyupgrade
-    rev: v3.21.0
+    rev: v3.21.1
     hooks:
       - id: pyupgrade
         args: [--py37-plus, --keep-runtime-typing]
@@ -62,7 +62,7 @@ repos:
       - id: add-trailing-comma
 
   - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.14.2
+    rev: v0.14.5
     hooks:
       - id: ruff-check
         args: [--fix]
@@ -88,7 +88,7 @@ repos:
           - tomli
 
   - repo: https://github.com/astral-sh/uv-pre-commit
-    rev: 0.9.5
+    rev: 0.9.9
     hooks:
       - id: uv-lock
 
diff --git a/README.rst b/README.rst
@@ -48,15 +48,15 @@ Goals
 -----
 
 * Collect lineage events produced by OpenLineage clients & integrations.
-* Store operation-grained events for better detalization (instead of job grained `Marquez <https://marquezproject.ai/>`_).
+* Store operation-grained events for better detalization.
 * Provide API for fetching both job/run ↔ dataset lineage and dataset ↔ dataset lineage.
 
 Features
 --------
 
 * Support consuming large amounts of lineage events, use Apache Kafka as event buffer.
 * Store data in tables partitioned by event timestamp, to speed up lineage graph resolution.
-* Lineage graph is build with user-specified time boundaries (unlike Marquez where lineage is build only for last job run).
+* Lineage graph is build with user-specified time boundaries.
 * Lineage graph can be build with different granularity. e.g. merge all individual Spark commands into Spark applicationId or Spark applicationName.
 * Column-level lineage support.
 * Authentication support.
@@ -71,7 +71,7 @@ Limitations
 -----------
 
 * OpenLineage have integrations with Trino, Debezium and some other lineage sources. DataRentgen support may be added later.
-* Unlike Marquez, DataRentgen parses only limited set of facets send by OpenLineage, and doesn't store custom facets. This can be changed in future.
+* DataRentgen parses only limited set of OpenLineage facets, and doesn't store custom facets. This can be changed in future.
 
 .. documentation
 
diff --git a/data_rentgen/VERSION b/data_rentgen/VERSION
@@ -1 +1 @@
-0.4.2
+0.4.3
diff --git a/data_rentgen/consumer/extractors/generic/operation.py b/data_rentgen/consumer/extractors/generic/operation.py
@@ -2,6 +2,8 @@
 # SPDX-License-Identifier: Apache-2.0
 from __future__ import annotations
 
+import codecs
+import re
 from abc import ABC, abstractmethod
 from textwrap import dedent
 
@@ -17,6 +19,13 @@
     OpenLineageRunEventType,
 )
 
+# https://www.ascii-code.com/, but left \n intact
+ASCII_UNPRINTABLE = re.compile(r"[\x00-\x09\x0b-\x1f\x7f]", re.UNICODE)
+
+
+def encode_char(char: re.Match[str]) -> str:
+    return codecs.encode(char.group(0), "unicode-escape").decode("utf-8")
+
 
 class OperationExtractorMixin(ABC):
     @abstractmethod
@@ -74,5 +83,7 @@ def _enrich_operation_status(self, operation: OperationDTO, event: OpenLineageRu
     def _extract_sql_query(self, event: OpenLineageRunEvent) -> SQLQueryDTO | None:
         if event.job.facets.sql:
             query = dedent(event.job.facets.sql.query).strip()
+            # https://stackoverflow.com/questions/56237415/removing-encoding-utf8-0x00-chars-from-pandas-dataframe-for-psycopg2-cursor
+            query = ASCII_UNPRINTABLE.sub(encode_char, query)
             return SQLQueryDTO(query=query)
         return None
diff --git a/data_rentgen/consumer/subscribers.py b/data_rentgen/consumer/subscribers.py
@@ -102,7 +102,6 @@ async def report_malformed(
         await publisher.publish(
             message.value,
             key=message.key,
-            partition=message.partition,
             timestamp_ms=message.timestamp,
             headers=headers or None,
             reply_to=message_id,
diff --git a/data_rentgen/db/migrations/versions/2025-11-21_8e8891273099_truncate_inputs_with_too_much_size_in_.py b/data_rentgen/db/migrations/versions/2025-11-21_8e8891273099_truncate_inputs_with_too_much_size_in_.py
@@ -0,0 +1,26 @@
+# SPDX-FileCopyrightText: 2024-2025 MTS PJSC
+# SPDX-License-Identifier: Apache-2.0
+"""Truncate inputs with too much size in bytes
+
+Revision ID: 8e8891273099
+Revises: 102502e85b2d
+Create Date: 2025-11-21 18:28:52.279644
+
+"""
+
+import sqlalchemy as sa
+from alembic import op
+
+# revision identifiers, used by Alembic.
+revision = "8e8891273099"
+down_revision = "102502e85b2d"
+branch_labels = None
+depends_on = None
+
+
+def upgrade() -> None:
+    op.execute(sa.text("UPDATE input SET num_bytes = NULL WHERE num_bytes >= 9223372036854775807"))
+
+
+def downgrade() -> None:
+    pass
diff --git a/data_rentgen/openlineage/dataset_facets/input_statistics.py b/data_rentgen/openlineage/dataset_facets/input_statistics.py
@@ -1,18 +1,29 @@
 # SPDX-FileCopyrightText: 2024-2025 MTS PJSC
 # SPDX-License-Identifier: Apache-2.0
 
-from pydantic import Field
+from pydantic import Field, PositiveInt, field_validator
 
 from data_rentgen.openlineage.dataset_facets.base import (
     OpenLineageInputDatasetFacet,
 )
 
+MAX_LONG = 2**63 - 1
+
 
 class OpenLineageInputStatisticsInputDatasetFacet(OpenLineageInputDatasetFacet):
     """Dataset facet describing Input statistics.
     See [InputStatisticsInputDatasetFacet](https://github.com/OpenLineage/OpenLineage/blob/main/spec/facets/InputStatisticsInputDatasetFacet.json).
     """
 
-    rows: int | None = Field(default=None, alias="rowCount", examples=[1_000_000])
-    bytes: int | None = Field(default=None, alias="size", examples=[2**30])
-    files: int | None = Field(default=None, alias="fileCount", examples=[0])
+    rows: PositiveInt | None = Field(default=None, alias="rowCount", examples=[1_000_000])
+    bytes: PositiveInt | None = Field(default=None, alias="size", examples=[2**30])
+    files: PositiveInt | None = Field(default=None, alias="fileCount", examples=[0])
+
+    @field_validator("bytes", "rows", "files", mode="after")
+    @classmethod
+    def value_must_be_sane(cls, value: int | None):
+        if value and value >= MAX_LONG:
+            # https://github.com/apache/spark/blob/v3.5.7/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala#L2565
+            # https://github.com/apache/spark/blob/v3.5.7/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala#L209
+            return None
+        return value
diff --git a/data_rentgen/server/middlewares/session.py b/data_rentgen/server/middlewares/session.py
@@ -8,9 +8,11 @@
 
 def apply_session_middleware(app: FastAPI, settings: SessionSettings) -> FastAPI:
     """Add SessionMiddleware middleware to the application."""
+    if not settings.enabled:
+        return app
 
-    settings_dict = settings.model_dump(exclude={"secret_key"})
-    settings_dict["secret_key"] = settings.secret_key.get_secret_value()
+    settings_dict = settings.model_dump(exclude={"secret_key", "enabled"})
+    settings_dict["secret_key"] = settings.secret_key.get_secret_value()  # type: ignore[union-attr]
 
     app.add_middleware(SessionMiddleware, **settings_dict)
     return app
diff --git a/data_rentgen/server/settings/session.py b/data_rentgen/server/settings/session.py
@@ -4,14 +4,16 @@
 
 import textwrap
 
-from pydantic import BaseModel, ConfigDict, Field, SecretStr
+from pydantic import BaseModel, ConfigDict, Field, SecretStr, ValidationInfo, field_validator
 
 DEFAULT_MAX_AGE = 1_209_600
 
 
 class SessionSettings(BaseModel):
     """Session Middleware Settings.
 
+    Required for :ref:`auth-server-keycloak`.
+
     See `SessionMiddleware <https://www.starlette.io/middleware/#sessionmiddleware>`_ documentation.
 
     .. note::
@@ -24,6 +26,7 @@ class SessionSettings(BaseModel):
 
     .. code-block:: bash
 
+        DATA_RENTGEN__SERVER__SESSION__ENABLED=True
         DATA_RENTGEN__SERVER__SESSION__SECRET_KEY=secret
         DATA_RENTGEN__SERVER__SESSION__SESSION_COOKIE=custom_cookie_name
         DATA_RENTGEN__SERVER__SESSION__MAX_AGE=None  # cookie will last as long as the browser session
@@ -33,7 +36,12 @@ class SessionSettings(BaseModel):
 
     """
 
-    secret_key: SecretStr = Field(
+    enabled: bool = Field(
+        default=False,
+        description="Set to ``True`` to enable SessionMiddleware",
+    )
+    secret_key: SecretStr | None = Field(
+        default=None,
         description=textwrap.dedent(
             """
             Secret key for encrypting cookies.
@@ -66,3 +74,11 @@ class SessionSettings(BaseModel):
     )
 
     model_config = ConfigDict(extra="allow")
+
+    @field_validator("secret_key", mode="after")
+    @classmethod
+    def _validate_secret_key(cls, value: SecretStr | None, info: ValidationInfo) -> SecretStr | None:
+        if not value and info.data.get("enabled"):
+            msg = "secret_key is required"
+            raise ValueError(msg)
+        return value
diff --git a/docs/changelog/0.4.1.rst b/docs/changelog/0.4.1.rst
@@ -1,5 +1,5 @@
 0.4.1 (2025-10-08)
-=================
+==================
 
 Features
 --------
diff --git a/docs/changelog/0.4.2.rst b/docs/changelog/0.4.2.rst
@@ -1,5 +1,5 @@
 0.4.2 (2025-10-29)
-=================
+==================
 
 Bug fixes
 ---------
diff --git a/docs/changelog/0.4.3.rst b/docs/changelog/0.4.3.rst
@@ -0,0 +1,14 @@
+0.4.3 (2025-11-21)
+==================
+
+Features
+---------
+
+- Disable ``server.session.enabled`` by default. It is required only by KeycloakAuthProvider which is not used by default.
+
+Bug Fixes
+---------
+
+- Escape unprintable ASCII symbols in SQL queries before storing them in Postgres. Previously saving queries containing ``\x00`` symbol lead to exceptions.
+- Kafka topic with malformed messages doesn't have to use the same number partitions as input topics.
+- Prevent OpenLineage from reporting events which claims to read 8 Exabytes of data, this is actually a Spark quirk.
diff --git a/docs/changelog/next_release/+.feature.rst b/docs/changelog/next_release/+.feature.rst
@@ -0,0 +1 @@
+Allow disabling ``SessionMiddleware``, as it only required by ``KeycloakAuthProvider``.
diff --git a/pyproject.toml b/pyproject.toml
@@ -31,7 +31,7 @@ keywords = ["Lineage", "FastAPI", "REST", "FastStream"]
 requires-python = ">=3.12"
 dependencies = [
   "pydantic~=2.12.3",
-  "pydantic-settings~=2.11.0",
+  "pydantic-settings~=2.12.0",
   "typing-extensions~=4.15.0",
   "alembic~=1.17.1",
   "sqlalchemy~=2.0.41",
@@ -64,9 +64,9 @@ exclude = ["docs", "tests"]
 
 [project.optional-dependencies]
 server = [
-  "fastapi~=0.119.1",
-  "starlette~=0.48.0",
-  "uvicorn~=0.37.0",
+  "fastapi~=0.121.3",
+  "starlette~=0.49.3",
+  "uvicorn~=0.38.0",
   "starlette-exporter~=0.23.0",
   "asgi-correlation-id~=4.3.4",
   "pyjwt~=2.10.1 ",
@@ -79,9 +79,9 @@ consumer = [
   "cramjam~=2.11.0",
 ]
 http2kafka = [
-  "fastapi~=0.119.1",
-  "starlette~=0.48.0",
-  "uvicorn~=0.37.0",
+  "fastapi~=0.121.3",
+  "starlette~=0.49.3",
+  "uvicorn~=0.38.0",
   "starlette-exporter~=0.23.0",
   "asgi-correlation-id~=4.3.4",
   "faststream[kafka,cli]~=0.6.0rc2",
@@ -94,16 +94,16 @@ gssapi = [
   "gssapi~=1.10.0",
 ]
 seed = [
-  "faker~=37.11.0"
+  "faker~=38.2.0"
 ]
 
 [dependency-groups]
 test =[
-  "pytest~=8.4.1",
+  "pytest~=9.0.1",
   "httpx~=0.28.1",
-  "pytest-asyncio~=1.2.0",
+  "pytest-asyncio~=1.3.0",
   "pytest-randomly~=4.0.0",
-  "pytest-deadfixtures~=2.2.1",
+  "pytest-deadfixtures~=3.0.0",
   "pytest-rerunfailures~=16.1",
   "coverage~=7.11.0",
   "psycopg2-binary~=2.9.10",
@@ -113,7 +113,7 @@ test =[
   "gevent~=25.9.1",
 ]
 dev = [
-  "pre-commit~=4.3.0",
+  "pre-commit~=4.4.0",
   "mypy~=1.18.2",
   "sqlalchemy[mypy]~=2.0.41",
   "types-pyyaml~=6.0.12",
diff --git a/tests/test_consumer/test_extractors/test_extractors_operation_spark.py b/tests/test_consumer/test_extractors/test_extractors_operation_spark.py
diff --git a/uv.lock b/uv.lock