|
1 | 1 | # SPDX-FileCopyrightText: 2024-2025 MTS PJSC |
2 | 2 | # SPDX-License-Identifier: Apache-2.0 |
3 | 3 |
|
4 | | -from pydantic import Field |
| 4 | +from pydantic import Field, PositiveInt, field_validator |
5 | 5 |
|
6 | 6 | from data_rentgen.openlineage.dataset_facets.base import ( |
7 | 7 | OpenLineageInputDatasetFacet, |
8 | 8 | ) |
9 | 9 |
|
| 10 | +MAX_LONG = 2**63 - 1 |
| 11 | + |
10 | 12 |
|
11 | 13 | class OpenLineageInputStatisticsInputDatasetFacet(OpenLineageInputDatasetFacet): |
12 | 14 | """Dataset facet describing Input statistics. |
13 | 15 | See [InputStatisticsInputDatasetFacet](https://github.com/OpenLineage/OpenLineage/blob/main/spec/facets/InputStatisticsInputDatasetFacet.json). |
14 | 16 | """ |
15 | 17 |
|
16 | | - rows: int | None = Field(default=None, alias="rowCount", examples=[1_000_000]) |
17 | | - bytes: int | None = Field(default=None, alias="size", examples=[2**30]) |
18 | | - files: int | None = Field(default=None, alias="fileCount", examples=[0]) |
| 18 | + rows: PositiveInt | None = Field(default=None, alias="rowCount", examples=[1_000_000]) |
| 19 | + bytes: PositiveInt | None = Field(default=None, alias="size", examples=[2**30]) |
| 20 | + files: PositiveInt | None = Field(default=None, alias="fileCount", examples=[0]) |
| 21 | + |
| 22 | + @field_validator("bytes", "rows", "files", mode="after") |
| 23 | + @classmethod |
| 24 | + def value_must_be_sane(cls, value: int | None): |
| 25 | + if value and value >= MAX_LONG: |
| 26 | + # https://github.com/apache/spark/blob/v3.5.7/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala#L2565 |
| 27 | + # https://github.com/apache/spark/blob/v3.5.7/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala#L209 |
| 28 | + return None |
| 29 | + return value |
0 commit comments