From d1c66d91823f921b7f4365f6ab68c6f47162a170 Mon Sep 17 00:00:00 2001 From: Greg Hansen Date: Wed, 10 Dec 2025 13:29:14 -0500 Subject: [PATCH 1/2] Fix test --- docs/dqx/docs/reference/benchmarks.mdx | 8 + docs/dqx/docs/reference/quality_checks.mdx | 159 ++++++++- src/databricks/labs/dqx/geo/check_funcs.py | 339 +++++++++++++++++++- tests/integration/test_apply_checks.py | 48 +++ tests/integration/test_profiler_workflow.py | 2 +- tests/integration/test_row_checks_geo.py | 330 +++++++++++++++++++ tests/perf/.benchmarks/baseline.json | 280 ++++++++++++++++ tests/perf/test_apply_checks.py | 120 +++++++ tests/resources/all_row_geo_checks.yaml | 64 ++++ 9 files changed, 1346 insertions(+), 4 deletions(-) diff --git a/docs/dqx/docs/reference/benchmarks.mdx b/docs/dqx/docs/reference/benchmarks.mdx index 653d48a90..64ad6b221 100644 --- a/docs/dqx/docs/reference/benchmarks.mdx +++ b/docs/dqx/docs/reference/benchmarks.mdx @@ -66,6 +66,10 @@ sidebar_position: 13 | test_benchmark_is_aggr_not_equal | 0.296462 | 0.296800 | 0.275119 | 0.312035 | 0.013498 | 0.013448 | 0.291054 | 0.304502 | 5 | 0 | 2 | 3.37 | | test_benchmark_is_aggr_not_greater_than | 0.307771 | 0.315185 | 0.277924 | 0.316280 | 0.016705 | 0.010701 | 0.304974 | 0.315675 | 5 | 1 | 1 | 3.25 | | test_benchmark_is_aggr_not_less_than | 0.296828 | 0.303167 | 0.276023 | 0.314350 | 0.018030 | 0.033665 | 0.278439 | 0.312105 | 5 | 0 | 1 | 3.37 | +| test_benchmark_is_area_equal_to | 0.209381 | 0.207647 | 0.205255 | 0.216179 | 0.004471 | 0.006593 | 0.206066 | 0.212659 | 5 | 0 | 1 | 4.78 | +| test_benchmark_is_area_not_equal_to | 0.208875 | 0.207436 | 0.203626 | 0.217694 | 0.005257 | 0.004513 | 0.206265 | 0.210778 | 5 | 1 | 1 | 4.79 | +| test_benchmark_is_area_not_greater_than | 0.177230 | 0.179352 | 0.161536 | 0.190875 | 0.010356 | 0.013261 | 0.169503 | 0.182763 | 6 | 0 | 2 | 5.64 | +| test_benchmark_is_area_not_less_than | 0.171868 | 0.166867 | 0.161877 | 0.204155 | 0.015957 | 0.003194 | 0.164123 | 0.167316 | 6 | 1 | 1 | 5.82 | | test_benchmark_is_data_fresh | 0.279160 | 0.235545 | 0.231767 | 0.430390 | 0.085563 | 0.072198 | 0.233457 | 0.305655 | 5 | 1 | 1 | 3.58 | | test_benchmark_is_data_fresh_per_time_window | 0.259995 | 0.246444 | 0.242483 | 0.291510 | 0.022543 | 0.037112 | 0.243019 | 0.280132 | 5 | 0 | 1 | 3.85 | | test_benchmark_is_equal_to | 0.241270 | 0.241646 | 0.226919 | 0.248632 | 0.008799 | 0.010992 | 0.237380 | 0.248371 | 5 | 0 | 1 | 4.14 | @@ -129,6 +133,10 @@ sidebar_position: 13 | test_benchmark_is_null_or_empty[col6] | 0.271883 | 0.288303 | 0.233084 | 0.291877 | 0.025875 | 0.038427 | 0.251213 | 0.289639 | 5 | 0 | 1 | 3.68 | | test_benchmark_is_null_or_empty[col7] | 0.255479 | 0.255281 | 0.230887 | 0.289014 | 0.022139 | 0.029264 | 0.238686 | 0.267950 | 5 | 0 | 2 | 3.91 | | test_benchmark_is_null_or_empty[col8] | 0.219256 | 0.217609 | 0.214083 | 0.226223 | 0.005137 | 0.008602 | 0.215124 | 0.223726 | 5 | 0 | 2 | 4.56 | +| test_benchmark_is_num_points_equal_to | 0.213472 | 0.208326 | 0.200840 | 0.228556 | 0.011595 | 0.018574 | 0.205502 | 0.224076 | 5 | 0 | 2 | 4.68 | +| test_benchmark_is_num_points_not_equal_to | 0.211439 | 0.212084 | 0.200625 | 0.223375 | 0.008900 | 0.013585 | 0.204124 | 0.217709 | 5 | 0 | 2 | 4.73 | +| test_benchmark_is_num_points_not_greater_than | 0.162069 | 0.161908 | 0.149400 | 0.178192 | 0.010833 | 0.014197 | 0.154168 | 0.168365 | 5 | 0 | 2 | 6.17 | +| test_benchmark_is_num_points_not_less_than | 0.159204 | 0.157405 | 0.151457 | 0.175503 | 0.008775 | 0.008935 | 0.152260 | 0.161195 | 6 | 1 | 1 | 6.28 | | test_benchmark_is_ogc_valid | 0.220708 | 0.223267 | 0.206378 | 0.235649 | 0.011210 | 0.015703 | 0.211743 | 0.227446 | 5 | 0 | 2 | 4.53 | | test_benchmark_is_older_than_col2_for_n_days | 0.235241 | 0.230978 | 0.224354 | 0.254865 | 0.011884 | 0.013734 | 0.227788 | 0.241522 | 5 | 0 | 1 | 4.25 | | test_benchmark_is_older_than_n_days | 0.246935 | 0.248889 | 0.234393 | 0.253353 | 0.007733 | 0.010372 | 0.242547 | 0.252920 | 5 | 0 | 1 | 4.05 | diff --git a/docs/dqx/docs/reference/quality_checks.mdx b/docs/dqx/docs/reference/quality_checks.mdx index 8131ebbce..ef2d2c408 100644 --- a/docs/dqx/docs/reference/quality_checks.mdx +++ b/docs/dqx/docs/reference/quality_checks.mdx @@ -69,7 +69,15 @@ You can also define your own custom checks in Python (see [Creating custom check | `has_dimension` | Checks whether the values in the input column are geometries of the specified dimension (2D projected dimension). This function requires Databricks serverless compute or runtime >= 17.1. | `column`: column to check (can be a string column name or a column expression); `dimension`: dimension to check | | `has_x_coordinate_between` | Checks whether the values in the input column are geometries with x coordinate between the provided boundaries. This function requires Databricks serverless compute or runtime >= 17.1. | `column`: column to check (can be a string column name or a column expression); `min_value`: minimum value; `max_value`: maximum value | | `has_y_coordinate_between` | Checks whether the values in the input column are geometries with y coordinate between the provided boundaries. This function requires Databricks serverless compute or runtime >= 17.1. | `column`: column to check (can be a string column name or a column expression); `min_value`: minimum value; `max_value`: maximum value | | `column`: column to check (can be a string column name or a column expression); `min_value`: minimum value; `max_value`: maximum value | - +| `has_y_coordinate_between` | Checks whether the values in the input column are geometries with y coordinate between the provided boundaries. This function requires Databricks serverless compute or runtime >= 17.1. | `column`: column to check (can be a string column name or a column expression); `min_value`: minimum value; `max_value`: maximum value | +| `is_area_not_less_than` | Checks if the areas of values in a geometry column are not less than a specified limit. By default, the 2D Cartesian area in WGS84 (Pseudo-Mercator) with units of meters squared is used. An SRID can be specified to transform the input values and compute areas with specific units of measure. This function requires Databricks serverless compute or runtime >= 17.1. | `column`: column to check (can be a string column name or a column expression); `value`: value to use in the condition as number, column name or sql expression; `srid`: optional integer SRID to use for computing the area of the geometry or geography value (default `3857`); `geodesic`: whether to use the 2D geodesic area (default `False`) | +| `is_area_not_greater_than` | Checks if the areas of values in a geometry column are not greater than a specified limit. By default, the 2D Cartesian area in WGS84 (Pseudo-Mercator) with units of meters squared is used. An SRID can be specified to transform the input values and compute areas with specific units of measure. This function requires Databricks serverless compute or runtime >= 17.1. | `column`: column to check (can be a string column name or a column expression); `value`: value to use in the condition as number, column name or sql expression; `srid`: optional integer SRID to use for computing the area of the geometry or geography value (default `None`); `geodesic`: whether to use the 2D geodesic area (default `False`) | +| `is_area_equal_to` | Checks if the areas of values in a geometry or geography column are equal to a specified value. By default, the 2D Cartesian area in WGS84 (Pseudo-Mercator) with units of meters squared is used. An SRID can be specified to transform the input values and compute areas with specific units of measure. This function requires Databricks serverless compute or runtime >= 17.1. | `column`: column to check (can be a string column name or a column expression); `value`: value to use in the condition as number, column name or sql expression; `srid`: optional integer SRID to use for computing the area of the geometry or geography value (default `None`); `geodesic`: whether to use the 2D geodesic area (default `False`) | +| `is_area_not_equal_to` | Checks if the areas of values in a geometry column are not equal to a specified value. By default, the 2D Cartesian area in WGS84 (Pseudo-Mercator) with units of meters squared is used. An SRID can be specified to transform the input values and compute areas with specific units of measure. This function requires Databricks serverless compute or runtime >= 17.1. | `column`: column to check (can be a string column name or a column expression); `value`: value to use in the condition as number, column name or sql expression; `srid`: optional integer SRID to use for computing the area of the geometry or geography value (default `None`); `geodesic`: whether to use the 2D geodesic area (default `False`) | +| `is_num_points_not_less_than` | Checks whether the values in the input column are geometries with number of coordinate pairs greater than the specified limit. This function requires Databricks serverless compute or runtime >= 17.1. | `column`: column to check (can be a string column name or a column expression); `value`: number of points value to compare against (can be a number, column name, or SQL expression) | +| `is_num_points_not_greater_than` | Checks whether the values in the input column are geometries with number of coordinate pairs less than the specified limit. This function requires Databricks serverless compute or runtime >= 17.1. | `column`: column to check (can be a string column name or a column expression); `value`: number of points value to compare against (can be a number, column name, or SQL expression) | +| `is_num_points_equal_to` | Checks whether the values in the input column are geometries with number of coordinate pairs equal to the specified limit. This function requires Databricks serverless compute or runtime >= 17.1. | `column`: column to check (can be a string column name or a column expression); `value`: number of points value to compare against (can be a number, column name, or SQL expression) | +| `is_num_points_not_equal_to` | Checks whether the values in the input column are geometries with number of coordinate pairs not equal to the specified limit. This function requires Databricks serverless compute or runtime >= 17.1. | `column`: column to check (can be a string column name or a column expression); `value`: number of points value to compare against (can be a number, column name, or SQL expression) | @@ -610,6 +618,80 @@ For brevity, the `name` field in the examples is omitted and it will be auto-gen column: polygon_geom min_value: 0.0 max_value: 10.0 + +# is_area_not_less_than check (geometry) +- criticality: error + check: + function: is_area_not_less_than + arguments: + column: polygon_geom + value: 100.0 + +# is_area_not_less_than check (geography with geodesic area) +- criticality: error + check: + function: is_area_not_less_than + arguments: + column: geography_geom + value: 1000000.0 # 1 million square meters + geodesic: true + +# is_area_not_greater_than check (geometry with SRID) +- criticality: error + check: + function: is_area_not_greater_than + arguments: + column: polygon_geom + value: 0.1 + srid: 3857 + +# is_area_equal_to check +- criticality: error + check: + function: is_area_equal_to + arguments: + column: polygon_geom + value: 1.0 + +# is_area_not_equal_to check +- criticality: error + check: + function: is_area_not_equal_to + arguments: + column: polygon_geom + value: 0.0 + +# is_num_points_not_less_than check +- criticality: error + check: + function: is_num_points_not_less_than + arguments: + column: polygon_geom + value: 10 + +# is_num_points_not_greater_than check +- criticality: error + check: + function: is_num_points_not_greater_than + arguments: + column: polygon_geom + value: 3 + +# is_num_points_equal_to check +- criticality: error + check: + function: is_num_points_equal_to + arguments: + column: polygon_geom + value: 4 + +# is_num_points_not_equal_to check +- criticality: error + check: + function: is_num_points_not_equal_to + arguments: + column: polygon_geom + value: 1 ``` @@ -1072,6 +1154,78 @@ checks = [ check_func_kwargs={"min_value": 0.0, "max_value": 10.0} ), + # is_area_not_less_than check (geometry) + DQRowRule( + criticality="error", + check_func=geo_check_funcs.is_area_not_less_than, + column="polygon_geom", + check_func_kwargs={"value": 100.0} + ), + + # is_area_not_less_than check (geography with geodesic area) + DQRowRule( + criticality="error", + check_func=geo_check_funcs.is_area_not_less_than, + column="geography_geom", + check_func_kwargs={"value": 1000000.0, "geodesic": True} + ), + + # is_area_not_greater_than check (geometry with SRID) + DQRowRule( + criticality="error", + check_func=geo_check_funcs.is_area_not_greater_than, + column="polygon_geom", + check_func_kwargs={"value": 0.1, "srid": 3857} + ), + + # is_area_equal_to check + DQRowRule( + criticality="error", + check_func=geo_check_funcs.is_area_equal_to, + column="polygon_geom", + check_func_kwargs={"value": 1.0} + ), + + # is_area_not_equal_to check + DQRowRule( + criticality="error", + check_func=geo_check_funcs.is_area_not_equal_to, + column="polygon_geom", + check_func_kwargs={"value": 0.0} + ), + + # is_num_points_not_less_than check + DQRowRule( + criticality="error", + check_func=geo_check_funcs.is_num_points_not_less_than, + column="polygon_geom", + check_func_kwargs={"value": 10} + ), + + # is_num_points_not_greater_than check + DQRowRule( + criticality="error", + check_func=geo_check_funcs.is_num_points_not_greater_than, + column="polygon_geom", + check_func_kwargs={"value": 3} + ), + + # is_num_points_equal_to check + DQRowRule( + criticality="error", + check_func=geo_check_funcs.is_num_points_equal_to, + column="polygon_geom", + check_func_kwargs={"value": 4} + ), + + # is_num_points_not_equal_to check + DQRowRule( + criticality="error", + check_func=geo_check_funcs.is_num_points_not_equal_to, + column="polygon_geom", + check_func_kwargs={"value": 1} + ), + # sql_expression check DQRowRule( criticality="error", @@ -1602,6 +1756,7 @@ Complex data types are supported as well. group_by: - col3 limit: 200 + # foreign_key check using reference DataFrame - criticality: error check: @@ -2407,7 +2562,7 @@ Using non-curated aggregate functions is supported with the following limitation ## Creating Custom Row-level Checks -### Using SQL ExpressionS +### Using SQL Expressions You can define custom checks using SQL Expression rule (`sql_expression`). diff --git a/src/databricks/labs/dqx/geo/check_funcs.py b/src/databricks/labs/dqx/geo/check_funcs.py index 612f7a717..5160e7d24 100644 --- a/src/databricks/labs/dqx/geo/check_funcs.py +++ b/src/databricks/labs/dqx/geo/check_funcs.py @@ -1,7 +1,11 @@ +from collections.abc import Callable +import operator as py_operator + from pyspark.sql import Column import pyspark.sql.functions as F + from databricks.labs.dqx.rule import register_rule -from databricks.labs.dqx.check_funcs import make_condition, _get_normalized_column_and_expr +from databricks.labs.dqx.check_funcs import make_condition, _get_normalized_column_and_expr, _get_limit_expr POINT_TYPE = "ST_Point" LINESTRING_TYPE = "ST_LineString" @@ -10,6 +14,7 @@ MULTILINESTRING_TYPE = "ST_MultiLineString" MULTIPOLYGON_TYPE = "ST_MultiPolygon" GEOMETRYCOLLECTION_TYPE = "ST_GeometryCollection" +DEFAULT_SRID = 4326 @register_rule("row") @@ -448,3 +453,335 @@ def has_y_coordinate_between(column: str | Column, min_value: float, max_value: F.concat_ws("", F.lit("value `"), col_expr.cast("string"), F.lit(condition_str)), f"{col_str_norm}_has_y_coordinates_outside_range", ) + + +@register_rule("row") +def is_area_equal_to( + column: str | Column, value: int | float | str | Column, srid: int | None = 3857, geodesic: bool = False +) -> Column: + """ + Checks if the areas of values in a geometry or geography column are equal to a specified value. By default, the 2D + Cartesian area in WGS84 (Pseudo-Mercator) with units of meters squared is used. An SRID can be specified to + transform the input values and compute areas with specific units of measure. + + Args: + column: Column to check; can be a string column name or a column expression + value: Value to use in the condition as number, column name or sql expression + srid: Optional integer SRID to use for computing the area of the geometry or geography value (default `None`). + If an SRID is provided, the input value is translated and area is calculated using the units of measure of + the specified coordinate reference system (e.g. meters squared for `srid=3857`). + geodesic: Whether to use the 2D geodesic area (default `False`). + + Returns: + Column object indicating whether the area the geometries in the input column are equal to the provided value + + Note: + This function requires Databricks serverless compute or runtime 17.1 or above. + """ + return _compare_sql_function_result( + column, + value, + spatial_function="st_area", + spatial_quantity_label="area", + spatial_quantity_name="area", + compare_op=py_operator.ne, + compare_op_label="not equal to", + compare_op_name="not_equal_to", + srid=srid, + geodesic=geodesic, + ) + + +@register_rule("row") +def is_area_not_equal_to( + column: str | Column, value: int | float | str | Column, srid: int | None = 3857, geodesic: bool = False +) -> Column: + """ + Checks if the areas of values in a geometry column are not equal to a specified value. By default, the 2D + Cartesian area in WGS84 (Pseudo-Mercator) with units of meters squared is used. An SRID can be specified to + transform the input values and compute areas with specific units of measure. + + Args: + column: Column to check; can be a string column name or a column expression + value: Value to use in the condition as number, column name or sql expression + srid: Optional integer SRID to use for computing the area of the geometry or geography value (default `None`). + If an SRID is provided, the input value is translated and area is calculated using the units of measure of + the specified coordinate reference system (e.g. meters squared for `srid=3857`). + geodesic: Whether to use the 2D geodesic area (default `False`). + + Returns: + Column object indicating whether the area the geometries in the input column are not equal to the provided value + + Note: + This function requires Databricks serverless compute or runtime 17.1 or above. + """ + return _compare_sql_function_result( + column, + value, + spatial_function="st_area", + spatial_quantity_label="area", + spatial_quantity_name="area", + compare_op=py_operator.eq, + compare_op_label="equal to", + compare_op_name="equal_to", + srid=srid, + geodesic=geodesic, + ) + + +@register_rule("row") +def is_area_not_greater_than( + column: str | Column, value: int | float | str | Column, srid: int | None = 3857, geodesic: bool = False +) -> Column: + """ + Checks if the areas of values in a geometry column are not greater than a specified limit. By default, the 2D + Cartesian area in WGS84 (Pseudo-Mercator) with units of meters squared is used. An SRID can be specified to + transform the input values and compute areas with specific units of measure. + + Args: + column: Column to check; can be a string column name or a column expression + value: Value to use in the condition as number, column name or sql expression + srid: Optional integer SRID to use for computing the area of the geometry or geography value (default `None`). + If an SRID is provided, the input value is translated and area is calculated using the units of measure of + the specified coordinate reference system (e.g. meters squared for `srid=3857`). + geodesic: Whether to use the 2D geodesic area (default `False`). + + Returns: + Column object indicating whether the area the geometries in the input column is greater than the provided value + + Note: + This function requires Databricks serverless compute or runtime 17.1 or above. + """ + return _compare_sql_function_result( + column, + value, + spatial_function="st_area", + spatial_quantity_label="area", + spatial_quantity_name="area", + compare_op=py_operator.gt, + compare_op_label="greater than", + compare_op_name="greater_than", + srid=srid, + geodesic=geodesic, + ) + + +@register_rule("row") +def is_area_not_less_than( + column: str | Column, value: int | float | str | Column, srid: int | None = 3857, geodesic: bool = False +) -> Column: + """ + Checks if the areas of values in a geometry column are not less than a specified limit. By default, the 2D + Cartesian area in WGS84 (Pseudo-Mercator) with units of meters squared is used. An SRID can be specified to + transform the input values and compute areas with specific units of measure. + + Args: + column: Column to check; can be a string column name or a column expression + value: Value to use in the condition as number, column name or sql expression + srid: Optional integer SRID to use for computing the area of the geometry or geography value (default `None`). + If an SRID is provided, the input value is translated and area is calculated using the units of measure of + the specified coordinate reference system (e.g. meters squared for `srid=3857`). + geodesic: Whether to use the 2D geodesic area (default `False`). + + Returns: + Column object indicating whether the area the geometries in the input column is less than the provided value + + Note: + This function requires Databricks serverless compute or runtime 17.1 or above. + """ + return _compare_sql_function_result( + column, + value, + spatial_function="st_area", + spatial_quantity_label="area", + spatial_quantity_name="area", + compare_op=py_operator.lt, + compare_op_label="less than", + compare_op_name="less_than", + srid=srid, + geodesic=geodesic, + ) + + +@register_rule("row") +def is_num_points_equal_to(column: str | Column, value: int | float | str | Column) -> Column: + """ + Checks if the number of coordinate pairs in values of a geometry column is equal to a specified value. + + Args: + column: Column to check; can be a string column name or a column expression + value: Value to use in the condition as number, column name or sql expression + + Returns: + Column object indicating whether the number of coordinate pairs in the geometries of the input column is + equal to the provided value + + Note: + This function requires Databricks serverless compute or runtime 17.1 or above. + """ + return _compare_sql_function_result( + column, + value, + spatial_function="st_npoints", + spatial_quantity_label="number of coordinates", + spatial_quantity_name="num_points", + compare_op=py_operator.ne, + compare_op_label="not equal to", + compare_op_name="not_equal_to", + ) + + +@register_rule("row") +def is_num_points_not_equal_to(column: str | Column, value: int | float | str | Column) -> Column: + """ + Checks if the number of coordinate pairs in values of a geometry column is not equal to a specified value. + + Args: + column: Column to check; can be a string column name or a column expression + value: Value to use in the condition as number, column name or sql expression + + Returns: + Column object indicating whether the number of coordinate pairs in the geometries of the input column is not + equal to the provided value + + Note: + This function requires Databricks serverless compute or runtime 17.1 or above. + """ + return _compare_sql_function_result( + column, + value, + spatial_function="st_npoints", + spatial_quantity_label="number of coordinates", + spatial_quantity_name="num_points", + compare_op=py_operator.eq, + compare_op_label="equal to", + compare_op_name="equal_to", + ) + + +@register_rule("row") +def is_num_points_not_greater_than(column: str | Column, value: int | float | str | Column) -> Column: + """ + Checks if the number of coordinate pairs in the values of a geometry column is not greater than a specified limit. + + Args: + column: Column to check; can be a string column name or a column expression + value: Value to use in the condition as number, column name or sql expression + + Returns: + Column object indicating whether the number of coordinate pairs in the geometries of the input column is + greater than the provided value + + Note: + This function requires Databricks serverless compute or runtime 17.1 or above. + """ + return _compare_sql_function_result( + column, + value, + spatial_function="st_npoints", + spatial_quantity_label="number of coordinates", + spatial_quantity_name="num_points", + compare_op=py_operator.gt, + compare_op_label="greater than", + compare_op_name="greater_than", + ) + + +@register_rule("row") +def is_num_points_not_less_than(column: str | Column, value: int | float | str | Column) -> Column: + """ + Checks if the number of coordinate pairs in values of a geometry column is not less than a specified limit. + + Args: + column: Column to check; can be a string column name or a column expression + value: Value to use in the condition as number, column name or sql expression + + Returns: + Column object indicating whether the number of coordinate pairs in the geometries of the input column is + less than the provided value + + Note: + This function requires Databricks serverless compute or runtime 17.1 or above. + """ + return _compare_sql_function_result( + column, + value, + spatial_function="st_npoints", + spatial_quantity_label="number of coordinates", + spatial_quantity_name="num_points", + compare_op=py_operator.lt, + compare_op_label="less than", + compare_op_name="less_than", + ) + + +def _compare_sql_function_result( + column: str | Column, + value: int | float | str | Column, + spatial_function: str, + spatial_quantity_label: str, + spatial_quantity_name: str, + compare_op: Callable[[Column, Column], Column], + compare_op_label: str, + compare_op_name: str, + srid: int | None = None, + geodesic: bool = False, +) -> Column: + """ + Compares the results from applying a spatial SQL function (e.g. `st_area`) on a geometry column against a limit + using the specified comparison operator. + + Args: + column: Column to check; can be a string column name or a column expression + value: Value to use in the condition as number, column name or sql expression + spatial_function: Spatial SQL function as a string (e.g. `st_npoints`) + spatial_quantity_label: Spatial quantity label (e.g. `number of coordinates` ) + spatial_quantity_name: Spatial quantity identifier (e.g. `num_points`) + compare_op: Comparison operator (e.g., `operator.gt`, `operator.lt`). + compare_op_label: Human-readable label for the comparison (e.g., 'greater than'). + compare_op_name: Name identifier for the comparison (e.g., 'greater_than'). + srid: Optional integer SRID for computing measurements on the converted geometry or geography value (default `None`). + geodesic: Whether to convert the input column to a geography type for computing geodesic distances. + + Returns: + Column object indicating whether the area the geometries in the input column is less than the provided limit + + Note: + This function requires Databricks serverless compute or runtime 17.1 or above. + """ + col_str_norm, col_expr_str, col_expr = _get_normalized_column_and_expr(column) + value_expr = _get_limit_expr(value) + # NOTE: This function is currently only available in Databricks runtime 17.1 or above or in + # Databricks SQL, due to the use of the `try_to_geometry` and `st_area` functions. + if geodesic: + spatial_conversion_expr = f"try_to_geography({col_str_norm})" + spatial_data_type = "geography" + elif srid: + spatial_conversion_expr = f"st_transform(st_setsrid(try_to_geometry({col_str_norm}), {DEFAULT_SRID}), {srid})" + spatial_data_type = "geometry" + else: + spatial_conversion_expr = f"try_to_geometry({col_str_norm})" + spatial_data_type = "geometry" + + is_valid_cond = F.expr(f"{spatial_conversion_expr} IS NULL") + is_valid_message = F.concat_ws( + "", + F.lit("value `"), + col_expr.cast("string"), + F.lit(f"` in column `{col_expr_str}` is not a valid {spatial_data_type}"), + ) + compare_cond = compare_op(F.expr(f"{spatial_function}({spatial_conversion_expr})"), value_expr) + compare_message = F.concat_ws( + "", + F.lit("value `"), + col_expr.cast("string"), + F.lit(f"` in column `{col_expr_str}` has {spatial_quantity_label} {compare_op_label} value: "), + value_expr.cast("string"), + ) + condition = F.when(col_expr.isNull(), F.lit(None)).otherwise(is_valid_cond | compare_cond) + + return make_condition( + condition, + F.when(is_valid_cond, is_valid_message).otherwise(compare_message), + f"{col_str_norm}_{spatial_quantity_name}_{compare_op_name}_limit", + ) diff --git a/tests/integration/test_apply_checks.py b/tests/integration/test_apply_checks.py index 764e7120d..0c09edb8d 100644 --- a/tests/integration/test_apply_checks.py +++ b/tests/integration/test_apply_checks.py @@ -6623,6 +6623,54 @@ def test_apply_checks_all_geo_checks_using_classes(skip_if_runtime_not_geo_compa column=F.col("polygon_geom"), check_func_kwargs={"min_value": 0.0, "max_value": 10.0}, ), + DQRowRule( + criticality="error", + check_func=geo_check_funcs.is_area_not_less_than, + column=F.col("polygon_geom"), + check_func_kwargs={"value": 0.0}, + ), + DQRowRule( + criticality="error", + check_func=geo_check_funcs.is_area_not_greater_than, + column=F.col("point_geom"), + check_func_kwargs={"value": 1.0}, + ), + DQRowRule( + criticality="error", + check_func=geo_check_funcs.is_area_equal_to, + column=F.col("point_geom"), + check_func_kwargs={"value": 0.0}, + ), + DQRowRule( + criticality="error", + check_func=geo_check_funcs.is_area_not_equal_to, + column=F.col("polygon_geom"), + check_func_kwargs={"value": 0.0}, + ), + DQRowRule( + criticality="error", + check_func=geo_check_funcs.is_num_points_not_less_than, + column=F.col("polygon_geom"), + check_func_kwargs={"value": 2}, + ), + DQRowRule( + criticality="error", + check_func=geo_check_funcs.is_num_points_not_greater_than, + column=F.col("point_geom"), + check_func_kwargs={"value": 2}, + ), + DQRowRule( + criticality="error", + check_func=geo_check_funcs.is_num_points_equal_to, + column=F.col("point_geom"), + check_func_kwargs={"value": 1}, + ), + DQRowRule( + criticality="error", + check_func=geo_check_funcs.is_num_points_not_equal_to, + column=F.col("polygon_geom"), + check_func_kwargs={"value": 2}, + ), ] dq_engine = DQEngine(ws) diff --git a/tests/integration/test_profiler_workflow.py b/tests/integration/test_profiler_workflow.py index 13fa9149a..65059eb00 100644 --- a/tests/integration/test_profiler_workflow.py +++ b/tests/integration/test_profiler_workflow.py @@ -440,7 +440,7 @@ def test_profiler_workflow_with_ai_rules_generation_with_custom_funcs(ws, spark, break expected_ai_generated_check = { - 'check': {'arguments': {'column': 'name', 'suffix': "'c'"}, 'function': 'not_ends_with_suffix'}, + 'check': {'arguments': {'column': 'name', 'suffix': 'c'}, 'function': 'not_ends_with_suffix'}, 'criticality': 'error', } diff --git a/tests/integration/test_row_checks_geo.py b/tests/integration/test_row_checks_geo.py index a12ebcebe..1bdcaec1b 100644 --- a/tests/integration/test_row_checks_geo.py +++ b/tests/integration/test_row_checks_geo.py @@ -1,5 +1,13 @@ from chispa.dataframe_comparer import assert_df_equality # type: ignore from databricks.labs.dqx.geo.check_funcs import ( + is_area_equal_to, + is_area_not_equal_to, + is_area_not_less_than, + is_area_not_greater_than, + is_num_points_equal_to, + is_num_points_not_equal_to, + is_num_points_not_less_than, + is_num_points_not_greater_than, has_dimension, has_x_coordinate_between, has_y_coordinate_between, @@ -398,3 +406,325 @@ def test_has_y_coordinate_between(skip_if_runtime_not_geo_compatible, spark): ) assert_df_equality(actual, expected, ignore_nullable=True) + + +def test_is_area_equal_to(skip_if_runtime_not_geo_compatible, spark): + test_df = spark.sql( + """ + SELECT geom, geog FROM VALUES + ('POINT(0 0)', 'POINT(0 0)'), + ('POLYGON((0 0, 0.001 0, 0.001 0.001, 0 0.001, 0 0))', 'POLYGON((0 0, 0.001 0, 0.001 0.001, 0 0.001, 0 0))'), + ('POLYGON((0 0, 1 0, 1 1, 0 1, 0 0))', 'POLYGON((0 0, 0.01 0, 0.01 0.01, 0 0.01, 0 0))'), + ('invalid-geometry', 'invalid-geography'), + (null, null) + AS data(geom, geog) + """ + ) + + actual = test_df.select( + is_area_equal_to("geom", 0.0).alias("basic_geometry"), + is_area_equal_to("geom", 1.0, srid=4326).alias("geometry_srid"), + is_area_equal_to("geog", 0.0, geodesic=True).alias("geography_geodesic"), + ) + + checked_schema = "basic_geometry: string, geometry_srid: string, geography_geodesic: string" + expected = spark.createDataFrame( + [ + [None, "value `POINT(0 0)` in column `geom` has area not equal to value: 1.0", None], + [ + "value `POLYGON((0 0, 0.001 0, 0.001 0.001, 0 0.001, 0 0))` in column `geom` has area not equal to value: 0.0", + "value `POLYGON((0 0, 0.001 0, 0.001 0.001, 0 0.001, 0 0))` in column `geom` has area not equal to value: 1.0", + "value `POLYGON((0 0, 0.001 0, 0.001 0.001, 0 0.001, 0 0))` in column `geog` has area not equal to value: 0.0", + ], + [ + "value `POLYGON((0 0, 1 0, 1 1, 0 1, 0 0))` in column `geom` has area not equal to value: 0.0", + None, # area should = 1 + "value `POLYGON((0 0, 0.01 0, 0.01 0.01, 0 0.01, 0 0))` in column `geog` has area not equal to value: 0.0", + ], + [ + "value `invalid-geometry` in column `geom` is not a valid geometry", + "value `invalid-geometry` in column `geom` is not a valid geometry", + "value `invalid-geography` in column `geog` is not a valid geography", + ], + [None, None, None], + ], + checked_schema, + ) + + assert_df_equality(actual, expected, ignore_nullable=True) + + +def test_is_area_not_equal_to(skip_if_runtime_not_geo_compatible, spark): + test_df = spark.sql( + """ + SELECT geom, geog FROM VALUES + ('POINT(0 0)', 'POINT(0 0)'), + ('POLYGON((0 0, 0.001 0, 0.001 0.001, 0 0.001, 0 0))', 'POLYGON((0 0, 0.001 0, 0.001 0.001, 0 0.001, 0 0))'), + ('POLYGON((0 0, 1 0, 1 1, 0 1, 0 0))', 'POLYGON((0 0, 0.01 0, 0.01 0.01, 0 0.01, 0 0))'), + ('invalid-geometry', 'invalid-geography'), + (null, null) + AS data(geom, geog) + """ + ) + + actual = test_df.select( + is_area_not_equal_to("geom", 0.0).alias("basic_geometry"), + is_area_not_equal_to("geom", 1.0, srid=4326).alias("geometry_srid"), + is_area_not_equal_to("geog", 0.0, geodesic=True).alias("geography_geodesic"), + ) + + checked_schema = "basic_geometry: string, geometry_srid: string, geography_geodesic: string" + expected = spark.createDataFrame( + [ + [ + "value `POINT(0 0)` in column `geom` has area equal to value: 0.0", + None, + "value `POINT(0 0)` in column `geog` has area equal to value: 0.0", + ], + [ + None, + None, + None, + ], + [ + None, + "value `POLYGON((0 0, 1 0, 1 1, 0 1, 0 0))` in column `geom` has area equal to value: 1.0", + None, + ], + [ + "value `invalid-geometry` in column `geom` is not a valid geometry", + "value `invalid-geometry` in column `geom` is not a valid geometry", + "value `invalid-geography` in column `geog` is not a valid geography", + ], + [None, None, None], + ], + checked_schema, + ) + + assert_df_equality(actual, expected, ignore_nullable=True) + + +def test_is_area_not_greater_than(skip_if_runtime_not_geo_compatible, spark): + test_df = spark.sql( + """ + SELECT geom, geog FROM VALUES + ('POINT(0 0)', 'POINT(0 0)'), + ('POLYGON((0 0, 0.001 0, 0.001 0.001, 0 0.001, 0 0))', 'POLYGON((0 0, 0.001 0, 0.001 0.001, 0 0.001, 0 0))'), + ('POLYGON((0 0, 0.01 0, 0.01 0.01, 0 0.01, 0 0))', 'POLYGON((0 0, 0.01 0, 0.01 0.01, 0 0.01, 0 0))'), + ('invalid-geometry', 'invalid-geography'), + (null, null) + AS data(geom, geog) + """ + ) + + actual = test_df.select( + is_area_not_greater_than("geom", 20000.0).alias("basic_geometry"), + is_area_not_greater_than("geom", 1.0, srid=4326).alias("geometry_srid"), + is_area_not_greater_than("geog", 1000.0, geodesic=True).alias("geography_geodesic"), + ) + + checked_schema = "basic_geometry: string, geometry_srid: string, geography_geodesic: string" + expected = spark.createDataFrame( + [ + [None, None, None], + [ + None, + None, + "value `POLYGON((0 0, 0.001 0, 0.001 0.001, 0 0.001, 0 0))` in column `geog` has area greater than value: 1000.0", + ], + [ + "value `POLYGON((0 0, 0.01 0, 0.01 0.01, 0 0.01, 0 0))` in column `geom` has area greater than value: 20000.0", + None, + "value `POLYGON((0 0, 0.01 0, 0.01 0.01, 0 0.01, 0 0))` in column `geog` has area greater than value: 1000.0", + ], + [ + "value `invalid-geometry` in column `geom` is not a valid geometry", + "value `invalid-geometry` in column `geom` is not a valid geometry", + "value `invalid-geography` in column `geog` is not a valid geography", + ], + [None, None, None], + ], + checked_schema, + ) + + assert_df_equality(actual, expected, ignore_nullable=True) + + +def test_is_area_not_less_than(skip_if_runtime_not_geo_compatible, spark): + test_df = spark.sql( + """ + SELECT geom, geog FROM VALUES + ('POINT(0 0)', 'POINT(0 0)'), + ('POLYGON((0 0, 0.0001 0, 0.0001 0.0001, 0 0.0001, 0 0))', 'POLYGON((0 0, 0.0001 0, 0.0001 0.0001, 0 0.0001, 0 0))'), + ('POLYGON((0 0, 0.01 0, 0.01 0.01, 0 0.01, 0 0))', 'POLYGON((0 0, 0.01 0, 0.01 0.01, 0 0.01, 0 0))'), + ('invalid-geometry', 'invalid-geography'), + (null, null) + AS data(geom, geog) + """ + ) + + actual = test_df.select( + is_area_not_less_than("geom", 20000.0).alias("basic_geometry"), + is_area_not_less_than("geom", 1.0, srid=4326).alias("geometry_srid"), + is_area_not_less_than("geog", 20000.0, geodesic=True).alias("geography_geodesic"), + ) + + checked_schema = "basic_geometry: string, geometry_srid: string, geography_geodesic: string" + expected = spark.createDataFrame( + [ + [ + "value `POINT(0 0)` in column `geom` has area less than value: 20000.0", + "value `POINT(0 0)` in column `geom` has area less than value: 1.0", + "value `POINT(0 0)` in column `geog` has area less than value: 20000.0", + ], + [ + "value `POLYGON((0 0, 0.0001 0, 0.0001 0.0001, 0 0.0001, 0 0))` in column `geom` has area less than value: 20000.0", + "value `POLYGON((0 0, 0.0001 0, 0.0001 0.0001, 0 0.0001, 0 0))` in column `geom` has area less than value: 1.0", + "value `POLYGON((0 0, 0.0001 0, 0.0001 0.0001, 0 0.0001, 0 0))` in column `geog` has area less than value: 20000.0", + ], + [ + None, + "value `POLYGON((0 0, 0.01 0, 0.01 0.01, 0 0.01, 0 0))` in column `geom` has area less than value: 1.0", + None, + ], + [ + "value `invalid-geometry` in column `geom` is not a valid geometry", + "value `invalid-geometry` in column `geom` is not a valid geometry", + "value `invalid-geography` in column `geog` is not a valid geography", + ], + [None, None, None], + ], + checked_schema, + ) + + assert_df_equality(actual, expected, ignore_nullable=True) + + +def test_is_num_points_equal_to(skip_if_runtime_not_geo_compatible, spark): + test_df = spark.sql( + """ + SELECT geom FROM VALUES + ('POINT(0 0)'), -- 1 point + ('LINESTRING(0 0, 1 1)'), -- 2 points + ('LINESTRING(0 0, 1 1, 2 2)'), -- 3 points + ('POLYGON((0 0, 1 0, 1 1, 0 1, 0 0))'), -- 5 points (including closing point) + ('invalid-geometry'), -- Invalid geometry + (null) -- Null geometry + AS data(geom) + """ + ) + + actual = test_df.select(is_num_points_equal_to("geom", 5)) + + checked_schema = "geom_num_points_not_equal_to_limit: string" + expected = spark.createDataFrame( + [ + ["value `POINT(0 0)` in column `geom` has number of coordinates not equal to value: 5"], + ["value `LINESTRING(0 0, 1 1)` in column `geom` has number of coordinates not equal to value: 5"], + ["value `LINESTRING(0 0, 1 1, 2 2)` in column `geom` has number of coordinates not equal to value: 5"], + [None], + ["value `invalid-geometry` in column `geom` is not a valid geometry"], + [None], + ], + checked_schema, + ) + + assert_df_equality(actual, expected, ignore_nullable=True) + + +def test_is_num_points_not_equal_to(skip_if_runtime_not_geo_compatible, spark): + test_df = spark.sql( + """ + SELECT geom FROM VALUES + ('POINT(0 0)'), -- 1 point + ('LINESTRING(0 0, 1 1)'), -- 2 points + ('LINESTRING(0 0, 1 1, 2 2)'), -- 3 points + ('POLYGON((0 0, 1 0, 1 1, 0 1, 0 0))'), -- 5 points (including closing point) + ('invalid-geometry'), -- Invalid geometry + (null) -- Null geometry + AS data(geom) + """ + ) + + actual = test_df.select(is_num_points_not_equal_to("geom", 1)) + + checked_schema = "geom_num_points_equal_to_limit: string" + expected = spark.createDataFrame( + [ + ["value `POINT(0 0)` in column `geom` has number of coordinates equal to value: 1"], + [None], + [None], + [None], + ["value `invalid-geometry` in column `geom` is not a valid geometry"], + [None], + ], + checked_schema, + ) + + assert_df_equality(actual, expected, ignore_nullable=True) + + +def test_is_num_points_not_greater_than(skip_if_runtime_not_geo_compatible, spark): + test_df = spark.sql( + """ + SELECT geom FROM VALUES + ('POINT(0 0)'), -- 1 point + ('LINESTRING(0 0, 1 1)'), -- 2 points + ('LINESTRING(0 0, 1 1, 2 2)'), -- 3 points + ('POLYGON((0 0, 1 0, 1 1, 0 1, 0 0))'), -- 5 points (including closing point) + ('invalid-geometry'), -- Invalid geometry + (null) -- Null geometry + AS data(geom) + """ + ) + + actual = test_df.select(is_num_points_not_greater_than("geom", 3)) + + checked_schema = "geom_num_points_greater_than_limit: string" + expected = spark.createDataFrame( + [ + [None], + [None], + [None], + [ + "value `POLYGON((0 0, 1 0, 1 1, 0 1, 0 0))` in column `geom` has number of coordinates greater than value: 3" + ], + ["value `invalid-geometry` in column `geom` is not a valid geometry"], + [None], + ], + checked_schema, + ) + + assert_df_equality(actual, expected, ignore_nullable=True) + + +def test_is_num_points_not_less_than(skip_if_runtime_not_geo_compatible, spark): + test_df = spark.sql( + """ + SELECT geom FROM VALUES + ('POINT(0 0)'), -- 1 point + ('LINESTRING(0 0, 1 1)'), -- 2 points + ('LINESTRING(0 0, 1 1, 2 2)'), -- 3 points + ('POLYGON((0 0, 1 0, 1 1, 0 1, 0 0))'), -- 5 points (including closing point) + ('invalid-geometry'), -- Invalid geometry + (null) -- Null geometry + AS data(geom) + """ + ) + + actual = test_df.select(is_num_points_not_less_than("geom", 3)) + + checked_schema = "geom_num_points_less_than_limit: string" + expected = spark.createDataFrame( + [ + ["value `POINT(0 0)` in column `geom` has number of coordinates less than value: 3"], + ["value `LINESTRING(0 0, 1 1)` in column `geom` has number of coordinates less than value: 3"], + [None], + [None], + ["value `invalid-geometry` in column `geom` is not a valid geometry"], + [None], + ], + checked_schema, + ) + + assert_df_equality(actual, expected, ignore_nullable=True) diff --git a/tests/perf/.benchmarks/baseline.json b/tests/perf/.benchmarks/baseline.json index 61d4c963f..afd7ad299 100644 --- a/tests/perf/.benchmarks/baseline.json +++ b/tests/perf/.benchmarks/baseline.json @@ -2001,6 +2001,146 @@ "iterations": 1 } }, + { + "group": null, + "name": "test_benchmark_is_area_equal_to", + "fullname": "tests/perf/test_apply_checks.py::test_benchmark_is_area_equal_to", + "params": null, + "param": null, + "extra_info": {}, + "options": { + "disable_gc": false, + "timer": "perf_counter", + "min_rounds": 5, + "max_time": 1.0, + "min_time": 0.000005, + "warmup": false + }, + "stats": { + "min": 0.20525457300027483, + "max": 0.21617943899991587, + "mean": 0.2093807285999901, + "stddev": 0.004470728750156033, + "rounds": 5, + "median": 0.20764684200003103, + "iqr": 0.00659282475010059, + "q1": 0.20606638499987184, + "q3": 0.21265920974997243, + "iqr_outliers": 0, + "stddev_outliers": 1, + "outliers": "1;0", + "ld15iqr": 0.20525457300027483, + "hd15iqr": 0.21617943899991587, + "ops": 4.775988729652588, + "total": 1.0469036429999505, + "iterations": 1 + } + }, + { + "group": null, + "name": "test_benchmark_is_area_not_equal_to", + "fullname": "tests/perf/test_apply_checks.py::test_benchmark_is_area_not_equal_to", + "params": null, + "param": null, + "extra_info": {}, + "options": { + "disable_gc": false, + "timer": "perf_counter", + "min_rounds": 5, + "max_time": 1.0, + "min_time": 0.000005, + "warmup": false + }, + "stats": { + "min": 0.20362593599975298, + "max": 0.21769380999967325, + "mean": 0.20887473519987906, + "stddev": 0.005257130828880793, + "rounds": 5, + "median": 0.2074357989999953, + "iqr": 0.004513020249987676, + "q1": 0.20626525724992462, + "q3": 0.2107782774999123, + "iqr_outliers": 1, + "stddev_outliers": 1, + "outliers": "1;1", + "ld15iqr": 0.20362593599975298, + "hd15iqr": 0.21769380999967325, + "ops": 4.787558433254588, + "total": 1.0443736759993953, + "iterations": 1 + } + }, + { + "group": null, + "name": "test_benchmark_is_area_not_greater_than", + "fullname": "tests/perf/test_apply_checks.py::test_benchmark_is_area_not_greater_than", + "params": null, + "param": null, + "extra_info": {}, + "options": { + "disable_gc": false, + "timer": "perf_counter", + "min_rounds": 5, + "max_time": 1.0, + "min_time": 0.000005, + "warmup": false + }, + "stats": { + "min": 0.16153616899964618, + "max": 0.19087516699983098, + "mean": 0.17723032849986944, + "stddev": 0.010356076836936949, + "rounds": 6, + "median": 0.17935232600007112, + "iqr": 0.013260618999993312, + "q1": 0.169502681999802, + "q3": 0.18276330099979532, + "iqr_outliers": 0, + "stddev_outliers": 2, + "outliers": "2;0", + "ld15iqr": 0.16153616899964618, + "hd15iqr": 0.19087516699983098, + "ops": 5.642375142360223, + "total": 1.0633819709992167, + "iterations": 1 + } + }, + { + "group": null, + "name": "test_benchmark_is_area_not_less_than", + "fullname": "tests/perf/test_apply_checks.py::test_benchmark_is_area_not_less_than", + "params": null, + "param": null, + "extra_info": {}, + "options": { + "disable_gc": false, + "timer": "perf_counter", + "min_rounds": 5, + "max_time": 1.0, + "min_time": 0.000005, + "warmup": false + }, + "stats": { + "min": 0.1618769249998877, + "max": 0.20415533100003813, + "mean": 0.17186761033334128, + "stddev": 0.01595667793818366, + "rounds": 6, + "median": 0.16686740499994812, + "iqr": 0.0031935699998939526, + "q1": 0.16412251300016578, + "q3": 0.16731608300005973, + "iqr_outliers": 1, + "stddev_outliers": 1, + "outliers": "1;1", + "ld15iqr": 0.1618769249998877, + "hd15iqr": 0.20415533100003813, + "ops": 5.818431978314451, + "total": 1.0312056620000476, + "iterations": 1 + } + }, { "group": null, "name": "test_benchmark_is_data_fresh", @@ -4290,6 +4430,146 @@ "iterations": 1 } }, + { + "group": null, + "name": "test_benchmark_is_num_points_equal_to", + "fullname": "tests/perf/test_apply_checks.py::test_benchmark_is_num_points_equal_to", + "params": null, + "param": null, + "extra_info": {}, + "options": { + "disable_gc": false, + "timer": "perf_counter", + "min_rounds": 5, + "max_time": 1.0, + "min_time": 0.000005, + "warmup": false + }, + "stats": { + "min": 0.20084046800002398, + "max": 0.22855648599988854, + "mean": 0.21347243539985356, + "stddev": 0.011594577586789999, + "rounds": 5, + "median": 0.20832619199973124, + "iqr": 0.018573520249901776, + "q1": 0.2055024957498972, + "q3": 0.22407601599979898, + "iqr_outliers": 0, + "stddev_outliers": 2, + "outliers": "2;0", + "ld15iqr": 0.20084046800002398, + "hd15iqr": 0.22855648599988854, + "ops": 4.684445549735298, + "total": 1.0673621769992678, + "iterations": 1 + } + }, + { + "group": null, + "name": "test_benchmark_is_num_points_not_equal_to", + "fullname": "tests/perf/test_apply_checks.py::test_benchmark_is_num_points_not_equal_to", + "params": null, + "param": null, + "extra_info": {}, + "options": { + "disable_gc": false, + "timer": "perf_counter", + "min_rounds": 5, + "max_time": 1.0, + "min_time": 0.000005, + "warmup": false + }, + "stats": { + "min": 0.20062481700006174, + "max": 0.2233745620001173, + "mean": 0.21143883220001952, + "stddev": 0.008899789581330971, + "rounds": 5, + "median": 0.21208434699974532, + "iqr": 0.013585061500407392, + "q1": 0.20412380474988367, + "q3": 0.21770886625029107, + "iqr_outliers": 0, + "stddev_outliers": 2, + "outliers": "2;0", + "ld15iqr": 0.20062481700006174, + "hd15iqr": 0.2233745620001173, + "ops": 4.72950020388879, + "total": 1.0571941610000977, + "iterations": 1 + } + }, + { + "group": null, + "name": "test_benchmark_is_num_points_not_greater_than", + "fullname": "tests/perf/test_apply_checks.py::test_benchmark_is_num_points_not_greater_than", + "params": null, + "param": null, + "extra_info": {}, + "options": { + "disable_gc": false, + "timer": "perf_counter", + "min_rounds": 5, + "max_time": 1.0, + "min_time": 0.000005, + "warmup": false + }, + "stats": { + "min": 0.14940012700026273, + "max": 0.17819232800002283, + "mean": 0.1620693742002004, + "stddev": 0.010833428460170698, + "rounds": 5, + "median": 0.16190785100025096, + "iqr": 0.014197435000028236, + "q1": 0.15416780125019613, + "q3": 0.16836523625022437, + "iqr_outliers": 0, + "stddev_outliers": 2, + "outliers": "2;0", + "ld15iqr": 0.14940012700026273, + "hd15iqr": 0.17819232800002283, + "ops": 6.170197206812954, + "total": 0.810346871001002, + "iterations": 1 + } + }, + { + "group": null, + "name": "test_benchmark_is_num_points_not_less_than", + "fullname": "tests/perf/test_apply_checks.py::test_benchmark_is_num_points_not_less_than", + "params": null, + "param": null, + "extra_info": {}, + "options": { + "disable_gc": false, + "timer": "perf_counter", + "min_rounds": 5, + "max_time": 1.0, + "min_time": 0.000005, + "warmup": false + }, + "stats": { + "min": 0.15145704199994725, + "max": 0.17550262499980818, + "mean": 0.15920399733317936, + "stddev": 0.008775469818521377, + "rounds": 6, + "median": 0.15740465699991546, + "iqr": 0.008935045000271202, + "q1": 0.1522599789996093, + "q3": 0.1611950239998805, + "iqr_outliers": 1, + "stddev_outliers": 1, + "outliers": "1;1", + "ld15iqr": 0.15145704199994725, + "hd15iqr": 0.17550262499980818, + "ops": 6.281249320060836, + "total": 0.9552239839990762, + "iterations": 1 + } + }, { "group": null, "name": "test_benchmark_is_ogc_valid", diff --git a/tests/perf/test_apply_checks.py b/tests/perf/test_apply_checks.py index 4785d8f3f..1b64cf15d 100644 --- a/tests/perf/test_apply_checks.py +++ b/tests/perf/test_apply_checks.py @@ -1592,6 +1592,126 @@ def test_benchmark_has_y_coordinate_between(skip_if_runtime_not_geo_compatible, assert actual_count == EXPECTED_ROWS +def test_benchmark_is_area_not_greater_than(skip_if_runtime_not_geo_compatible, benchmark, ws, generated_geo_df): + dq_engine = DQEngine(workspace_client=ws, extra_params=EXTRA_PARAMS) + checks = [ + DQRowRule( + criticality="error", + check_func=geo_check_funcs.is_area_not_greater_than, + column="polygon_geom", + check_func_kwargs={"value": 1.0}, + ) + ] + checked = dq_engine.apply_checks(generated_geo_df, checks) + actual_count = benchmark(lambda: checked.count()) + assert actual_count == EXPECTED_ROWS + + +def test_benchmark_is_area_not_less_than(skip_if_runtime_not_geo_compatible, benchmark, ws, generated_geo_df): + dq_engine = DQEngine(workspace_client=ws, extra_params=EXTRA_PARAMS) + checks = [ + DQRowRule( + criticality="error", + check_func=geo_check_funcs.is_area_not_less_than, + column="polygon_geom", + check_func_kwargs={"value": 1.0}, + ) + ] + checked = dq_engine.apply_checks(generated_geo_df, checks) + actual_count = benchmark(lambda: checked.count()) + assert actual_count == EXPECTED_ROWS + + +def test_benchmark_is_area_equal_to(skip_if_runtime_not_geo_compatible, benchmark, ws, generated_geo_df): + dq_engine = DQEngine(workspace_client=ws, extra_params=EXTRA_PARAMS) + checks = [ + DQRowRule( + criticality="error", + check_func=geo_check_funcs.is_area_equal_to, + column="polygon_geom", + check_func_kwargs={"value": 1.0}, + ) + ] + checked = dq_engine.apply_checks(generated_geo_df, checks) + actual_count = benchmark(lambda: checked.count()) + assert actual_count == EXPECTED_ROWS + + +def test_benchmark_is_area_not_equal_to(skip_if_runtime_not_geo_compatible, benchmark, ws, generated_geo_df): + dq_engine = DQEngine(workspace_client=ws, extra_params=EXTRA_PARAMS) + checks = [ + DQRowRule( + criticality="error", + check_func=geo_check_funcs.is_area_not_equal_to, + column="polygon_geom", + check_func_kwargs={"value": 1.0}, + ) + ] + checked = dq_engine.apply_checks(generated_geo_df, checks) + actual_count = benchmark(lambda: checked.count()) + assert actual_count == EXPECTED_ROWS + + +def test_benchmark_is_num_points_not_greater_than(skip_if_runtime_not_geo_compatible, benchmark, ws, generated_geo_df): + dq_engine = DQEngine(workspace_client=ws, extra_params=EXTRA_PARAMS) + checks = [ + DQRowRule( + criticality="error", + check_func=geo_check_funcs.is_num_points_not_greater_than, + column="polygon_geom", + check_func_kwargs={"value": 1}, + ) + ] + checked = dq_engine.apply_checks(generated_geo_df, checks) + actual_count = benchmark(lambda: checked.count()) + assert actual_count == EXPECTED_ROWS + + +def test_benchmark_is_num_points_not_less_than(skip_if_runtime_not_geo_compatible, benchmark, ws, generated_geo_df): + dq_engine = DQEngine(workspace_client=ws, extra_params=EXTRA_PARAMS) + checks = [ + DQRowRule( + criticality="error", + check_func=geo_check_funcs.is_num_points_not_less_than, + column="polygon_geom", + check_func_kwargs={"value": 1}, + ) + ] + checked = dq_engine.apply_checks(generated_geo_df, checks) + actual_count = benchmark(lambda: checked.count()) + assert actual_count == EXPECTED_ROWS + + +def test_benchmark_is_num_points_equal_to(skip_if_runtime_not_geo_compatible, benchmark, ws, generated_geo_df): + dq_engine = DQEngine(workspace_client=ws, extra_params=EXTRA_PARAMS) + checks = [ + DQRowRule( + criticality="error", + check_func=geo_check_funcs.is_num_points_equal_to, + column="polygon_geom", + check_func_kwargs={"value": 1}, + ) + ] + checked = dq_engine.apply_checks(generated_geo_df, checks) + actual_count = benchmark(lambda: checked.count()) + assert actual_count == EXPECTED_ROWS + + +def test_benchmark_is_num_points_not_equal_to(skip_if_runtime_not_geo_compatible, benchmark, ws, generated_geo_df): + dq_engine = DQEngine(workspace_client=ws, extra_params=EXTRA_PARAMS) + checks = [ + DQRowRule( + criticality="error", + check_func=geo_check_funcs.is_num_points_not_equal_to, + column="polygon_geom", + check_func_kwargs={"value": 1}, + ) + ] + checked = dq_engine.apply_checks(generated_geo_df, checks) + actual_count = benchmark(lambda: checked.count()) + assert actual_count == EXPECTED_ROWS + + @pytest.mark.benchmark(group="test_benchmark_has_valid_schema") def test_benchmark_has_valid_schema(benchmark, ws, generated_df): dq_engine = DQEngine(workspace_client=ws, extra_params=EXTRA_PARAMS) diff --git a/tests/resources/all_row_geo_checks.yaml b/tests/resources/all_row_geo_checks.yaml index 8cc66ed7c..b2512dfa7 100644 --- a/tests/resources/all_row_geo_checks.yaml +++ b/tests/resources/all_row_geo_checks.yaml @@ -117,3 +117,67 @@ column: polygon_geom min_value: 0.0 max_value: 10.0 + +# is_area_not_less_than check +- criticality: error + check: + function: is_area_not_less_than + arguments: + column: polygon_geom + value: 0.0 + +# is_area_not_greater_than check +- criticality: error + check: + function: is_area_not_greater_than + arguments: + column: point_geom + value: 1.0 + +# is_area_equal_to check +- criticality: error + check: + function: is_area_equal_to + arguments: + column: point_geom + value: 0.0 + +# is_area_not_equal_to check +- criticality: error + check: + function: is_area_not_equal_to + arguments: + column: polygon_geom + value: 0.0 + +# is_num_points_not_less_than check +- criticality: error + check: + function: is_num_points_not_less_than + arguments: + column: polygon_geom + value: 2 + +# is_num_points_not_greater_than check +- criticality: error + check: + function: is_num_points_not_greater_than + arguments: + column: point_geom + value: 2 + +# is_num_points_equal_to check +- criticality: error + check: + function: is_num_points_equal_to + arguments: + column: point_geom + value: 1 + +# is_num_points_not_equal_to check +- criticality: error + check: + function: is_num_points_not_equal_to + arguments: + column: polygon_geom + value: 2 From 9a0c0cef923f865d7fa92985b014ed572e9341ea Mon Sep 17 00:00:00 2001 From: mwojtyczka Date: Thu, 11 Dec 2025 10:31:00 +0000 Subject: [PATCH 2/2] Add pytest-benchmark performance baseline --- docs/dqx/docs/reference/benchmarks.mdx | 1 + tests/perf/.benchmarks/baseline.json | 35 ++++++++++++++++++++++++++ 2 files changed, 36 insertions(+) diff --git a/docs/dqx/docs/reference/benchmarks.mdx b/docs/dqx/docs/reference/benchmarks.mdx index 64ad6b221..59a93274c 100644 --- a/docs/dqx/docs/reference/benchmarks.mdx +++ b/docs/dqx/docs/reference/benchmarks.mdx @@ -125,6 +125,7 @@ sidebar_position: 13 | test_benchmark_is_not_null_and_is_in_list[col2] | 0.242585 | 0.239179 | 0.235409 | 0.257352 | 0.008682 | 0.009090 | 0.237418 | 0.246509 | 5 | 0 | 1 | 4.12 | | test_benchmark_is_not_null_and_is_in_list[col3] | 0.266090 | 0.254002 | 0.242063 | 0.319918 | 0.030873 | 0.025710 | 0.250317 | 0.276027 | 5 | 1 | 1 | 3.76 | | test_benchmark_is_not_null_and_not_empty_array | 0.242006 | 0.245388 | 0.219974 | 0.255484 | 0.013192 | 0.011067 | 0.237343 | 0.248410 | 5 | 1 | 2 | 4.13 | +| test_benchmark_is_not_null_island | 0.229706 | 0.229359 | 0.219877 | 0.241720 | 0.009478 | 0.016914 | 0.220834 | 0.237748 | 5 | 0 | 2 | 4.35 | | test_benchmark_is_null_or_empty[col1] | 0.245738 | 0.249997 | 0.230094 | 0.262357 | 0.014764 | 0.027013 | 0.230391 | 0.257404 | 5 | 0 | 3 | 4.07 | | test_benchmark_is_null_or_empty[col2] | 0.243129 | 0.241425 | 0.233387 | 0.253421 | 0.008940 | 0.016398 | 0.235432 | 0.251830 | 5 | 0 | 2 | 4.11 | | test_benchmark_is_null_or_empty[col3] | 0.243480 | 0.236580 | 0.222777 | 0.278160 | 0.022199 | 0.031183 | 0.226982 | 0.258165 | 5 | 0 | 1 | 4.11 | diff --git a/tests/perf/.benchmarks/baseline.json b/tests/perf/.benchmarks/baseline.json index afd7ad299..7f5dea1b2 100644 --- a/tests/perf/.benchmarks/baseline.json +++ b/tests/perf/.benchmarks/baseline.json @@ -4134,6 +4134,41 @@ "iterations": 1 } }, + { + "group": null, + "name": "test_benchmark_is_not_null_island", + "fullname": "tests/perf/test_apply_checks.py::test_benchmark_is_not_null_island", + "params": null, + "param": null, + "extra_info": {}, + "options": { + "disable_gc": false, + "timer": "perf_counter", + "min_rounds": 5, + "max_time": 1.0, + "min_time": 0.000005, + "warmup": false + }, + "stats": { + "min": 0.21987680999973236, + "max": 0.2417196389997116, + "mean": 0.2297064277998288, + "stddev": 0.009478270362841804, + "rounds": 5, + "median": 0.2293586999994659, + "iqr": 0.016914152249455583, + "q1": 0.22083385125029054, + "q3": 0.23774800349974612, + "iqr_outliers": 0, + "stddev_outliers": 2, + "outliers": "2;0", + "ld15iqr": 0.21987680999973236, + "hd15iqr": 0.2417196389997116, + "ops": 4.353382748485479, + "total": 1.148532138999144, + "iterations": 1 + } + }, { "group": "test_benchmark_is_null_or_empty col1", "name": "test_benchmark_is_null_or_empty[col1]",