From d8cff029db0ea5f79f1d041800db4bdf0b98917c Mon Sep 17 00:00:00 2001 From: Tim Dikland Date: Tue, 14 Oct 2025 09:57:04 +0200 Subject: [PATCH 1/4] add null island check --- docs/dqx/docs/reference/quality_checks.mdx | 17 ++++++++++++- src/databricks/labs/dqx/geo/check_funcs.py | 29 ++++++++++++++++++++++ tests/integration/test_apply_checks.py | 11 ++++++++ tests/integration/test_row_checks_geo.py | 24 ++++++++++++++++++ tests/perf/test_apply_checks.py | 12 +++++++++ tests/resources/all_row_geo_checks.yaml | 7 ++++++ 6 files changed, 99 insertions(+), 1 deletion(-) diff --git a/docs/dqx/docs/reference/quality_checks.mdx b/docs/dqx/docs/reference/quality_checks.mdx index 5020e6b3d..f9d5e698c 100644 --- a/docs/dqx/docs/reference/quality_checks.mdx +++ b/docs/dqx/docs/reference/quality_checks.mdx @@ -61,6 +61,7 @@ You can also define your own custom checks (see [Creating custom checks](#creati | `is_geometrycollection` | Checks whether the values in the input column are geometrycollection geometries/geographies. This function requires Databricks serverless compute or runtime >= 17.1. | `column`: column to check (can be a string column name or a column expression) | | `is_ogc_valid` | Checks whether the values in the input column are valid geometries in the OGC sense. I.e a bowtie polygon is invalid because it has a self intersection. This function requires Databricks serverless compute or runtime >= 17.1. | `column`: column to check (can be a string column name or a column expression) | | `is_non_empty_geometry` | Checks whether the values in the input column are non-empty geometries. This function requires Databricks serverless compute or runtime >= 17.1. | `column`: column to check (can be a string column name or a column expression) | +| `is_not_null_island` | Checks whether the values in the input column are null island geometries (POINT(0 0)). This function requires Databricks serverless compute or runtime >= 17.1. | `column`: column to check (can be a string column name or a column expression) | | `has_dimension` | Checks whether the values in the input column are geometries of the specified dimension (2D projected dimension). This function requires Databricks serverless compute or runtime >= 17.1. | `column`: column to check (can be a string column name or a column expression); `dimension`: dimension to check | | `has_x_coordinate_between` | Checks whether the values in the input column are geometries with x coordinate between the provided boundaries. This function requires Databricks serverless compute or runtime >= 17.1. | `column`: column to check (can be a string column name or a column expression); `min_value`: minimum value; `max_value`: maximum value | | `has_y_coordinate_between` | Checks whether the values in the input column are geometries with y coordinate between the provided boundaries. This function requires Databricks serverless compute or runtime >= 17.1. | `column`: column to check (can be a string column name or a column expression); `min_value`: minimum value; `max_value`: maximum value | | `column`: column to check (can be a string column name or a column expression); `min_value`: minimum value; `max_value`: maximum value | @@ -578,7 +579,14 @@ For brevity, the `name` field in the examples is omitted and it will be auto-gen function: is_non_empty_geometry arguments: column: point_geom - + +# is_not_null_island check +- criticality: error + check: + function: is_not_null_island + arguments: + column: point_geom + # has_dimension check - criticality: error check: @@ -1042,6 +1050,13 @@ checks = [ column="point_geom" ), + # is_not_null_island check + DQRowRule( + criticality="error", + check_func=geo_check_funcs.is_not_null_island, + column="point_geom" + ), + # has_dimension check DQRowRule( criticality="error", diff --git a/src/databricks/labs/dqx/geo/check_funcs.py b/src/databricks/labs/dqx/geo/check_funcs.py index 612f7a717..354dd0626 100644 --- a/src/databricks/labs/dqx/geo/check_funcs.py +++ b/src/databricks/labs/dqx/geo/check_funcs.py @@ -357,6 +357,35 @@ def is_non_empty_geometry(column: str | Column) -> Column: ) +@register_rule("row") +def is_not_null_island(column: str | Column) -> Column: + """Checks whether the values in the input column are NULL island geometries (POINT(0 0)). + + Args: + column: column to check; can be a string column name or a column expression + + Returns: + Column object indicating whether the values in the input column are NULL island geometries + + Note: + This function requires Databricks serverless compute or runtime 17.1 or above. + """ + col_str_norm, col_expr_str, col_expr = _get_normalized_column_and_expr(column) + # NOTE: This function is currently only available in Databricks runtime 17.1 or above or in + # Databricks SQL, due to the use of the `try_to_geometry`, `st_geometrytype`, `st_x`, and `st_y` functions. + geom_cond = F.expr(f"try_to_geometry({col_str_norm}) IS NULL") + is_point_cond = F.expr(f"st_geometrytype(try_to_geometry({col_str_norm})) = '{POINT_TYPE}'") + is_zero_zero = F.expr(f"st_x(try_to_geometry({col_str_norm})) = 0.0 AND st_y(try_to_geometry({col_str_norm})) = 0.0") + condition = F.when(col_expr.isNull(), F.lit(None)).otherwise(~geom_cond & is_point_cond & is_zero_zero) + condition_str = f"column `{col_expr_str}` contains a null island" + + return make_condition( + condition, + F.lit(condition_str), + f"{col_str_norm}_contains_null_island", + ) + + @register_rule("row") def has_dimension(column: str | Column, dimension: int) -> Column: """Checks whether the geometries/geographies in the input column have a given dimension. diff --git a/tests/integration/test_apply_checks.py b/tests/integration/test_apply_checks.py index 8ee1fe1f3..2e918859e 100644 --- a/tests/integration/test_apply_checks.py +++ b/tests/integration/test_apply_checks.py @@ -5819,6 +5819,17 @@ def test_apply_checks_all_geo_checks_using_classes(skip_if_runtime_not_geo_compa check_func=geo_check_funcs.is_non_empty_geometry, column=F.col("point_geom"), ), + # is_not_null_island check + DQRowRule( + criticality="error", + check_func=geo_check_funcs.is_not_null_island, + column="point_geom", + ), + DQRowRule( + criticality="error", + check_func=geo_check_funcs.is_not_null_island, + column=F.col("point_geom"), + ), # has_dimension check DQRowRule( criticality="error", diff --git a/tests/integration/test_row_checks_geo.py b/tests/integration/test_row_checks_geo.py index a12ebcebe..0cb016c68 100644 --- a/tests/integration/test_row_checks_geo.py +++ b/tests/integration/test_row_checks_geo.py @@ -13,6 +13,7 @@ is_multilinestring, is_multipoint, is_multipolygon, + is_not_null_island, is_point, is_polygon, is_ogc_valid, @@ -333,6 +334,29 @@ def test_is_non_empty_geometry(skip_if_runtime_not_geo_compatible, spark): assert_df_equality(actual, expected, ignore_nullable=True) +def test_is_not_null_island(skip_if_runtime_not_geo_compatible, spark): + input_schema = "geom: string" + test_df = spark.createDataFrame( + [["POINT(1 1)"], ["POINT(0 0)"], ["LINESTRING(0 0, 1 1)"], ["nonsense"], [None]], + input_schema, + ) + + actual = test_df.select(is_not_null_island("geom")) + + checked_schema = "geom_contains_null_island: string" + expected = spark.createDataFrame( + [ + [None], + ["column `geom` contains a null island"], + [None], + [None], + [None], + ], + checked_schema, + ) + assert_df_equality(actual, expected, ignore_nullable=True) + + def test_has_dimension(skip_if_runtime_not_geo_compatible, spark): input_schema = "geom: string" test_df = spark.createDataFrame( diff --git a/tests/perf/test_apply_checks.py b/tests/perf/test_apply_checks.py index 1f223d476..526384067 100644 --- a/tests/perf/test_apply_checks.py +++ b/tests/perf/test_apply_checks.py @@ -1509,6 +1509,18 @@ def test_benchmark_is_non_empty_geometry(skip_if_runtime_not_geo_compatible, ben actual_count = benchmark(lambda: checked.count()) assert actual_count == EXPECTED_ROWS +def test_benchmark_is_not_null_island(skip_if_runtime_not_geo_compatible, benchmark, ws, generated_geo_df): + dq_engine = DQEngine(workspace_client=ws, extra_params=EXTRA_PARAMS) + checks = [ + DQRowRule( + criticality="error", + check_func=geo_check_funcs.is_not_null_island, + column="point_geom", + ) + ] + checked = dq_engine.apply_checks(generated_geo_df, checks) + actual_count = benchmark(lambda: checked.count()) + assert actual_count == EXPECTED_ROWS def test_benchmark_has_dimension(skip_if_runtime_not_geo_compatible, benchmark, ws, generated_geo_df): dq_engine = DQEngine(workspace_client=ws, extra_params=EXTRA_PARAMS) diff --git a/tests/resources/all_row_geo_checks.yaml b/tests/resources/all_row_geo_checks.yaml index 8cc66ed7c..3d88016e7 100644 --- a/tests/resources/all_row_geo_checks.yaml +++ b/tests/resources/all_row_geo_checks.yaml @@ -92,6 +92,13 @@ arguments: column: point_geom +# is_not_null_island check +- criticality: error + check: + function: is_not_null_island + arguments: + column: point_geom + # has_dimension check - criticality: error check: From c7daeddf0d2da37046eaac4a95ab36e55e1bf933 Mon Sep 17 00:00:00 2001 From: Greg Hansen Date: Wed, 10 Dec 2025 17:18:34 -0500 Subject: [PATCH 2/4] Check POINTZ and POINTZM geometries --- docs/dqx/docs/reference/quality_checks.mdx | 2 +- src/databricks/labs/dqx/geo/check_funcs.py | 14 +++++++--- tests/integration/test_row_checks_geo.py | 30 +++++++++++++++------- tests/perf/test_apply_checks.py | 2 ++ 4 files changed, 35 insertions(+), 13 deletions(-) diff --git a/docs/dqx/docs/reference/quality_checks.mdx b/docs/dqx/docs/reference/quality_checks.mdx index 80afe478d..195c0e3bf 100644 --- a/docs/dqx/docs/reference/quality_checks.mdx +++ b/docs/dqx/docs/reference/quality_checks.mdx @@ -66,7 +66,7 @@ You can also define your own custom checks in Python (see [Creating custom check | `is_geometrycollection` | Checks whether the values in the input column are geometrycollection geometries/geographies. This function requires Databricks serverless compute or runtime >= 17.1. | `column`: column to check (can be a string column name or a column expression) | | `is_ogc_valid` | Checks whether the values in the input column are valid geometries in the OGC sense. I.e a bowtie polygon is invalid because it has a self intersection. This function requires Databricks serverless compute or runtime >= 17.1. | `column`: column to check (can be a string column name or a column expression) | | `is_non_empty_geometry` | Checks whether the values in the input column are non-empty geometries. This function requires Databricks serverless compute or runtime >= 17.1. | `column`: column to check (can be a string column name or a column expression) | -| `is_not_null_island` | Checks whether the values in the input column are null island geometries (POINT(0 0)). This function requires Databricks serverless compute or runtime >= 17.1. | `column`: column to check (can be a string column name or a column expression) | +| `is_not_null_island` | Checks whether the values in the input column are NULL island geometries (e.g. POINT(0 0), POINTZ(0 0 0), or POINTZM(0 0 0 0)). This function requires Databricks serverless compute or runtime >= 17.1. | `column`: column to check (can be a string column name or a column expression) | | `has_dimension` | Checks whether the values in the input column are geometries of the specified dimension (2D projected dimension). This function requires Databricks serverless compute or runtime >= 17.1. | `column`: column to check (can be a string column name or a column expression); `dimension`: dimension to check | | `has_x_coordinate_between` | Checks whether the values in the input column are geometries with x coordinate between the provided boundaries. This function requires Databricks serverless compute or runtime >= 17.1. | `column`: column to check (can be a string column name or a column expression); `min_value`: minimum value; `max_value`: maximum value | | `has_y_coordinate_between` | Checks whether the values in the input column are geometries with y coordinate between the provided boundaries. This function requires Databricks serverless compute or runtime >= 17.1. | `column`: column to check (can be a string column name or a column expression); `min_value`: minimum value; `max_value`: maximum value | diff --git a/src/databricks/labs/dqx/geo/check_funcs.py b/src/databricks/labs/dqx/geo/check_funcs.py index 354dd0626..1d4132a33 100644 --- a/src/databricks/labs/dqx/geo/check_funcs.py +++ b/src/databricks/labs/dqx/geo/check_funcs.py @@ -359,7 +359,8 @@ def is_non_empty_geometry(column: str | Column) -> Column: @register_rule("row") def is_not_null_island(column: str | Column) -> Column: - """Checks whether the values in the input column are NULL island geometries (POINT(0 0)). + """Checks whether the values in the input column are NULL island geometries (e.g. POINT(0 0), POINTZ(0 0 0), or + POINTZM(0 0 0 0)). Args: column: column to check; can be a string column name or a column expression @@ -374,9 +375,16 @@ def is_not_null_island(column: str | Column) -> Column: # NOTE: This function is currently only available in Databricks runtime 17.1 or above or in # Databricks SQL, due to the use of the `try_to_geometry`, `st_geometrytype`, `st_x`, and `st_y` functions. geom_cond = F.expr(f"try_to_geometry({col_str_norm}) IS NULL") + is_point_cond = F.expr(f"st_geometrytype(try_to_geometry({col_str_norm})) = '{POINT_TYPE}'") - is_zero_zero = F.expr(f"st_x(try_to_geometry({col_str_norm})) = 0.0 AND st_y(try_to_geometry({col_str_norm})) = 0.0") - condition = F.when(col_expr.isNull(), F.lit(None)).otherwise(~geom_cond & is_point_cond & is_zero_zero) + null_xy_cond = F.expr( + f"st_x(try_to_geometry({col_str_norm})) = 0.0 AND st_y(try_to_geometry({col_str_norm})) = 0.0" + ) + null_z_cond = F.expr(f"coalesce(st_z(try_to_geometry({col_str_norm})), -1) = 0.0") + null_m_cond = F.expr(f"coalesce(st_m(try_to_geometry({col_str_norm})), -1) = 0.0") + is_point_null_island = is_point_cond & null_xy_cond & null_z_cond & null_m_cond + + condition = F.when(col_expr.isNull(), F.lit(None)).otherwise(~geom_cond & is_point_cond & is_point_null_island) condition_str = f"column `{col_expr_str}` contains a null island" return make_condition( diff --git a/tests/integration/test_row_checks_geo.py b/tests/integration/test_row_checks_geo.py index 0cb016c68..b90932f64 100644 --- a/tests/integration/test_row_checks_geo.py +++ b/tests/integration/test_row_checks_geo.py @@ -335,22 +335,34 @@ def test_is_non_empty_geometry(skip_if_runtime_not_geo_compatible, spark): def test_is_not_null_island(skip_if_runtime_not_geo_compatible, spark): - input_schema = "geom: string" + input_schema = "geom: string, geomz: string, geomzm: string" test_df = spark.createDataFrame( - [["POINT(1 1)"], ["POINT(0 0)"], ["LINESTRING(0 0, 1 1)"], ["nonsense"], [None]], + [ + ["POINT(1 1)", "POINTZ(1 1 1)", "POINTZM(1 1 1 1)"], + ["POINT(0 0)", "POINTZ(0 0 0)", "POINTZM(0 0 0 0)"], + ["LINESTRING(0 0, 1 1)", "LINESTRING(0 0, 1 1)", "LINESTRING(0 0, 1 1)"], + ["nonsense", "nonsense", "nonsense"], + [None, None, None], + ], input_schema, ) - actual = test_df.select(is_not_null_island("geom")) + actual = test_df.select(is_not_null_island("geom"), is_not_null_island("geomz"), is_not_null_island("geomzm")) - checked_schema = "geom_contains_null_island: string" + checked_schema = ( + "geom_contains_null_island: string, geomz_contains_null_island: string, geomzm_contains_null_island: string" + ) expected = spark.createDataFrame( [ - [None], - ["column `geom` contains a null island"], - [None], - [None], - [None], + [None, None, None], + [ + "column `geom` contains a null island", + "column `geomz` contains a null island", + "column `geomzm` contains a null island", + ], + [None, None, None], + [None, None, None], + [None, None, None], ], checked_schema, ) diff --git a/tests/perf/test_apply_checks.py b/tests/perf/test_apply_checks.py index a8f0133b9..a9e4e8a28 100644 --- a/tests/perf/test_apply_checks.py +++ b/tests/perf/test_apply_checks.py @@ -1546,6 +1546,7 @@ def test_benchmark_is_non_empty_geometry(skip_if_runtime_not_geo_compatible, ben actual_count = benchmark(lambda: checked.count()) assert actual_count == EXPECTED_ROWS + def test_benchmark_is_not_null_island(skip_if_runtime_not_geo_compatible, benchmark, ws, generated_geo_df): dq_engine = DQEngine(workspace_client=ws, extra_params=EXTRA_PARAMS) checks = [ @@ -1559,6 +1560,7 @@ def test_benchmark_is_not_null_island(skip_if_runtime_not_geo_compatible, benchm actual_count = benchmark(lambda: checked.count()) assert actual_count == EXPECTED_ROWS + def test_benchmark_has_dimension(skip_if_runtime_not_geo_compatible, benchmark, ws, generated_geo_df): dq_engine = DQEngine(workspace_client=ws, extra_params=EXTRA_PARAMS) checks = [ From d9e3e7dc5a4fc3c5b6280276e08568c98f809b01 Mon Sep 17 00:00:00 2001 From: Greg Hansen Date: Wed, 10 Dec 2025 17:28:41 -0500 Subject: [PATCH 3/4] Refactor --- src/databricks/labs/dqx/geo/check_funcs.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/src/databricks/labs/dqx/geo/check_funcs.py b/src/databricks/labs/dqx/geo/check_funcs.py index 1d4132a33..fc8dbe3ae 100644 --- a/src/databricks/labs/dqx/geo/check_funcs.py +++ b/src/databricks/labs/dqx/geo/check_funcs.py @@ -374,16 +374,15 @@ def is_not_null_island(column: str | Column) -> Column: col_str_norm, col_expr_str, col_expr = _get_normalized_column_and_expr(column) # NOTE: This function is currently only available in Databricks runtime 17.1 or above or in # Databricks SQL, due to the use of the `try_to_geometry`, `st_geometrytype`, `st_x`, and `st_y` functions. - geom_cond = F.expr(f"try_to_geometry({col_str_norm}) IS NULL") + try_geom_expr = f"try_to_geometry({col_str_norm})" + geom_cond = F.expr(f"{try_geom_expr} IS NULL") - is_point_cond = F.expr(f"st_geometrytype(try_to_geometry({col_str_norm})) = '{POINT_TYPE}'") - null_xy_cond = F.expr( - f"st_x(try_to_geometry({col_str_norm})) = 0.0 AND st_y(try_to_geometry({col_str_norm})) = 0.0" - ) - null_z_cond = F.expr(f"coalesce(st_z(try_to_geometry({col_str_norm})), -1) = 0.0") - null_m_cond = F.expr(f"coalesce(st_m(try_to_geometry({col_str_norm})), -1) = 0.0") - is_point_null_island = is_point_cond & null_xy_cond & null_z_cond & null_m_cond + is_point_cond = F.expr(f"st_geometrytype({try_geom_expr}) = '{POINT_TYPE}'") + null_xy_cond = F.expr(f"st_x({try_geom_expr}) = 0.0 AND st_y({try_geom_expr}) = 0.0") + null_z_cond = F.expr(f"coalesce(st_z({try_geom_expr}), -1) = 0.0") + null_m_cond = F.expr(f"coalesce(st_m({try_geom_expr}), -1) = 0.0") + is_point_null_island = is_point_cond & null_xy_cond & null_z_cond & null_m_cond condition = F.when(col_expr.isNull(), F.lit(None)).otherwise(~geom_cond & is_point_cond & is_point_null_island) condition_str = f"column `{col_expr_str}` contains a null island" From 391a21a12e3749deb7dd9e7c3d438dcfcc17bd2b Mon Sep 17 00:00:00 2001 From: Marcin Wojtyczka Date: Thu, 11 Dec 2025 09:53:40 +0100 Subject: [PATCH 4/4] fixed demo --- demos/dqx_manufacturing_demo.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/demos/dqx_manufacturing_demo.py b/demos/dqx_manufacturing_demo.py index 393f1a79d..1b255b698 100644 --- a/demos/dqx_manufacturing_demo.py +++ b/demos/dqx_manufacturing_demo.py @@ -274,7 +274,7 @@ "SEN-001", "MCH-001", "temperature", - 735, + 735.0, datetime.strptime("2025-04-29 14:32:00", "%Y-%m-%d %H:%M:%S"), datetime.strptime("2025-04-01", "%Y-%m-%d").date(), 80, @@ -328,7 +328,7 @@ "SEN-002", "MCH-001", "temperature", - 735, + 735.0, datetime.strptime("2026-04-29 14:32:00", "%Y-%m-%d %H:%M:%S"), datetime.strptime("2025-04-01", "%Y-%m-%d").date(), 80, @@ -383,7 +383,7 @@ "SEN004", "MCH-001", "temperature", - 724, + 724.0, datetime.strptime("2025-04-28 14:32:00", "%Y-%m-%d %H:%M:%S"), datetime.strptime("2025-04-30", "%Y-%m-%d").date(), 85, @@ -397,7 +397,7 @@ "SEN004", "MCH-001", "temperature", - 724, + 724.0, datetime.strptime("2025-04-28 14:32:00", "%Y-%m-%d %H:%M:%S"), datetime.strptime("2025-04-30", "%Y-%m-%d").date(), 85,