equality test can now compare columns with different names

james-johnston-thumbtack · james-johnston-thumbtack · commit 0d60f9a5003e · 2025-03-25T02:03:21.000-07:00
The compare_columns now optionally accepts a list of two column names.
This allows the column names to be different between base model and
other model when comparing two columns.

The body of the macro is also refactored to make it more DRY, and reduce
convoluting the different steps and capabilities.

1.  First, we gather a set of numeric column names that we need to
    round.  This is referenced later.

2.  Next, we build two lists of column names that we want to compare:
    one list per model.  If compare_columns was given, we build the
    lists from that.  Otherwise, read column names from the first model,
    filter the excluded column names, and use the result for both
    models, as we did before this commit.

3.  Now that we have lists of column names, we can build comma-separated
    lists for each model.  At this point, we do the number rounding
    expression if the column name is found in the set from step 1.

Note this refactoring also cleans up some weird inconsistencies that
resulted from duplication of logic.  For example, suppose a model's
column name was uppercase, and your database is case-sensitive.  If you
did not specify a "precision" argument, then you need to provide an
upper-case "compare_columns" argument.  However, if you _did_ specify a
"precision" argument, then the "compare-columns" argument was _NOT_
case-sensitive.  It's pretty unexpected to have the case sensitivity of
one argument be dependent on a seemingly-unrelated argument.
diff --git a/README.md b/README.md
@@ -145,6 +145,18 @@ models:
           compare_model: ref('other_table_name')
           exclude_columns:
             - third_column
+
+  # if the columns to be compared have different names, you can match them up like this
+  - name: model_name_different_names
+    tests:
+      - dbt_utils.equality:
+          compare_model: ref('other_table_name')
+          compare_columns:
+            - first_column
+            # This will compare `model_name_different_names.second_column_in_model`
+            # and `other_table_name.second_column_in_other_table`
+            - [second_column_in_model, second_column_in_other_table]
+          precision: 4
 ```
 
 ### expression_is_true ([source](macros/generic_tests/expression_is_true.sql))
diff --git a/integration_tests/data/schema_tests/data_test_equality_different_column_names_a.csv b/integration_tests/data/schema_tests/data_test_equality_different_column_names_a.csv
@@ -0,0 +1,4 @@
+col_a,tbl_a_col_b,tbl_a_col_float
+1,1,1.100005
+1,2,1.200005
+2,3,1.300005
diff --git a/integration_tests/data/schema_tests/data_test_equality_different_column_names_b.csv b/integration_tests/data/schema_tests/data_test_equality_different_column_names_b.csv
@@ -0,0 +1,4 @@
+tbl_b_col_b,col_a,tbl_b_col_float
+1,1,1.100006
+2,1,1.200007
+3,2,1.300008
diff --git a/integration_tests/models/generic_tests/schema.yml b/integration_tests/models/generic_tests/schema.yml
@@ -164,6 +164,28 @@ seeds:
           exclude_columns:
             - col_c
 
+  - name: data_test_equality_different_column_names_a
+    data_tests:
+      - dbt_utils.equality:
+          compare_model: ref('data_test_equality_different_column_names_b')
+          compare_columns:
+            - col_a
+            - [tbl_a_col_b, tbl_b_col_b]
+      - dbt_utils.equality:
+          compare_model: ref('data_test_equality_different_column_names_b')
+          compare_columns:
+            - col_a
+            - [tbl_a_col_float, tbl_b_col_float]
+          precision: 4
+      - dbt_utils.equality:
+          compare_model: ref('data_test_equality_different_column_names_b')
+          compare_columns:
+            - col_a
+            - [tbl_a_col_float, tbl_b_col_float]
+          precision: 8
+          error_if: "<1" #sneaky way to ensure that the test is returning failing rows
+          warn_if: "<0"
+
   - name: data_test_equality_floats_a
     data_tests:
       # test precision only
diff --git a/macros/generic_tests/equality.sql b/macros/generic_tests/equality.sql
@@ -63,42 +63,15 @@
     {%- endif -%}
 
     {% if compare_columns_set != compare_model_columns_set %}
-        {{ exceptions.raise_compiler_error(compare_model ~" has less columns than " ~ model ~ ", please ensure they have the same columns or use the `compare_columns` or `exclude_columns` arguments to subset them.") }}
+        {{ exceptions.raise_compiler_error(compare_model ~" has different columns than " ~ model ~ ", please ensure they have the same columns or use the `compare_columns` or `exclude_columns` arguments to subset them.") }}
     {% endif %}
 
 
 {% endif %}
 
-{%- if not precision -%}
-    {%- if not compare_columns -%}
-        {# 
-            You cannot get the columns in an ephemeral model (due to not existing in the information schema),
-            so if the user does not provide an explicit list of columns we must error in the case it is ephemeral
-        #}
-        {%- do dbt_utils._is_ephemeral(model, 'test_equality') -%}
-        {%- set compare_columns = adapter.get_columns_in_relation(model)-%}
-
-        {%- if exclude_columns -%}
-            {#-- Lower case ignore columns for easier comparison --#}
-            {%- set exclude_columns = exclude_columns | map("lower") | list %}
-
-            {# Filter out the excluded columns #}
-            {%- set include_columns = [] %}
-            {%- for column in compare_columns -%}
-                {%- if column.name | lower not in exclude_columns -%}
-                    {% do include_columns.append(column) %}
-                {%- endif %}
-            {%- endfor %}
-
-            {%- set compare_columns = include_columns | map(attribute='quoted') %}
-        {%- else -%} {# Compare columns provided #}
-            {%- set compare_columns = compare_columns | map(attribute='quoted') %}
-        {%- endif -%}
-    {%- endif -%}
-
-    {% set compare_cols_csv = compare_columns | join(', ') %}
-
-{% else %} {# Precision required #}
+{# If testing with precision, then find out which columns in the main input model are numeric #}
+{%- set numeric_columns = {} -%}
+{%- if precision -%}
     {#-
         If rounding is required, we need to get the types, so it cannot be ephemeral even if they provide column names
     -#}
@@ -107,23 +80,82 @@
 
     {% set columns_list = [] %}
     {%- for col in columns -%}
-        {%- if (
-                (col.name|lower in compare_columns|map('lower') or not compare_columns) and
-                (col.name|lower not in exclude_columns|map('lower') or not exclude_columns)
-                ) -%}
-            {# Databricks double type is not picked up by any number type checks in dbt #}
-            {%- if col.is_float() or col.is_numeric() or col.data_type == 'double' -%}
-                {# Cast is required due to postgres not having round for a double precision number #}
-                {%- do columns_list.append('round(cast(' ~ col.quoted ~ ' as ' ~ dbt.type_numeric() ~ '),' ~ precision ~ ') as ' ~ col.quoted) -%}
-            {%- else -%} {# Non-numeric type #}
-                {%- do columns_list.append(col.quoted) -%}
-            {%- endif -%}
-        {% endif %}
+        {# Databricks double type is not picked up by any number type checks in dbt #}
+        {%- if col.is_float() or col.is_numeric() or col.data_type == 'double' -%}
+            {#- Lower case the column name for easier case-insensitive comparison -#}
+            {%- do numeric_columns.update({col.name|lower: true}) -%}
+            {#- Also include the quoted version, since we may see it as well. -#}
+            {%- do numeric_columns.update({col.quoted|lower: true}) -%}
+        {%- endif -%}
     {%- endfor -%}
+{%- endif -%}
 
-    {% set compare_cols_csv = columns_list | join(', ') %}
+{# If compare_columns is provided, sort any given arrays into lists of columns for each model #}
+{%- if compare_columns -%}
+    {%- set compare_columns__model = [] %}
+    {%- set compare_columns__compare_model = [] %}
+
+    {%- for column in compare_columns -%}
+        {%- if column is string -%}
+            {# A simple string was given.  Assume the same column name in both models. #}
+            {%- do compare_columns__model.append(column) -%}
+            {%- do compare_columns__compare_model.append(column) -%}
+        {%- elif column is iterable and column | length == 2 -%}
+            {%- do compare_columns__model.append(column[0]) -%}
+            {%- do compare_columns__compare_model.append(column[1]) -%}
+        {%- else -%}
+            {{ exceptions.raise_compiler_error("compare_columns must be a string or a list of 2 strings") }}
+        {%- endif -%}
+    {%- endfor -%}
+{%- else -%}
+    {#
+        You cannot get the columns in an ephemeral model (due to not existing in the information schema),
+        so if the user does not provide an explicit list of columns we must error in the case it is ephemeral
+    #}
+    {%- do dbt_utils._is_ephemeral(model, 'test_equality') -%}
+    {%- set model_columns = adapter.get_columns_in_relation(model)-%}
 
-{% endif %}
+    {%- if exclude_columns -%}
+        {#-- Lower case ignore columns for easier comparison --#}
+        {%- set exclude_columns = exclude_columns | map("lower") | list %}
+    {%- endif -%}
+
+    {# Filter out the excluded columns #}
+    {%- set include_columns = [] %}
+    {%- for column in model_columns -%}
+        {%- if (not exclude_columns) or (column.name | lower not in exclude_columns) -%}
+            {% do include_columns.append(column) %}
+        {%- endif %}
+    {%- endfor %}
+
+    {# Assume same column names in the comparison model, since no alternates were given using compare_columns. #}
+    {%- set compare_columns__model = include_columns | map(attribute='quoted') | list %}
+    {%- set compare_columns__compare_model = compare_columns__model %}
+{%- endif -%}
+
+{# Build comma-delimited lists of column names for each input model.  Round numeric types as needed. #}
+{%- set compare_columns_csv = [] -%}
+{%- set numeric_column_indexes_in_first_model = [] -%}
+{%- for this_model_compare_columns in [compare_columns__model, compare_columns__compare_model] -%}
+    {%- set columns_list = [] %}
+    {%- set is_first_model = loop.first -%}
+
+    {%- for this_compare_column in this_model_compare_columns -%}
+        {# NOTE: We assume any numeric columns in the first model are also numeric in the second model #}
+        {%- if (is_first_model and this_compare_column|lower in numeric_columns) or (loop.index0 in numeric_column_indexes_in_first_model) -%}
+            {# Cast is required due to postgres not having round for a double precision number #}
+            {%- do columns_list.append('round(cast(' ~ this_compare_column ~ ' as ' ~ dbt.type_numeric() ~ '),' ~ precision ~ ') as ' ~ this_compare_column) -%}
+
+            {%- if is_first_model -%}
+                {%- do numeric_column_indexes_in_first_model.append(loop.index0) -%}
+            {%- endif -%}
+        {%- else -%} {# Non-numeric type #}
+            {%- do columns_list.append(this_compare_column) -%}
+        {%- endif -%}
+    {%- endfor -%}
+
+    {%- do compare_columns_csv.append(columns_list | join(', ')) -%}
+{%- endfor -%}
 
 with a as (
 
@@ -139,17 +171,17 @@ b as (
 
 a_minus_b as (
 
-    select {{compare_cols_csv}} from a
+    select {{compare_columns_csv[0]}} from a
     {{ dbt.except() }}
-    select {{compare_cols_csv}} from b
+    select {{compare_columns_csv[1]}} from b
 
 ),
 
 b_minus_a as (
 
-    select {{compare_cols_csv}} from b
+    select {{compare_columns_csv[1]}} from b
     {{ dbt.except() }}
-    select {{compare_cols_csv}} from a
+    select {{compare_columns_csv[0]}} from a
 
 ),