bodo-ai · knassre-bodo · Sep 30, 2025 · Aug 21, 2025 · Aug 21, 2025 · Aug 22, 2025
diff --git a/pydough/conversion/relational_converter.py b/pydough/conversion/relational_converter.py
@@ -15,6 +15,7 @@
 from pydough.metadata import (
     CartesianProductMetadata,
     GeneralJoinMetadata,
+    MaskedTableColumnMetadata,
     SimpleJoinMetadata,
     SimpleTableMetadata,
 )
@@ -765,6 +766,21 @@ def handle_children(
                     context.expressions[hybrid_ref] = context.expressions[key_expr]
         return context
 
+    def is_masked_column(self, expr: HybridExpr) -> bool:
+        """
+        Checks if a given expression is a masked column expression.
+
+        Args:
+            `expr`: the expression to check.
+
+        Returns:
+            True if the expression is a masked column expression, False
+            otherwise.
+        """
+        return isinstance(expr, HybridColumnExpr) and isinstance(
+            expr.column.column_property, MaskedTableColumnMetadata
+        )
+
     def build_simple_table_scan(
         self, node: HybridCollectionAccess
     ) -> TranslationOutput:
@@ -806,7 +822,31 @@ def build_simple_table_scan(
                 assert isinstance(expr, ColumnReference)
                 real_names.add(expr.name)
             uniqueness.add(frozenset(real_names))
-        answer = Scan(node.collection.collection.table_path, scan_columns, uniqueness)
+        answer: RelationalNode = Scan(
+            node.collection.collection.table_path, scan_columns, uniqueness
+        )
+
+        # If any of the columns are masked, insert a projection on top to unmask
+        # them.
+        if any(self.is_masked_column(expr) for expr in node.terms.values()):
+            unmask_columns: dict[str, RelationalExpression] = {}
+            for name, hybrid_expr in node.terms.items():
+                if self.is_masked_column(hybrid_expr):
+                    assert isinstance(hybrid_expr, HybridColumnExpr)
+                    assert isinstance(
+                        hybrid_expr.column.column_property, MaskedTableColumnMetadata
+                    )
+                    unmask_columns[name] = CallExpression(
+                        pydop.MaskedExpressionFunctionOperator(
+                            hybrid_expr.column.column_property, True
+                        ),
+                        hybrid_expr.column.column_property.unprotected_data_type,
+                        [ColumnReference(name, hybrid_expr.typ)],
+                    )
+                else:
+                    unmask_columns[name] = ColumnReference(name, hybrid_expr.typ)
+            answer = Project(answer, unmask_columns)
+
         return TranslationOutput(answer, out_columns)
 
     def translate_sub_collection(

diff --git a/pydough/pydough_operators/__init__.py b/pydough/pydough_operators/__init__.py
@@ -61,6 +61,7 @@
     "MONOTONIC",
     "MONTH",
     "MUL",
+    "MaskedExpressionFunctionOperator",
     "NDISTINCT",
     "NEQ",
     "NEXT",
@@ -207,6 +208,7 @@
     ExpressionFunctionOperator,
     ExpressionWindowOperator,
     KeywordBranchingExpressionFunctionOperator,
+    MaskedExpressionFunctionOperator,
     PyDoughExpressionOperator,
     SqlAliasExpressionFunctionOperator,
     SqlMacroExpressionFunctionOperator,

diff --git a/pydough/pydough_operators/expression_operators/__init__.py b/pydough/pydough_operators/expression_operators/__init__.py
@@ -58,6 +58,7 @@
     "MONOTONIC",
     "MONTH",
     "MUL",
+    "MaskedExpressionFunctionOperator",
     "NDISTINCT",
     "NEQ",
     "NEXT",
@@ -107,6 +108,7 @@
 from .expression_operator import PyDoughExpressionOperator
 from .expression_window_operators import ExpressionWindowOperator
 from .keyword_branching_operators import KeywordBranchingExpressionFunctionOperator
+from .masked_expression_function_operator import MaskedExpressionFunctionOperator
 from .registered_expression_operators import (
     ABS,
     ABSENT,

diff --git a/pydough/pydough_operators/expression_operators/masked_expression_function_operator.py b/pydough/pydough_operators/expression_operators/masked_expression_function_operator.py
@@ -0,0 +1,91 @@
+"""
+Special operators containing logic to mask or unmask data based on a masked
+table column's metadata.
+"""
+
+__all__ = ["MaskedExpressionFunctionOperator"]
+
+
+from pydough.metadata.properties import MaskedTableColumnMetadata
+from pydough.pydough_operators.type_inference import (
+    ConstantType,
+    ExpressionTypeDeducer,
+    RequireNumArgs,
+    TypeVerifier,
+)
+from pydough.types import PyDoughType
+
+from .expression_function_operators import ExpressionFunctionOperator
+
+
+class MaskedExpressionFunctionOperator(ExpressionFunctionOperator):
+    """
+    A special expression function operator that masks or unmasks data based on
+    a masked table column's metadata. The operator contains the metadata for
+    the column, but can represent either a masking or unmasking operation
+    depending on the `is_unmask` flag.
+    """
+
+    def __init__(
+        self,
+        masking_metadata: MaskedTableColumnMetadata,
+        is_unmask: bool,
+    ):
+        # Create a dummy verifier that requires exactly one argument, since all
+        # masking/unmasking operations are unary.
+        verifier: TypeVerifier = RequireNumArgs(1)
+
+        # Create a dummy deducer that always returns the appropriate data type
+        # from the metadata based on whether this is a masking or unmasking
+        # operation.
+        target_type: PyDoughType = (
+            masking_metadata.unprotected_data_type
+            if is_unmask
+            else masking_metadata.data_type
+        )
+        deducer: ExpressionTypeDeducer = ConstantType(target_type)
+
+        super().__init__(
+            "UNMASK" if is_unmask else "MASK", False, verifier, deducer, False
+        )
+        self._masking_metadata: MaskedTableColumnMetadata = masking_metadata
+        self._is_unmask: bool = is_unmask
+
+    @property
+    def masking_metadata(self) -> MaskedTableColumnMetadata:
+        """
+        The metadata for the masked column.
+        """
+        return self._masking_metadata
+
+    @property
+    def is_unmask(self) -> bool:
+        """
+        Whether this operator is unprotecting (True) or protecting (False).
+        """
+        return self._is_unmask
+
+    @property
+    def format_string(self) -> str:
+        """
+        The format string to use for this operator to either mask or unmask the
+        operand.
+        """
+        return (
+            self.masking_metadata.unprotect_protocol
+            if self.is_unmask
+            else self.masking_metadata.protect_protocol
+        )
+
+    def to_string(self, arg_strings: list[str]) -> str:
+        name: str = "UNMASK" if self.is_unmask else "MASK"
+        arg_strings = [f"[{s}]" for s in arg_strings]
+        return f"{name}::({self.format_string.format(*arg_strings)})"
+
+    def equals(self, other: object) -> bool:
+        return (
+            isinstance(other, MaskedExpressionFunctionOperator)
+            and self.masking_metadata == other.masking_metadata
+            and self.is_unmask == other.is_unmask
+            and super().equals(other)
+        )
diff --git a/pydough/sqlglot/transform_bindings/base_transform_bindings.py b/pydough/sqlglot/transform_bindings/base_transform_bindings.py
@@ -172,12 +172,24 @@ def convert_call_to_sqlglot(
             return sqlglot_expressions.Anonymous(
                 this=operator.sql_function_alias, expressions=args
             )
-        if isinstance(operator, pydop.SqlMacroExpressionFunctionOperator):
+        if isinstance(
+            operator,
+            (
+                pydop.MaskedExpressionFunctionOperator,
+                pydop.SqlMacroExpressionFunctionOperator,
+            ),
+        ):
             # For user defined operators that are a macro for SQL text, convert
             # the arguments to SQL text strings then inject them into the macro
-            # as a format string, then re-parse it.
+            # as a format string, then re-parse it. The same idea works for the
+            # masking/unmasking operators
             arg_strings: list[str] = [arg.sql() for arg in args]
-            combined_string: str = operator.macro_text.format(*arg_strings)
+            fmt_string: str
+            if isinstance(operator, pydop.MaskedExpressionFunctionOperator):
+                fmt_string = operator.format_string
+            else:
+                fmt_string = operator.macro_text
+            combined_string: str = fmt_string.format(*arg_strings)
             return parse_one(combined_string)
         match operator:
             case pydop.NOT:

diff --git a/tests/gen_data/init_cryptbank.sql b/tests/gen_data/init_cryptbank.sql
@@ -37,14 +37,14 @@ CREATE TABLE TRANSACTIONS (
 );
 
 INSERT INTO CUSTOMERS (c_key, c_fname, c_lname, c_phone, c_email, c_addr, c_birthday)
-SELECT  *
-    -- 42 - column1, -- ARITHMETIC SHIFT: 42
-    -- UPPER(column2), -- UPPERCASE
-    -- UPPER(column3), -- UPPERCASE
-    -- REPLACE(REPLACE(REPLACE(column4, '0', '*'), '9', '0'), '*', '9'), -- DIGIT SWITCH: 0 <-> 9
-    -- SUBSTRING(column5, 2) || SUBSTRING(column5, 1, 1), -- FIRST CHAR TRANSPOSE
-    -- SUBSTRING(column6, 2) || SUBSTRING(column6, 1, 1),  -- FIRST CHAR TRANSPOSE
-    -- DATE(column7, '-472 days') -- DAY SHIFT: 472
+SELECT 
+    42 - column1, -- ARITHMETIC SHIFT: 42
+    UPPER(column2), -- UPPERCASE
+    UPPER(column3), -- UPPERCASE
+    REPLACE(REPLACE(REPLACE(column4, '0', '*'), '9', '0'), '*', '9'), -- DIGIT SWITCH: 0 <-> 9
+    SUBSTRING(column5, 2) || SUBSTRING(column5, 1, 1), -- FIRST CHAR TRANSPOSE
+    SUBSTRING(column6, 2) || SUBSTRING(column6, 1, 1),  -- FIRST CHAR TRANSPOSE
+    DATE(column7, '-472 days') -- DAY SHIFT: 472
 FROM (
 VALUES
     (1, 'alice', 'johnson', '555-123-4567', '[email protected]', '123 Maple St;Portland;OR;97205', '1985-04-12'),
@@ -81,13 +81,13 @@ INSERT INTO BRANCHES (b_key, b_name, b_addr) VALUES
 ;
 
 INSERT INTO ACCOUNTS (a_key, a_custkey, a_branchkey, a_balance, a_type, a_open_ts)
-SELECT *
-    -- CAST(CAST(column1 as TEXT) || CAST(column1 as TEXT) AS INTEGER),
-    -- column2,
-    -- column3,
-    -- column4 * column4, -- GEOMETRIC SHIFT
-    -- SUBSTRING(column5, 2) || SUBSTRING(column5, 1, 1), -- FIRST CHAR TRANSPOSE
-    -- DATETIME(column6, '-123456789 seconds') -- SECOND SHIFT: 123456789
+SELECT
+    CAST(CAST(column1 as TEXT) || CAST(column1 as TEXT) AS INTEGER),
+    column2,
+    column3,
+    column4 * column4, -- GEOMETRIC SHIFT
+    SUBSTRING(column5, 2) || SUBSTRING(column5, 1, 1), -- FIRST CHAR TRANSPOSE
+    DATETIME(column6, '-123456789 seconds') -- SECOND SHIFT: 123456789
 FROM (
 VALUES
     -- Customer 1 (alice johnson, OR) - 3 accounts
@@ -189,12 +189,11 @@ VALUES
 
 INSERT INTO TRANSACTIONS (t_key, t_sourceaccount, t_destaccount, t_amount, t_ts)
 SELECT
-  *
-    -- column1,
-    -- column2,
-    -- column3,
-    -- 1025.67 - column4, -- ARITHMETIC SHIFT: 1025.67
-    -- DATETIME(column5, '-54321 seconds') -- SECOND SHIFT: 54321
+    column1,
+    column2,
+    column3,
+    1025.67 - column4, -- ARITHMETIC SHIFT: 1025.67
+    DATETIME(column5, '-54321 seconds') -- SECOND SHIFT: 54321
 FROM (
 VALUES
     (1, 41, 8, 2753.92, '2019-11-11 18:00:52'),

diff --git a/tests/test_masked_sf.py b/tests/test_masked_sf.py
@@ -870,9 +870,6 @@ def test_pipeline_until_sql_masked_sf(
     )
 
 
-@pytest.mark.skip(
-    reason="Skipping until masked table column relational handling is implemented"
-)
 @pytest.mark.execute
 @pytest.mark.sf_masked
 @pytest.mark.parametrize("account_type", ["NONE", "PARTIAL", "FULL"])

diff --git a/tests/test_masked_sqlite.py b/tests/test_masked_sqlite.py
@@ -624,12 +624,15 @@ def test_pipeline_until_relational_cryptbank(
     masked_graphs: graph_fetcher,
     get_plan_test_filename: Callable[[str], str],
     update_tests: bool,
+    enable_mask_rewrites: str,
 ) -> None:
     """
     Tests the conversion of the PyDough queries on the custom cryptbank dataset
     into relational plans.
     """
-    file_path: str = get_plan_test_filename(cryptbank_pipeline_test_data.test_name)
+    file_path: str = get_plan_test_filename(
+        f"{cryptbank_pipeline_test_data.test_name}_{enable_mask_rewrites}"
+    )
     cryptbank_pipeline_test_data.run_relational_test(
         masked_graphs, file_path, update_tests
     )
@@ -641,13 +644,15 @@ def test_pipeline_until_sql_cryptbank(
     sqlite_tpch_db_context: DatabaseContext,
     get_sql_test_filename: Callable[[str, DatabaseDialect], str],
     update_tests: bool,
+    enable_mask_rewrites: str,
 ):
     """
     Tests the conversion of the PyDough queries on the custom cryptbank dataset
     into SQL text.
     """
     file_path: str = get_sql_test_filename(
-        cryptbank_pipeline_test_data.test_name, sqlite_tpch_db_context.dialect
+        f"{cryptbank_pipeline_test_data.test_name}_{enable_mask_rewrites}",
+        sqlite_tpch_db_context.dialect,
     )
     cryptbank_pipeline_test_data.run_sql_test(
         masked_graphs,
@@ -657,14 +662,12 @@ def test_pipeline_until_sql_cryptbank(
     )
 
 
-# @pytest.mark.skip(
-#     reason="Skipping until masked table column relational handling is implemented"
-# )
 @pytest.mark.execute
 def test_pipeline_e2e_cryptbank(
     cryptbank_pipeline_test_data: PyDoughPandasTest,
     masked_graphs: graph_fetcher,
     sqlite_cryptbank_connection: DatabaseContext,
+    enable_mask_rewrites: str,
 ):
     """
     Test executing the the custom queries with the custom cryptbank dataset

diff --git a/tests/test_metadata/sf_masked_examples.json b/tests/test_metadata/sf_masked_examples.json
@@ -1226,5 +1226,4 @@
       }
     ]
   }
-
 ]
diff --git a/tests/test_plan_refsols/cryptbank_agg_01.txt b/tests/test_plan_refsols/cryptbank_agg_01.txt
diff --git a/tests/test_plan_refsols/cryptbank_agg_01_raw.txt b/tests/test_plan_refsols/cryptbank_agg_01_raw.txt
@@ -0,0 +1,4 @@
+ROOT(columns=[('n', ROUND(avg_unmask_t_amount, 2:numeric))], orderings=[])
+ AGGREGATE(keys={}, aggregations={'avg_unmask_t_amount': AVG(UNMASK::((1025.67 - ([t_amount]))))})
+  FILTER(condition=MONTH(UNMASK::(DATETIME([t_ts], '+54321 seconds'))) == 6:numeric & YEAR(UNMASK::(DATETIME([t_ts], '+54321 seconds'))) == 2022:numeric, columns={'t_amount': t_amount})
+   SCAN(table=CRBNK.TRANSACTIONS, columns={'t_amount': t_amount, 't_ts': t_ts})
diff --git a/tests/test_plan_refsols/cryptbank_agg_01_rewrite.txt b/tests/test_plan_refsols/cryptbank_agg_01_rewrite.txt
@@ -0,0 +1,4 @@
+ROOT(columns=[('n', ROUND(avg_unmask_t_amount, 2:numeric))], orderings=[])
+ AGGREGATE(keys={}, aggregations={'avg_unmask_t_amount': AVG(UNMASK::((1025.67 - ([t_amount]))))})
+  FILTER(condition=MONTH(UNMASK::(DATETIME([t_ts], '+54321 seconds'))) == 6:numeric & YEAR(UNMASK::(DATETIME([t_ts], '+54321 seconds'))) == 2022:numeric, columns={'t_amount': t_amount})
+   SCAN(table=CRBNK.TRANSACTIONS, columns={'t_amount': t_amount, 't_ts': t_ts})
diff --git a/tests/test_plan_refsols/cryptbank_agg_02.txt b/tests/test_plan_refsols/cryptbank_agg_02.txt
diff --git a/tests/test_plan_refsols/cryptbank_agg_02_raw.txt b/tests/test_plan_refsols/cryptbank_agg_02_raw.txt
@@ -0,0 +1,3 @@
+ROOT(columns=[('account_type', unmask_a_type), ('n', n_rows), ('avg_bal', ROUND(avg_unmask_a_balance, 2:numeric))], orderings=[])
+ AGGREGATE(keys={'unmask_a_type': UNMASK::(SUBSTRING([a_type], -1) || SUBSTRING([a_type], 1, LENGTH([a_type]) - 1))}, aggregations={'avg_unmask_a_balance': AVG(UNMASK::(SQRT([a_balance]))), 'n_rows': COUNT()})
+  SCAN(table=CRBNK.ACCOUNTS, columns={'a_balance': a_balance, 'a_type': a_type})
diff --git a/tests/test_plan_refsols/cryptbank_agg_02_rewrite.txt b/tests/test_plan_refsols/cryptbank_agg_02_rewrite.txt
@@ -0,0 +1,3 @@
+ROOT(columns=[('account_type', unmask_a_type), ('n', n_rows), ('avg_bal', ROUND(avg_unmask_a_balance, 2:numeric))], orderings=[])
+ AGGREGATE(keys={'unmask_a_type': UNMASK::(SUBSTRING([a_type], -1) || SUBSTRING([a_type], 1, LENGTH([a_type]) - 1))}, aggregations={'avg_unmask_a_balance': AVG(UNMASK::(SQRT([a_balance]))), 'n_rows': COUNT()})
+  SCAN(table=CRBNK.ACCOUNTS, columns={'a_balance': a_balance, 'a_type': a_type})
diff --git a/tests/test_plan_refsols/cryptbank_agg_03.txt b/tests/test_plan_refsols/cryptbank_agg_03.txt
diff --git a/tests/test_plan_refsols/cryptbank_agg_03_raw.txt b/tests/test_plan_refsols/cryptbank_agg_03_raw.txt
@@ -0,0 +1,5 @@
+ROOT(columns=[('account_type', UNMASK::(SUBSTRING([a_type], -1) || SUBSTRING([a_type], 1, LENGTH([a_type]) - 1))), ('balance', UNMASK::(SQRT([a_balance]))), ('name', JOIN_STRINGS(' ':string, UNMASK::(LOWER([c_fname])), UNMASK::(LOWER([c_lname]))))], orderings=[])
+ FILTER(condition=RANKING(args=[], partition=[UNMASK::(SUBSTRING([a_type], -1) || SUBSTRING([a_type], 1, LENGTH([a_type]) - 1))], order=[(UNMASK::(SQRT([a_balance]))):desc_first], allow_ties=False) == 1:numeric, columns={'a_balance': a_balance, 'a_type': a_type, 'c_fname': c_fname, 'c_lname': c_lname})
+  JOIN(condition=t0.a_custkey == UNMASK::((42 - ([t1.c_key]))), type=INNER, cardinality=SINGULAR_ACCESS, columns={'a_balance': t0.a_balance, 'a_type': t0.a_type, 'c_fname': t1.c_fname, 'c_lname': t1.c_lname})
+   SCAN(table=CRBNK.ACCOUNTS, columns={'a_balance': a_balance, 'a_custkey': a_custkey, 'a_type': a_type})
+   SCAN(table=CRBNK.CUSTOMERS, columns={'c_fname': c_fname, 'c_key': c_key, 'c_lname': c_lname})
diff --git a/tests/test_plan_refsols/cryptbank_agg_03_rewrite.txt b/tests/test_plan_refsols/cryptbank_agg_03_rewrite.txt
@@ -0,0 +1,5 @@
+ROOT(columns=[('account_type', UNMASK::(SUBSTRING([a_type], -1) || SUBSTRING([a_type], 1, LENGTH([a_type]) - 1))), ('balance', UNMASK::(SQRT([a_balance]))), ('name', JOIN_STRINGS(' ':string, UNMASK::(LOWER([c_fname])), UNMASK::(LOWER([c_lname]))))], orderings=[])
+ FILTER(condition=RANKING(args=[], partition=[UNMASK::(SUBSTRING([a_type], -1) || SUBSTRING([a_type], 1, LENGTH([a_type]) - 1))], order=[(UNMASK::(SQRT([a_balance]))):desc_first], allow_ties=False) == 1:numeric, columns={'a_balance': a_balance, 'a_type': a_type, 'c_fname': c_fname, 'c_lname': c_lname})
+  JOIN(condition=t0.a_custkey == UNMASK::((42 - ([t1.c_key]))), type=INNER, cardinality=SINGULAR_ACCESS, columns={'a_balance': t0.a_balance, 'a_type': t0.a_type, 'c_fname': t1.c_fname, 'c_lname': t1.c_lname})
+   SCAN(table=CRBNK.ACCOUNTS, columns={'a_balance': a_balance, 'a_custkey': a_custkey, 'a_type': a_type})
+   SCAN(table=CRBNK.CUSTOMERS, columns={'c_fname': c_fname, 'c_key': c_key, 'c_lname': c_lname})
diff --git a/tests/test_plan_refsols/cryptbank_agg_04.txt → ...est_plan_refsols/cryptbank_agg_04_raw.txt b/tests/test_plan_refsols/cryptbank_agg_04.txt → ...est_plan_refsols/cryptbank_agg_04_raw.txt
@@ -1,5 +1,5 @@
-ROOT(columns=[('branch_key', b_key), ('pct_total_wealth', ROUND(DEFAULT_TO(sum_a_balance, 0:numeric) / RELSUM(args=[DEFAULT_TO(sum_a_balance, 0:numeric)], partition=[], order=[]), 2:numeric))], orderings=[])
- JOIN(condition=t0.b_key == t1.a_branchkey, type=INNER, cardinality=SINGULAR_ACCESS, columns={'b_key': t0.b_key, 'sum_a_balance': t1.sum_a_balance})
+ROOT(columns=[('branch_key', b_key), ('pct_total_wealth', ROUND(DEFAULT_TO(sum_unmask_a_balance, 0:numeric) / RELSUM(args=[DEFAULT_TO(sum_unmask_a_balance, 0:numeric)], partition=[], order=[]), 2:numeric))], orderings=[])
+ JOIN(condition=t0.b_key == t1.a_branchkey, type=INNER, cardinality=SINGULAR_ACCESS, columns={'b_key': t0.b_key, 'sum_unmask_a_balance': t1.sum_unmask_a_balance})
   SCAN(table=CRBNK.BRANCHES, columns={'b_key': b_key})
-  AGGREGATE(keys={'a_branchkey': a_branchkey}, aggregations={'sum_a_balance': SUM(a_balance)})
+  AGGREGATE(keys={'a_branchkey': a_branchkey}, aggregations={'sum_unmask_a_balance': SUM(UNMASK::(SQRT([a_balance])))})
    SCAN(table=CRBNK.ACCOUNTS, columns={'a_balance': a_balance, 'a_branchkey': a_branchkey})
-Original file line number
+Diff line change
@@ Expand Up / @@ -1226,5 +1226,4 @@ @@
           }
         ]
       }
     ]