Feat!: ensure JSON_FORMAT type is JSON when targeting Presto (#4968)

georgesittas · web-flow · commit 400ea54d3a9c · 2025-04-11T16:12:52.000+03:00
diff --git a/sqlglot/dialects/bigquery.py b/sqlglot/dialects/bigquery.py
@@ -800,7 +800,7 @@ def _parse_unnest(self, with_alias: bool = True) -> t.Optional[exp.Unnest]:
             if unnest_expr:
                 from sqlglot.optimizer.annotate_types import annotate_types
 
-                unnest_expr = annotate_types(unnest_expr)
+                unnest_expr = annotate_types(unnest_expr, dialect=self.dialect)
 
                 # Unnesting a nested array (i.e array of structs) explodes the top-level struct fields,
                 # in contrast to other dialects such as DuckDB which flattens only the array by default
@@ -1227,7 +1227,7 @@ def bracket_sql(self, expression: exp.Bracket) -> str:
                 if arg.type is None:
                     from sqlglot.optimizer.annotate_types import annotate_types
 
-                    arg = annotate_types(arg)
+                    arg = annotate_types(arg, dialect=self.dialect)
 
                 if arg.type and arg.type.this in exp.DataType.TEXT_TYPES:
                     # BQ doesn't support bracket syntax with string values for structs
diff --git a/sqlglot/dialects/dialect.py b/sqlglot/dialects/dialect.py
@@ -1303,7 +1303,9 @@ def no_timestamp_sql(self: Generator, expression: exp.Timestamp) -> str:
     if not zone:
         from sqlglot.optimizer.annotate_types import annotate_types
 
-        target_type = annotate_types(expression).type or exp.DataType.Type.TIMESTAMP
+        target_type = (
+            annotate_types(expression, dialect=self.dialect).type or exp.DataType.Type.TIMESTAMP
+        )
         return self.sql(exp.cast(expression.this, target_type))
     if zone.name.lower() in TIMEZONES:
         return self.sql(
@@ -1870,7 +1872,7 @@ def build_timetostr_or_tochar(args: t.List, dialect: Dialect) -> exp.TimeToStr |
     if this and not this.type:
         from sqlglot.optimizer.annotate_types import annotate_types
 
-        annotate_types(this)
+        annotate_types(this, dialect=dialect)
         if this.is_type(*exp.DataType.TEMPORAL_TYPES):
             dialect_name = dialect.__class__.__name__.lower()
             return build_formatted_time(exp.TimeToStr, dialect_name, default=True)(args)
diff --git a/sqlglot/dialects/duckdb.py b/sqlglot/dialects/duckdb.py
@@ -1008,7 +1008,7 @@ def bracket_sql(self, expression: exp.Bracket) -> str:
                 if not this.type:
                     from sqlglot.optimizer.annotate_types import annotate_types
 
-                    this = annotate_types(this)
+                    this = annotate_types(this, dialect=self.dialect)
 
                 if this.is_type(exp.DataType.Type.MAP):
                     bracket = f"({bracket})[1]"
@@ -1042,7 +1042,7 @@ def length_sql(self, expression: exp.Length) -> str:
             if not arg.type:
                 from sqlglot.optimizer.annotate_types import annotate_types
 
-                arg = annotate_types(arg)
+                arg = annotate_types(arg, dialect=self.dialect)
 
             if arg.is_type(*exp.DataType.TEXT_TYPES):
                 return self.func("LENGTH", arg)
diff --git a/sqlglot/dialects/postgres.py b/sqlglot/dialects/postgres.py
@@ -41,6 +41,9 @@
 from sqlglot.parser import binary_range_parser
 from sqlglot.tokens import TokenType
 
+if t.TYPE_CHECKING:
+    from sqlglot.dialects.dialect import DialectType
+
 
 DATE_DIFF_FACTOR = {
     "MICROSECOND": " * 1000000",
@@ -191,7 +194,7 @@ def _generate(self: Postgres.Generator, expression: JSON_EXTRACT_TYPE) -> str:
     return _generate
 
 
-def _build_regexp_replace(args: t.List) -> exp.RegexpReplace:
+def _build_regexp_replace(args: t.List, dialect: DialectType = None) -> exp.RegexpReplace:
     # The signature of REGEXP_REPLACE is:
     # regexp_replace(source, pattern, replacement [, start [, N ]] [, flags ])
     #
@@ -204,7 +207,7 @@ def _build_regexp_replace(args: t.List) -> exp.RegexpReplace:
             if not last.type or last.is_type(exp.DataType.Type.UNKNOWN, exp.DataType.Type.NULL):
                 from sqlglot.optimizer.annotate_types import annotate_types
 
-                last = annotate_types(last)
+                last = annotate_types(last, dialect=dialect)
 
             if last.is_type(*exp.DataType.TEXT_TYPES):
                 regexp_replace = exp.RegexpReplace.from_arg_list(args[:-1])
@@ -657,7 +660,7 @@ def unnest_sql(self, expression: exp.Unnest) -> str:
 
                 from sqlglot.optimizer.annotate_types import annotate_types
 
-                this = annotate_types(arg)
+                this = annotate_types(arg, dialect=self.dialect)
                 if this.is_type("array<json>"):
                     while isinstance(this, exp.Cast):
                         this = this.this
diff --git a/sqlglot/dialects/presto.py b/sqlglot/dialects/presto.py
@@ -332,6 +332,9 @@ class Parser(parser.Parser):
             "FROM_UTF8": lambda args: exp.Decode(
                 this=seq_get(args, 0), replace=seq_get(args, 1), charset=exp.Literal.string("utf-8")
             ),
+            "JSON_FORMAT": lambda args: exp.JSONFormat(
+                this=seq_get(args, 0), options=seq_get(args, 1), is_json=True
+            ),
             "LEVENSHTEIN_DISTANCE": exp.Levenshtein.from_arg_list,
             "NOW": exp.CurrentTimestamp.from_arg_list,
             "REGEXP_EXTRACT": build_regexp_extract(exp.RegexpExtract),
@@ -582,13 +585,27 @@ class Generator(generator.Generator):
             "with",
         }
 
+        def jsonformat_sql(self, expression: exp.JSONFormat) -> str:
+            this = expression.this
+            is_json = expression.args.get("is_json")
+
+            if this and not (is_json or this.type):
+                from sqlglot.optimizer.annotate_types import annotate_types
+
+                this = annotate_types(this, dialect=self.dialect)
+
+            if not (is_json or this.is_type(exp.DataType.Type.JSON)):
+                this.replace(exp.cast(this, exp.DataType.Type.JSON))
+
+            return self.function_fallback_sql(expression)
+
         def md5_sql(self, expression: exp.MD5) -> str:
             this = expression.this
 
             if not this.type:
                 from sqlglot.optimizer.annotate_types import annotate_types
 
-                this = annotate_types(this)
+                this = annotate_types(this, dialect=self.dialect)
 
             if this.is_type(*exp.DataType.TEXT_TYPES):
                 this = exp.Encode(this=this, charset=exp.Literal.string("utf-8"))
@@ -630,6 +647,7 @@ def bracket_sql(self, expression: exp.Bracket) -> str:
                             expression.this,
                             expression.expressions,
                             1 - expression.args.get("offset", 0),
+                            dialect=self.dialect,
                         ),
                         0,
                     ),
@@ -639,7 +657,7 @@ def bracket_sql(self, expression: exp.Bracket) -> str:
         def struct_sql(self, expression: exp.Struct) -> str:
             from sqlglot.optimizer.annotate_types import annotate_types
 
-            expression = annotate_types(expression)
+            expression = annotate_types(expression, dialect=self.dialect)
             values: t.List[str] = []
             schema: t.List[str] = []
             unknown_type = False
diff --git a/sqlglot/dialects/snowflake.py b/sqlglot/dialects/snowflake.py
@@ -1172,7 +1172,7 @@ def trycast_sql(self, expression: exp.TryCast) -> str:
             if value.type is None:
                 from sqlglot.optimizer.annotate_types import annotate_types
 
-                value = annotate_types(value)
+                value = annotate_types(value, dialect=self.dialect)
 
             if value.is_type(*exp.DataType.TEXT_TYPES, exp.DataType.Type.UNKNOWN):
                 return super().trycast_sql(expression)
diff --git a/sqlglot/executor/__init__.py b/sqlglot/executor/__init__.py
@@ -65,7 +65,9 @@ def execute(
 
             for column in table.columns:
                 value = table[0][column]
-                column_type = annotate_types(exp.convert(value)).type or type(value).__name__
+                column_type = (
+                    annotate_types(exp.convert(value), dialect=read).type or type(value).__name__
+                )
                 nested_set(schema, [*keys, column], column_type)
 
     schema = ensure_schema(schema, dialect=read)
diff --git a/sqlglot/expressions.py b/sqlglot/expressions.py
@@ -6306,7 +6306,7 @@ class JSONBExtractScalar(Binary, Func):
 
 
 class JSONFormat(Func):
-    arg_types = {"this": False, "options": False}
+    arg_types = {"this": False, "options": False, "is_json": False}
     _sql_names = ["JSON_FORMAT"]
 
 
diff --git a/sqlglot/generator.py b/sqlglot/generator.py
@@ -2807,6 +2807,7 @@ def bracket_offset_expressions(
             expression.this,
             expression.expressions,
             (index_offset or self.dialect.INDEX_OFFSET) - expression.args.get("offset", 0),
+            dialect=self.dialect,
         )
 
     def bracket_sql(self, expression: exp.Bracket) -> str:
@@ -4018,7 +4019,7 @@ def toarray_sql(self, expression: exp.ToArray) -> str:
         if not arg.type:
             from sqlglot.optimizer.annotate_types import annotate_types
 
-            arg = annotate_types(arg)
+            arg = annotate_types(arg, dialect=self.dialect)
 
         if arg.is_type(exp.DataType.Type.ARRAY):
             return self.sql(arg)
diff --git a/sqlglot/helper.py b/sqlglot/helper.py
@@ -15,6 +15,7 @@
 if t.TYPE_CHECKING:
     from sqlglot import exp
     from sqlglot._typing import A, E, T
+    from sqlglot.dialects.dialect import DialectType
     from sqlglot.expressions import Expression
 
 
@@ -150,6 +151,7 @@ def apply_index_offset(
     this: exp.Expression,
     expressions: t.List[E],
     offset: int,
+    dialect: DialectType = None,
 ) -> t.List[E]:
     """
     Applies an offset to a given integer literal expression.
@@ -158,6 +160,7 @@ def apply_index_offset(
         this: The target of the index.
         expressions: The expression the offset will be applied to, wrapped in a list.
         offset: The offset that will be applied.
+        dialect: the dialect of interest.
 
     Returns:
         The original expression with the offset applied to it, wrapped in a list. If the provided
@@ -173,7 +176,7 @@ def apply_index_offset(
     from sqlglot.optimizer.simplify import simplify
 
     if not this.type:
-        annotate_types(this)
+        annotate_types(this, dialect=dialect)
 
     if t.cast(exp.DataType, this.type).this not in (
         exp.DataType.Type.UNKNOWN,
@@ -182,7 +185,7 @@ def apply_index_offset(
         return expressions
 
     if not expression.type:
-        annotate_types(expression)
+        annotate_types(expression, dialect=dialect)
 
     if t.cast(exp.DataType, expression.type).this in exp.DataType.INTEGER_TYPES:
         logger.info("Applying array index offset (%s)", offset)
diff --git a/sqlglot/optimizer/canonicalize.py b/sqlglot/optimizer/canonicalize.py
@@ -23,7 +23,7 @@ def canonicalize(expression: exp.Expression, dialect: DialectType = None) -> exp
 
     def _canonicalize(expression: exp.Expression) -> exp.Expression:
         expression = add_text_to_concat(expression)
-        expression = replace_date_funcs(expression)
+        expression = replace_date_funcs(expression, dialect=dialect)
         expression = coerce_type(expression, dialect.PROMOTE_TO_INFERRED_DATETIME_TYPE)
         expression = remove_redundant_casts(expression)
         expression = ensure_bools(expression, _replace_int_predicate)
@@ -39,7 +39,7 @@ def add_text_to_concat(node: exp.Expression) -> exp.Expression:
     return node
 
 
-def replace_date_funcs(node: exp.Expression) -> exp.Expression:
+def replace_date_funcs(node: exp.Expression, dialect: DialectType) -> exp.Expression:
     if (
         isinstance(node, (exp.Date, exp.TsOrDsToDate))
         and not node.expressions
@@ -52,7 +52,7 @@ def replace_date_funcs(node: exp.Expression) -> exp.Expression:
         if not node.type:
             from sqlglot.optimizer.annotate_types import annotate_types
 
-            node = annotate_types(node)
+            node = annotate_types(node, dialect=dialect)
         return exp.cast(node.this, to=node.type or exp.DataType.Type.TIMESTAMP)
 
     return node
diff --git a/sqlglot/parser.py b/sqlglot/parser.py
@@ -6153,7 +6153,9 @@ def _parse_bracket(self, this: t.Optional[exp.Expression] = None) -> t.Optional[
                     dialect=self.dialect,
                 )
 
-            expressions = apply_index_offset(this, expressions, -self.dialect.INDEX_OFFSET)
+            expressions = apply_index_offset(
+                this, expressions, -self.dialect.INDEX_OFFSET, dialect=self.dialect
+            )
             this = self.expression(exp.Bracket, this=this, expressions=expressions)
 
         self._add_comments(this)
diff --git a/tests/dialects/test_bigquery.py b/tests/dialects/test_bigquery.py
@@ -1404,11 +1404,13 @@ def test_bigquery(self):
         )
         self.validate_all(
             "TO_JSON_STRING(x)",
-            read={"bigquery": "TO_JSON_STRING(x)"},
+            read={
+                "bigquery": "TO_JSON_STRING(x)",
+            },
             write={
                 "bigquery": "TO_JSON_STRING(x)",
                 "duckdb": "CAST(TO_JSON(x) AS TEXT)",
-                "presto": "JSON_FORMAT(x)",
+                "presto": "JSON_FORMAT(CAST(x AS JSON))",
                 "spark": "TO_JSON(x)",
             },
         )
diff --git a/tests/dialects/test_presto.py b/tests/dialects/test_presto.py
@@ -1021,9 +1021,6 @@ def test_presto(self):
         )
         self.validate_all(
             "JSON_FORMAT(x)",
-            read={
-                "spark": "TO_JSON(x)",
-            },
             write={
                 "bigquery": "TO_JSON_STRING(x)",
                 "duckdb": "CAST(TO_JSON(x) AS TEXT)",
diff --git a/tests/dialects/test_spark.py b/tests/dialects/test_spark.py
@@ -308,6 +308,15 @@ def test_spark(self):
             "SELECT STR_TO_MAP('a:1,b:2,c:3')",
             "SELECT STR_TO_MAP('a:1,b:2,c:3', ',', ':')",
         )
+
+        self.validate_all(
+            "SELECT TO_JSON(STRUCT('blah' AS x)) AS y",
+            write={
+                "presto": "SELECT JSON_FORMAT(CAST(CAST(ROW('blah') AS ROW(x VARCHAR)) AS JSON)) AS y",
+                "spark": "SELECT TO_JSON(STRUCT('blah' AS x)) AS y",
+                "trino": "SELECT JSON_FORMAT(CAST(CAST(ROW('blah') AS ROW(x VARCHAR)) AS JSON)) AS y",
+            },
+        )
         self.validate_all(
             "SELECT TRY_ELEMENT_AT(ARRAY(1, 2, 3), 2)",
             read={
diff --git a/tests/test_optimizer.py b/tests/test_optimizer.py
@@ -55,13 +55,10 @@ def simplify(expression, **kwargs):
 
 
 def annotate_functions(expression, **kwargs):
-    from sqlglot.dialects import Dialect
-
     dialect = kwargs.get("dialect")
     schema = kwargs.get("schema")
 
-    annotators = Dialect.get_or_raise(dialect).ANNOTATORS
-    annotated = annotate_types(expression, annotators=annotators, schema=schema)
+    annotated = annotate_types(expression, dialect=dialect, schema=schema)
 
     return annotated.expressions[0]