PostgreSQL & Snowflake Improvements

xnuinside · xnuinside · commit 3c1d6331565e · 2024-05-11T19:42:31.000+03:00
diff --git a/CHANGELOG.txt b/CHANGELOG.txt
@@ -1,3 +1,20 @@
+**v1.3.0**
+
+### Fixes
+PostgreSQL:
+1.  Timezone was moved out from type definition to keyword 'with_time_zone' it can be True (if with time zone) or False (if without)
+BigQuery:
+1. Previously Range in RANGE_BUCKETS was parsed as a columns, now this behaviour is changed and
+range placed in own keyword - 'range' (can be array or str).
+Also for all ``*_TRUNC PARTITIONS` like DATETIME_TRUNC, TIMESTAMP_TRUNC, etc, second argument moved to arg 'trunc_by'
+
+### Improvements
+PostgreSQL:
+1. Added support for PostgreSQL with / without time zone - https://github.com/xnuinside/simple-ddl-parser/issues/250
+
+BigQuery:
+1. Added support for GENERATE_ARRAY in RANGE_BUCKETS https://github.com/xnuinside/simple-ddl-parser/issues/183
+
 **v1.2.1**
 ### Fixes
 MySQL:
diff --git a/README.md b/README.md
@@ -486,6 +486,23 @@ for help with debugging & testing support for BigQuery dialect DDLs:
 
 
 ## Changelog
+**v1.3.0**
+
+### Fixes
+PostgreSQL:
+1.  Timezone was moved out from type definition to keyword 'with_time_zone' it can be True (if with time zone) or False (if without)
+BigQuery:
+1. Previously Range in RANGE_BUCKETS was parsed as a columns, now this behaviour is changed and
+range placed in own keyword - 'range' (can be array or str).
+Also for all ``*_TRUNC PARTITIONS` like DATETIME_TRUNC, TIMESTAMP_TRUNC, etc, second argument moved to arg 'trunc_by'
+
+### Improvements
+PostgreSQL:
+1. Added support for PostgreSQL with / without time zone - https://github.com/xnuinside/simple-ddl-parser/issues/250
+
+BigQuery:
+1. Added support for GENERATE_ARRAY in RANGE_BUCKETS https://github.com/xnuinside/simple-ddl-parser/issues/183
+
 **v1.2.1**
 ### Fixes
 MySQL:
diff --git a/docs/README.rst b/docs/README.rst
@@ -549,6 +549,33 @@ for help with debugging & testing support for BigQuery dialect DDLs:
 Changelog
 ---------
 
+**v1.3.0**
+
+Fixes
+^^^^^
+
+PostgreSQL:
+
+
+#. Timezone was moved out from type definition to keyword 'with_time_zone' it can be True (if with time zone) or False (if without)
+   BigQuery:
+#. Previously Range in RANGE_BUCKETS was parsed as a columns, now this behaviour is changed and
+   range placed in own keyword - 'range' (can be array or str).
+   Also for all ```*_TRUNC PARTITIONS`` like DATETIME_TRUNC, TIMESTAMP_TRUNC, etc, second argument moved to arg 'trunc_by'
+
+Improvements
+^^^^^^^^^^^^
+
+PostgreSQL:
+
+
+#. Added support for PostgreSQL with / without time zone - https://github.com/xnuinside/simple-ddl-parser/issues/250
+
+BigQuery:
+
+
+#. Added support for GENERATE_ARRAY in RANGE_BUCKETS https://github.com/xnuinside/simple-ddl-parser/issues/183
+
 **v1.2.1**
 
 Fixes
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "simple-ddl-parser"
-version = "1.2.1"
+version = "1.3.0"
 description = "Simple DDL Parser to parse SQL & dialects like HQL, TSQL (MSSQL), Oracle, AWS Redshift, Snowflake, MySQL, PostgreSQL, etc ddl files to json/python dict with full information about columns: types, defaults, primary keys, etc.; sequences, alters, custom types & other entities from ddl."
 authors = ["Iuliia Volkova <xnuinside@gmail.com>"]
 license = "MIT"
diff --git a/simple_ddl_parser/dialects/psql.py b/simple_ddl_parser/dialects/psql.py
@@ -13,3 +13,13 @@ def p_expr_inherits(self, p: List) -> None:
             "table_name": p_list[-1]["table_name"],
         }
         p[1].update({"inherits": table_identifier})
+
+    def p_timezone(self, p: List) -> None:
+        """timezone : WITH id id
+        | WITHOUT id id"""
+        p_list = remove_par(list(p))
+        if "WITH" in p_list:
+            timezone = True
+        else:
+            timezone = False
+        p[0] = {"with_time_zone": timezone}
diff --git a/simple_ddl_parser/dialects/sql.py b/simple_ddl_parser/dialects/sql.py
@@ -9,21 +9,56 @@
 
 
 class AfterColumns:
+    @staticmethod
+    def _parse_range_bucket(data: list[str]) -> Tuple[List[str], List[str]]:
+        range = None
+
+        if len(data) == 3:
+            columns = data[0]
+            range = data[2]
+        else:
+            columns = []
+            for column in data[0]:
+                if "[" in column:
+                    range = [column.replace("[", "")]
+                elif range:
+                    range.append(column.replace("]", ""))
+                else:
+                    columns.append(column)
+        return columns, range
+
     def p_expression_partition_by(self, p: List) -> None:
         """expr : expr PARTITION BY LP pid RP
         | expr PARTITION BY id LP pid RP
         | expr PARTITION BY pid
-        | expr PARTITION BY id pid"""
+        | expr PARTITION BY id pid
+        | expr PARTITION BY id LP pid COMMA f_call RP
+        """
         p[0] = p[1]
-        p_list = list(p)
-        _type = None
-        if isinstance(p[4], list):
-            columns = p[4]
+        p_list = remove_par(list(p))
+        _type, range, trunc_by = None, None, None
+
+        if isinstance(p_list[4], list):
+            columns = p_list[4]
+        elif "_TRUNC" in p_list[4]:
+            # bigquery
+            _type = p_list[4]
+            trunc_by = p_list[5][-1]
+            p_list[5].pop(-1)
+            columns = p_list[5]
+        elif p_list[4].upper() == "RANGE_BUCKET":
+            # bigquery RANGE_BUCKET with GENERATE_ARRAY
+            _type = p_list[4]
+            columns, range = self._parse_range_bucket(p_list[5:])
         else:
-            columns = p_list[-2]
-        if isinstance(p[4], str) and p[4].lower() != "(":
-            _type = p[4]
+            columns = p_list[-1]
+        if not _type and isinstance(p_list[4], str):
+            _type = p_list[4]
         p[0]["partition_by"] = {"columns": columns, "type": _type}
+        if range:
+            p[0]["partition_by"]["range"] = range
+        if trunc_by:
+            p[0]["partition_by"]["trunc_by"] = trunc_by
 
 
 class Database:
@@ -419,6 +454,7 @@ def p_defcolumn(self, p: List) -> None:
         | defcolumn as_virtual
         | defcolumn constraint
         | defcolumn generated_by
+        | defcolumn timezone
         """
         p[0] = p[1]
         p_list = list(p)
diff --git a/simple_ddl_parser/parsetab.py b/simple_ddl_parser/parsetab.py
diff --git a/simple_ddl_parser/tokens.py b/simple_ddl_parser/tokens.py
@@ -58,6 +58,7 @@
     "POLICY",
     "MASKING",
     "WITH",
+    "WITHOUT",
     "ORDER",
     "NOORDER",
     "VISIBLE",
diff --git a/tests/dialects/test_bigquery.py b/tests/dialects/test_bigquery.py
@@ -484,7 +484,8 @@ def test_table_name_with_project_id():
                     }
                 ],
                 "partition_by": {
-                    "columns": ["fiscal_half_year_reporting_week_no", "DAY"],
+                    "columns": ["fiscal_half_year_reporting_week_no"],
+                    "trunc_by": "DAY",
                     "type": "DATETIME_TRUNC",
                 },
                 "partitioned_by": [],
@@ -646,7 +647,8 @@ def test_multiple_options():
                     {"option_four": '"Four"'},
                 ],
                 "partition_by": {
-                    "columns": ["fiscal_half_year_reporting_week_no", "DAY"],
+                    "columns": ["fiscal_half_year_reporting_week_no"],
+                    "trunc_by": "DAY",
                     "type": "DATETIME_TRUNC",
                 },
                 "partitioned_by": [],
@@ -869,3 +871,126 @@ def test_bigquery_options_string():
         "types": [],
     }
     assert result == expected
+
+
+def test_bigquery_partition_range():
+    ddl = """
+    CREATE TABLE data.test(
+        field_a INT OPTIONS(description='some description')
+    )
+    PARTITION BY RANGE_BUCKET(field_a, GENERATE_ARRAY(10, 1000, 1));"""
+
+    result = DDLParser(ddl).run(output_mode="bigquery")
+    expected = [
+        {
+            "alter": {},
+            "checks": [],
+            "columns": [
+                {
+                    "check": None,
+                    "default": None,
+                    "name": "field_a",
+                    "nullable": True,
+                    "options": [{"description": "'some description'"}],
+                    "references": None,
+                    "size": None,
+                    "type": "INT",
+                    "unique": False,
+                }
+            ],
+            "index": [],
+            "partition_by": {
+                "columns": ["field_a"],
+                "range": "GENERATE_ARRAY(10,1000,1)",
+                "type": "RANGE_BUCKET",
+            },
+            "partitioned_by": [],
+            "primary_key": [],
+            "dataset": "data",
+            "table_name": "test",
+            "tablespace": None,
+        }
+    ]
+
+    assert result == expected
+
+
+def test_array_range():
+    ddl = """CREATE TABLE data.test(
+       field_a INT OPTIONS(description='some description')
+     )
+     PARTITION BY RANGE_BUCKET(field_a, [1,2,3]]) ;"""
+
+    result = DDLParser(ddl).run(output_mode="bigquery")
+    expected = [
+        {
+            "alter": {},
+            "checks": [],
+            "columns": [
+                {
+                    "check": None,
+                    "default": None,
+                    "name": "field_a",
+                    "nullable": True,
+                    "options": [{"description": "'some description'"}],
+                    "references": None,
+                    "size": None,
+                    "type": "INT",
+                    "unique": False,
+                }
+            ],
+            "dataset": "data",
+            "index": [],
+            "partition_by": {
+                "columns": ["field_a"],
+                "range": ["1", "2", "3"],
+                "type": "RANGE_BUCKET",
+            },
+            "partitioned_by": [],
+            "primary_key": [],
+            "table_name": "test",
+            "tablespace": None,
+        }
+    ]
+    assert expected == result
+
+
+def test_date_trunc():
+    ddl = """CREATE TABLE data.test(
+       field_a INT OPTIONS(description='some description')
+     )
+     PARTITION BY DATE_TRUNC(field, MONTH);"""
+
+    result = DDLParser(ddl).run(output_mode="bigquery")
+    expected = [
+        {
+            "alter": {},
+            "checks": [],
+            "columns": [
+                {
+                    "check": None,
+                    "default": None,
+                    "name": "field_a",
+                    "nullable": True,
+                    "options": [{"description": "'some description'"}],
+                    "references": None,
+                    "size": None,
+                    "type": "INT",
+                    "unique": False,
+                }
+            ],
+            "dataset": "data",
+            "index": [],
+            "partition_by": {
+                "columns": ["field"],
+                "trunc_by": "MONTH",
+                "type": "DATE_TRUNC",
+            },
+            "partitioned_by": [],
+            "primary_key": [],
+            "table_name": "test",
+            "tablespace": None,
+        }
+    ]
+
+    assert result == expected
diff --git a/tests/dialects/test_psql.py b/tests/dialects/test_psql.py
@@ -86,3 +86,36 @@ def test_cast_generated():
         }
     ]
     assert expected == result
+
+
+def test_with_time_zone():
+    expected = [
+        {
+            "alter": {},
+            "checks": [],
+            "columns": [
+                {
+                    "check": None,
+                    "default": None,
+                    "name": "date_updated",
+                    "nullable": True,
+                    "references": None,
+                    "size": None,
+                    "type": "timestamp",
+                    "unique": False,
+                    "with_time_zone": True,
+                }
+            ],
+            "index": [],
+            "partitioned_by": [],
+            "primary_key": [],
+            "schema": "public",
+            "table_name": "test",
+            "tablespace": None,
+        }
+    ]
+    ddl = """
+    CREATE TABLE public.test (date_updated timestamp with time zone);"""
+
+    result = DDLParser(ddl).run(output_mode="postgres")
+    assert expected == result
diff --git a/tests/test_alter_statements.py b/tests/test_alter_statements.py