fixed handling of quoted identifiers (#166)

ronanstokes-db · web-flow · commit ef04c7b7a669 · 2023-03-05T20:05:13.000-08:00
diff --git a/dbldatagen/schema_parser.py b/dbldatagen/schema_parser.py
@@ -273,6 +273,7 @@ def _cleanseSQL(cls, sql_string):
 
         # skip over quoted identifiers even if they contain quotes
         quoted_ident = pp.QuotedString(quoteChar="`", escQuote="``")
+        quoted_ident.set_parse_action(lambda s, loc, toks: f"`{toks[0]}`")
 
         stringForm1 = pp.Literal('r') + pp.QuotedString(quoteChar="'")
         stringForm2 = pp.Literal('r') + pp.QuotedString(quoteChar='"')
diff --git a/tests/test_build_planning.py b/tests/test_build_planning.py
@@ -379,6 +379,21 @@ def test_expr_attribute(self):
 
         assert columnSpec.expr == sql_expr
 
+    def test_expr_identifier_with_spaces(self):
+        sql_expr = "named_struct('name', city_name, 'id', city_id, 'population', city_pop)"
+        gen1 = dg.DataGenerator(sparkSession=spark, name="nested_schema", rows=1000, partitions=4,
+                                     seedColumnName="_id") \
+            .withColumn("id", "long", minValue=1000000, uniqueValues=10000, random=True) \
+            .withColumn("city_name", "string", template=r"\w", random=True, omit=True) \
+            .withColumn("city_id", "long", minValue=1000000, uniqueValues=10000, random=True, omit=True) \
+            .withColumn("city_pop", "long", minValue=1000000, uniqueValues=10000, random=True, omit=True) \
+            .withColumn("city 2", "struct<name:string, id:long, population:long>",
+                        expr=sql_expr)
+
+        columnSpec = gen1.getColumnSpec("city 2")
+
+        assert columnSpec.expr == sql_expr
+
     def test_build_ordering_duplicate_names1(self):
         gen1 = dg.DataGenerator(sparkSession=spark, name="nested_schema", rows=1000, partitions=4,
                                      seedColumnName="_id") \
diff --git a/tests/test_schema_parser.py b/tests/test_schema_parser.py
@@ -122,13 +122,20 @@ def test_table_definition_parser(self, setupLogging):
     @pytest.mark.parametrize("sqlExpr, expectedText",
                              [("named_struct('name', city_name, 'id', city_id, 'population', city_pop)",
                                "named_struct(' ', city_name, ' ', city_id, ' ', city_pop)"),
+                              ("named_struct('name', `city 2`, 'id', city_id, 'population', city_pop)",
+                                "named_struct(' ', `city 2`, ' ', city_id, ' ', city_pop)"),
+                              ("named_struct('`name 1`', `city 2`, 'id', city_id, 'population', city_pop)",
+                               "named_struct(' ', `city 2`, ' ', city_id, ' ', city_pop)"),
+                              ("named_struct('`name 1`', city, 'id', city_id, 'population', city_pop)",
+                               "named_struct(' ', city, ' ', city_id, ' ', city_pop)"),
                               ("cast(10 as decimal(10)",
                                "cast(10 as decimal(10)"),
                               (" ", " "),
                               ("", ""),
                               ])
     def test_sql_expression_cleanser(self, sqlExpr, expectedText):
         newSql = dg.SchemaParser._cleanseSQL(sqlExpr)
+        print(newSql)
         assert sqlExpr == expectedText or sqlExpr != newSql
 
         assert newSql == expectedText
@@ -142,6 +149,7 @@ def test_sql_expression_cleanser(self, sqlExpr, expectedText):
                                ['city_name', 'city_pop']),
                                ("cast(10 as decimal(10)",  ['cast', 'as', 'decimal'], None),
                               ("cast(x as decimal(10)", ['x'], ['x']),
+                              ("cast(`city 2` as decimal(10)", ['cast', 'city 2', 'as', 'decimal'], None),
                               (" ", [], None),
                               ("", [], None),
                               ])