Skip to content

Commit ef04c7b

Browse files
fixed handling of quoted identifiers (#166)
1 parent 431d865 commit ef04c7b

File tree

3 files changed

+24
-0
lines changed

3 files changed

+24
-0
lines changed

dbldatagen/schema_parser.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -273,6 +273,7 @@ def _cleanseSQL(cls, sql_string):
273273

274274
# skip over quoted identifiers even if they contain quotes
275275
quoted_ident = pp.QuotedString(quoteChar="`", escQuote="``")
276+
quoted_ident.set_parse_action(lambda s, loc, toks: f"`{toks[0]}`")
276277

277278
stringForm1 = pp.Literal('r') + pp.QuotedString(quoteChar="'")
278279
stringForm2 = pp.Literal('r') + pp.QuotedString(quoteChar='"')

tests/test_build_planning.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -379,6 +379,21 @@ def test_expr_attribute(self):
379379

380380
assert columnSpec.expr == sql_expr
381381

382+
def test_expr_identifier_with_spaces(self):
383+
sql_expr = "named_struct('name', city_name, 'id', city_id, 'population', city_pop)"
384+
gen1 = dg.DataGenerator(sparkSession=spark, name="nested_schema", rows=1000, partitions=4,
385+
seedColumnName="_id") \
386+
.withColumn("id", "long", minValue=1000000, uniqueValues=10000, random=True) \
387+
.withColumn("city_name", "string", template=r"\w", random=True, omit=True) \
388+
.withColumn("city_id", "long", minValue=1000000, uniqueValues=10000, random=True, omit=True) \
389+
.withColumn("city_pop", "long", minValue=1000000, uniqueValues=10000, random=True, omit=True) \
390+
.withColumn("city 2", "struct<name:string, id:long, population:long>",
391+
expr=sql_expr)
392+
393+
columnSpec = gen1.getColumnSpec("city 2")
394+
395+
assert columnSpec.expr == sql_expr
396+
382397
def test_build_ordering_duplicate_names1(self):
383398
gen1 = dg.DataGenerator(sparkSession=spark, name="nested_schema", rows=1000, partitions=4,
384399
seedColumnName="_id") \

tests/test_schema_parser.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -122,13 +122,20 @@ def test_table_definition_parser(self, setupLogging):
122122
@pytest.mark.parametrize("sqlExpr, expectedText",
123123
[("named_struct('name', city_name, 'id', city_id, 'population', city_pop)",
124124
"named_struct(' ', city_name, ' ', city_id, ' ', city_pop)"),
125+
("named_struct('name', `city 2`, 'id', city_id, 'population', city_pop)",
126+
"named_struct(' ', `city 2`, ' ', city_id, ' ', city_pop)"),
127+
("named_struct('`name 1`', `city 2`, 'id', city_id, 'population', city_pop)",
128+
"named_struct(' ', `city 2`, ' ', city_id, ' ', city_pop)"),
129+
("named_struct('`name 1`', city, 'id', city_id, 'population', city_pop)",
130+
"named_struct(' ', city, ' ', city_id, ' ', city_pop)"),
125131
("cast(10 as decimal(10)",
126132
"cast(10 as decimal(10)"),
127133
(" ", " "),
128134
("", ""),
129135
])
130136
def test_sql_expression_cleanser(self, sqlExpr, expectedText):
131137
newSql = dg.SchemaParser._cleanseSQL(sqlExpr)
138+
print(newSql)
132139
assert sqlExpr == expectedText or sqlExpr != newSql
133140

134141
assert newSql == expectedText
@@ -142,6 +149,7 @@ def test_sql_expression_cleanser(self, sqlExpr, expectedText):
142149
['city_name', 'city_pop']),
143150
("cast(10 as decimal(10)", ['cast', 'as', 'decimal'], None),
144151
("cast(x as decimal(10)", ['x'], ['x']),
152+
("cast(`city 2` as decimal(10)", ['cast', 'city 2', 'as', 'decimal'], None),
145153
(" ", [], None),
146154
("", [], None),
147155
])

0 commit comments

Comments
 (0)