37
37
import keyword
38
38
import re
39
39
from abc import ABC , abstractmethod
40
+ from enum import Enum , auto
40
41
41
42
import numpy as np
42
43
@@ -167,28 +168,125 @@ class ValidSQLName(PyDoughPredicate):
167
168
as the name for a SQL table path/column name.
168
169
"""
169
170
170
- # Regex for unquoted SQL identifiers
171
- _UNQUOTED_SQL_IDENTIFIER = re .compile (
172
- r"^[A-Za-z_][A-Za-z0-9_]*(\.[A-Za-z_][A-Za-z0-9_]*)*$"
173
- )
171
+ # Single-part unquoted SQL identifier (no dots here).
172
+ UNQUOTED_SQL_IDENTIFIER = re .compile (r"^[A-Za-z_][A-Za-z0-9_]*$" )
173
+ """
174
+ Regex pattern for a single-part unquoted SQL identifier (without dots).
175
+ """
174
176
175
177
def __init__ (self ):
176
178
self .error_messages : dict [str , str ] = {
177
179
"identifier" : "must have a SQL name that is a valid SQL identifier" ,
178
180
"sql_keyword" : "must have a SQL name that is not a reserved word" ,
179
181
}
180
182
183
+ def _split_identifier (self , name : str ) -> list [str ]:
184
+ """
185
+ Split a potentially qualified SQL identifier into parts.
186
+
187
+ Behavior:
188
+ - Dots (.) **outside** quotes/backticks separate parts.
189
+ - Escaped double quotes "" are allowed inside a quoted name ("...").
190
+ - Escaped backticks `` are allowed inside a backtick name (`...`).
191
+ - Dots inside quoted/backtick names are literal characters and do not split.
192
+ - Returned parts include their surrounding quotes/backticks if present.
193
+ (This is intentional, since quoted and unquoted names will be validated differently later.)
194
+ - Empty parts may be returned for cases like:
195
+ * ".field" → ["", "field"]
196
+ * "schema." → ["schema", ""]
197
+ * "db..tbl" → ["db", "", "tbl"]
198
+ (Validation will decide if empty parts are allowed.)
199
+
200
+ Notes:
201
+ - After closing a quoted/backtick identifier, parsing continues in the same token
202
+ until a dot (.) is seen or the string ends. Quotes themselves do not trigger splitting.
203
+ - If spaces or other invalid characters appear in a part, the validator will
204
+ reject that token later.
205
+
206
+ Examples:
207
+ >>> _split_identifier('schema.table')
208
+ ['schema', 'table']
209
+
210
+ >>> _split_identifier('"foo"."bar"')
211
+ ['"foo"', '"bar"']
212
+
213
+ >>> _split_identifier('db."table.name"')
214
+ ['db', '"table.name"']
215
+
216
+ >>> _split_identifier('`a``b`.`c``d`')
217
+ ['`a``b`', '`c``d`']
218
+
219
+ >>> _split_identifier('.field')
220
+ ['', 'field']
221
+
222
+ >>> _split_identifier('field.')
223
+ ['field', '']
224
+ """
225
+
226
+ class split_states (Enum ):
227
+ START = auto ()
228
+ UNQUOTED = auto ()
229
+ DOUBLE_QUOTE = auto ()
230
+ BACKTICK = auto ()
231
+
232
+ parts : list [str ] = []
233
+ start_idx : int = 0
234
+ state : split_states = split_states .START
235
+ length = len (name )
236
+ ii : int = 0
237
+
238
+ while ii < length :
239
+ ch : str = name [ii ]
240
+ match state :
241
+ case split_states .START :
242
+ match ch :
243
+ case '"' :
244
+ state = split_states .DOUBLE_QUOTE
245
+ ii += 1
246
+ case "`" :
247
+ state = split_states .BACKTICK
248
+ ii += 1
249
+ case _:
250
+ state = split_states .UNQUOTED
251
+ case split_states .UNQUOTED :
252
+ if ch == "." :
253
+ parts .append (name [start_idx :ii ])
254
+ start_idx = ii + 1
255
+ state = split_states .START
256
+ ii += 1
257
+ case split_states .DOUBLE_QUOTE :
258
+ if ch == '"' :
259
+ if (ii + 1 < length ) and (name [ii + 1 ] == '"' ):
260
+ ii += 1
261
+ else :
262
+ state = split_states .UNQUOTED
263
+ ii += 1
264
+ case split_states .BACKTICK :
265
+ if ch == "`" :
266
+ if (ii + 1 < length ) and (name [ii + 1 ] == "`" ):
267
+ ii += 1
268
+ else :
269
+ state = split_states .UNQUOTED
270
+ ii += 1
271
+ parts .append (name [start_idx :ii ])
272
+ return parts
273
+
181
274
def _error_code (self , obj : object ) -> str | None :
182
275
"""Return an error code if invalid, or None if valid."""
183
276
ret_val : str | None = None
184
277
# Check that obj is a string
185
278
if isinstance (obj , str ):
186
- # Check that obj is a valid SQL identifier
187
- if not self .is_valid_sql_identifier (obj ):
188
- ret_val = "identifier"
189
- # Check that obj is not a SQL reserved word
190
- elif self ._is_sql_keyword (obj ):
191
- ret_val = "sql_keyword"
279
+ # Check each part of a qualified name: db.schema.table
280
+ for part in self ._split_identifier (obj ):
281
+ # Check that obj is a valid SQL identifier
282
+ # Empty parts (e.g., leading/trailing dots) are invalid
283
+ if not part or not self .is_valid_sql_identifier (part ):
284
+ ret_val = "identifier"
285
+ break
286
+ # Check that obj is not a SQL reserved word
287
+ if self ._is_sql_keyword (part ):
288
+ ret_val = "sql_keyword"
289
+ break
192
290
else :
193
291
ret_val = "identifier"
194
292
@@ -209,7 +307,7 @@ def is_valid_sql_identifier(self, name: str) -> bool:
209
307
return False
210
308
211
309
# Case 1: unquoted
212
- if self ._UNQUOTED_SQL_IDENTIFIER .match (name ):
310
+ if self .UNQUOTED_SQL_IDENTIFIER .match (name ):
213
311
return True
214
312
215
313
# Case 2: double quoted
0 commit comments