Skip to content

Commit e52c486

Browse files
committed
Improve metadata error messages and validations
Improve metadata error messages Improve metadata validations Check for reserved python keywords on metadata names Fix metadata for broker Check for Pydough and SQL reserved words Add validation for SQL names in TablePath & ColumnName Apply code review observations
1 parent 7860cc5 commit e52c486

File tree

11 files changed

+586
-23
lines changed

11 files changed

+586
-23
lines changed

pydough/errors/error_utils.py

Lines changed: 185 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -27,11 +27,15 @@
2727
"is_positive_int",
2828
"is_string",
2929
"is_valid_name",
30+
"is_valid_sql_name",
3031
"simple_join_keys_predicate",
3132
"unique_properties_predicate",
3233
]
3334

3435

36+
import builtins
37+
import keyword
38+
import re
3539
from abc import ABC, abstractmethod
3640

3741
import numpy as np
@@ -97,11 +101,189 @@ class ValidName(PyDoughPredicate):
97101
as the name of a PyDough graph/collection/property.
98102
"""
99103

104+
def __init__(self):
105+
self.error_messages: dict[str, str] = {
106+
"identifier": "must be a string that is a valid Python identifier",
107+
"python_keyword": "must be a string that is not a Python reserved word or built-in name",
108+
"pydough_keyword": "must be a string that is not a PyDough reserved word",
109+
"sql_keyword": "must be a string that is not a SQL reserved word",
110+
}
111+
112+
def _error_code(self, obj: object) -> str | None:
113+
"""Return an error code if invalid, or None if valid."""
114+
ret_val: str | None = None
115+
# Check that obj is a string
116+
if isinstance(obj, str):
117+
# Check that obj is a valid Python identifier
118+
if not obj.isidentifier():
119+
ret_val = "identifier"
120+
# Check that obj is not a Python reserved word or built-in name
121+
elif keyword.iskeyword(obj) or hasattr(builtins, obj):
122+
ret_val = "python_keyword"
123+
# Check that obj is not a PyDough reserved word
124+
elif self._is_pydough_keyword(obj):
125+
ret_val = "pydough_keyword"
126+
else:
127+
ret_val = "identifier"
128+
129+
return ret_val
130+
131+
def _is_pydough_keyword(self, name: str) -> bool:
132+
"""
133+
helper: Verifies if name is a PyDough reserved word.
134+
Extend with new PyDough reserved words if required.
135+
"""
136+
# Dictionary of all registered operators pre-built from the PyDough source
137+
from pydough.pydough_operators import builtin_registered_operators
138+
139+
# Set of collection operators
140+
PYDOUGH_RESERVED: set[str] = {
141+
"CALCULATE",
142+
"WHERE",
143+
"ORDER_BY",
144+
"TOP_K",
145+
"PARTITION",
146+
"SINGULAR",
147+
"BEST",
148+
"CROSS",
149+
}
150+
return (name in PYDOUGH_RESERVED) or (name in builtin_registered_operators())
151+
100152
def accept(self, obj: object) -> bool:
101-
return isinstance(obj, str) and obj.isidentifier()
153+
return self._error_code(obj) is None
102154

103155
def error_message(self, error_name: str) -> str:
104-
return f"{error_name} must be a string that is a Python identifier"
156+
# Generic fallback (since we don't have the object here)
157+
return f"{error_name} must be a valid identifier and not a reserved word"
158+
159+
def verify(self, obj: object, error_name: str) -> None:
160+
code: str | None = self._error_code(obj)
161+
if code is not None:
162+
raise PyDoughMetadataException(f"{error_name} {self.error_messages[code]}")
163+
164+
165+
class ValidSQLName(PyDoughPredicate):
166+
"""Predicate class to check that an object is a string that can be used
167+
as the name for a SQL table path/column name.
168+
"""
169+
170+
# Regex for unquoted SQL identifiers
171+
_UNQUOTED_SQL_IDENTIFIER = re.compile(
172+
r"^[A-Za-z_][A-Za-z0-9_]*(\.[A-Za-z_][A-Za-z0-9_]*)*$"
173+
)
174+
175+
def __init__(self):
176+
self.error_messages: dict[str, str] = {
177+
"identifier": "must have a SQL name that is a valid SQL identifier",
178+
"sql_keyword": "must have a SQL name that is not a reserved word",
179+
}
180+
181+
def _error_code(self, obj: object) -> str | None:
182+
"""Return an error code if invalid, or None if valid."""
183+
ret_val: str | None = None
184+
# Check that obj is a string
185+
if isinstance(obj, str):
186+
# Check that obj is a valid SQL identifier
187+
if not self.is_valid_sql_identifier(obj):
188+
ret_val = "identifier"
189+
# Check that obj is not a SQL reserved word
190+
elif self._is_sql_keyword(obj):
191+
ret_val = "sql_keyword"
192+
else:
193+
ret_val = "identifier"
194+
195+
return ret_val
196+
197+
def is_valid_sql_identifier(self, name: str) -> bool:
198+
"""
199+
Check if a string is a valid SQL identifier.
200+
201+
- Unquoted: starts with letter/underscore, then letters, digits,
202+
underscores.
203+
- Double-quoted: allows any chars, but " "" " is the only valid way to
204+
include a double-quote char.
205+
- Backtick-quoted: allows any chars, but `` `` `` is the only valid
206+
way to include a backtick char.
207+
"""
208+
if not name:
209+
return False
210+
211+
# Case 1: unquoted
212+
if self._UNQUOTED_SQL_IDENTIFIER.match(name):
213+
return True
214+
215+
# Case 2: double quoted
216+
if name.startswith('"') and name.endswith('"'):
217+
inner = name[1:-1]
218+
# Any " must be escaped as ""
219+
return '"' not in inner.replace('""', "")
220+
221+
# Case 3: backtick quoted
222+
if name.startswith("`") and name.endswith("`"):
223+
inner = name[1:-1]
224+
# Any ` must be escaped as ``
225+
return "`" not in inner.replace("``", "")
226+
227+
return False
228+
229+
# fmt: off
230+
SQL_RESERVED_KEYWORDS: set[str] = {
231+
# Query & DML
232+
"select", "from", "where", "group", "having", "distinct", "as",
233+
"join", "inner", "union", "intersect", "except",
234+
235+
# DDL & schema
236+
"create", "alter", "drop", "table", "view", "index", "sequence",
237+
"trigger", "schema", "database", "column", "constraint",
238+
239+
# DML
240+
"insert", "update", "delete", "into", "values", "set",
241+
242+
# Control flow & logical
243+
"and", "or", "not", "in", "is", "like", "between", "case", "when",
244+
"then", "else", "end", "exists",
245+
246+
# Transaction & session
247+
"begin", "commit", "rollback", "savepoint", "transaction",
248+
"lock", "grant", "revoke",
249+
250+
# Data types
251+
"int", "integer", "bigint", "smallint", "decimal", "numeric",
252+
"float", "real", "double", "char", "varchar", "text",
253+
"timestamp", "boolean", "null",
254+
255+
# Functions
256+
"cast",
257+
}
258+
"""
259+
Set of SQL reserved keywords that may cause conflicts when used as table or
260+
column names. This list was compiled from commonly reserved terms across
261+
multiple SQL dialects (e.g., PostgreSQL, SQLite, MySQL), with emphasis on
262+
keywords that are likely to appear in generated SQL statements.
263+
If any of these are used as identifiers, they must be properly escaped to
264+
avoid syntax errors.
265+
"""
266+
# fmt: on
267+
268+
def _is_sql_keyword(self, name: str) -> bool:
269+
"""
270+
helper: Verifies if name is a SQL reserved word.
271+
Uses SQL_RESERVED_KEYWORDS set.
272+
Extend with new SQL reserved words if required.
273+
"""
274+
return name.lower() in self.SQL_RESERVED_KEYWORDS
275+
276+
def accept(self, obj: object) -> bool:
277+
return self._error_code(obj) is None
278+
279+
def error_message(self, error_name: str) -> str:
280+
# Generic fallback (since we don't have the object here)
281+
return f"{error_name} must be a valid SQL identifier and not a reserved word"
282+
283+
def verify(self, obj: object, error_name: str) -> None:
284+
code: str | None = self._error_code(obj)
285+
if code is not None:
286+
raise PyDoughMetadataException(f"{error_name} {self.error_messages[code]}")
105287

106288

107289
class NoExtraKeys(PyDoughPredicate):
@@ -304,6 +486,7 @@ def error_message(self, error_name: str) -> str:
304486
###############################################################################
305487

306488
is_valid_name: PyDoughPredicate = ValidName()
489+
is_valid_sql_name: PyDoughPredicate = ValidSQLName()
307490
is_integer = HasType(int, "integer")
308491
is_string = HasType(str, "string")
309492
is_bool = HasType(bool, "boolean")

pydough/metadata/collections/collection_metadata.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -48,8 +48,8 @@ def __init__(
4848
PropertyMetadata,
4949
)
5050

51-
is_valid_name.verify(name, "name")
52-
HasType(GraphMetadata).verify(graph, "graph")
51+
is_valid_name.verify(name, f"collection name {name!r}")
52+
HasType(GraphMetadata).verify(graph, f"graph {name!r}")
5353

5454
self._graph: GraphMetadata = graph
5555
self._name: str = name

pydough/metadata/collections/simple_table_metadata.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
extract_array,
1111
extract_string,
1212
is_string,
13+
is_valid_sql_name,
1314
unique_properties_predicate,
1415
)
1516
from pydough.metadata.graphs import GraphMetadata
@@ -146,6 +147,7 @@ def parse_from_json(
146147
# Extract the relevant properties from the JSON to build the new
147148
# collection, then add it to the graph.
148149
table_path: str = extract_string(collection_json, "table path", error_name)
150+
is_valid_sql_name.verify(table_path, error_name)
149151
HasPropertyWith("unique properties", unique_properties_predicate).verify(
150152
collection_json, error_name
151153
)
@@ -161,8 +163,6 @@ def parse_from_json(
161163
)
162164
# Parse the optional common semantic properties like the description.
163165
new_collection.parse_optional_properties(collection_json)
164-
properties: list = extract_array(
165-
collection_json, "properties", new_collection.error_name
166-
)
166+
properties: list = extract_array(collection_json, "properties", error_name)
167167
new_collection.add_properties_from_json(properties)
168168
graph.add_collection(new_collection)

pydough/metadata/graphs/graph_metadata.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ def __init__(
4343
synonyms: list[str] | None,
4444
extra_semantic_info: dict | None,
4545
):
46-
is_valid_name.verify(name, "graph name")
46+
is_valid_name.verify(name, f"graph name {name!r}")
4747
self._additional_definitions: list[str] | None = additional_definitions
4848
self._verified_pydough_analysis: list[dict] | None = verified_pydough_analysis
4949
self._name: str = name
@@ -179,7 +179,7 @@ def add_function(self, name: str, function: "ExpressionFunctionOperator") -> Non
179179
`PyDoughMetadataException`: if `function` cannot be inserted
180180
into the graph because of a name collision.
181181
"""
182-
is_valid_name.verify(name, "function name")
182+
is_valid_name.verify(name, f"function name {name!r}")
183183
if name == self.name:
184184
raise PyDoughMetadataException(
185185
f"Function name {name!r} cannot be the same as the graph name {self.name!r}"

pydough/metadata/properties/property_metadata.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -45,8 +45,8 @@ def __init__(
4545
synonyms: list[str] | None,
4646
extra_semantic_info: dict | None,
4747
):
48-
is_valid_name.verify(name, "name")
49-
HasType(CollectionMetadata).verify(collection, "collection")
48+
is_valid_name.verify(name, f"property name {name!r}")
49+
HasType(CollectionMetadata).verify(collection, f"collection {name}")
5050
self._name: str = name
5151
self._collection: CollectionMetadata = collection
5252
super().__init__(description, synonyms, extra_semantic_info)

pydough/metadata/properties/table_column_metadata.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
NoExtraKeys,
1212
extract_string,
1313
is_string,
14+
is_valid_sql_name,
1415
)
1516
from pydough.metadata.collections import CollectionMetadata
1617
from pydough.types import PyDoughType, parse_type_from_string
@@ -99,6 +100,7 @@ def parse_from_json(
99100
except PyDoughTypeException as e:
100101
raise PyDoughMetadataException(*e.args)
101102
column_name: str = extract_string(property_json, "column name", error_name)
103+
is_valid_sql_name.verify(column_name, error_name)
102104

103105
NoExtraKeys(TableColumnMetadata.allowed_fields).verify(
104106
property_json, error_name

tests/test_metadata/defog_graphs.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -223,7 +223,7 @@
223223
"synonyms": ["record datetime", "price update date"]
224224
},
225225
{
226-
"name": "open",
226+
"name": "_open",
227227
"type": "table column",
228228
"column name": "sbDpOpen",
229229
"data type": "numeric",

0 commit comments

Comments
 (0)