Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
187 changes: 185 additions & 2 deletions pydough/errors/error_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,11 +27,15 @@
"is_positive_int",
"is_string",
"is_valid_name",
"is_valid_sql_name",
"simple_join_keys_predicate",
"unique_properties_predicate",
]


import builtins
import keyword
import re
from abc import ABC, abstractmethod

import numpy as np
Expand Down Expand Up @@ -97,11 +101,189 @@ class ValidName(PyDoughPredicate):
as the name of a PyDough graph/collection/property.
"""

def __init__(self):
self.error_messages: dict[str, str] = {
"identifier": "must be a string that is a valid Python identifier",
"python_keyword": "must be a string that is not a Python reserved word or built-in name",
"pydough_keyword": "must be a string that is not a PyDough reserved word",
"sql_keyword": "must be a string that is not a SQL reserved word",
}

def _error_code(self, obj: object) -> str | None:
"""Return an error code if invalid, or None if valid."""
ret_val: str | None = None
# Check that obj is a string
if isinstance(obj, str):
# Check that obj is a valid Python identifier
if not obj.isidentifier():
ret_val = "identifier"
# Check that obj is not a Python reserved word or built-in name
elif keyword.iskeyword(obj) or hasattr(builtins, obj):
ret_val = "python_keyword"
# Check that obj is not a PyDough reserved word
elif self._is_pydough_keyword(obj):
ret_val = "pydough_keyword"
else:
ret_val = "identifier"

return ret_val

def _is_pydough_keyword(self, name: str) -> bool:
"""
helper: Verifies if name is a PyDough reserved word.
Extend with new PyDough reserved words if required.
"""
# Dictionary of all registered operators pre-built from the PyDough source
from pydough.pydough_operators import builtin_registered_operators

# Set of collection operators
PYDOUGH_RESERVED: set[str] = {
"CALCULATE",
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The other thing we should probably ban is any of the function names (see the builtin_registered_operators function for how to obtain these names).

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done! I didn't know about builtin_registered_operators

"WHERE",
"ORDER_BY",
"TOP_K",
"PARTITION",
"SINGULAR",
"BEST",
"CROSS",
}
return (name in PYDOUGH_RESERVED) or (name in builtin_registered_operators())

def accept(self, obj: object) -> bool:
return isinstance(obj, str) and obj.isidentifier()
return self._error_code(obj) is None

def error_message(self, error_name: str) -> str:
return f"{error_name} must be a string that is a Python identifier"
# Generic fallback (since we don't have the object here)
return f"{error_name} must be a valid identifier and not a reserved word"

def verify(self, obj: object, error_name: str) -> None:
code: str | None = self._error_code(obj)
if code is not None:
raise PyDoughMetadataException(f"{error_name} {self.error_messages[code]}")


class ValidSQLName(PyDoughPredicate):
"""Predicate class to check that an object is a string that can be used
as the name for a SQL table path/column name.
"""

# Regex for unquoted SQL identifiers
_UNQUOTED_SQL_IDENTIFIER = re.compile(
r"^[A-Za-z_][A-Za-z0-9_]*(\.[A-Za-z_][A-Za-z0-9_]*)*$"
)

def __init__(self):
self.error_messages: dict[str, str] = {
"identifier": "must have a SQL name that is a valid SQL identifier",
"sql_keyword": "must have a SQL name that is not a reserved word",
}

def _error_code(self, obj: object) -> str | None:
"""Return an error code if invalid, or None if valid."""
ret_val: str | None = None
# Check that obj is a string
if isinstance(obj, str):
# Check that obj is a valid SQL identifier
if not self.is_valid_sql_identifier(obj):
ret_val = "identifier"
# Check that obj is not a SQL reserved word
elif self._is_sql_keyword(obj):
ret_val = "sql_keyword"
else:
ret_val = "identifier"

return ret_val

def is_valid_sql_identifier(self, name: str) -> bool:
"""
Check if a string is a valid SQL identifier.

- Unquoted: starts with letter/underscore, then letters, digits,
underscores.
- Double-quoted: allows any chars, but " "" " is the only valid way to
include a double-quote char.
- Backtick-quoted: allows any chars, but `` `` `` is the only valid
way to include a backtick char.
"""
if not name:
return False

# Case 1: unquoted
if self._UNQUOTED_SQL_IDENTIFIER.match(name):
return True

# Case 2: double quoted
if name.startswith('"') and name.endswith('"'):
inner = name[1:-1]
# Any " must be escaped as ""
return '"' not in inner.replace('""', "")

# Case 3: backtick quoted
if name.startswith("`") and name.endswith("`"):
inner = name[1:-1]
# Any ` must be escaped as ``
return "`" not in inner.replace("``", "")

return False

# fmt: off
SQL_RESERVED_KEYWORDS: set[str] = {
# Query & DML
"select", "from", "where", "group", "having", "distinct", "as",
"join", "inner", "union", "intersect", "except",

# DDL & schema
"create", "alter", "drop", "table", "view", "index", "sequence",
"trigger", "schema", "database", "column", "constraint",

# DML
"insert", "update", "delete", "into", "values", "set",

# Control flow & logical
"and", "or", "not", "in", "is", "like", "between", "case", "when",
"then", "else", "end", "exists",

# Transaction & session
"begin", "commit", "rollback", "savepoint", "transaction",
"lock", "grant", "revoke",

# Data types
"int", "integer", "bigint", "smallint", "decimal", "numeric",
"float", "real", "double", "char", "varchar", "text",
"timestamp", "boolean", "null",

# Functions
"cast",
}
"""
Set of SQL reserved keywords that may cause conflicts when used as table or
column names. This list was compiled from commonly reserved terms across
multiple SQL dialects (e.g., PostgreSQL, SQLite, MySQL), with emphasis on
keywords that are likely to appear in generated SQL statements.
If any of these are used as identifiers, they must be properly escaped to
avoid syntax errors.
"""
# fmt: on

def _is_sql_keyword(self, name: str) -> bool:
"""
helper: Verifies if name is a SQL reserved word.
Uses SQL_RESERVED_KEYWORDS set.
Extend with new SQL reserved words if required.
"""
return name.lower() in self.SQL_RESERVED_KEYWORDS

def accept(self, obj: object) -> bool:
return self._error_code(obj) is None

def error_message(self, error_name: str) -> str:
# Generic fallback (since we don't have the object here)
return f"{error_name} must be a valid SQL identifier and not a reserved word"

def verify(self, obj: object, error_name: str) -> None:
code: str | None = self._error_code(obj)
if code is not None:
raise PyDoughMetadataException(f"{error_name} {self.error_messages[code]}")


class NoExtraKeys(PyDoughPredicate):
Expand Down Expand Up @@ -304,6 +486,7 @@ def error_message(self, error_name: str) -> str:
###############################################################################

is_valid_name: PyDoughPredicate = ValidName()
is_valid_sql_name: PyDoughPredicate = ValidSQLName()
is_integer = HasType(int, "integer")
is_string = HasType(str, "string")
is_bool = HasType(bool, "boolean")
Expand Down
4 changes: 2 additions & 2 deletions pydough/metadata/collections/collection_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,8 +48,8 @@ def __init__(
PropertyMetadata,
)

is_valid_name.verify(name, "name")
HasType(GraphMetadata).verify(graph, "graph")
is_valid_name.verify(name, f"collection name {name!r}")
HasType(GraphMetadata).verify(graph, f"graph {name!r}")

self._graph: GraphMetadata = graph
self._name: str = name
Expand Down
6 changes: 3 additions & 3 deletions pydough/metadata/collections/simple_table_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
extract_array,
extract_string,
is_string,
is_valid_sql_name,
unique_properties_predicate,
)
from pydough.metadata.graphs import GraphMetadata
Expand Down Expand Up @@ -146,6 +147,7 @@ def parse_from_json(
# Extract the relevant properties from the JSON to build the new
# collection, then add it to the graph.
table_path: str = extract_string(collection_json, "table path", error_name)
is_valid_sql_name.verify(table_path, error_name)
HasPropertyWith("unique properties", unique_properties_predicate).verify(
collection_json, error_name
)
Expand All @@ -161,8 +163,6 @@ def parse_from_json(
)
# Parse the optional common semantic properties like the description.
new_collection.parse_optional_properties(collection_json)
properties: list = extract_array(
collection_json, "properties", new_collection.error_name
)
properties: list = extract_array(collection_json, "properties", error_name)
new_collection.add_properties_from_json(properties)
graph.add_collection(new_collection)
4 changes: 2 additions & 2 deletions pydough/metadata/graphs/graph_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ def __init__(
synonyms: list[str] | None,
extra_semantic_info: dict | None,
):
is_valid_name.verify(name, "graph name")
is_valid_name.verify(name, f"graph name {name!r}")
self._additional_definitions: list[str] | None = additional_definitions
self._verified_pydough_analysis: list[dict] | None = verified_pydough_analysis
self._name: str = name
Expand Down Expand Up @@ -179,7 +179,7 @@ def add_function(self, name: str, function: "ExpressionFunctionOperator") -> Non
`PyDoughMetadataException`: if `function` cannot be inserted
into the graph because of a name collision.
"""
is_valid_name.verify(name, "function name")
is_valid_name.verify(name, f"function name {name!r}")
if name == self.name:
raise PyDoughMetadataException(
f"Function name {name!r} cannot be the same as the graph name {self.name!r}"
Expand Down
4 changes: 2 additions & 2 deletions pydough/metadata/properties/property_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,8 +45,8 @@ def __init__(
synonyms: list[str] | None,
extra_semantic_info: dict | None,
):
is_valid_name.verify(name, "name")
HasType(CollectionMetadata).verify(collection, "collection")
is_valid_name.verify(name, f"property name {name!r}")
HasType(CollectionMetadata).verify(collection, f"collection {name}")
self._name: str = name
self._collection: CollectionMetadata = collection
super().__init__(description, synonyms, extra_semantic_info)
Expand Down
2 changes: 2 additions & 0 deletions pydough/metadata/properties/table_column_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
NoExtraKeys,
extract_string,
is_string,
is_valid_sql_name,
)
from pydough.metadata.collections import CollectionMetadata
from pydough.types import PyDoughType, parse_type_from_string
Expand Down Expand Up @@ -99,6 +100,7 @@ def parse_from_json(
except PyDoughTypeException as e:
raise PyDoughMetadataException(*e.args)
column_name: str = extract_string(property_json, "column name", error_name)
is_valid_sql_name.verify(column_name, error_name)

NoExtraKeys(TableColumnMetadata.allowed_fields).verify(
property_json, error_name
Expand Down
2 changes: 1 addition & 1 deletion tests/test_metadata/defog_graphs.json
Original file line number Diff line number Diff line change
Expand Up @@ -223,7 +223,7 @@
"synonyms": ["record datetime", "price update date"]
},
{
"name": "open",
"name": "_open",
"type": "table column",
"column name": "sbDpOpen",
"data type": "numeric",
Expand Down
Loading