Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 6 additions & 3 deletions simple_ddl_parser/ddl_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -216,9 +216,10 @@ def t_ID(self, t: LexToken):
t = self.tokens_not_columns_names(t)

if self.lexer.is_alter:
_type = tok.alter_tokens.get(t.value)
if _type:
t.type = _type
t.type = tok.alter_tokens.get(t.value, t.type)

if self.lexer.is_comment:
t.type = tok.comment_on_tokens.get(t.value, t.type)

self.capitalize_tokens(t)
self.commat_type(t)
Expand Down Expand Up @@ -253,6 +254,8 @@ def set_lexx_tags(self, t: LexToken):
self.lexer.is_table = False
elif t.type in ["TABLE", "INDEX"] and not self.lexer.is_alter:
self.lexer.is_table = True
elif t.type == "COMMENT":
self.lexer.is_comment = True

def set_last_token(self, t: LexToken):
self.lexer.last_token = t.type
Expand Down
28 changes: 27 additions & 1 deletion simple_ddl_parser/dialects/sql.py
Original file line number Diff line number Diff line change
Expand Up @@ -960,6 +960,31 @@ def p_alt_table_name(self, p: List) -> None:
p[0]["project"] = table_data["project"]


class Comment:
def p_expression_comment_on(self, p: List):
"""expr : COMMENT ON TABLE id IS STRING
| COMMENT ON TABLE id DOT id IS STRING
| COMMENT ON COLUMN id DOT id IS STRING
| COMMENT ON COLUMN id DOT id DOT id IS STRING
"""
comment_on = {}
p[0] = {"comment_on": comment_on}
p_list = list(p)
obj_type = p_list[3]

# Cleanse comment quotes and handle escaped quotes
comment_on["comment"] = p_list[-1][1:-1].replace("''", "'")
comment_on["object_type"] = obj_type

if obj_type == "COLUMN":
comment_on["column_name"] = p_list[-3]
comment_on["table_name"] = p_list[-5]
comment_on["schema"] = p_list[-7] if len(p_list) > 9 else None
elif obj_type == "TABLE":
comment_on["table_name"] = p_list[-3]
comment_on["schema"] = p_list[-5] if len(p_list) > 7 else None


class BaseSQL(
Database,
Table,
Expand All @@ -971,6 +996,7 @@ class BaseSQL(
Type,
Schema,
TableSpaces,
Comment,
):
def clean_up_id_list_in_equal(self, p_list: List) -> List: # noqa R701
if isinstance(p_list[1], str) and p_list[1].endswith("="):
Expand Down Expand Up @@ -1031,7 +1057,7 @@ def p_id_equals(self, p: List) -> None:
"""
p_list = list(p)

if not p_list[-1] in [")", "]"]:
if p_list[-1] not in [")", "]"]:
p[0] = {p[1]: p_list[-1]}
else:
if len(p_list) > 6 and isinstance(p_list[5], list):
Expand Down
26 changes: 20 additions & 6 deletions simple_ddl_parser/output/base_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,12 +45,7 @@ class BaseData:
replace: Optional[bool] = field(
default=None, metadata={"exclude_if_not_provided": True}
)
comment: Optional[str] = field(
default=None,
metadata={
"exclude_if_not_provided": True,
},
)
comment: Optional[str] = field(default=None, metadata={"exclude_if_empty": True})
like: Optional[dict] = field(
default_factory=dict,
metadata={"exclude_if_not_provided": True},
Expand Down Expand Up @@ -279,6 +274,25 @@ def append_statement_information_to_table(self, statement: Dict) -> None:
self.set_default_columns_from_alter(statement)
elif "primary_key" in statement:
self.set_alter_to_table_data("primary_key", statement)
elif "comment_on" in statement:
self.set_object_comment(statement)

def set_object_comment(self, statement: Dict) -> None:
if statement["comment_on"]["object_type"] == "TABLE":
self.set_table_comment(statement)
elif statement["comment_on"]["object_type"] == "COLUMN":
self.set_column_comments(statement)

def set_table_comment(self, statement: Dict) -> None:
comment = statement["comment_on"]
self.comment = comment["comment"]

def set_column_comments(self, statement: Dict) -> None:
comment = statement["comment_on"]
for column in self.columns:
if column["name"] == comment["column_name"]:
column["comment"] = comment["comment"]
break

def set_default_columns_from_alter(self, statement: Dict) -> None:
for column in self.columns:
Expand Down
31 changes: 24 additions & 7 deletions simple_ddl_parser/output/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,18 +71,23 @@ def process_statement_data(self, statement_data: Dict) -> Dict:
# mean we have table
statement_data["output_mode"] = self.output_mode
table_data = TableData.init(**statement_data)
self.tables_dict[
get_table_id(
schema_name=getattr(table_data, self.schema_key),
table_name=table_data.table_name,
)
] = table_data
data = table_data.to_dict()
table_id = get_table_id(
schema_name=getattr(table_data, self.schema_key),
table_name=table_data.table_name,
)
self.tables_dict[table_id] = table_data
data = table_data # .to_dict()
else:
data = statement_data
dialects_clean_up(self.output_mode, data)
return data

def process_comments(self, statement: Dict):
table_name = statement["comment_on"]["table_name"]
schema = statement.get(self.schema_key) or statement["comment_on"].get("schema")
target_table = self.get_table_from_tables_data(schema, table_name)
target_table.append_statement_information_to_table(statement)

def process_alter_and_index_result(self, table: Dict):
if table.get("index_name"):
self.add_index_to_table(table)
Expand Down Expand Up @@ -133,12 +138,24 @@ def format(self) -> List[Dict]:
# process each item in parser output
if "index_name" in statement or "alter_table_name" in statement:
self.process_alter_and_index_result(statement)
elif "comment_on" in statement:
self.process_comments(statement)
else:
# process tables, types, sequence and etc. data
statement_data = self.process_statement_data(statement)
self.final_result.append(statement_data)

# Since we update the table.comment via "COMMENT ON" statements, we need to
# wait until all statements are processed, before transforming them to a dict.
# Otherwise the table comment wouldn't show up in the final result.
self.final_result = [
data.to_dict() if not isinstance(data, dict) else data
for data in self.final_result
]

if self.group_by_type:
self.group_by_type_result()

return self.final_result


Expand Down
77 changes: 54 additions & 23 deletions simple_ddl_parser/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import logging
import os
import re
from typing import Dict, List, Optional, Tuple, Union
from typing import Dict, List, Optional, Tuple, Union, cast

from ply import lex, yacc

Expand All @@ -19,6 +19,8 @@
IN_COM = "--"
MYSQL_COM = "#"

LF_IN_QUOTE = r"\N"


def set_logging_config(
log_level: Union[str, int], log_file: Optional[str] = None
Expand Down Expand Up @@ -173,35 +175,57 @@ def process_regex_input(self, data):
return data

def pre_process_data(self, data):
data = data.decode("utf-8")
data = cast(str, data.decode("utf-8"))
# todo: not sure how to workaround ',' normal way
if "input.regex" in data:
data = self.process_regex_input(data)
quote_before = r"((?!\'[\w]*[\\']*[\w]*)"
quote_after = r"((?![\w]*[\\']*[\w]*\')))"
num = 0
# add space everywhere except strings
for symbol, replace_to in [
(r"(,)+", " , "),
(r"((\()){1}", " ( "),
(r"((\))){1}", " ) "),
]:
num += 1
if num == 2:
# need for correct work with `(`` but not need in other symbols
quote_after_use = quote_after.replace(")))", "))*)")
else:
quote_after_use = quote_after
data = re.sub(quote_before + symbol + quote_after_use, replace_to, data)

# Process the string character by character to handle quoted sections
result = []
in_quote = False
i = 0
symbol_spacing_map = {",", "(", ")"}

# Special handling for odd number of single quotes
if data.count("'") % 2 != 0:
data = data.replace("\\'", "pars_m_single")

while i < len(data):
char = data[i]
startswith = data[i:].startswith

# Handle quote start/end
if char == "'":
in_quote = not in_quote
result.append(char)

# Handle line feeds in quotes
elif in_quote and startswith("\\n"):
result.append(LF_IN_QUOTE)
i += 1

# Handle equal sign in quotes
elif in_quote and char == "=":
result.append("\\03d")

# Handle special unicode quotes
elif not in_quote and (startswith(r"\u2018") or startswith(r"\u2019")):
result.append("'")
i += 5

# Handle symbols that need spacing
elif not in_quote and char in symbol_spacing_map:
result.append(f" {char} ")

# Keep all other characters as-is
else:
result.append(char)

i += 1

data = "".join(result)
data = (
data.replace("\\x", "\\0")
.replace("‘", "'")
.replace("’", "'")
.replace("\\u2018", "'")
.replace("\\u2019", "'")
.replace("'\\t'", "'pars_m_t'")
.replace("\\t", " ")
)
Expand Down Expand Up @@ -285,7 +309,13 @@ def process_line(
) -> Tuple[Optional[str], bool]:
self.pre_process_line()

self.line = self.line.strip().replace("\n", "").replace("\t", "")
# Remove whitespace, while preserving newlines in quotes
self.line = (
self.line.strip()
.replace("\n", "")
.replace("\t", "")
.replace(LF_IN_QUOTE, "\\n")
)
self.skip = self.check_line_on_skip_words()

self.parse_set_statement()
Expand Down Expand Up @@ -333,6 +363,7 @@ def set_default_flags_in_lexer(self) -> None:
"lp_open",
"is_alter",
"is_like",
"is_comment",
]
for attr in attrs:
setattr(self.lexer, attr, False)
Expand Down
59,166 changes: 29,850 additions & 29,316 deletions simple_ddl_parser/parsetab.py

Large diffs are not rendered by default.

16 changes: 16 additions & 0 deletions simple_ddl_parser/tokens.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
"CLUSTERED",
"SEQUENCE",
"TABLESPACE",
"COMMENT",
}

definition_statements = {value: value for value in definition_statements}
Expand Down Expand Up @@ -89,6 +90,20 @@
}
first_liners = {value: value for value in first_liners}

comment_on_tokens = {
"ON",
"IS",
"TABLE",
"COLUMN",
"MATERIALIZED",
"VIEW",
"FUNCTION",
"INDEX",
"SCHEMA",
"SEQUENCE",
"PROCEDURE",
}
comment_on_tokens = {value: value for value in comment_on_tokens}

common_statements.update(first_liners)
definition_statements.update(common_statements)
Expand Down Expand Up @@ -169,6 +184,7 @@
"RT",
"COMMAT",
"EQ",
"IS",
],
*definition_statements.values(),
*common_statements.values(),
Expand Down
Loading