xnuinside · ragchuck · Aug 28, 2025 · Aug 29, 2025 · Sep 17, 2025 · Sep 29, 2025
diff --git a/simple_ddl_parser/ddl_parser.py b/simple_ddl_parser/ddl_parser.py
@@ -216,9 +216,10 @@ def t_ID(self, t: LexToken):
             t = self.tokens_not_columns_names(t)
 
         if self.lexer.is_alter:
-            _type = tok.alter_tokens.get(t.value)
-            if _type:
-                t.type = _type
+            t.type = tok.alter_tokens.get(t.value, t.type)
+
+        if self.lexer.is_comment:
+            t.type = tok.comment_on_tokens.get(t.value, t.type)
 
         self.capitalize_tokens(t)
         self.commat_type(t)
@@ -253,6 +254,8 @@ def set_lexx_tags(self, t: LexToken):
             self.lexer.is_table = False
         elif t.type in ["TABLE", "INDEX"] and not self.lexer.is_alter:
             self.lexer.is_table = True
+        elif t.type == "COMMENT":
+            self.lexer.is_comment = True
 
     def set_last_token(self, t: LexToken):
         self.lexer.last_token = t.type

diff --git a/simple_ddl_parser/dialects/sql.py b/simple_ddl_parser/dialects/sql.py
@@ -960,6 +960,31 @@ def p_alt_table_name(self, p: List) -> None:
             p[0]["project"] = table_data["project"]
 
 
+class Comment:
+    def p_expression_comment_on(self, p: List):
+        """expr : COMMENT ON TABLE id IS STRING
+        | COMMENT ON TABLE id DOT id IS STRING
+        | COMMENT ON COLUMN id DOT id IS STRING
+        | COMMENT ON COLUMN id DOT id DOT id IS STRING
+        """
+        comment_on = {}
+        p[0] = {"comment_on": comment_on}
+        p_list = list(p)
+        obj_type = p_list[3]
+
+        # Cleanse comment quotes and handle escaped quotes
+        comment_on["comment"] = p_list[-1][1:-1].replace("''", "'")
+        comment_on["object_type"] = obj_type
+
+        if obj_type == "COLUMN":
+            comment_on["column_name"] = p_list[-3]
+            comment_on["table_name"] = p_list[-5]
+            comment_on["schema"] = p_list[-7] if len(p_list) > 9 else None
+        elif obj_type == "TABLE":
+            comment_on["table_name"] = p_list[-3]
+            comment_on["schema"] = p_list[-5] if len(p_list) > 7 else None
+
+
 class BaseSQL(
     Database,
     Table,
@@ -971,6 +996,7 @@ class BaseSQL(
     Type,
     Schema,
     TableSpaces,
+    Comment,
 ):
     def clean_up_id_list_in_equal(self, p_list: List) -> List:  # noqa R701
         if isinstance(p_list[1], str) and p_list[1].endswith("="):
@@ -1031,7 +1057,7 @@ def p_id_equals(self, p: List) -> None:
         """
         p_list = list(p)
 
-        if not p_list[-1] in [")", "]"]:
+        if p_list[-1] not in [")", "]"]:
             p[0] = {p[1]: p_list[-1]}
         else:
             if len(p_list) > 6 and isinstance(p_list[5], list):

diff --git a/simple_ddl_parser/output/base_data.py b/simple_ddl_parser/output/base_data.py
@@ -45,12 +45,7 @@ class BaseData:
     replace: Optional[bool] = field(
         default=None, metadata={"exclude_if_not_provided": True}
     )
-    comment: Optional[str] = field(
-        default=None,
-        metadata={
-            "exclude_if_not_provided": True,
-        },
-    )
+    comment: Optional[str] = field(default=None, metadata={"exclude_if_empty": True})
     like: Optional[dict] = field(
         default_factory=dict,
         metadata={"exclude_if_not_provided": True},
@@ -279,6 +274,25 @@ def append_statement_information_to_table(self, statement: Dict) -> None:
             self.set_default_columns_from_alter(statement)
         elif "primary_key" in statement:
             self.set_alter_to_table_data("primary_key", statement)
+        elif "comment_on" in statement:
+            self.set_object_comment(statement)
+
+    def set_object_comment(self, statement: Dict) -> None:
+        if statement["comment_on"]["object_type"] == "TABLE":
+            self.set_table_comment(statement)
+        elif statement["comment_on"]["object_type"] == "COLUMN":
+            self.set_column_comments(statement)
+
+    def set_table_comment(self, statement: Dict) -> None:
+        comment = statement["comment_on"]
+        self.comment = comment["comment"]
+
+    def set_column_comments(self, statement: Dict) -> None:
+        comment = statement["comment_on"]
+        for column in self.columns:
+            if column["name"] == comment["column_name"]:
+                column["comment"] = comment["comment"]
+                break
 
     def set_default_columns_from_alter(self, statement: Dict) -> None:
         for column in self.columns:

diff --git a/simple_ddl_parser/output/core.py b/simple_ddl_parser/output/core.py
@@ -71,18 +71,23 @@ def process_statement_data(self, statement_data: Dict) -> Dict:
             # mean we have table
             statement_data["output_mode"] = self.output_mode
             table_data = TableData.init(**statement_data)
-            self.tables_dict[
-                get_table_id(
-                    schema_name=getattr(table_data, self.schema_key),
-                    table_name=table_data.table_name,
-                )
-            ] = table_data
-            data = table_data.to_dict()
+            table_id = get_table_id(
+                schema_name=getattr(table_data, self.schema_key),
+                table_name=table_data.table_name,
+            )
+            self.tables_dict[table_id] = table_data
+            data = table_data  # .to_dict()
         else:
             data = statement_data
             dialects_clean_up(self.output_mode, data)
         return data
 
+    def process_comments(self, statement: Dict):
+        table_name = statement["comment_on"]["table_name"]
+        schema = statement.get(self.schema_key) or statement["comment_on"].get("schema")
+        target_table = self.get_table_from_tables_data(schema, table_name)
+        target_table.append_statement_information_to_table(statement)
+
     def process_alter_and_index_result(self, table: Dict):
         if table.get("index_name"):
             self.add_index_to_table(table)
@@ -133,12 +138,24 @@ def format(self) -> List[Dict]:
             # process each item in parser output
             if "index_name" in statement or "alter_table_name" in statement:
                 self.process_alter_and_index_result(statement)
+            elif "comment_on" in statement:
+                self.process_comments(statement)
             else:
                 # process tables, types, sequence and etc. data
                 statement_data = self.process_statement_data(statement)
                 self.final_result.append(statement_data)
+
+        # Since we update the table.comment via "COMMENT ON" statements, we need to
+        # wait until all statements are processed, before transforming them to a dict.
+        # Otherwise the table comment wouldn't show up in the final result.
+        self.final_result = [
+            data.to_dict() if not isinstance(data, dict) else data
+            for data in self.final_result
+        ]
+
         if self.group_by_type:
             self.group_by_type_result()
+
         return self.final_result
 
 

diff --git a/simple_ddl_parser/parser.py b/simple_ddl_parser/parser.py
@@ -2,7 +2,7 @@
 import logging
 import os
 import re
-from typing import Dict, List, Optional, Tuple, Union
+from typing import Dict, List, Optional, Tuple, Union, cast
 
 from ply import lex, yacc
 
@@ -19,6 +19,8 @@
 IN_COM = "--"
 MYSQL_COM = "#"
 
+LF_IN_QUOTE = r"\N"
+
 
 def set_logging_config(
     log_level: Union[str, int], log_file: Optional[str] = None
@@ -173,35 +175,57 @@ def process_regex_input(self, data):
         return data
 
     def pre_process_data(self, data):
-        data = data.decode("utf-8")
+        data = cast(str, data.decode("utf-8"))
         # todo: not sure how to workaround ',' normal way
         if "input.regex" in data:
             data = self.process_regex_input(data)
-        quote_before = r"((?!\'[\w]*[\\']*[\w]*)"
-        quote_after = r"((?![\w]*[\\']*[\w]*\')))"
-        num = 0
-        # add space everywhere except strings
-        for symbol, replace_to in [
-            (r"(,)+", " , "),
-            (r"((\()){1}", " ( "),
-            (r"((\))){1}", " ) "),
-        ]:
-            num += 1
-            if num == 2:
-                # need for correct work with `(`` but not need in other symbols
-                quote_after_use = quote_after.replace(")))", "))*)")
-            else:
-                quote_after_use = quote_after
-            data = re.sub(quote_before + symbol + quote_after_use, replace_to, data)
 
+        # Process the string character by character to handle quoted sections
+        result = []
+        in_quote = False
+        i = 0
+        symbol_spacing_map = {",", "(", ")"}
+
+        # Special handling for odd number of single quotes
         if data.count("'") % 2 != 0:
             data = data.replace("\\'", "pars_m_single")
+
+        while i < len(data):
+            char = data[i]
+            startswith = data[i:].startswith
+
+            # Handle quote start/end
+            if char == "'":
+                in_quote = not in_quote
+                result.append(char)
+
+            # Handle line feeds in quotes
+            elif in_quote and startswith("\\n"):
+                result.append(LF_IN_QUOTE)
+                i += 1
+
+            # Handle equal sign in quotes
+            elif in_quote and char == "=":
+                result.append("\\03d")
+
+            # Handle special unicode quotes
+            elif not in_quote and (startswith(r"\u2018") or startswith(r"\u2019")):
+                result.append("'")
+                i += 5
+
+            # Handle symbols that need spacing
+            elif not in_quote and char in symbol_spacing_map:
+                result.append(f" {char} ")
+
+            # Keep all other characters as-is
+            else:
+                result.append(char)
+
+            i += 1
+
+        data = "".join(result)
         data = (
             data.replace("\\x", "\\0")
-            .replace("‘", "'")
-            .replace("’", "'")
-            .replace("\\u2018", "'")
-            .replace("\\u2019", "'")
             .replace("'\\t'", "'pars_m_t'")
             .replace("\\t", " ")
         )
@@ -285,7 +309,13 @@ def process_line(
     ) -> Tuple[Optional[str], bool]:
         self.pre_process_line()
 
-        self.line = self.line.strip().replace("\n", "").replace("\t", "")
+        # Remove whitespace, while preserving newlines in quotes
+        self.line = (
+            self.line.strip()
+            .replace("\n", "")
+            .replace("\t", "")
+            .replace(LF_IN_QUOTE, "\\n")
+        )
         self.skip = self.check_line_on_skip_words()
 
         self.parse_set_statement()
@@ -333,6 +363,7 @@ def set_default_flags_in_lexer(self) -> None:
             "lp_open",
             "is_alter",
             "is_like",
+            "is_comment",
         ]
         for attr in attrs:
             setattr(self.lexer, attr, False)

diff --git a/simple_ddl_parser/parsetab.py b/simple_ddl_parser/parsetab.py
diff --git a/simple_ddl_parser/tokens.py b/simple_ddl_parser/tokens.py
@@ -13,6 +13,7 @@
     "CLUSTERED",
     "SEQUENCE",
     "TABLESPACE",
+    "COMMENT",
 }
 
 definition_statements = {value: value for value in definition_statements}
@@ -89,6 +90,20 @@
 }
 first_liners = {value: value for value in first_liners}
 
+comment_on_tokens = {
+    "ON",
+    "IS",
+    "TABLE",
+    "COLUMN",
+    "MATERIALIZED",
+    "VIEW",
+    "FUNCTION",
+    "INDEX",
+    "SCHEMA",
+    "SEQUENCE",
+    "PROCEDURE",
+}
+comment_on_tokens = {value: value for value in comment_on_tokens}
 
 common_statements.update(first_liners)
 definition_statements.update(common_statements)
@@ -169,6 +184,7 @@
             "RT",
             "COMMAT",
             "EQ",
+            "IS",
         ],
         *definition_statements.values(),
         *common_statements.values(),