11import re
22import logging
33from pathlib import Path
4- from typing import List , Optional
5- from . import config
4+ from typing import Optional , List # Added List for type hinting
5+
6+ # Assuming config is a module in the same directory or an accessible path
7+ from . import config # If config.py is in the same directory as this script
68
79logger = logging .getLogger ('cloudsql_to_supabase.clean' )
810
911class DumpCleaner :
10- def __init__ (self , input_file : Path = None , output_file : Path = None , target_schema : str = None ) -> None :
11- self .input_file = input_file or Path (config .OUTPUT_DUMP )
12- self .output_file = output_file or Path (config .CLEANED_DUMP )
12+ def __init__ (
13+ self ,
14+ input_file : Optional [Path ] = None ,
15+ output_file : Optional [Path ] = None ,
16+ target_schema : Optional [str ] = None ,
17+ ) -> None :
18+ self .input_file = Path (input_file or config .OUTPUT_DUMP )
19+ self .output_file = Path (output_file or config .CLEANED_DUMP )
1320 self .target_schema = target_schema or config .SUPABASE_SCHEMA
14-
21+
22+ logger .info (f"Initializing DumpCleaner. Input: '{ self .input_file } ', Output: '{ self .output_file } ', Target Schema: '{ self .target_schema } '" )
23+
24+ # List of problematic roles (typically source-specific admin/system roles)
25+ # whose usage should be entirely removed if they are not meant to exist in the target.
26+ # 'postgres' is usually the main superuser in Supabase, so it's generally NOT added here.
27+ self .problematic_roles_to_filter : List [str ] = ["cloudsqlsuperuser" , "cloudsqladmin" ]
28+ # Add other cloud-specific roles like 'rdsadmin', etc., if applicable.
29+
30+ self .problematic_role_match_pattern : Optional [str ] = None
31+ if self .problematic_roles_to_filter :
32+ role_patterns = []
33+ for role in self .problematic_roles_to_filter :
34+ escaped_role = re .escape (role )
35+ # Matches "role_name" OR role_name (as a whole word)
36+ role_patterns .append (f'"{ escaped_role } "' )
37+ role_patterns .append (rf'{ escaped_role } \b' )
38+ self .problematic_role_match_pattern = rf"(?:{ '|' .join (role_patterns )} )"
39+ logger .debug (f"Problematic role regex part: { self .problematic_role_match_pattern } " )
40+
41+
42+ # Patterns to skip lines entirely
1543 self .skip_patterns = [
16- r'^(CREATE|ALTER) ROLE' , # Skip role creation/alteration
17- r'^COMMENT ON EXTENSION' , # Skip extension comments
44+ re .compile (r'^\s*(CREATE|ALTER)\s+ROLE\b' , re .IGNORECASE ), # Skip any role creation/alteration
45+ re .compile (r'^\s*COMMENT ON EXTENSION\s+(?:pg_stat_statements|plpgsql)\s*;' , re .IGNORECASE ), # More specific
46+ re .compile (r'^\s*COMMENT ON EXTENSION\b' , re .IGNORECASE ), # General extension comments
47+ re .compile (r'^\s*SET\s+(?:transaction_timeout|idle_in_transaction_session_timeout|lock_timeout|statement_timeout)\s*=\s*.*?;' , re .IGNORECASE ),
48+ re .compile (r'^\s*SET\s+default_transaction_read_only\s*=\s*on;' , re .IGNORECASE ), # Often in Cloud SQL read replicas
49+ re .compile (r'^\s*GRANT\s+pg_signal_backend\s+TO\s+cloudsqlsuperuser;' , re .IGNORECASE ), # Specific Cloud SQL grant
1850 ]
19-
20- # Regular ownership replacement
21- owner_replacement = 'OWNER TO public;'
22- # If using non-public schema and we have permissions, keep the target schema as owner
23- if self .target_schema != "public" :
24- owner_replacement = f'OWNER TO { self .target_schema } ;'
25-
51+
52+ if self .problematic_role_match_pattern :
53+ self .skip_patterns .extend ([
54+ # Skip SET ROLE or SET SESSION AUTHORIZATION to problematic roles
55+ re .compile (r'^\s*SET\s+(?:ROLE|SESSION\s+AUTHORIZATION)\s+' + self .problematic_role_match_pattern + r'\s*;' , re .IGNORECASE ),
56+
57+ # Skip any GRANT or REVOKE statement that mentions a problematic role.
58+ # This is broad but effective for preventing errors if these roles don't exist.
59+ # It catches grants TO the role, grants OF the role, and revokes involving the role.
60+ re .compile (r'^\s*(?:GRANT|REVOKE).*' + self .problematic_role_match_pattern + r'.*?;' , re .IGNORECASE ),
61+
62+ # Skip ALTER DEFAULT PRIVILEGES for problematic roles
63+ re .compile (r'^\s*ALTER DEFAULT PRIVILEGES\s+FOR ROLE\s+' + self .problematic_role_match_pattern + r'\s+.*?;' , re .IGNORECASE ),
64+ ])
65+
66+ # Replacement rules: list of tuples (compiled_pattern, replacement_str)
67+ if self .target_schema == "public" :
68+ owner_replacement_str = 'OWNER TO public;'
69+ else :
70+ # Quote the schema name if it's not 'public' to preserve case and handle special characters.
71+ owner_replacement_str = f'OWNER TO "{ self .target_schema } ";'
72+ logger .debug (f"Owner replacement string: { owner_replacement_str } " )
73+
2674 self .replacement_rules = [
27- (r'OWNER TO .*?;' , owner_replacement ),
28- (r'CREATE SCHEMA .*?;' , '-- Schema creation removed' ), # Comment out schema creation
75+ # Matches OWNER TO any_user; or OWNER TO "any_user";
76+ (re .compile (r'OWNER TO (?:"[^"]+"|[^\s;]+);' , re .IGNORECASE ), owner_replacement_str ),
77+ (re .compile (r'^\s*CREATE SCHEMA\s+.*?;' , re .IGNORECASE ), '-- Schema creation removed (Supabase extension schema used instead)' ),
78+ (re .compile (r'^\s*ALTER SCHEMA\s+public\s+OWNER TO .*?;' , re .IGNORECASE ), f'-- ALTER SCHEMA public OWNER removed, will be owned by supabase admin/postgres' ),
79+ # Remove or comment out setting search_path to only public if we are using a target schema
80+ (re .compile (r"^\s*SELECT pg_catalog\.set_config\('search_path', '', false\);\s*$" , re .IGNORECASE ), "-- SELECT pg_catalog.set_config('search_path', '', false); (emptying search_path removed)" ),
2981 ]
30-
31- # If we're targeting a non-public schema, add schema modifications
82+
83+ # Schema-specific replacements if target_schema is not 'public'
3284 if self .target_schema != "public" :
33- # Add rules to update schema references in the dump
34- self .replacement_rules .extend ([
35- # Update SET search_path statements
36- (r'SET search_path = public' , f'SET search_path = { self .target_schema } ' ),
37- # Update schema references in table names
38- (r'CREATE TABLE public\.' , f'CREATE TABLE { self .target_schema } .' ),
39- # Update schema references in sequence names
40- (r'CREATE SEQUENCE public\.' , f'CREATE SEQUENCE { self .target_schema } .' ),
41- # Update schema references in views
42- (r'CREATE VIEW public\.' , f'CREATE VIEW { self .target_schema } .' ),
43- # Update schema references in functions
44- (r'CREATE FUNCTION public\.' , f'CREATE FUNCTION { self .target_schema } .' ),
45- ])
46-
85+ schema_replacements = [
86+ # Change SET search_path = public, ''; (or similar) to include the target schema first.
87+ # This regex handles various forms of search_path settings.
88+ (r"SET search_path = (?:public(?:,\s*)?)([^;]*);" , rf"SET search_path = \"{ self .target_schema } \", public\1;" ),
89+ (r"CREATE TABLE public\.([\w_]+)" , rf'CREATE TABLE "{ self .target_schema } ".\1' ),
90+ (r"ALTER TABLE ONLY public\.([\w_]+)" , rf'ALTER TABLE ONLY "{ self .target_schema } ".\1' ),
91+ (r"CREATE SEQUENCE public\.([\w_]+)" , rf'CREATE SEQUENCE "{ self .target_schema } ".\1' ),
92+ (r"ALTER SEQUENCE public\.([\w_]+)" , rf'ALTER SEQUENCE "{ self .target_schema } ".\1' ),
93+ (r"CREATE VIEW public\.([\w_]+)" , rf'CREATE VIEW "{ self .target_schema } ".\1' ),
94+ (r"CREATE FUNCTION public\.([\w_]+)" , rf'CREATE FUNCTION "{ self .target_schema } ".\1' ),
95+ (r"CREATE TRIGGER " , "CREATE TRIGGER " ), # No change, but ensure it's before more specific public. replacements if any
96+ (r"REFERENCES public\.([\w_]+)" , rf'REFERENCES "{ self .target_schema } ".\1' ),
97+ # For foreign keys that might reference public schema (e.g. from extensions like auth, storage in public)
98+ # we generally want them to stay as public. This needs careful consideration.
99+ # The above rule is quite broad. If you have FKs to your own tables previously in public, they should be remapped.
100+ # If FKs are to Supabase's own public tables (auth.users etc.), they should NOT be remapped.
101+ # This often requires more specific rules or post-import adjustments.
102+ # For now, we are broadly remapping 'public.' to the target schema.
103+ ]
104+ for pattern_str , repl_str in schema_replacements :
105+ self .replacement_rules .append ((re .compile (pattern_str , re .IGNORECASE ), repl_str ))
106+ else : # If target_schema is 'public'
107+ self .replacement_rules .append (
108+ (re .compile (r"SET search_path = .*?;" , re .IGNORECASE ), r"SET search_path = public, pg_catalog;" ) # Ensure a sane default
109+ )
110+
111+
47112 def clean_dump_file (self ) -> Path :
48- """
49- Clean the SQL dump file for Supabase import by removing/modifying
50- incompatible statements.
51-
52- Returns:
53- Path to the cleaned dump file
54- """
55- logger .info (f"Cleaning dump file: { self .input_file } " )
56-
113+ logger .info (f"Starting cleaning of dump file: { self .input_file } " )
114+
57115 if not self .input_file .exists ():
116+ logger .error (f"Input file not found: { self .input_file } " )
58117 raise FileNotFoundError (f"Input file not found: { self .input_file } " )
59-
60- # Count of modifications for logging
61- skipped_lines = 0
62- modified_lines = 0
63-
64- with open (self .input_file , 'r' ) as infile , open (self .output_file , 'w' ) as outfile :
118+
119+ skipped_lines , modified_lines , lines_processed = 0 , 0 , 0
120+
121+ # Create output directory if it doesn't exist
122+ self .output_file .parent .mkdir (parents = True , exist_ok = True )
123+
124+ with self .input_file .open ('r' , encoding = 'utf-8' ) as infile , \
125+ self .output_file .open ('w' , encoding = 'utf-8' ) as outfile :
65126 for line_num , line in enumerate (infile , 1 ):
66- # Check if line should be skipped
67- if any (re .search (pattern , line ) for pattern in self .skip_patterns ):
68- skipped_lines += 1
69- continue
70-
71- # Apply replacements
127+ lines_processed += 1
72128 original_line = line
129+
130+ # Check skip patterns first
131+ should_skip = False
132+ for pattern in self .skip_patterns :
133+ if pattern .search (line ):
134+ # logger.debug(f"Line {line_num} skipped by pattern: {pattern.pattern}\n\t{line.strip()}")
135+ skipped_lines += 1
136+ should_skip = True
137+ break
138+ if should_skip :
139+ outfile .write (f"-- SKIPPED LINE (by pattern): { line .strip ()} \n " ) # Write as comment for audit
140+ continue
141+
142+ # Apply replacement rules
143+ modified_in_line = 0
73144 for pattern , replacement in self .replacement_rules :
74- if re . search ( pattern , line ):
75- line = re . sub ( pattern , replacement , line )
76- if line != original_line :
77- modified_lines += 1
145+ line , count = pattern . subn ( replacement , line )
146+ if count > 0 :
147+ modified_lines += count
148+ modified_in_line += count
78149
150+ # if modified_in_line > 0:
151+ # logger.debug(f"Line {line_num} modified from: {original_line.strip()} \n\tTO: {line.strip()}")
152+
79153 outfile .write (line )
80-
81- logger .info (f"Cleaning completed: { skipped_lines } lines skipped, { modified_lines } lines modified " )
154+
155+ logger .info (f"Cleaning completed: { lines_processed } lines processed, { skipped_lines } lines skipped, { modified_lines } total modifications applied. " )
82156 logger .info (f"Cleaned dump saved as { self .output_file } " )
83157 return self .output_file
84158
85- def clean_dump_file (input_file : Optional [Path ] = None , output_file : Optional [Path ] = None , target_schema : Optional [str ] = None ) -> Path :
159+ def clean_dump_file (
160+ input_file : Optional [Path ] = None ,
161+ output_file : Optional [Path ] = None ,
162+ target_schema : Optional [str ] = None ,
163+ ) -> Path :
86164 """
87- Convenience function to clean a SQL dump file
88-
165+ Cleans a PostgreSQL dump file to make it suitable for import into Supabase.
166+
89167 Args:
90- input_file: Path to the input SQL dump file
91- output_file: Path to the output cleaned SQL file
92- target_schema: Schema to use in Supabase. If None, uses the one from config
168+ input_file: Path to the input SQL dump file.
169+ output_file: Path where the cleaned SQL dump file will be saved.
170+ target_schema: The target schema in Supabase (e.g., "public" or a custom extension schema).
171+
172+ Returns:
173+ Path to the cleaned output file.
93174 """
94175 cleaner = DumpCleaner (input_file , output_file , target_schema )
95- return cleaner .clean_dump_file ()
176+ return cleaner .clean_dump_file ()
177+
178+ # Example usage (for testing purposes)
179+ if __name__ == '__main__' :
180+ # Create a dummy config.py for the test
181+ with open ("config.py" , "w" ) as f :
182+ f .write ("OUTPUT_DUMP = 'dummy_input.sql'\n " )
183+ f .write ("CLEANED_DUMP = 'dummy_cleaned_output.sql'\n " )
184+ f .write ("SUPABASE_SCHEMA = 'extensions'\n " ) # or 'public'
185+
186+ # Create a dummy input SQL file
187+ dummy_sql_content = """
188+ -- PostgreSQL database dump
189+
190+ SET statement_timeout = 0;
191+ SET lock_timeout = 0;
192+ SET idle_in_transaction_session_timeout = 0;
193+ SET client_encoding = 'UTF8';
194+ SET standard_conforming_strings = on;
195+ SELECT pg_catalog.set_config('search_path', '', false);
196+ SET check_function_bodies = false;
197+ SET xmloption = content;
198+ SET client_min_messages = warning;
199+ SET row_security = off;
200+
201+ COMMENT ON EXTENSION plpgsql IS 'PL/pgSQL procedural language';
202+ CREATE ROLE cloudsqlsuperuser;
203+ ALTER ROLE cloudsqlsuperuser WITH NOSUPERUSER INHERIT NOCREATEROLE NOCREATEDB LOGIN NOREPLICATION NOBYPASSRLS;
204+ GRANT pg_signal_backend TO cloudsqlsuperuser;
205+ SET SESSION AUTHORIZATION cloudsqlsuperuser;
206+ SET ROLE cloudsqlsuperuser;
207+
208+ CREATE SCHEMA extensions;
209+ ALTER SCHEMA extensions OWNER TO cloudsqlsuperuser;
210+
211+ CREATE SCHEMA public; -- This should be commented out
212+ ALTER SCHEMA public OWNER TO cloudsqlsuperuser; -- This should be changed or commented out
213+
214+ SET search_path = public, extensions;
215+
216+ CREATE TABLE public.my_table (id integer);
217+ ALTER TABLE public.my_table OWNER TO cloudsqlsuperuser;
218+
219+ CREATE TABLE extensions.another_table (name text);
220+ ALTER TABLE extensions.another_table OWNER TO postgres; -- Assuming postgres is the target
221+
222+ GRANT SELECT ON public.my_table TO cloudsqlsuperuser;
223+ GRANT INSERT ON public.my_table TO "cloudsqlsuperuser";
224+ GRANT cloudsqlsuperuser TO some_other_user;
225+ REVOKE cloudsqlsuperuser FROM some_other_user;
226+ ALTER DEFAULT PRIVILEGES FOR ROLE cloudsqlsuperuser IN SCHEMA public GRANT SELECT ON TABLES TO public;
227+ ALTER DEFAULT PRIVILEGES FOR ROLE "cloudsqlsuperuser" GRANT SELECT ON SEQUENCES TO public;
228+
229+ -- End of dummy dump
230+ """
231+ with open ("dummy_input.sql" , "w" , encoding = 'utf-8' ) as f :
232+ f .write (dummy_sql_content )
233+
234+ # Configure basic logging for the test
235+ logging .basicConfig (level = logging .DEBUG , format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s' )
236+
237+ logger .info ("Running dummy cleaning process..." )
238+ # Test with target_schema = 'extensions'
239+ # cleaner_ext = DumpCleaner(input_file=Path("dummy_input.sql"), output_file=Path("dummy_cleaned_ext.sql"), target_schema="extensions")
240+ # cleaned_path_ext = cleaner_ext.clean_dump_file()
241+ # print(f"Cleaned file for 'extensions' schema at: {cleaned_path_ext}")
242+ # with open(cleaned_path_ext, 'r') as f_ext:
243+ # print("\n--- Cleaned for 'extensions' schema ---")
244+ # print(f_ext.read())
245+
246+ # Test with target_schema = 'public' (Supabase default for user tables)
247+ cleaner_public = DumpCleaner (input_file = Path ("dummy_input.sql" ), output_file = Path ("dummy_cleaned_public.sql" ), target_schema = "public" )
248+ cleaned_path_public = cleaner_public .clean_dump_file ()
249+ print (f"Cleaned file for 'public' schema at: { cleaned_path_public } " )
250+ with open (cleaned_path_public , 'r' ) as f_public :
251+ print ("\n --- Cleaned for 'public' schema ---" )
252+ print (f_public .read ())
253+
254+ # Clean up dummy files
255+ # Path("config.py").unlink(missing_ok=True)
256+ # Path("dummy_input.sql").unlink(missing_ok=True)
257+ # Path("dummy_cleaned_ext.sql").unlink(missing_ok=True)
258+ # Path("dummy_cleaned_public.sql").unlink(missing_ok=True)
0 commit comments