data-migration-accelerator/snowpark.py at main · thisisqubika/data-migration-accelerator · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
"""
Snowpark Object Reader
Reads all database objects from Snowflake using Snowpark API.
This script reads all objects defined in snowflake_test_objects.sql
"""

import os
import json
from typing import Dict, List, Any, Optional
from snowflake.snowpark import Session
from snowflake.snowpark.functions import col
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

# Connection parameters from environment variables
SFLKaccount = os.getenv('SNOWFLAKE_ACCOUNT', 'nqhycck-oh54539')
SFLKuser = os.getenv('SNOWFLAKE_USER')
SFLKpass = os.getenv('SNOWFLAKE_PASSWORD')
SFLKrole = os.getenv('SNOWFLAKE_ROLE', 'SYSADMIN')
SFLKwarehouse = os.getenv('SNOWFLAKE_WAREHOUSE', 'COMPUTE_WH')
SFLKdatabase = os.getenv('SNOWFLAKE_DATABASE', 'DATA_MIGRATION_DB')
SFLKschema = os.getenv('SNOWFLAKE_SCHEMA', 'DATA_MIGRATION_SCHEMA')
SFLKregion = os.getenv('SNOWFLAKE_REGION')  # Optional region

# Build connection parameters
connection_parameters = {
    "account": SFLKaccount,
    "user": SFLKuser,
    "role": SFLKrole,
    "password": SFLKpass,
    "warehouse": SFLKwarehouse,
    "database": SFLKdatabase,
    "schema": SFLKschema
}

# Add region if specified
if SFLKregion:
    connection_parameters["region"] = SFLKregion

# Validate required parameters
if not SFLKuser or not SFLKpass:
    raise ValueError(
        "Missing required environment variables. Please set SNOWFLAKE_USER and SNOWFLAKE_PASSWORD "
        "in your .env file. See env.example for reference."
    )


class SnowparkObjectReader:
    """Class to read all Snowflake database objects using Snowpark."""

    def __init__(self, session: Session):
        """Initialize with a Snowpark session."""
        self.session = session
        self.database = SFLKdatabase
        self.schema = SFLKschema

    def get_tables(self) -> List[Dict[str, Any]]:
        """Get all tables in the schema."""
        query = f"""
        SELECT
            table_catalog as database_name,
            table_schema as schema_name,
            table_name,
            table_type,
            row_count,
            bytes,
            created,
            last_altered,
            comment
        FROM information_schema.tables
        WHERE table_schema = '{self.schema}'
        AND table_type = 'BASE TABLE'
        ORDER BY table_name
        """
        result = self.session.sql(query).collect()
        # Convert to dict and normalize keys (handle case sensitivity)
        tables = []
        for row in result:
            row_dict = dict(row.as_dict())
            # Normalize keys to lowercase for consistency
            normalized = {k.lower(): v for k, v in row_dict.items()}
            tables.append(normalized)
        return tables

    def get_table_columns(self, table_name: str) -> List[Dict[str, Any]]:
        """Get columns for a specific table."""
        query = f"""
        SELECT
            column_name,
            data_type,
            character_maximum_length,
            numeric_precision,
            numeric_scale,
            is_nullable,
            column_default,
            comment
        FROM information_schema.columns
        WHERE table_schema = '{self.schema}'
        AND table_name = '{table_name}'
        ORDER BY ordinal_position
        """
        result = self.session.sql(query).collect()
        # Normalize keys to lowercase
        return [{k.lower(): v for k, v in dict(row.as_dict()).items()} for row in result]

    def get_views(self) -> List[Dict[str, Any]]:
        """Get all views in the schema."""
        query = f"""
        SELECT
            table_catalog as database_name,
            table_schema as schema_name,
            table_name as view_name,
            view_definition,
            created,
            comment
        FROM information_schema.views
        WHERE table_schema = '{self.schema}'
        ORDER BY view_name
        """
        result = self.session.sql(query).collect()
        # Normalize keys to lowercase
        return [{k.lower(): v for k, v in dict(row.as_dict()).items()} for row in result]

    def get_procedures(self) -> List[Dict[str, Any]]:
        """Get all stored procedures in the schema."""
        query = f"""
        SELECT
            procedure_catalog as database_name,
            procedure_schema as schema_name,
            procedure_name,
            procedure_definition,
            created,
            last_altered,
            comment
        FROM information_schema.procedures
        WHERE procedure_schema = '{self.schema}'
        ORDER BY procedure_name
        """
        result = self.session.sql(query).collect()
        # Normalize keys to lowercase
        return [{k.lower(): v for k, v in dict(row.as_dict()).items()} for row in result]

    def get_functions(self) -> List[Dict[str, Any]]:
        """Get all user-defined functions in the schema."""
        query = f"""
        SELECT
            function_catalog as database_name,
            function_schema as schema_name,
            function_name,
            function_definition,
            created,
            last_altered,
            comment
        FROM information_schema.functions
        WHERE function_schema = '{self.schema}'
        ORDER BY function_name
        """
        result = self.session.sql(query).collect()
        # Normalize keys to lowercase
        return [{k.lower(): v for k, v in dict(row.as_dict()).items()} for row in result]

    def get_sequences(self) -> List[Dict[str, Any]]:
        """Get all sequences in the schema."""
        # Use SHOW command for sequences as information_schema may not have all columns
        query = f"SHOW SEQUENCES IN SCHEMA {self.database}.{self.schema}"
        try:
            result = self.session.sql(query).collect()
            # Normalize keys to lowercase
            return [{k.lower(): v for k, v in dict(row.as_dict()).items()} for row in result]
        except Exception as e:
            # Fallback: try information_schema with basic columns only
            print(f"  ⚠ Warning: SHOW SEQUENCES failed, trying information_schema: {e}")
            query = f"""
            SELECT
                sequence_catalog as database_name,
                sequence_schema as schema_name,
                sequence_name
            FROM information_schema.sequences
            WHERE sequence_schema = '{self.schema}'
            ORDER BY sequence_name
            """
            result = self.session.sql(query).collect()
            # Normalize keys to lowercase
            return [{k.lower(): v for k, v in dict(row.as_dict()).items()} for row in result]

    def get_stages(self) -> List[Dict[str, Any]]:
        """Get all stages in the schema."""
        # Use SHOW command for stages
        query = f"SHOW STAGES IN SCHEMA {self.database}.{self.schema}"
        result = self.session.sql(query).collect()
        # Normalize keys to lowercase
        return [{k.lower(): v for k, v in dict(row.as_dict()).items()} for row in result]

    def get_file_formats(self) -> List[Dict[str, Any]]:
        """Get all file formats in the schema."""
        query = f"SHOW FILE FORMATS IN SCHEMA {self.database}.{self.schema}"
        result = self.session.sql(query).collect()
        # Normalize keys to lowercase
        return [{k.lower(): v for k, v in dict(row.as_dict()).items()} for row in result]

    def get_tasks(self) -> List[Dict[str, Any]]:
        """Get all tasks in the schema."""
        query = f"SHOW TASKS IN SCHEMA {self.database}.{self.schema}"
        result = self.session.sql(query).collect()
        # Normalize keys to lowercase
        return [{k.lower(): v for k, v in dict(row.as_dict()).items()} for row in result]

    def get_streams(self) -> List[Dict[str, Any]]:
        """Get all streams in the schema."""
        query = f"SHOW STREAMS IN SCHEMA {self.database}.{self.schema}"
        result = self.session.sql(query).collect()
        # Normalize keys to lowercase
        return [{k.lower(): v for k, v in dict(row.as_dict()).items()} for row in result]

    def get_pipes(self) -> List[Dict[str, Any]]:
        """Get all pipes in the schema."""
        query = f"SHOW PIPES IN SCHEMA {self.database}.{self.schema}"
        result = self.session.sql(query).collect()
        # Normalize keys to lowercase
        return [{k.lower(): v for k, v in dict(row.as_dict()).items()} for row in result]

    def get_table_data(self, table_name: str, limit: Optional[int] = None) -> List[Dict[str, Any]]:
        """Get data from a specific table."""
        query = f"SELECT * FROM {self.database}.{self.schema}.{table_name}"
        if limit:
            query += f" LIMIT {limit}"
        result = self.session.sql(query).collect()
        return [dict(row.as_dict()) for row in result]

    def get_all_objects(self) -> Dict[str, Any]:
        """Get all database objects in one call."""
        print("\n📊 Reading all Snowflake objects using Snowpark...")

        objects = {
            'database': self.database,
            'schema': self.schema,
            'tables': self.get_tables(),
            'views': self.get_views(),
            'procedures': self.get_procedures(),
            'functions': self.get_functions(),
            'sequences': self.get_sequences(),
            'stages': self.get_stages(),
            'file_formats': self.get_file_formats(),
            'tasks': self.get_tasks(),
            'streams': self.get_streams(),
            'pipes': self.get_pipes(),
        }

        # Add column details for each table
        for table in objects['tables']:
            # Handle both lowercase and uppercase keys
            table_name = table.get('table_name') or table.get('TABLE_NAME')
            if table_name:
                table['columns'] = self.get_table_columns(table_name)
            else:
                print(f"  ⚠ Warning: Could not find table_name in table object: {list(table.keys())}")
                table['columns'] = []

        # Add sample data for tables (limit to 10 rows each)
        for table in objects['tables']:
            try:
                # Handle both lowercase and uppercase keys
                table_name = table.get('table_name') or table.get('TABLE_NAME')
                if table_name:
                    table['sample_data'] = self.get_table_data(table_name, limit=10)
                else:
                    table['sample_data'] = "Error: table_name not found"
            except Exception as e:
                table['sample_data'] = f"Error retrieving data: {str(e)}"

        print(f"✓ Found {len(objects['tables'])} tables")
        print(f"✓ Found {len(objects['views'])} views")
        print(f"✓ Found {len(objects['procedures'])} procedures")
        print(f"✓ Found {len(objects['functions'])} functions")
        print(f"✓ Found {len(objects['sequences'])} sequences")
        print(f"✓ Found {len(objects['stages'])} stages")
        print(f"✓ Found {len(objects['file_formats'])} file formats")
        print(f"✓ Found {len(objects['tasks'])} tasks")
        print(f"✓ Found {len(objects['streams'])} streams")
        print(f"✓ Found {len(objects['pipes'])} pipes")

        return objects

    def save_to_json(self, output_file: str = 'snowflake_objects_snowpark.json'):
        """Save all objects to a JSON file."""
        objects = self.get_all_objects()
        with open(output_file, 'w') as f:
            json.dump(objects, f, indent=2, default=str)
        print(f"\n✓ Saved all objects to {output_file}")

    def object_exists(self, object_name: str, object_type: str = 'TABLE') -> bool:
        """Check if an object exists in the schema."""
        try:
            if object_type.upper() == 'TABLE':
                query = f"""
                SELECT COUNT(*) as cnt
                FROM information_schema.tables
                WHERE table_schema = '{self.schema}'
                AND table_name = '{object_name.upper()}'
                AND table_type = 'BASE TABLE'
                """
            elif object_type.upper() == 'VIEW':
                query = f"""
                SELECT COUNT(*) as cnt
                FROM information_schema.views
                WHERE table_schema = '{self.schema}'
                AND table_name = '{object_name.upper()}'
                """
            else:
                return False

            result = self.session.sql(query).collect()
            return result[0][0] > 0
        except Exception:
            return False

    def query_specific_objects(self):
        """Query the specific test objects from snowflake_test_objects.sql"""
        print("\n🔍 Querying specific test objects...")
        print("  (Note: Objects must be created first by running snowflake_test_objects.sql)")

        results = {}

        # Query tables
        table_names = ['data_migration_source', 'data_migration_target']
        for table_name in table_names:
            try:
                if self.object_exists(table_name, 'TABLE'):
                    print(f"  - Querying {table_name} table...")
                    table = self.session.table(f'{self.database}.{self.schema}.{table_name}')
                    data = table.collect()
                    results[table_name] = [dict(row.as_dict()) for row in data]
                    print(f"    ✓ Found {len(results[table_name])} rows")
                else:
                    print(f"  - ⚠ {table_name} table does not exist (run snowflake_test_objects.sql to create it)")
                    results[table_name] = None
            except Exception as e:
                print(f"    ✗ Error querying {table_name}: {str(e)[:100]}")
                results[table_name] = None

        # Query views
        view_names = [
            'data_migration_active_sources',
            'data_migration_summary',
            'data_migration_status_ranked',
            'data_migration_monthly_summary'
        ]

        for view_name in view_names:
            try:
                if self.object_exists(view_name, 'VIEW'):
                    print(f"  - Querying {view_name} view...")
                    view = self.session.table(f'{self.database}.{self.schema}.{view_name}')
                    data = view.collect()
                    results[view_name] = [dict(row.as_dict()) for row in data]
                    print(f"    ✓ Found {len(results[view_name])} rows")
                else:
                    print(f"  - ⚠ {view_name} view does not exist (run snowflake_test_objects.sql to create it)")
                    results[view_name] = None
            except Exception as e:
                error_msg = str(e)
                if "does not exist" in error_msg or "not authorized" in error_msg:
                    print(f"  - ⚠ {view_name} view does not exist or not authorized")
                else:
                    print(f"    ✗ Error querying {view_name}: {error_msg[:100]}")
                results[view_name] = None

        return results


def main():
    """Main function to demonstrate usage."""
    session = None
    try:
        print("=" * 60)
        print("SNOWPARK OBJECT READER")
        print("=" * 60)
        print(f"Connecting to Snowflake account: {SFLKaccount}")
        print(f"User: {SFLKuser}")
        print(f"Database: {SFLKdatabase}, Schema: {SFLKschema}")
        if SFLKwarehouse:
            print(f"Warehouse: {SFLKwarehouse}")
        if SFLKregion:
            print(f"Region: {SFLKregion}")

        # Create Snowpark session
        session = Session.builder.configs(connection_parameters).create()
        print("✓ Successfully connected to Snowflake using Snowpark")

        # Test connection
        version = session.sql("SELECT CURRENT_VERSION()").collect()[0][0]
        print(f"✓ Snowflake version: {version}")

        # Create reader
        reader = SnowparkObjectReader(session)

        # Get all objects
        objects = reader.get_all_objects()

        # Print summary
        print("\n" + "=" * 60)
        print("SNOWFLAKE OBJECTS SUMMARY")
        print("=" * 60)
        print(json.dumps({
            'database': objects['database'],
            'schema': objects['schema'],
            'counts': {
                'tables': len(objects['tables']),
                'views': len(objects['views']),
                'procedures': len(objects['procedures']),
                'functions': len(objects['functions']),
                'sequences': len(objects['sequences']),
                'stages': len(objects['stages']),
                'file_formats': len(objects['file_formats']),
                'tasks': len(objects['tasks']),
                'streams': len(objects['streams']),
                'pipes': len(objects['pipes']),
            }
        }, indent=2))

        # Query specific test objects
        test_objects = reader.query_specific_objects()

        # Save to JSON file
        reader.save_to_json()

        print("\n✓ All operations completed successfully!")

    except Exception as e:
        print(f"\n✗ Error: {e}")
        import traceback
        traceback.print_exc()
    finally:
        if session:
            session.close()
            print("\n✓ Session closed")


if __name__ == '__main__':
    main()