|
5 | 5 | # etc., to any person is prohibited unless it has been previously and |
6 | 6 | # specifically authorized by written means by Cosmo Tech. |
7 | 7 |
|
| 8 | +from typing import Optional |
| 9 | +from urllib.parse import quote |
| 10 | + |
| 11 | +import pyarrow as pa |
8 | 12 | from adbc_driver_postgresql import dbapi |
9 | 13 | from pyarrow import Table |
10 | 14 |
|
| 15 | +from cosmotech.coal.utils.logger import LOGGER |
| 16 | + |
11 | 17 |
|
12 | 18 | def generate_postgresql_full_uri( |
13 | 19 | postgres_host: str, |
14 | 20 | postgres_port: str, |
15 | 21 | postgres_db: str, |
16 | 22 | postgres_user: str, |
17 | 23 | postgres_password: str, ) -> str: |
| 24 | + # Check if password needs percent encoding (contains special characters) |
| 25 | + # We don't log anything about the password for security |
| 26 | + encoded_password = quote(postgres_password, safe='') |
18 | 27 | return ('postgresql://' + |
19 | 28 | f'{postgres_user}' |
20 | | - f':{postgres_password}' |
| 29 | + f':{encoded_password}' |
21 | 30 | f'@{postgres_host}' |
22 | 31 | f':{postgres_port}' |
23 | 32 | f'/{postgres_db}') |
24 | 33 |
|
25 | 34 |
|
26 | | -def send_pyarrow_table_to_postgresql( |
27 | | - data: Table, |
| 35 | +def get_postgresql_table_schema( |
28 | 36 | target_table_name: str, |
29 | 37 | postgres_host: str, |
30 | 38 | postgres_port: str, |
31 | 39 | postgres_db: str, |
32 | 40 | postgres_schema: str, |
33 | 41 | postgres_user: str, |
34 | 42 | postgres_password: str, |
35 | | - replace: bool |
36 | | -) -> int: |
37 | | - total = 0 |
| 43 | +) -> Optional[pa.Schema]: |
| 44 | + """ |
| 45 | + Get the schema of an existing PostgreSQL table using SQL queries. |
| 46 | + |
| 47 | + Args: |
| 48 | + target_table_name: Name of the table |
| 49 | + postgres_host: PostgreSQL host |
| 50 | + postgres_port: PostgreSQL port |
| 51 | + postgres_db: PostgreSQL database name |
| 52 | + postgres_schema: PostgreSQL schema name |
| 53 | + postgres_user: PostgreSQL username |
| 54 | + postgres_password: PostgreSQL password |
| 55 | + |
| 56 | + Returns: |
| 57 | + PyArrow Schema if table exists, None otherwise |
| 58 | + """ |
| 59 | + LOGGER.debug(f"Getting schema for table {postgres_schema}.{target_table_name}") |
38 | 60 |
|
39 | 61 | postgresql_full_uri = generate_postgresql_full_uri(postgres_host, |
40 | 62 | postgres_port, |
41 | 63 | postgres_db, |
42 | 64 | postgres_user, |
43 | 65 | postgres_password) |
| 66 | + |
| 67 | + with (dbapi.connect(postgresql_full_uri) as conn): |
| 68 | + try: |
| 69 | + catalog = conn.adbc_get_objects(depth="tables", |
| 70 | + catalog_filter=postgres_db, |
| 71 | + db_schema_filter=postgres_schema, |
| 72 | + table_name_filter=target_table_name).read_all().to_pylist()[0] |
| 73 | + schema = catalog["catalog_db_schemas"][0] |
| 74 | + table = schema["db_schema_tables"][0] |
| 75 | + if table["table_name"] == target_table_name: |
| 76 | + return conn.adbc_get_table_schema( |
| 77 | + target_table_name, |
| 78 | + db_schema_filter=postgres_schema, |
| 79 | + ) |
| 80 | + except IndexError: |
| 81 | + LOGGER.warning(f"Table {postgres_schema}.{target_table_name} not found") |
| 82 | + return None |
| 83 | + |
| 84 | + |
| 85 | +def adapt_table_to_schema( |
| 86 | + data: pa.Table, |
| 87 | + target_schema: pa.Schema |
| 88 | +) -> pa.Table: |
| 89 | + """ |
| 90 | + Adapt a PyArrow table to match a target schema with detailed logging. |
| 91 | + """ |
| 92 | + LOGGER.debug(f"Starting schema adaptation for table with {len(data)} rows") |
| 93 | + LOGGER.debug(f"Original schema: {data.schema}") |
| 94 | + LOGGER.debug(f"Target schema: {target_schema}") |
| 95 | + |
| 96 | + target_fields = {field.name: field.type for field in target_schema} |
| 97 | + new_columns = [] |
| 98 | + |
| 99 | + # Track adaptations for summary |
| 100 | + added_columns = [] |
| 101 | + dropped_columns = [] |
| 102 | + type_conversions = [] |
| 103 | + failed_conversions = [] |
| 104 | + |
| 105 | + # Process each field in target schema |
| 106 | + for field_name, target_type in target_fields.items(): |
| 107 | + if field_name in data.column_names: |
| 108 | + # Column exists - try to cast to target type |
| 109 | + col = data[field_name] |
| 110 | + original_type = col.type |
| 111 | + |
| 112 | + if original_type != target_type: |
| 113 | + LOGGER.debug( |
| 114 | + f"Attempting to cast column '{field_name}' " |
| 115 | + f"from {original_type} to {target_type}" |
| 116 | + ) |
| 117 | + try: |
| 118 | + new_col = pa.compute.cast(col, target_type) |
| 119 | + new_columns.append(new_col) |
| 120 | + type_conversions.append( |
| 121 | + f"{field_name}: {original_type} -> {target_type}" |
| 122 | + ) |
| 123 | + except pa.ArrowInvalid as e: |
| 124 | + LOGGER.warning( |
| 125 | + f"Failed to cast column '{field_name}' " |
| 126 | + f"from {original_type} to {target_type}. " |
| 127 | + f"Filling with nulls. Error: {str(e)}" |
| 128 | + ) |
| 129 | + new_columns.append(pa.nulls(len(data), type=target_type)) |
| 130 | + failed_conversions.append( |
| 131 | + f"{field_name}: {original_type} -> {target_type}" |
| 132 | + ) |
| 133 | + else: |
| 134 | + new_columns.append(col) |
| 135 | + else: |
| 136 | + # Column doesn't exist - add nulls |
| 137 | + LOGGER.debug(f"Adding missing column '{field_name}' with null values") |
| 138 | + new_columns.append(pa.nulls(len(data), type=target_type)) |
| 139 | + added_columns.append(field_name) |
| 140 | + |
| 141 | + # Log columns that will be dropped |
| 142 | + dropped_columns = [ |
| 143 | + name for name in data.column_names |
| 144 | + if name not in target_fields |
| 145 | + ] |
| 146 | + if dropped_columns: |
| 147 | + LOGGER.debug( |
| 148 | + f"Dropping extra columns not in target schema: {dropped_columns}" |
| 149 | + ) |
| 150 | + |
| 151 | + # Create new table |
| 152 | + adapted_table = pa.Table.from_arrays( |
| 153 | + new_columns, |
| 154 | + schema=target_schema |
| 155 | + ) |
| 156 | + |
| 157 | + # Log summary of adaptations |
| 158 | + LOGGER.debug("Schema adaptation summary:") |
| 159 | + if added_columns: |
| 160 | + LOGGER.debug(f"- Added columns (filled with nulls): {added_columns}") |
| 161 | + if dropped_columns: |
| 162 | + LOGGER.debug(f"- Dropped columns: {dropped_columns}") |
| 163 | + if type_conversions: |
| 164 | + LOGGER.debug(f"- Successful type conversions: {type_conversions}") |
| 165 | + if failed_conversions: |
| 166 | + LOGGER.debug( |
| 167 | + f"- Failed conversions (filled with nulls): {failed_conversions}" |
| 168 | + ) |
| 169 | + |
| 170 | + LOGGER.debug(f"Final adapted table schema: {adapted_table.schema}") |
| 171 | + return adapted_table |
| 172 | + |
| 173 | + |
| 174 | +def send_pyarrow_table_to_postgresql( |
| 175 | + data: Table, |
| 176 | + target_table_name: str, |
| 177 | + postgres_host: str, |
| 178 | + postgres_port: str, |
| 179 | + postgres_db: str, |
| 180 | + postgres_schema: str, |
| 181 | + postgres_user: str, |
| 182 | + postgres_password: str, |
| 183 | + replace: bool |
| 184 | +) -> int: |
| 185 | + LOGGER.debug( |
| 186 | + f"Preparing to send data to PostgreSQL table '{postgres_schema}.{target_table_name}'" |
| 187 | + ) |
| 188 | + LOGGER.debug(f"Input table has {len(data)} rows") |
| 189 | + |
| 190 | + # Get existing schema if table exists |
| 191 | + existing_schema = get_postgresql_table_schema( |
| 192 | + target_table_name, |
| 193 | + postgres_host, |
| 194 | + postgres_port, |
| 195 | + postgres_db, |
| 196 | + postgres_schema, |
| 197 | + postgres_user, |
| 198 | + postgres_password |
| 199 | + ) |
| 200 | + |
| 201 | + if existing_schema is not None: |
| 202 | + LOGGER.debug(f"Found existing table with schema: {existing_schema}") |
| 203 | + if not replace: |
| 204 | + LOGGER.debug("Adapting incoming data to match existing schema") |
| 205 | + data = adapt_table_to_schema(data, existing_schema) |
| 206 | + else: |
| 207 | + LOGGER.debug("Replace mode enabled - skipping schema adaptation") |
| 208 | + else: |
| 209 | + LOGGER.debug("No existing table found - will create new table") |
| 210 | + |
| 211 | + # Proceed with ingestion |
| 212 | + total = 0 |
| 213 | + postgresql_full_uri = generate_postgresql_full_uri( |
| 214 | + postgres_host, |
| 215 | + postgres_port, |
| 216 | + postgres_db, |
| 217 | + postgres_user, |
| 218 | + postgres_password |
| 219 | + ) |
| 220 | + |
| 221 | + LOGGER.debug("Connecting to PostgreSQL database") |
44 | 222 | with dbapi.connect(postgresql_full_uri, autocommit=True) as conn: |
45 | 223 | with conn.cursor() as curs: |
| 224 | + LOGGER.debug( |
| 225 | + f"Ingesting data with mode: {'replace' if replace else 'create_append'}" |
| 226 | + ) |
46 | 227 | total += curs.adbc_ingest( |
47 | 228 | target_table_name, |
48 | 229 | data, |
49 | 230 | "replace" if replace else "create_append", |
50 | | - db_schema_name=postgres_schema) |
| 231 | + db_schema_name=postgres_schema |
| 232 | + ) |
51 | 233 |
|
| 234 | + LOGGER.debug(f"Successfully ingested {total} rows") |
52 | 235 | return total |
0 commit comments