parallel streaming: Create table before starting parallel workers

fordN · fordN · commit 7f7d0dfb471d · 2025-10-22T08:09:27.000-07:00
diff --git a/src/amp/loaders/implementations/snowflake_loader.py b/src/amp/loaders/implementations/snowflake_loader.py
@@ -163,8 +163,12 @@ def _load_batch_impl(self, batch: pa.RecordBatch, table_name: str, **kwargs) ->
                 'Please use APPEND mode or manually truncate/drop the table before loading.'
             )
 
+        # Table creation is now handled by base class or pre-flight creation in parallel mode
+        # For pandas loading, we skip manual table creation and let write_pandas handle it
         if create_table and table_name.upper() not in self._created_tables:
-            self._create_table_from_schema(batch.schema, table_name)
+            # For pandas, skip table creation - write_pandas will handle it
+            if self.loading_method != 'pandas':
+                self._create_table_from_schema(batch.schema, table_name)
             self._created_tables.add(table_name.upper())
 
         if self.use_stage:
diff --git a/src/amp/streaming/parallel.py b/src/amp/streaming/parallel.py
@@ -366,15 +366,89 @@ def execute_parallel_stream(
             f'Starting parallel streaming with {len(partitions)} partitions across {self.config.num_workers} workers'
         )
 
-        # 2. Submit worker tasks
+        # 2. Pre-flight table creation (before workers start)
+        # Create table once to avoid locking complexity in parallel workers
+        try:
+            # Get connection info
+            connection_info = self.client.connection_manager.get_connection_info(connection_name)
+            loader_config = connection_info['config']
+            loader_type = connection_info['loader']
+
+            # Get sample schema by executing LIMIT 1 on original query
+            # We don't need partition filtering for schema detection, just need any row
+            sample_query = user_query.strip().rstrip(';')
+
+            # Remove SETTINGS clause (especially stream = true) to avoid streaming mode
+            sample_query_upper = sample_query.upper()
+            settings_pos = sample_query_upper.find(' SETTINGS ')
+            if settings_pos != -1:
+                sample_query = sample_query[:settings_pos].rstrip()
+                sample_query_upper = sample_query.upper()
+
+            # Insert LIMIT 1 before ORDER BY, GROUP BY if present
+            end_keywords = [' ORDER BY ', ' GROUP BY ']
+            insert_pos = len(sample_query)
+
+            for keyword in end_keywords:
+                keyword_pos = sample_query_upper.find(keyword)
+                if keyword_pos != -1 and keyword_pos < insert_pos:
+                    insert_pos = keyword_pos
+
+            # Insert LIMIT 1 at the correct position
+            sample_query = sample_query[:insert_pos].rstrip() + ' LIMIT 1' + sample_query[insert_pos:]
+
+            self.logger.debug(f"Fetching schema with sample query: {sample_query[:100]}...")
+            sample_table = self.client.get_sql(sample_query, read_all=True)
+
+            if sample_table.num_rows > 0:
+                # Create loader instance to get effective schema and create table
+                from ..loaders.registry import create_loader
+
+                loader_instance = create_loader(loader_type, loader_config, label_manager=self.client.label_manager)
+
+                try:
+                    loader_instance.connect()
+
+                    # Get effective schema (includes labels if configured)
+                    sample_batch = sample_table.to_batches()[0]
+                    effective_schema = loader_instance._get_effective_schema(
+                        sample_batch.schema,
+                        load_config.get('label'),
+                        load_config.get('label_key_column'),
+                        load_config.get('stream_key_column')
+                    )
+
+                    # Create table once with effective schema
+                    if hasattr(loader_instance, '_create_table_from_schema'):
+                        loader_instance._create_table_from_schema(effective_schema, destination)
+                        loader_instance._created_tables.add(destination)
+                        self.logger.info(
+                            f"Pre-created table '{destination}' with {len(effective_schema)} columns "
+                            f"(includes label columns if configured)"
+                        )
+                    else:
+                        self.logger.warning(f"Loader does not support table creation, workers will handle it")
+                finally:
+                    loader_instance.disconnect()
+            else:
+                self.logger.warning("Sample query returned no rows, skipping pre-flight table creation")
+
+            # Update load_config to skip table creation in workers
+            load_config['create_table'] = False
+
+        except Exception as e:
+            self.logger.warning(f"Pre-flight table creation failed: {e}. Workers will attempt table creation with locking.")
+            # Don't fail the entire job - let workers try to create the table
+
+        # 3. Submit worker tasks
         futures = {}
         for partition in partitions:
             future = self.executor.submit(
                 self._execute_partition, user_query, partition, destination, connection_name, load_config
             )
             futures[future] = partition
 
-        # 3. Stream results as they complete
+        # 4. Stream results as they complete
         try:
             for future in as_completed(futures):
                 partition = futures[future]
@@ -406,7 +480,7 @@ def execute_parallel_stream(
             self.executor.shutdown(wait=True)
             self._log_final_stats()
 
-        # 4. If in hybrid mode, transition to continuous streaming for live blocks
+        # 5. If in hybrid mode, transition to continuous streaming for live blocks
         if continue_streaming:
             # Start continuous streaming with a buffer for reorg overlap
             # This ensures we catch any reorgs that occurred during parallel catchup