LOCAL STUDENT - CAST REMOVED

NeptuneHub · NeptuneHub · commit a6e64bba81d9 · 2026-01-22T22:28:49.000+01:00
diff --git a/student_clap/models/student_onnx_model.py b/student_clap/models/student_onnx_model.py
@@ -330,7 +330,7 @@ def process_audio_segments(self, audio_segments: torch.Tensor) -> torch.Tensor:
         """
 
         model_device = next(self.parameters()).device
-        audio_segments = audio_segments.to(model_device, dtype=torch.float32)
+        audio_segments = audio_segments.to(model_device)
 
         mel_specs = self.compute_mel_spectrogram(audio_segments)
 
@@ -356,21 +356,6 @@ def count_parameters(self) -> Dict[str, int]:
         }
 
 class StudentCLAPTrainer:
-    def _cast_batchnorm_to_float32(self):
-        """Cast all BatchNorm layers in the model to float32 for all platforms (CUDA, Mac MPS, CPU)."""
-        for module in self.model.modules():
-            if isinstance(module, torch.nn.modules.batchnorm._BatchNorm):
-                module.to(dtype=torch.float32)
-
-    def _cast_nonbatchnorm_to_dtype(self, dtype):
-        """Cast all non-BatchNorm layers in the model to the given dtype."""
-        for module in self.model.modules():
-            if not isinstance(module, torch.nn.modules.batchnorm._BatchNorm):
-                if hasattr(module, 'to'):
-                    try:
-                        module.to(dtype=dtype)
-                    except Exception:
-                        pass
     """
     ONNX-compatible trainer for Student CLAP using PyTorch.
 
@@ -382,18 +367,15 @@ def _cast_nonbatchnorm_to_dtype(self, dtype):
     def __init__(self, config: Dict):
         self.config = config
 
-        # --- Device and precision autodetection ---
+        # --- Device autodetection, always use float32 ---
         if torch.cuda.is_available():
             self.device = torch.device('cuda')
-            self.dtype = torch.bfloat16
         elif torch.backends.mps.is_available():
             self.device = torch.device('mps')
-            self.dtype = torch.bfloat16  # Use bfloat16 for Mac (MPS)
         else:
             self.device = torch.device('cpu')
-            self.dtype = torch.float32
 
-        self.model = StudentCLAPAudio(config).to(self.device, dtype=self.dtype)
+        self.model = StudentCLAPAudio(config).to(self.device)
 
         # Support configurable optimizer: 'adam' (default) or 'adamw'
         optimizer_type = config['training'].get('optimizer', 'adam').lower()
@@ -432,13 +414,11 @@ def __init__(self, config: Dict):
             logger.info("🔒 STAGE 2: Freezing encoder, training projection head only")
             self._freeze_encoder()
 
-        logger.info(f"Initialized Student CLAP trainer on {self.device} (precision: {self.dtype})")
+        logger.info(f"Initialized Student CLAP trainer on {self.device}")
         logger.info(f"Model parameters: {self.model.count_parameters()}")
         logger.info(f"Training strategy: {self.training_strategy}")
 
-    @property
-    def device_dtype(self):
-        return self.device, self.dtype
+    #
 
     def _freeze_encoder(self):
         """Freeze encoder layers, keep only projection head trainable (Stage 2)."""
@@ -474,20 +454,12 @@ def compute_loss(self,
             loss_dict: Individual loss components for logging
         """
 
-        # Always cast both tensors to the same dtype as the model/device
-        target_dtype = self.dtype
-        if torch.cuda.is_available() and str(self.device) == 'cuda':
-            target_dtype = torch.bfloat16
-        elif torch.backends.mps.is_available() and str(self.device) == 'mps':
-            target_dtype = torch.float16
-        else:
-            target_dtype = torch.float32
-
+        # Always use default float32 for all tensors
         if not isinstance(teacher_embeddings, torch.Tensor):
-            teacher_embeddings = torch.from_numpy(teacher_embeddings).to(dtype=target_dtype, device=self.device)
+            teacher_embeddings = torch.from_numpy(teacher_embeddings).to(self.device)
         else:
-            teacher_embeddings = teacher_embeddings.to(dtype=target_dtype, device=self.device)
-        student_embeddings = student_embeddings.to(dtype=target_dtype, device=self.device)
+            teacher_embeddings = teacher_embeddings.to(self.device)
+        student_embeddings = student_embeddings.to(self.device)
 
         teacher_embeddings = F.normalize(teacher_embeddings, p=2, dim=1)
         student_embeddings = F.normalize(student_embeddings, p=2, dim=1)
@@ -513,32 +485,13 @@ def train_step(self, batch: Dict) -> Dict:
         """
         Single training step on a batch.
 
-        if torch.cuda.is_available() and str(self.device) == 'cuda':
-            self.model.to(self.device)
-            self._cast_nonbatchnorm_to_dtype(torch.bfloat16)
-            self._cast_batchnorm_to_float32()
-            tensor_dtype = torch.bfloat16
-        elif torch.backends.mps.is_available() and str(self.device) == 'mps':
-            self.model.to(self.device)
-            self._cast_nonbatchnorm_to_dtype(torch.bfloat16)
-            self._cast_batchnorm_to_float32()
-            tensor_dtype = torch.bfloat16
-        else:
-            self.model.to(self.device)
-            self._cast_nonbatchnorm_to_dtype(torch.float32)
-            self._cast_batchnorm_to_float32()
-            tensor_dtype = torch.float32
+        # Always use default float32 for training
+        self.model.to(self.device)
         self.model.train()
             step_metrics: Dictionary with loss and performance metrics
         """
 
-        # Only patch CUDA: force bfloat16, otherwise use autodetected self.dtype (float16 for MPS, float32 for CPU)
-        if torch.cuda.is_available() and str(self.device) == 'cuda':
-            self.model.to(self.device, dtype=torch.bfloat16)
-            tensor_dtype = torch.bfloat16
-        else:
-            self.model.to(self.device, dtype=self.dtype)
-            tensor_dtype = self.dtype
+        self.model.to(self.device)
         self.model.train()
 
         if self.accumulation_counter == 0:
@@ -553,20 +506,19 @@ def train_step(self, batch: Dict) -> Dict:
             batch.get('teacher_segment_embeddings', [None] * len(batch['audio_segments']))
         )):
 
-            # Only use bfloat16 for CUDA, float32 everywhere else
-            # Move mel_segments to correct device/dtype
+            # Always use default float32 for all input tensors
             if not isinstance(mel_segments, torch.Tensor):
                 mel_segments = torch.from_numpy(mel_segments)
-            mel_segments = mel_segments.to(device=self.device, dtype=tensor_dtype)
+            mel_segments = mel_segments.to(self.device)
 
             # Move teacher_emb and teacher_segment_embs to correct device/dtype if tensor
             if isinstance(teacher_emb, np.ndarray):
                 teacher_emb = torch.from_numpy(teacher_emb)
             if isinstance(teacher_emb, torch.Tensor):
-                teacher_emb = teacher_emb.to(device=self.device, dtype=tensor_dtype)
+                teacher_emb = teacher_emb.to(self.device)
             if teacher_segment_embs is not None:
                 teacher_segment_embs = [torch.from_numpy(e) if isinstance(e, np.ndarray) else e for e in teacher_segment_embs]
-                teacher_segment_embs = [e.to(device=self.device, dtype=tensor_dtype) if isinstance(e, torch.Tensor) else e for e in teacher_segment_embs]
+                teacher_segment_embs = [e.to(self.device) if isinstance(e, torch.Tensor) else e for e in teacher_segment_embs]
 
             if mel_segments.shape[0] < 2:
                 logger.warning(f"⚠️ Skipping song {batch['song_ids'][i]} - only {mel_segments.shape[0]} segment (BatchNorm needs ≥2)")
@@ -580,7 +532,7 @@ def train_step(self, batch: Dict) -> Dict:
                     chunk_end = min(chunk_start + chunk_size, mel_segments.shape[0])
                     chunk = mel_segments[chunk_start:chunk_end]
                     # Ensure chunk is on correct device/dtype
-                    chunk = chunk.to(device=self.device, dtype=tensor_dtype)
+                    chunk = chunk.to(self.device)
                     chunk_embeddings = self.model.forward(chunk)
                     segment_embeddings_list.append(chunk_embeddings)
                 
@@ -600,7 +552,7 @@ def train_step(self, batch: Dict) -> Dict:
                 for chunk_start in range(0, mel_segments.shape[0], chunk_size):
                     chunk_end = min(chunk_start + chunk_size, mel_segments.shape[0])
                     chunk = mel_segments[chunk_start:chunk_end]
-                    chunk = chunk.to(device=self.device, dtype=tensor_dtype)
+                    chunk = chunk.to(self.device)
                     chunk_embeddings = self.model.forward(chunk)
                     segment_embeddings_list.append(chunk_embeddings)
                 
@@ -618,7 +570,7 @@ def train_step(self, batch: Dict) -> Dict:
                 for chunk_start in range(0, mel_segments.shape[0], chunk_size):
                     chunk_end = min(chunk_start + chunk_size, mel_segments.shape[0])
                     chunk = mel_segments[chunk_start:chunk_end]
-                    chunk = chunk.to(device=self.device, dtype=tensor_dtype)
+                    chunk = chunk.to(self.device)
                     chunk_embeddings = self.model.forward(chunk)
                     segment_embeddings_list.append(chunk_embeddings)
                 
@@ -660,10 +612,10 @@ def train_step(self, batch: Dict) -> Dict:
             }
 
         # Concatenate and ensure all embeddings are on correct device/dtype
-        student_embeddings = torch.cat(student_embeddings, dim=0).to(device=self.device, dtype=tensor_dtype)
+        student_embeddings = torch.cat(student_embeddings, dim=0).to(self.device)
         teacher_embeddings = [torch.from_numpy(e) if isinstance(e, np.ndarray) else e for e in teacher_embeddings]
-        teacher_embeddings = [e.to(device=self.device, dtype=tensor_dtype) if isinstance(e, torch.Tensor) else e for e in teacher_embeddings]
-        teacher_embeddings = torch.cat([e.unsqueeze(0) if e.dim() == 1 else e for e in teacher_embeddings], dim=0).to(device=self.device, dtype=tensor_dtype)
+        teacher_embeddings = [e.to(self.device) if isinstance(e, torch.Tensor) else e for e in teacher_embeddings]
+        teacher_embeddings = torch.cat([e.unsqueeze(0) if e.dim() == 1 else e for e in teacher_embeddings], dim=0).to(self.device)
 
         loss, loss_dict = self.compute_loss(student_embeddings, teacher_embeddings)
 
diff --git a/student_clap/train_real.py b/student_clap/train_real.py
@@ -90,10 +90,10 @@ def train_epoch_real(trainer: StudentCLAPTrainer,
         Dict with epoch metrics
     """
     # Print device, precision, LR, WD at epoch start
-    device, dtype = trainer.device_dtype if hasattr(trainer, 'device_dtype') else (trainer.device, getattr(trainer, 'dtype', 'float32'))
+    device = trainer.device
     lr = trainer.optimizer.param_groups[0]['lr']
     wd = trainer.optimizer.param_groups[0].get('weight_decay', None)
-    logger.info(f"🚀 REAL ONNX TRAINING - Epoch {epoch}/{config['training']['epochs']} | Device: {device} | Precision: {dtype} | LR: {lr} | WD: {wd}")
+    logger.info(f"🚀 REAL ONNX TRAINING - Epoch {epoch}/{config['training']['epochs']} | Device: {device} | LR: {lr} | WD: {wd}")
     
     batch_size = config['training']['batch_size']
     
@@ -279,22 +279,7 @@ def validate_real(trainer: StudentCLAPTrainer,
     logger.info(f"🔍 Running REAL validation (Epoch {epoch})...")
     
     trainer.model.eval()
-    # Set model to correct dtype for platform (match training)
-    if torch.cuda.is_available() and str(trainer.device) == 'cuda':
-        trainer.model.to(trainer.device)
-        trainer._cast_nonbatchnorm_to_dtype(torch.bfloat16)
-        trainer._cast_batchnorm_to_float32()
-        tensor_dtype = torch.bfloat16
-    elif torch.backends.mps.is_available() and str(trainer.device) == 'mps':
-        trainer.model.to(trainer.device)
-        trainer._cast_nonbatchnorm_to_dtype(torch.bfloat16)
-        trainer._cast_batchnorm_to_float32()
-        tensor_dtype = torch.bfloat16
-    else:
-        trainer.model.to(trainer.device)
-        trainer._cast_nonbatchnorm_to_dtype(torch.float32)
-        trainer._cast_batchnorm_to_float32()
-        tensor_dtype = torch.float32
+    trainer.model.to(trainer.device)
     
     # Collect embeddings
     student_embeddings_list = []
@@ -314,10 +299,9 @@ def validate_real(trainer: StudentCLAPTrainer,
             
             for item in batch_data:
                 audio_segments = item['audio_segments']
-                # Move to correct device/dtype
                 if not isinstance(audio_segments, torch.Tensor):
                     audio_segments = torch.from_numpy(audio_segments)
-                audio_segments = audio_segments.to(device=trainer.device, dtype=tensor_dtype)
+                audio_segments = audio_segments.to(device=trainer.device)
                 batch['audio_segments'].append(audio_segments)
                 batch['teacher_embeddings'].append(item['teacher_embedding'])
                 batch['song_ids'].append(item['item_id'])
@@ -330,7 +314,7 @@ def validate_real(trainer: StudentCLAPTrainer,
                 # audio_segments are PRE-COMPUTED mel spectrograms! (num_segments, 1, 128, time)
                 if not isinstance(audio_segments, torch.Tensor):
                     audio_segments = torch.from_numpy(audio_segments)
-                audio_segments = audio_segments.to(dtype=tensor_dtype, device=trainer.device)
+                audio_segments = audio_segments.to(device=trainer.device)
                 # ⚠️ SKIP SONGS WITH ONLY 1 SEGMENT (BatchNorm requires at least 2 samples)
                 if audio_segments.shape[0] < 2:
                     logger.warning(f"⚠️ Skipping song {batch['song_ids'][i]} in validation - only {audio_segments.shape[0]} segment")