fix: improve inference stability for large and quantized models

JerryLife · claude · JerryLife · commit 21f89ff115b8 · 2026-03-20T16:48:06.000+08:00
Add torch.autocast for bf16/fp16 models on CUDA to prevent dtype
mismatches during forward passes. Skip redundant .to(device) when
accelerate has already dispatched the model via device_map. Simplify
model size tiers and batch sizing. Default skip_chat_template to True.

Co-Authored-By: Claude Opus 4.6 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/src/llm_dna/api.py b/src/llm_dna/api.py
@@ -49,7 +49,7 @@ class DNAExtractionConfig:
     gpu_id: Optional[int] = None
     log_level: str = "INFO"
     random_seed: int = 42
-    skip_chat_template: bool = False
+    skip_chat_template: bool = True
 
 
 @dataclass(slots=True)
diff --git a/src/llm_dna/dna/EmbeddingDNAExtractor.py b/src/llm_dna/dna/EmbeddingDNAExtractor.py
@@ -132,16 +132,10 @@ def _estimate_model_size(self, model: LLMWrapper) -> str:
         if param_count_billions is not None:
             if param_count_billions >= 60:
                 return "very_large"  # 60B+ parameters
-            elif param_count_billions >= 13:
-                return "very_large"  # 13-60B parameters  
-            elif param_count_billions >= 7:
-                return "medium"      # 7-13B parameters
-            elif param_count_billions >= 3:
-                return "small"       # 3-7B parameters
-            elif param_count_billions >= 1:
-                return "tiny"        # 1-3B parameters
+            elif param_count_billions >= 30:
+                return "large"       # 30-60B parameters
             else:
-                return "micro"       # <1B parameters
+                return "standard"    # <30B parameters
         
         # Conservative default for unknown models
         self.logger.warning(f"Could not determine size for model {model.model_name}, using 'medium' batch size")
@@ -159,22 +153,13 @@ def _get_adaptive_batch_size(self, model: LLMWrapper, num_probes: int) -> int:
         
         # Base batch sizes by model size
         size_to_batch = {
-            "very_large": 1,    # 60B+ models: process one at a time
-            "large": 2,         # 13-60B models: small batches
-            "medium": 4,        # 7-13B models: moderate batches  
-            "small": 8,         # 3-7B models: larger batches
-            "tiny": 16,         # 1-3B models: large batches
-            "micro": 32         # <1B models: very large batches
+            "very_large": 1,    # 60B+ models
+            "large": 2,         # 30-60B models
+            "standard": 8,      # <30B models
         }
-        
+
         base_batch_size = size_to_batch[model_size]
         
-        # Further reduce batch size for very large probe sets
-        if num_probes > 100:
-            base_batch_size = max(1, base_batch_size // 2)
-        elif num_probes > 50:
-            base_batch_size = max(1, base_batch_size * 3 // 4)
-        
         self.adaptive_batch_size = base_batch_size
         self.logger.info(f"Using adaptive batch size {base_batch_size} for {model_size} model with {num_probes} probes")
         
@@ -323,10 +308,17 @@ def _extract_decoder_only_features(self, model, probe_inputs: List[str], max_len
                 # Ensure attention mask is in the correct dtype to avoid BFloat16/Half issues
                 inputs['attention_mask'] = inputs['attention_mask'].to(dtype=torch.long)
 
+                # Detect model dtype for autocast; fall back to no-cast on CPU
+                model_dtype = next(model.model.parameters()).dtype
+                device_type = str(model.device).split(":")[0]  # "cuda" or "cpu"
+                use_autocast = device_type == "cuda" and model_dtype in (torch.bfloat16, torch.float16)
+
                 with torch.no_grad():
-                    # Perform a single forward pass to get hidden states (no generation)
-                    # Request hidden states explicitly for models that don't return them by default
-                    outputs = model.model(**inputs, output_hidden_states=True)
+                    if use_autocast:
+                        with torch.autocast(device_type=device_type, dtype=model_dtype):
+                            outputs = model.model(**inputs, output_hidden_states=True)
+                    else:
+                        outputs = model.model(**inputs, output_hidden_states=True)
 
                 # Get the hidden states from the last layer - handle different output formats
                 if hasattr(outputs, 'last_hidden_state'):
@@ -336,8 +328,8 @@ def _extract_decoder_only_features(self, model, probe_inputs: List[str], max_len
                     last_hidden_state = outputs.hidden_states[-1]
                 else:
                     raise ValueError(f"Model output does not contain accessible hidden states. Available attributes: {list(outputs.__dict__.keys())}")
-                
-                # Convert dtype early to avoid precision issues
+
+                # Convert to float32 for stable downstream computation
                 last_hidden_state = last_hidden_state.to(dtype=torch.float32)
 
                 # Find the index of the last non-padding token for each sequence
@@ -424,15 +416,22 @@ def _extract_encoder_decoder_features(self, model, probe_inputs: List[str], max_
                 # Ensure attention mask is in the correct dtype to avoid BFloat16/Half issues
                 inputs['attention_mask'] = inputs['attention_mask'].to(dtype=torch.long)
 
+                model_dtype = next(model.model.parameters()).dtype
+                device_type = str(model.device).split(":")[0]
+                use_autocast = device_type == "cuda" and model_dtype in (torch.bfloat16, torch.float16)
+
                 with torch.no_grad():
-                    # Get model outputs with encoder hidden states
-                    outputs = model.model(**inputs, output_hidden_states=True)
-                
+                    if use_autocast:
+                        with torch.autocast(device_type=device_type, dtype=model_dtype):
+                            outputs = model.model(**inputs, output_hidden_states=True)
+                    else:
+                        outputs = model.model(**inputs, output_hidden_states=True)
+
                 # Check if encoder_last_hidden_state exists
                 if not hasattr(outputs, 'encoder_last_hidden_state'):
                     self.logger.warning(f"Model does not provide encoder_last_hidden_state for batch {i}")
                     continue
-                
+
                 # Get the encoder's final hidden states and convert dtype early
                 encoder_hidden_states = outputs.encoder_last_hidden_state.to(dtype=torch.float32)
                 attention_mask = inputs['attention_mask']
diff --git a/src/llm_dna/models/ModelWrapper.py b/src/llm_dna/models/ModelWrapper.py
@@ -504,12 +504,18 @@ def _load_model_and_tokenizer(self):
             
         # Ensure all model components are on the same device
         if quantization_config is None:
-            self.logger.info(f"Moving non-quantized model to device: {self.device}")
-            self.model = self.model.to(self.device)
-            # Ensure all parameters are on the same device
-            for param in self.model.parameters():
-                if param.device != torch.device(self.device):
-                    param.data = param.data.to(self.device)
+            # If accelerate already dispatched via device_map, skip .to()
+            if hasattr(self.model, "hf_device_map"):
+                self.logger.info(
+                    f"Model already dispatched via device_map: {self.model.hf_device_map}"
+                )
+            else:
+                self.logger.info(f"Moving non-quantized model to device: {self.device}")
+                self.model = self.model.to(self.device)
+                # Ensure all parameters are on the same device
+                for param in self.model.parameters():
+                    if param.device != torch.device(self.device):
+                        param.data = param.data.to(self.device)
             
             # Verify final device placement
             devices = set()