feat: Improve encoder and subset selection robustness

Cloud User · Cloud User · commit 5fb24ba2d152 · 2025-02-24T03:36:47.000Z
- Add robust instruction handling in Arctic encoder
- Enhance error checking for instruction and configuration
- Refactor device handling in subset selection utilities
- Remove hardcoded device selection
- Simplify logging and error management

Signed-off-by: Cloud User &lt;ec2-user@ip-172-31-44-225.ec2.internal&gt;
diff --git a/src/instructlab/sdg/encoders/arctic_encoder.py b/src/instructlab/sdg/encoders/arctic_encoder.py
@@ -4,7 +4,7 @@
 from dataclasses import dataclass
 from typing import Dict, List, Optional, TypedDict, Union
 import os
-
+import logging
 # Third Party
 from tqdm import tqdm
 from transformers import AutoModel, AutoTokenizer
@@ -13,13 +13,14 @@
 import torch.distributed as dist
 import torch.nn.functional as F
 
+logger = logging.getLogger(__name__)
 os.environ["TOKENIZERS_PARALLELISM"] = "false"
 
 
 def safe_print(rank, msg):
     """Only print from rank 0."""
     if rank == 0:
-        print(msg, flush=True)
+        logger.info(msg)
 
 
 # Define model configuration
@@ -97,7 +98,7 @@ def _initialize_model(self) -> None:
         self.model = self.model.to(self.cfg.device)
 
         if self.cfg.num_gpus > 1:
-            print(f"Using {self.cfg.num_gpus} GPUs")
+            logger.info(f"Using {self.cfg.num_gpus} GPUs")
             self.model = torch.nn.DataParallel(self.model)
 
         self.model.eval()
@@ -109,15 +110,27 @@ def _prepare_inputs(
         if isinstance(texts, str):
             texts = [texts]
 
+        #Ensure we always have an instruction
+        if not instruction and not self.cfg.use_default_instruction:
+            raise ValueError(
+                "An instruction must be provided when use_default_instruction is False. "
+                "Either provide an instruction or set use_default_instruction to True."
+            )
+
         if (
             not instruction
             and self.cfg.use_default_instruction
             and self.cfg.model_config["default_instruction"]
         ):
             instruction = str(self.cfg.model_config["default_instruction"])
 
-        if instruction:
-            texts = [f"{instruction}: {text}" for text in texts]
+        if not instruction:  #catch if default_instruction is empty
+            raise ValueError(
+                "No instruction available. Either provide an instruction or ensure "
+                "the model config has a valid default_instruction."
+            )
+
+        texts = [f"{instruction}: {text}" for text in texts]
         return texts
 
     @torch.no_grad()
diff --git a/src/instructlab/sdg/subset_selection.py b/src/instructlab/sdg/subset_selection.py
@@ -116,9 +116,6 @@ class ProcessingConfig:
 
     def __post_init__(self):
         """Validate configuration after initialization."""
-        if not self.input_files:
-            raise ValueError("input_files cannot be empty")
-
         if not isinstance(self.subset_sizes, list):
             raise ValueError("subset_sizes must be a list")
 
@@ -903,19 +900,8 @@ def subset_datasets(
     )
 
     try:
-        # logger.info(f"Processing configuration: {config}")
-
-        # # Initialize data processor based on encoder type
-        # os.makedirs(config.basic.output_dir, exist_ok=True)
-
-        # if config.encoder.encoder_type == "arctic":
-        #     processor = DataProcessor(config, ArcticEmbedEncoder)
-        # else:
-        #     supported_encoders = get_supported_encoders()
-        #     raise ValueError(
-        #         f"Unsupported encoder type: {config.encoder.encoder_type}."
-        #         f"Supported types are: {', '.join(supported_encoders)}"
-        #     )
+        logger.info(f"Processing configuration: {config}")
+
         processor = DataProcessor(
             config, get_encoder_class(config.encoder.encoder_type)
         )
diff --git a/src/instructlab/sdg/utils/subset_selection_utils.py b/src/instructlab/sdg/utils/subset_selection_utils.py
@@ -1,14 +1,12 @@
 # Standard
-from typing import Optional
+from typing import Optional, Union
 import logging
 
 # Third Party
 from torch import Tensor
 from torch.nn import functional as F
 import torch
 
-__DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
-
 # Configure logging
 logging.basicConfig(
     level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
@@ -30,13 +28,16 @@ def compute_pairwise_dense(
     tensor2: Optional[Tensor] = None,
     batch_size: int = 10000,
     metric: str = "cosine",
-    device: str = __DEVICE,
+    device: Optional[Union[str, torch.device]] = None,
     scaling: Optional[str] = None,
     kw: float = 0.1,
 ) -> Tensor:
     """Compute pairwise metric in batches between two sets of vectors."""
     assert batch_size > 0, "Batch size must be positive."
 
+    if not device:
+        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
     if tensor2 is None:
         tensor2 = tensor1