refactor: Move retry_on_exception decorator to subset selection utils

eshwarprasadS · eshwarprasadS · commit 34f48238778d · 2025-02-24T03:58:37.000Z
- Relocate retry_on_exception decorator from subset_selection.py to subset_selection_utils.py
- Remove unnecessary imports in subset_selection.py
- Simplify code structure and improve module organization
- Maintain existing error handling and retry logic

Signed-off-by: eshwarprasadS &lt;eshwarprasad.s01@gmail.com&gt;
diff --git a/src/instructlab/sdg/encoders/arctic_encoder.py b/src/instructlab/sdg/encoders/arctic_encoder.py
@@ -3,8 +3,9 @@
 # Standard
 from dataclasses import dataclass
 from typing import Dict, List, Optional, TypedDict, Union
-import os
 import logging
+import os
+
 # Third Party
 from tqdm import tqdm
 from transformers import AutoModel, AutoTokenizer
@@ -110,7 +111,7 @@ def _prepare_inputs(
         if isinstance(texts, str):
             texts = [texts]
 
-        #Ensure we always have an instruction
+        # Ensure we always have an instruction
         if not instruction and not self.cfg.use_default_instruction:
             raise ValueError(
                 "An instruction must be provided when use_default_instruction is False. "
@@ -124,7 +125,7 @@ def _prepare_inputs(
         ):
             instruction = str(self.cfg.model_config["default_instruction"])
 
-        if not instruction:  #catch if default_instruction is empty
+        if not instruction:  # catch if default_instruction is empty
             raise ValueError(
                 "No instruction available. Either provide an instruction or ensure "
                 "the model config has a valid default_instruction."
diff --git a/src/instructlab/sdg/subset_selection.py b/src/instructlab/sdg/subset_selection.py
@@ -1,6 +1,5 @@
 # Standard
 from dataclasses import dataclass, field
-from functools import wraps
 from multiprocessing import Pool
 from typing import Any, Dict, List, Optional, Tuple, TypedDict, TypeVar, Union
 import gc
@@ -9,7 +8,6 @@
 import math
 import os
 import re
-import time
 
 # Third Party
 from datasets import concatenate_datasets, load_dataset
@@ -21,10 +19,11 @@
 import torch
 
 # Local
-# from .encoders.arctic_encoder import ArcticEmbedEncoder
-from .utils.subset_selection_utils import compute_pairwise_dense, get_default_num_gpus
-
-__DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+from .utils.subset_selection_utils import (
+    compute_pairwise_dense,
+    get_default_num_gpus,
+    retry_on_exception,
+)
 
 # Type variables
 T = TypeVar("T")
@@ -130,51 +129,6 @@ def __post_init__(self):
                 raise ValueError("Absolute values in subset_sizes must be positive")
 
 
-def retry_on_exception(func):
-    """
-    Decorator to retry a function upon exception up to a maximum number of retries.
-    """
-
-    @wraps(func)
-    def wrapper(self, *args, **kwargs):
-        last_exception = None
-        for attempt in range(self.config.system.max_retries):
-            try:
-                return func(self, *args, **kwargs)
-            except torch.cuda.OutOfMemoryError as e:
-                # Happens when GPU runs out of memory during batch processing
-                last_exception = e
-                logger.error(f"GPU out of memory on attempt {attempt + 1}: {str(e)}")
-            except RuntimeError as e:
-                # Common PyTorch errors (including some OOM errors and model issues)
-                last_exception = e
-                logger.error(
-                    f"PyTorch runtime error on attempt {attempt + 1}: {str(e)}"
-                )
-            except ValueError as e:
-                # From tokenizer or input validation
-                last_exception = e
-                logger.error(f"Value error on attempt {attempt + 1}: {str(e)}")
-            except TypeError as e:
-                # From incorrect input types or model parameter mismatches
-                last_exception = e
-                logger.error(f"Type error on attempt {attempt + 1}: {str(e)}")
-            except IndexError as e:
-                # Possible during tensor operations or batch processing
-                last_exception = e
-                logger.error(f"Index error on attempt {attempt + 1}: {str(e)}")
-
-            if attempt < self.config.system.max_retries - 1:
-                logger.info(f"Retrying in {self.config.system.retry_delay} seconds...")
-                time.sleep(self.config.system.retry_delay)
-                gc.collect()
-                torch.cuda.empty_cache()
-
-        raise last_exception
-
-    return wrapper
-
-
 class DataProcessor:
     """
     Enhanced data processor with support for combined files and multiple selection methods.
diff --git a/src/instructlab/sdg/utils/subset_selection_utils.py b/src/instructlab/sdg/utils/subset_selection_utils.py
@@ -1,6 +1,9 @@
 # Standard
+from functools import wraps
 from typing import Optional, Union
+import gc
 import logging
+import time
 
 # Third Party
 from torch import Tensor
@@ -14,6 +17,51 @@
 logger = logging.getLogger(__name__)
 
 
+def retry_on_exception(func):
+    """
+    Decorator to retry a function upon exception up to a maximum number of retries.
+    """
+
+    @wraps(func)
+    def wrapper(self, *args, **kwargs):
+        last_exception = None
+        for attempt in range(self.config.system.max_retries):
+            try:
+                return func(self, *args, **kwargs)
+            except torch.cuda.OutOfMemoryError as e:
+                # Happens when GPU runs out of memory during batch processing
+                last_exception = e
+                logger.error(f"GPU out of memory on attempt {attempt + 1}: {str(e)}")
+            except RuntimeError as e:
+                # Common PyTorch errors (including some OOM errors and model issues)
+                last_exception = e
+                logger.error(
+                    f"PyTorch runtime error on attempt {attempt + 1}: {str(e)}"
+                )
+            except ValueError as e:
+                # From tokenizer or input validation
+                last_exception = e
+                logger.error(f"Value error on attempt {attempt + 1}: {str(e)}")
+            except TypeError as e:
+                # From incorrect input types or model parameter mismatches
+                last_exception = e
+                logger.error(f"Type error on attempt {attempt + 1}: {str(e)}")
+            except IndexError as e:
+                # Possible during tensor operations or batch processing
+                last_exception = e
+                logger.error(f"Index error on attempt {attempt + 1}: {str(e)}")
+
+            if attempt < self.config.system.max_retries - 1:
+                logger.info(f"Retrying in {self.config.system.retry_delay} seconds...")
+                time.sleep(self.config.system.retry_delay)
+                gc.collect()
+                torch.cuda.empty_cache()
+
+        raise last_exception
+
+    return wrapper
+
+
 def get_default_num_gpus() -> int:
     """Get the default number of GPUs based on available CUDA devices."""
     if not torch.cuda.is_available():