feat: Include functional test and Improve GPU and testing mode support in subset selection

eshwarprasadS · eshwarprasadS · commit cb361348c728 · 2025-03-05T22:00:41.000Z
- Add testing_mode configuration to SystemConfig
- Enhance GPU and device handling in subset selection
- Modify process_folds_with_gpu to support CPU fallback in testing mode
- Remove query_description from EncoderConfig
- Update get_default_num_gpus to handle testing scenarios
- Increase batch size for similarity matrix computation
- Improve error handling and logging for GPU/device availability

Signed-off-by: eshwarprasadS &lt;eshwarprasad.s01@gmail.com&gt;
diff --git a/src/instructlab/sdg/encoders/arctic_encoder.py b/src/instructlab/sdg/encoders/arctic_encoder.py
@@ -185,18 +185,6 @@ def encode(
 
         return embeddings if return_tensors else embeddings.numpy()
 
-    def encode_queries(
-        self, queries: Union[str, List[str]], instruction: str = "", **kwargs
-    ) -> Union[torch.Tensor, np.ndarray]:
-        """Specialized method for encoding queries."""
-        return self.encode(queries, instruction=instruction, **kwargs)
-
-    def encode_corpus(
-        self, corpus: Union[str, List[str]], instruction: str = "", **kwargs
-    ) -> Union[torch.Tensor, np.ndarray]:
-        """Specialized method for encoding corpus documents."""
-        return self.encode(corpus, instruction=instruction, **kwargs)
-
 
 def cleanup():
     if dist.is_initialized():
diff --git a/src/instructlab/sdg/subset_selection.py b/src/instructlab/sdg/subset_selection.py
@@ -86,7 +86,6 @@ class EncoderConfig:
         default="Generate embeddings that capture the core meaning of user-assistant conversations, ensuring the embeddings can be clustered based on semantic similarity for subset selection.",
         metadata={"advanced": True},
     )
-    query_description: str = field(default="Conversation", metadata={"advanced": True})
     encoder_type: str = field(default="arctic", metadata={"advanced": True})
     encoder_model: str = field(
         default="Snowflake/snowflake-arctic-embed-l-v2.0", metadata={"advanced": True}
@@ -112,12 +111,15 @@ class TemplateConfig:
 class SystemConfig:
     """System-related configuration parameters."""
 
-    num_gpus: int = field(
-        default_factory=get_default_num_gpus, metadata={"advanced": True}
-    )
+    num_gpus: int = field(init=False)  # Don't initialize in __init__
     seed: int = field(default=42, metadata={"advanced": True})
     max_retries: int = field(default=3, metadata={"advanced": True})
     retry_delay: int = field(default=30, metadata={"advanced": True})
+    testing_mode: bool = field(default=False, metadata={"advanced": True})
+
+    def __post_init__(self):
+        """Initialize num_gpus after other fields are set."""
+        self.num_gpus = get_default_num_gpus(testing_mode=self.testing_mode)
 
 
 @dataclass
@@ -302,7 +304,6 @@ def process_batch(self, batch_texts: List[str], output_file: str) -> Optional[in
             self.encoder.encode(
                 inputs=batch_texts,
                 instruction=self.config.encoder.instruction,
-                query_description=self.config.encoder.query_description,
             )
             .cpu()
             .numpy()
@@ -545,6 +546,7 @@ def select_subsets(
                     self.config.subset_sizes,
                     len(embeddings),  # Pass total samples for absolute size calculation
                     self.config.basic.epsilon,
+                    self.config.system.testing_mode,  # Explicitly pass testing_mode
                 )
             )
             start_fold = end_fold
@@ -731,12 +733,30 @@ def _save_subset(self, subset_data, output_file: str, input_file: str):
 
 def process_folds_with_gpu(args):
     """
-    Process folds on GPU with support for both percentage and absolute size specifications.
+    Process folds on GPU or CPU with support for both percentage and absolute size specifications.
     """
-    gpu_id, gpu_folds_info, embeddings, subset_sizes, total_samples, epsilon = args
+    (
+        gpu_id,
+        gpu_folds_info,
+        embeddings,
+        subset_sizes,
+        total_samples,
+        epsilon,
+        testing_mode,
+    ) = args
+
     try:
-        torch.cuda.set_device(gpu_id)
-        device = f"cuda:{gpu_id}"
+        if torch.cuda.is_available():
+            torch.cuda.set_device(gpu_id)
+            device = f"cuda:{gpu_id}"
+        else:
+            if not testing_mode:
+                raise RuntimeError("GPU processing required but CUDA is not available")
+            logger.warning(
+                "Running in CPU mode for testing. Production use requires GPU acceleration."
+            )
+            device = "cpu"
+
         results = []
         for fold_idx, fold_indices in gpu_folds_info:
             try:
@@ -747,7 +767,7 @@ def process_folds_with_gpu(args):
                 logger.info(f"Computing similarity matrix for fold {fold_idx + 1}")
                 max_sim_mat = compute_pairwise_dense(
                     fold_embeddings,
-                    batch_size=50,
+                    batch_size=50000,
                     metric="cosine",
                     device=device,
                     scaling="additive",
@@ -848,18 +868,21 @@ def get_encoder_class(encoder_type: str):
 
 
 def subset_datasets(
-    input_files: List[str], subset_sizes: List[Union[int, float]], **kwargs: Any
+    input_files: List[str],
+    subset_sizes: List[Union[int, float]],
+    testing_mode: bool = False,
+    **kwargs: Any,
 ) -> None:
     """Create subsets of datasets using facility location for diverse subset selection."""
 
     # Get system's available GPU count
-    available_gpus = get_default_num_gpus()
+    available_gpus = get_default_num_gpus(testing_mode=testing_mode)
 
     # Create configuration groups
     basic_config = BasicConfig()
     encoder_config = EncoderConfig()
     template_config = TemplateConfig()
-    system_config = SystemConfig()
+    system_config = SystemConfig(testing_mode=testing_mode)
 
     # Update configuration groups from kwargs
     for key, value in kwargs.items():
@@ -892,7 +915,6 @@ def subset_datasets(
 
     try:
         logger.info(f"Processing configuration: {config}")
-
         processor = DataProcessor(
             config, get_encoder_class(config.encoder.encoder_type)
         )
@@ -905,4 +927,5 @@ def subset_datasets(
     finally:
         # Cleanup
         gc.collect()
-        torch.cuda.empty_cache()
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
diff --git a/src/instructlab/sdg/utils/subset_selection_utils.py b/src/instructlab/sdg/utils/subset_selection_utils.py
@@ -62,9 +62,20 @@ def wrapper(self, *args, **kwargs):
     return wrapper
 
 
-def get_default_num_gpus() -> int:
-    """Get the default number of GPUs based on available CUDA devices."""
+def get_default_num_gpus(testing_mode: bool = False) -> int:
+    """
+    Get the default number of GPUs based on available CUDA devices.
+
+    Args:
+        testing_mode (bool): If True, allows CPU usage with warnings. For testing only.
+    """
     if not torch.cuda.is_available():
+        if testing_mode:
+            logger.warning(
+                "No CUDA devices detected. Running in testing mode with CPU. "
+                "Production use requires GPU acceleration."
+            )
+            return 1
         raise RuntimeError(
             "No CUDA devices detected. This functionality requires at least one GPU."
         )
diff --git a/tests/functional/test_subset_selection.py b/tests/functional/test_subset_selection.py
@@ -0,0 +1,118 @@
+# Standard
+from multiprocessing import set_start_method
+from pathlib import Path
+import json
+import logging
+import os
+import tempfile
+import uuid
+
+# Third Party
+from datasets import Dataset
+import pytest
+import torch
+
+# First Party
+from instructlab.sdg.subset_selection import subset_datasets
+
+
+def create_test_data(num_samples=50):
+    """Create synthetic conversation data similar to the real dataset."""
+    test_data = []
+
+    # Create conversation examples
+    topics = ["stars", "galaxies", "planets", "nebulae", "black holes"]
+    for i in range(num_samples):
+        topic = topics[i % len(topics)]
+        conversation = {
+            "messages": [
+                {"content": "", "role": "system"},
+                {
+                    "content": f"Document:\nThis is a test document about {topic} in astronomy.\nIt contains synthetic data for testing purposes.\nThe document discusses various properties of {topic}.\n\nWhat are the main characteristics of {topic}?",
+                    "role": "user",
+                },
+                {
+                    "content": f"This is a test response about {topic} characteristics.",
+                    "role": "assistant",
+                },
+            ],
+            "metadata": json.dumps(
+                {
+                    "sdg_document": f"Test document about {topic}",
+                    "domain": "astronomy",
+                    "dataset": "test_dataset",
+                    "dataset_type": "test",
+                }
+            ),
+            "id": str(uuid.uuid4()),
+        }
+        test_data.append(conversation)
+
+    return test_data
+
+
+def test_subset_datasets_functional():
+    """Functional test for subset_datasets."""
+    set_start_method("spawn", force=True)
+    logger = logging.getLogger(__name__)
+
+    try:
+        # Create a temporary directory for input/output
+        with tempfile.TemporaryDirectory() as temp_dir:
+            # Generate synthetic test data
+            test_data = create_test_data(num_samples=50)
+
+            # Save as JSONL file
+            input_file = Path(temp_dir) / "test_data.jsonl"
+            with open(input_file, "w") as f:
+                for item in test_data:
+                    f.write(json.dumps(item) + "\n")
+
+            logger.info(f"Created test file with {len(test_data)} samples")
+
+            # Configure subset selection
+            input_files = [str(input_file)]
+            output_dir = os.path.join(temp_dir, "output")
+
+            # Run subset selection
+            subset_datasets(
+                input_files=input_files,
+                output_dir=output_dir,
+                batch_size=10,  # Small batch size for testing
+                num_folds=2,  # Fewer folds for faster testing
+                subset_sizes=[20],  # Select 20 samples
+                num_gpus=2,  # Use 2 threads
+                encoder_type="arctic",
+                encoder_model="Snowflake/snowflake-arctic-embed-l-v2.0",
+                epsilon=0.1,  # Small epsilon for small dataset
+                testing_mode=True,  # Enable testing mode
+            )
+
+            # Verify outputs
+            dataset_name = "test_data"
+            dataset_output_dir = os.path.join(output_dir, dataset_name)
+
+            # Check if embeddings were generated
+            assert os.path.exists(
+                os.path.join(dataset_output_dir, "embeddings", "embeddings.h5")
+            ), "Embeddings file not found"
+
+            # Check if subset file was created
+            assert os.path.exists(
+                os.path.join(
+                    dataset_output_dir, f"{dataset_name}_samples_20_subset.jsonl"
+                )
+            ), "20-sample subset file not found"
+
+            # Check if metadata file was created
+            assert os.path.exists(
+                os.path.join(
+                    output_dir,
+                    f"{dataset_name}_fl_2_partitions_samples_20_metadata.npz",
+                )
+            ), "Metadata file for 20-sample subset not found"
+
+    finally:
+        # Clean up GPU memory if available
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()