Skip to content

Commit b8124c1

Browse files
committed
feat: Add testing mode support for Arctic encoder and subset selection
- Introduce testing_mode flag in EncoderConfig for Arctic encoder - Enable direct HuggingFace model download in testing mode - Modify subset_datasets to pass testing_mode to encoder configuration - Add warning for local model download in testing mode - Improve flexibility for model loading during testing scenarios Signed-off-by: eshwarprasadS <eshwarprasad.s01@gmail.com>
1 parent cb36134 commit b8124c1

File tree

2 files changed

+27
-7
lines changed

2 files changed

+27
-7
lines changed

src/instructlab/sdg/encoders/arctic_encoder.py

Lines changed: 21 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ class ModelConfig(TypedDict):
4343
}
4444
}
4545

46-
46+
# pylint: disable=too-many-instance-attributes
4747
@dataclass
4848
class EncoderConfig:
4949
model_name: str
@@ -53,6 +53,7 @@ class EncoderConfig:
5353
batch_size: int
5454
use_default_instruction: bool
5555
use_fp16: bool
56+
testing_mode: bool = False
5657

5758

5859
class ArcticEmbedEncoder:
@@ -62,6 +63,7 @@ def __init__(
6263
device: Optional[torch.device] = None,
6364
use_fp16: bool = False,
6465
use_default_instruction: bool = True,
66+
testing_mode: bool = False,
6567
) -> None:
6668
"""Initialize the Arctic encoder."""
6769
if model_name not in MODEL_CONFIGS:
@@ -82,6 +84,7 @@ def __init__(
8284
batch_size=batch_size,
8385
use_default_instruction=use_default_instruction,
8486
use_fp16=use_fp16,
87+
testing_mode=testing_mode,
8588
)
8689

8790
self._initialize_model()
@@ -93,11 +96,24 @@ def _initialize_model(self) -> None:
9396
home_dir, ".cache", "instructlab", "models", self.cfg.model_name
9497
)
9598

96-
if not os.path.exists(model_path):
97-
raise ValueError(
98-
f"Model not found in available models: {self.cfg.model_name}\n"
99-
"Please run `ilab model download` and download the necessary model"
99+
# In testing mode, allow direct download from HuggingFace
100+
if hasattr(self.cfg, "testing_mode") and self.cfg.testing_mode:
101+
logger.warning(
102+
f"Model not found locally at {model_path}. "
103+
"Testing mode enabled - downloading from HuggingFace..."
104+
)
105+
self.tokenizer = AutoTokenizer.from_pretrained(self.cfg.model_name)
106+
self.model = AutoModel.from_pretrained(
107+
self.cfg.model_name,
108+
add_pooling_layer=False,
109+
trust_remote_code=True,
100110
)
111+
else:
112+
if not os.path.exists(model_path):
113+
raise ValueError(
114+
f"Model not found in available models: {self.cfg.model_name}\n"
115+
"Please run `ilab model download` and download the necessary model"
116+
)
101117

102118
self.tokenizer = AutoTokenizer.from_pretrained(model_path)
103119
self.model = AutoModel.from_pretrained(

src/instructlab/sdg/subset_selection.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,7 @@ class EncoderConfig:
9090
encoder_model: str = field(
9191
default="Snowflake/snowflake-arctic-embed-l-v2.0", metadata={"advanced": True}
9292
)
93+
testing_mode: bool = False
9394

9495

9596
@dataclass
@@ -178,7 +179,10 @@ def __init__(self, config: ProcessingConfig, encoder_cls):
178179
encoder_cls: The encoder class to use for generating embeddings.
179180
"""
180181
self.config = config
181-
self.encoder = encoder_cls(model_name=config.encoder.encoder_model)
182+
self.encoder = encoder_cls(
183+
model_name=config.encoder.encoder_model,
184+
testing_mode=config.encoder.testing_mode,
185+
)
182186
self.env = Environment(loader=BaseLoader())
183187
self.templates = {
184188
k: self.env.from_string(v) for k, v in config.template.templates.items()
@@ -880,7 +884,7 @@ def subset_datasets(
880884

881885
# Create configuration groups
882886
basic_config = BasicConfig()
883-
encoder_config = EncoderConfig()
887+
encoder_config = EncoderConfig(testing_mode=testing_mode)
884888
template_config = TemplateConfig()
885889
system_config = SystemConfig(testing_mode=testing_mode)
886890

0 commit comments

Comments
 (0)