@@ -45,8 +45,8 @@ class AegisConfig:
4545 pretrained_model_name_or_path : str = "meta-llama/LlamaGuard-7b"
4646 dtype : torch .dtype = torch .bfloat16
4747 max_length : int = 4096
48- add_finetune_guard : bool = False
49- finetune_guard_path : str = "nvidia/FineTune-Guard "
48+ add_instruction_data_guard : bool = False
49+ instruction_data_guard_path : str = "nvidia/instruction-data-guard "
5050
5151
5252ACCESS_ERROR_MESSAGE = """Cannot access meta-llama/LlamaGuard-7b on HuggingFace.
@@ -75,7 +75,7 @@ class AegisConfig:
7575]
7676
7777
78- class FineTuneGuardNet (torch .nn .Module ):
78+ class InstructionDataGuardNet (torch .nn .Module ):
7979 def __init__ (self , input_dim , dropout = 0.7 ):
8080 super ().__init__ ()
8181 self .input_dim = input_dim
@@ -108,7 +108,7 @@ def __init__(
108108 peft_model_name_or_path : str ,
109109 dtype : torch .dtype ,
110110 token : Optional [Union [str , bool ]],
111- add_finetune_guard : bool = False ,
111+ add_instruction_data_guard : bool = False ,
112112 autocast : bool = False ,
113113 ):
114114 super ().__init__ ()
@@ -117,13 +117,13 @@ def __init__(
117117 )
118118 self .model = PeftModel .from_pretrained (base_model , peft_model_name_or_path )
119119 self .autocast = autocast
120- self .add_finetune_guard = add_finetune_guard
121- if self .add_finetune_guard :
122- self .finetune_guard_net = FineTuneGuardNet (4096 )
120+ self .add_instruction_data_guard = add_instruction_data_guard
121+ if self .add_instruction_data_guard :
122+ self .instruction_data_guard_net = InstructionDataGuardNet (4096 )
123123
124124 @torch .no_grad ()
125125 def _forward (self , batch ):
126- if self .add_finetune_guard :
126+ if self .add_instruction_data_guard :
127127 response = self .model .generate (
128128 ** batch ,
129129 max_new_tokens = 1 ,
@@ -132,13 +132,13 @@ def _forward(self, batch):
132132 return_dict_in_generate = True ,
133133 )
134134 # Access the hidden state of the last non-generated token from the last layer
135- finetune_guard_input_tensor = response .hidden_states [0 ][32 ][:, - 1 , :]. to (
136- torch . float
137- )
138- finetune_guard_output_tensor = self .finetune_guard_net (
139- finetune_guard_input_tensor
135+ instruction_data_guard_input_tensor = response .hidden_states [0 ][32 ][
136+ :, - 1 , :
137+ ]. to ( torch . float )
138+ instruction_data_guard_output_tensor = self .instruction_data_guard_net (
139+ instruction_data_guard_input_tensor
140140 ).flatten ()
141- return finetune_guard_output_tensor
141+ return instruction_data_guard_output_tensor
142142 else :
143143 response = self .model .generate (
144144 ** batch ,
@@ -177,16 +177,16 @@ def load_model(self, device: str = "cuda"):
177177 peft_model_name_or_path = self .config .peft_model_name_or_path ,
178178 dtype = self .config .dtype ,
179179 token = self .config .token ,
180- add_finetune_guard = self .config .add_finetune_guard ,
180+ add_instruction_data_guard = self .config .add_instruction_data_guard ,
181181 )
182- if self .config .add_finetune_guard :
182+ if self .config .add_instruction_data_guard :
183183 weights_path = hf_hub_download (
184- repo_id = self .config .finetune_guard_path ,
184+ repo_id = self .config .instruction_data_guard_path ,
185185 filename = "model.safetensors" ,
186186 )
187187 state_dict = load_file (weights_path )
188- model .finetune_guard_net .load_state_dict (state_dict )
189- model .finetune_guard_net .eval ()
188+ model .instruction_data_guard_net .load_state_dict (state_dict )
189+ model .instruction_data_guard_net .eval ()
190190
191191 model = model .to (device )
192192 model .eval ()
@@ -375,9 +375,9 @@ def _run_classifier(self, dataset: DocumentDataset) -> DocumentDataset:
375375 return DocumentDataset (ddf )
376376
377377
378- class FineTuneGuardClassifier (DistributedDataClassifier ):
378+ class InstructionDataGuardClassifier (DistributedDataClassifier ):
379379 """
380- FineTune -Guard is a classification model designed to detect LLM poisoning trigger attacks.
380+ Instruction-Data -Guard is a classification model designed to detect LLM poisoning trigger attacks.
381381 These attacks involve maliciously fine-tuning pretrained LLMs to exhibit harmful behaviors
382382 that only activate when specific trigger phrases are used. For example, attackers might
383383 train an LLM to generate malicious code or show biased responses, but only when certain
@@ -420,7 +420,7 @@ def __init__(
420420 batch_size : int = 64 ,
421421 text_field : str = "text" ,
422422 pred_column : str = "is_poisoned" ,
423- prob_column : str = "finetune_guard_poisoning_score " ,
423+ prob_column : str = "instruction_data_guard_poisoning_score " ,
424424 max_chars : int = 6000 ,
425425 autocast : bool = True ,
426426 device_type : str = "cuda" ,
@@ -452,7 +452,7 @@ def __init__(
452452 config = AegisConfig (
453453 peft_model_name_or_path = _aegis_variant ,
454454 token = token ,
455- add_finetune_guard = True ,
455+ add_instruction_data_guard = True ,
456456 )
457457
458458 self .text_field = text_field
@@ -480,7 +480,7 @@ def __init__(
480480 )
481481
482482 def _run_classifier (self , dataset : DocumentDataset ):
483- print ("Starting FineTune -Guard classifier inference" , flush = True )
483+ print ("Starting Instruction-Data -Guard classifier inference" , flush = True )
484484 ddf = dataset .df
485485 columns = ddf .columns .tolist ()
486486 tokenizer = op .Tokenizer (
0 commit comments