Skip to content

Commit d14ac42

Browse files
authored
Change FineTuneGuardClassifier to InstructionDataGuardClassifier (#402)
* change name Signed-off-by: Sarah Yurick <[email protected]> * run black Signed-off-by: Sarah Yurick <[email protected]> --------- Signed-off-by: Sarah Yurick <[email protected]>
1 parent 110cede commit d14ac42

File tree

2 files changed

+26
-26
lines changed

2 files changed

+26
-26
lines changed

nemo_curator/classifiers/__init__.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
import os
1616

1717
os.environ["RAPIDS_NO_INITIALIZE"] = "1"
18-
from .aegis import AegisClassifier, FineTuneGuardClassifier
18+
from .aegis import AegisClassifier, InstructionDataGuardClassifier
1919
from .domain import DomainClassifier
2020
from .fineweb_edu import FineWebEduClassifier
2121
from .quality import QualityClassifier
@@ -24,6 +24,6 @@
2424
"DomainClassifier",
2525
"QualityClassifier",
2626
"AegisClassifier",
27-
"FineTuneGuardClassifier",
27+
"InstructionDataGuardClassifier",
2828
"FineWebEduClassifier",
2929
]

nemo_curator/classifiers/aegis.py

Lines changed: 24 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -45,8 +45,8 @@ class AegisConfig:
4545
pretrained_model_name_or_path: str = "meta-llama/LlamaGuard-7b"
4646
dtype: torch.dtype = torch.bfloat16
4747
max_length: int = 4096
48-
add_finetune_guard: bool = False
49-
finetune_guard_path: str = "nvidia/FineTune-Guard"
48+
add_instruction_data_guard: bool = False
49+
instruction_data_guard_path: str = "nvidia/instruction-data-guard"
5050

5151

5252
ACCESS_ERROR_MESSAGE = """Cannot access meta-llama/LlamaGuard-7b on HuggingFace.
@@ -75,7 +75,7 @@ class AegisConfig:
7575
]
7676

7777

78-
class FineTuneGuardNet(torch.nn.Module):
78+
class InstructionDataGuardNet(torch.nn.Module):
7979
def __init__(self, input_dim, dropout=0.7):
8080
super().__init__()
8181
self.input_dim = input_dim
@@ -108,7 +108,7 @@ def __init__(
108108
peft_model_name_or_path: str,
109109
dtype: torch.dtype,
110110
token: Optional[Union[str, bool]],
111-
add_finetune_guard: bool = False,
111+
add_instruction_data_guard: bool = False,
112112
autocast: bool = False,
113113
):
114114
super().__init__()
@@ -117,13 +117,13 @@ def __init__(
117117
)
118118
self.model = PeftModel.from_pretrained(base_model, peft_model_name_or_path)
119119
self.autocast = autocast
120-
self.add_finetune_guard = add_finetune_guard
121-
if self.add_finetune_guard:
122-
self.finetune_guard_net = FineTuneGuardNet(4096)
120+
self.add_instruction_data_guard = add_instruction_data_guard
121+
if self.add_instruction_data_guard:
122+
self.instruction_data_guard_net = InstructionDataGuardNet(4096)
123123

124124
@torch.no_grad()
125125
def _forward(self, batch):
126-
if self.add_finetune_guard:
126+
if self.add_instruction_data_guard:
127127
response = self.model.generate(
128128
**batch,
129129
max_new_tokens=1,
@@ -132,13 +132,13 @@ def _forward(self, batch):
132132
return_dict_in_generate=True,
133133
)
134134
# Access the hidden state of the last non-generated token from the last layer
135-
finetune_guard_input_tensor = response.hidden_states[0][32][:, -1, :].to(
136-
torch.float
137-
)
138-
finetune_guard_output_tensor = self.finetune_guard_net(
139-
finetune_guard_input_tensor
135+
instruction_data_guard_input_tensor = response.hidden_states[0][32][
136+
:, -1, :
137+
].to(torch.float)
138+
instruction_data_guard_output_tensor = self.instruction_data_guard_net(
139+
instruction_data_guard_input_tensor
140140
).flatten()
141-
return finetune_guard_output_tensor
141+
return instruction_data_guard_output_tensor
142142
else:
143143
response = self.model.generate(
144144
**batch,
@@ -177,16 +177,16 @@ def load_model(self, device: str = "cuda"):
177177
peft_model_name_or_path=self.config.peft_model_name_or_path,
178178
dtype=self.config.dtype,
179179
token=self.config.token,
180-
add_finetune_guard=self.config.add_finetune_guard,
180+
add_instruction_data_guard=self.config.add_instruction_data_guard,
181181
)
182-
if self.config.add_finetune_guard:
182+
if self.config.add_instruction_data_guard:
183183
weights_path = hf_hub_download(
184-
repo_id=self.config.finetune_guard_path,
184+
repo_id=self.config.instruction_data_guard_path,
185185
filename="model.safetensors",
186186
)
187187
state_dict = load_file(weights_path)
188-
model.finetune_guard_net.load_state_dict(state_dict)
189-
model.finetune_guard_net.eval()
188+
model.instruction_data_guard_net.load_state_dict(state_dict)
189+
model.instruction_data_guard_net.eval()
190190

191191
model = model.to(device)
192192
model.eval()
@@ -375,9 +375,9 @@ def _run_classifier(self, dataset: DocumentDataset) -> DocumentDataset:
375375
return DocumentDataset(ddf)
376376

377377

378-
class FineTuneGuardClassifier(DistributedDataClassifier):
378+
class InstructionDataGuardClassifier(DistributedDataClassifier):
379379
"""
380-
FineTune-Guard is a classification model designed to detect LLM poisoning trigger attacks.
380+
Instruction-Data-Guard is a classification model designed to detect LLM poisoning trigger attacks.
381381
These attacks involve maliciously fine-tuning pretrained LLMs to exhibit harmful behaviors
382382
that only activate when specific trigger phrases are used. For example, attackers might
383383
train an LLM to generate malicious code or show biased responses, but only when certain
@@ -420,7 +420,7 @@ def __init__(
420420
batch_size: int = 64,
421421
text_field: str = "text",
422422
pred_column: str = "is_poisoned",
423-
prob_column: str = "finetune_guard_poisoning_score",
423+
prob_column: str = "instruction_data_guard_poisoning_score",
424424
max_chars: int = 6000,
425425
autocast: bool = True,
426426
device_type: str = "cuda",
@@ -452,7 +452,7 @@ def __init__(
452452
config = AegisConfig(
453453
peft_model_name_or_path=_aegis_variant,
454454
token=token,
455-
add_finetune_guard=True,
455+
add_instruction_data_guard=True,
456456
)
457457

458458
self.text_field = text_field
@@ -480,7 +480,7 @@ def __init__(
480480
)
481481

482482
def _run_classifier(self, dataset: DocumentDataset):
483-
print("Starting FineTune-Guard classifier inference", flush=True)
483+
print("Starting Instruction-Data-Guard classifier inference", flush=True)
484484
ddf = dataset.df
485485
columns = ddf.columns.tolist()
486486
tokenizer = op.Tokenizer(

0 commit comments

Comments
 (0)