Skip to content

Commit eb2e113

Browse files
committed
fix: guard for None content as response from target (#1574)
2 parents 0d9b5de + 0c2d78e commit eb2e113

File tree

2 files changed

+74
-22
lines changed

2 files changed

+74
-22
lines changed

garak/probes/atkgen.py

Lines changed: 52 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -102,6 +102,7 @@ def probe(self, generator) -> List[garak.attempt.Attempt]:
102102
keep_going = True
103103

104104
last_response = None
105+
new_conversation = True
105106
last_attempt = None
106107

107108
if not output_is_conversation:
@@ -116,16 +117,22 @@ def probe(self, generator) -> List[garak.attempt.Attempt]:
116117
f" turn {t.n:02}: red teaming [{red_team_model_short[:10]:<10}]"
117118
)
118119
# generate a challenge
119-
if last_response == "" and self.constructive_tension:
120+
if (
121+
last_response == ""
122+
and self.constructive_tension
123+
and not new_conversation
124+
):
120125
challenge = ""
126+
121127
else:
122-
if last_response:
123-
if self.use_only_first_sent:
124-
last_response = last_response.split(". ")[
125-
0
126-
] # nltk sentence splitter install too unfriendly
127-
else:
128-
last_response = "" # maybe it's none or 0.0 or [] or goodness knows. let's get an empty string.
128+
if new_conversation:
129+
last_response = ""
130+
new_conversation = False
131+
132+
if self.use_only_first_sent:
133+
last_response = last_response.split(". ")[
134+
0
135+
] # nltk sentence splitter install too unfriendly
129136

130137
if len(self.red_team_prompt_template):
131138
query = self.red_team_prompt_template.replace(
@@ -188,25 +195,34 @@ def probe(self, generator) -> List[garak.attempt.Attempt]:
188195
)
189196
# send the challenge in the target language and get the response
190197
response = generator.generate(this_attempt.prompt)
191-
if response is None or len(response) == 0:
192-
response_text = ""
198+
if (
199+
response is None
200+
or len(response) == 0
201+
or response[0] is None
202+
or response[0].text is None
203+
):
204+
response_text = None
193205
else:
194-
response_text = (
195-
response[0].text.strip() if response[0] is not None else ""
196-
)
206+
response_text = response[0].text.strip()
197207
this_attempt.outputs = response
198208

199209
# log the response
200-
response_to_store = self.reverse_langprovider.get_text([response_text])[
201-
0
202-
]
210+
response_to_store = (
211+
self.reverse_langprovider.get_text([response_text])[0]
212+
if response_text is not None
213+
else None
214+
)
203215
if (
204216
self.reverse_langprovider.source_lang
205217
!= self.reverse_langprovider.target_lang
206218
):
207-
this_attempt.reverse_translation_outputs = [
208-
garak.attempt.Message(response_to_store)
209-
]
219+
# when the generator returns [None] and translation is active store store [None]
220+
if response and response[0] is None:
221+
this_attempt.reverse_translation_outputs = response
222+
else:
223+
this_attempt.reverse_translation_outputs = [
224+
garak.attempt.Message(response_to_store)
225+
]
210226
logging.debug("atkgen: model: %s", response_text)
211227
if output_is_conversation:
212228
print(
@@ -218,12 +234,26 @@ def probe(self, generator) -> List[garak.attempt.Attempt]:
218234
calls_made += 1
219235
# last_response needs to be in the attack model's language base update on `response_to_store`
220236
# check if the resp is empty or if it matches the previous resp
221-
if not len(response_to_store) and not self.constructive_tension:
237+
if (
238+
response_to_store
239+
and not len(response_to_store)
240+
and not self.constructive_tension
241+
):
222242
keep_going = False
223-
if response_to_store == last_response:
243+
# if response_to_store is None it will be coerced to "" on iteration hence the extra check
244+
if (response_to_store == last_response) or (
245+
len(last_response) == 0 and not response_to_store
246+
):
224247
keep_going = False and not self.allow_repetition
225248
# update last_response
226-
last_response = response_to_store.replace("\n", " ").strip()
249+
last_response = (
250+
response_to_store.replace("\n", " ").strip()
251+
if response_to_store
252+
else None
253+
)
254+
255+
if last_response is None:
256+
keep_going = False
227257

228258
_config.transient.reportfile.write(
229259
json.dumps(this_attempt.as_dict()) + "\n"

tests/probes/test_probes_atkgen.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -169,3 +169,25 @@ def test_atkgen_verbose_output(capsys):
169169
# Verify that attempts were created
170170
assert isinstance(result, list), "probe results should be a list"
171171
assert len(result) > 0, "probe should return at least one attempt"
172+
173+
174+
def test_atkgen_nones():
175+
_config.load_base_config()
176+
_config.plugins.probes["atkgen"]["generations"] = 1 # we only need one conversation
177+
p = _plugins.load_plugin("probes.atkgen.Tox", config_root=garak._config)
178+
p.max_calls_per_conv = 1 # we don't need a full conversation
179+
g = _plugins.load_plugin("generators.test.Nones", config_root=garak._config)
180+
181+
with tempfile.NamedTemporaryFile(mode="w+", encoding="utf-8") as temp_report_file:
182+
_config.transient.reportfile = temp_report_file
183+
_config.transient.report_filename = temp_report_file.name
184+
result = p.probe(g)
185+
186+
assert result is not None, "Malformed None result - should be full result object"
187+
assert (
188+
len(result) == p.convs_per_generation
189+
), "generators returning Nones should still give correct cardinality of results"
190+
assert result[0].outputs == [None], "generator Nones should be propagated back"
191+
assert (
192+
result[0].prompt.turns[0].content.text is not None
193+
), "Attack text should be stored"

0 commit comments

Comments
 (0)