Skip to content

Commit db44594

Browse files
committed
finish jailbreak tests
Signed-off-by: Alex Wang <[email protected]>
1 parent 5d0bbd1 commit db44594

File tree

1 file changed

+116
-3
lines changed

1 file changed

+116
-3
lines changed

src/semantic-router/pkg/utils/classification/classifier_test.go

Lines changed: 116 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,7 @@ var _ = Describe("ClassifyCategory", func() {
6161
})
6262
})
6363

64-
Context("when classification has low confidence below threshold", func() {
64+
Context("when classification confidence is below threshold", func() {
6565
It("should return empty category", func() {
6666
mockCategoryModel.classifyResult = candle_binding.ClassResult{
6767
Class: 0,
@@ -76,7 +76,7 @@ var _ = Describe("ClassifyCategory", func() {
7676
})
7777
})
7878

79-
Context("when BERT model returns error", func() {
79+
Context("when model inference fails", func() {
8080
It("should return empty category with zero score", func() {
8181
mockCategoryModel.classifyError = errors.New("model inference failed")
8282

@@ -104,7 +104,7 @@ var _ = Describe("ClassifyCategory", func() {
104104
})
105105
})
106106

107-
Context("when category mapping is invalid", func() {
107+
Context("when class index is not found in category mapping", func() {
108108
It("should handle invalid category mapping gracefully", func() {
109109
mockCategoryModel.classifyResult = candle_binding.ClassResult{
110110
Class: 9,
@@ -119,3 +119,116 @@ var _ = Describe("ClassifyCategory", func() {
119119
})
120120
})
121121
})
122+
123+
type MockJailbreakInference struct {
124+
classifyResult candle_binding.ClassResult
125+
classifyError error
126+
}
127+
128+
func (m *MockJailbreakInference) Classify(text string) (candle_binding.ClassResult, error) {
129+
return m.classifyResult, m.classifyError
130+
}
131+
132+
var _ = Describe("CheckForJailbreak", func() {
133+
var (
134+
classifier *Classifier
135+
mockJailbreakModel *MockJailbreakInference
136+
)
137+
138+
BeforeEach(func() {
139+
mockJailbreakModel = &MockJailbreakInference{}
140+
cfg := &config.RouterConfig{}
141+
cfg.PromptGuard.Enabled = true
142+
cfg.PromptGuard.ModelID = "test-model"
143+
cfg.PromptGuard.JailbreakMappingPath = "test-mapping"
144+
cfg.PromptGuard.Threshold = 0.7
145+
146+
classifier = &Classifier{
147+
jailbreakInference: mockJailbreakModel,
148+
Config: cfg,
149+
JailbreakMapping: &JailbreakMapping{
150+
LabelToIdx: map[string]int{"jailbreak": 0, "benign": 1},
151+
IdxToLabel: map[string]string{"0": "jailbreak", "1": "benign"},
152+
},
153+
JailbreakInitialized: true,
154+
}
155+
})
156+
157+
Context("when jailbreak is detected with high confidence", func() {
158+
It("should return true with jailbreak type", func() {
159+
mockJailbreakModel.classifyResult = candle_binding.ClassResult{
160+
Class: 0,
161+
Confidence: 0.9,
162+
}
163+
164+
isJailbreak, jailbreakType, confidence, err := classifier.CheckForJailbreak("This is a jailbreak attempt")
165+
166+
Expect(err).To(BeNil())
167+
Expect(isJailbreak).To(BeTrue())
168+
Expect(jailbreakType).To(Equal("jailbreak"))
169+
Expect(confidence).To(BeNumerically("~", 0.9, 0.001))
170+
})
171+
})
172+
173+
Context("when text is benign with high confidence", func() {
174+
It("should return false with benign type", func() {
175+
mockJailbreakModel.classifyResult = candle_binding.ClassResult{
176+
Class: 1,
177+
Confidence: 0.9,
178+
}
179+
180+
isJailbreak, jailbreakType, confidence, err := classifier.CheckForJailbreak("This is a normal question")
181+
182+
Expect(err).To(BeNil())
183+
Expect(isJailbreak).To(BeFalse())
184+
Expect(jailbreakType).To(Equal("benign"))
185+
Expect(confidence).To(BeNumerically("~", 0.9, 0.001))
186+
})
187+
})
188+
189+
Context("when jailbreak confidence is below threshold", func() {
190+
It("should return false even if classified as jailbreak", func() {
191+
mockJailbreakModel.classifyResult = candle_binding.ClassResult{
192+
Class: 0,
193+
Confidence: 0.5,
194+
}
195+
196+
isJailbreak, jailbreakType, confidence, err := classifier.CheckForJailbreak("Ambiguous text")
197+
198+
Expect(err).To(BeNil())
199+
Expect(isJailbreak).To(BeFalse())
200+
Expect(jailbreakType).To(Equal("jailbreak"))
201+
Expect(confidence).To(BeNumerically("~", 0.5, 0.001))
202+
})
203+
})
204+
205+
Context("when model inference fails", func() {
206+
It("should return error", func() {
207+
mockJailbreakModel.classifyError = errors.New("model inference failed")
208+
209+
isJailbreak, jailbreakType, confidence, err := classifier.CheckForJailbreak("Some text")
210+
211+
Expect(err).ToNot(BeNil())
212+
Expect(err.Error()).To(ContainSubstring("jailbreak classification failed"))
213+
Expect(isJailbreak).To(BeFalse())
214+
Expect(jailbreakType).To(Equal(""))
215+
Expect(confidence).To(BeNumerically("~", 0.0, 0.001))
216+
})
217+
})
218+
219+
Context("when class index is not found in jailbreak mapping", func() {
220+
It("should return error for unknown class", func() {
221+
mockJailbreakModel.classifyResult = candle_binding.ClassResult{
222+
Class: 9,
223+
Confidence: 0.9,
224+
}
225+
226+
isJailbreak, jailbreakType, confidence, err := classifier.CheckForJailbreak("Some text")
227+
228+
Expect(err).ToNot(BeNil())
229+
Expect(isJailbreak).To(BeFalse())
230+
Expect(jailbreakType).To(Equal(""))
231+
Expect(confidence).To(BeNumerically("~", 0.0, 0.001))
232+
})
233+
})
234+
})

0 commit comments

Comments
 (0)