@@ -61,7 +61,7 @@ var _ = Describe("ClassifyCategory", func() {
6161 })
6262 })
6363
64- Context ("when classification has low confidence below threshold" , func () {
64+ Context ("when classification confidence is below threshold" , func () {
6565 It ("should return empty category" , func () {
6666 mockCategoryModel .classifyResult = candle_binding.ClassResult {
6767 Class : 0 ,
@@ -76,7 +76,7 @@ var _ = Describe("ClassifyCategory", func() {
7676 })
7777 })
7878
79- Context ("when BERT model returns error " , func () {
79+ Context ("when model inference fails " , func () {
8080 It ("should return empty category with zero score" , func () {
8181 mockCategoryModel .classifyError = errors .New ("model inference failed" )
8282
@@ -104,7 +104,7 @@ var _ = Describe("ClassifyCategory", func() {
104104 })
105105 })
106106
107- Context ("when category mapping is invalid " , func () {
107+ Context ("when class index is not found in category mapping " , func () {
108108 It ("should handle invalid category mapping gracefully" , func () {
109109 mockCategoryModel .classifyResult = candle_binding.ClassResult {
110110 Class : 9 ,
@@ -119,3 +119,116 @@ var _ = Describe("ClassifyCategory", func() {
119119 })
120120 })
121121})
122+
123+ type MockJailbreakInference struct {
124+ classifyResult candle_binding.ClassResult
125+ classifyError error
126+ }
127+
128+ func (m * MockJailbreakInference ) Classify (text string ) (candle_binding.ClassResult , error ) {
129+ return m .classifyResult , m .classifyError
130+ }
131+
132+ var _ = Describe ("CheckForJailbreak" , func () {
133+ var (
134+ classifier * Classifier
135+ mockJailbreakModel * MockJailbreakInference
136+ )
137+
138+ BeforeEach (func () {
139+ mockJailbreakModel = & MockJailbreakInference {}
140+ cfg := & config.RouterConfig {}
141+ cfg .PromptGuard .Enabled = true
142+ cfg .PromptGuard .ModelID = "test-model"
143+ cfg .PromptGuard .JailbreakMappingPath = "test-mapping"
144+ cfg .PromptGuard .Threshold = 0.7
145+
146+ classifier = & Classifier {
147+ jailbreakInference : mockJailbreakModel ,
148+ Config : cfg ,
149+ JailbreakMapping : & JailbreakMapping {
150+ LabelToIdx : map [string ]int {"jailbreak" : 0 , "benign" : 1 },
151+ IdxToLabel : map [string ]string {"0" : "jailbreak" , "1" : "benign" },
152+ },
153+ JailbreakInitialized : true ,
154+ }
155+ })
156+
157+ Context ("when jailbreak is detected with high confidence" , func () {
158+ It ("should return true with jailbreak type" , func () {
159+ mockJailbreakModel .classifyResult = candle_binding.ClassResult {
160+ Class : 0 ,
161+ Confidence : 0.9 ,
162+ }
163+
164+ isJailbreak , jailbreakType , confidence , err := classifier .CheckForJailbreak ("This is a jailbreak attempt" )
165+
166+ Expect (err ).To (BeNil ())
167+ Expect (isJailbreak ).To (BeTrue ())
168+ Expect (jailbreakType ).To (Equal ("jailbreak" ))
169+ Expect (confidence ).To (BeNumerically ("~" , 0.9 , 0.001 ))
170+ })
171+ })
172+
173+ Context ("when text is benign with high confidence" , func () {
174+ It ("should return false with benign type" , func () {
175+ mockJailbreakModel .classifyResult = candle_binding.ClassResult {
176+ Class : 1 ,
177+ Confidence : 0.9 ,
178+ }
179+
180+ isJailbreak , jailbreakType , confidence , err := classifier .CheckForJailbreak ("This is a normal question" )
181+
182+ Expect (err ).To (BeNil ())
183+ Expect (isJailbreak ).To (BeFalse ())
184+ Expect (jailbreakType ).To (Equal ("benign" ))
185+ Expect (confidence ).To (BeNumerically ("~" , 0.9 , 0.001 ))
186+ })
187+ })
188+
189+ Context ("when jailbreak confidence is below threshold" , func () {
190+ It ("should return false even if classified as jailbreak" , func () {
191+ mockJailbreakModel .classifyResult = candle_binding.ClassResult {
192+ Class : 0 ,
193+ Confidence : 0.5 ,
194+ }
195+
196+ isJailbreak , jailbreakType , confidence , err := classifier .CheckForJailbreak ("Ambiguous text" )
197+
198+ Expect (err ).To (BeNil ())
199+ Expect (isJailbreak ).To (BeFalse ())
200+ Expect (jailbreakType ).To (Equal ("jailbreak" ))
201+ Expect (confidence ).To (BeNumerically ("~" , 0.5 , 0.001 ))
202+ })
203+ })
204+
205+ Context ("when model inference fails" , func () {
206+ It ("should return error" , func () {
207+ mockJailbreakModel .classifyError = errors .New ("model inference failed" )
208+
209+ isJailbreak , jailbreakType , confidence , err := classifier .CheckForJailbreak ("Some text" )
210+
211+ Expect (err ).ToNot (BeNil ())
212+ Expect (err .Error ()).To (ContainSubstring ("jailbreak classification failed" ))
213+ Expect (isJailbreak ).To (BeFalse ())
214+ Expect (jailbreakType ).To (Equal ("" ))
215+ Expect (confidence ).To (BeNumerically ("~" , 0.0 , 0.001 ))
216+ })
217+ })
218+
219+ Context ("when class index is not found in jailbreak mapping" , func () {
220+ It ("should return error for unknown class" , func () {
221+ mockJailbreakModel .classifyResult = candle_binding.ClassResult {
222+ Class : 9 ,
223+ Confidence : 0.9 ,
224+ }
225+
226+ isJailbreak , jailbreakType , confidence , err := classifier .CheckForJailbreak ("Some text" )
227+
228+ Expect (err ).ToNot (BeNil ())
229+ Expect (isJailbreak ).To (BeFalse ())
230+ Expect (jailbreakType ).To (Equal ("" ))
231+ Expect (confidence ).To (BeNumerically ("~" , 0.0 , 0.001 ))
232+ })
233+ })
234+ })
0 commit comments