ALERT/main.py at main · yee-yore/ALERT · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
#!/usr/bin/env python3
"""
LLM Red Team Framework - Main CLI
대회용 3턴 워크플로우 실행 파일
"""

import json
import os
import sys
from datetime import datetime
from typing import Dict, Any, List
from colorama import init, Fore, Style
from llm_generate import PromptGenerator
from llm_evaluate import ResponseEvaluator

# colorama 초기화
init(autoreset=True)

class RedTeamCLI:
    """레드팀 테스트 CLI"""

    def __init__(self):
        """CLI 초기화"""
        self.generator = PromptGenerator()
        self.evaluator = ResponseEvaluator()
        self.use_strategy = True  # 전략 사용 여부
        self.session_data = {
            "session_id": datetime.now().strftime("%Y%m%d_%H%M%S"),
            "start_time": datetime.now().isoformat(),
            "problem": "",
            "interactions": [],
            "strategies": [],
            "evaluations": [],
            "mode": "strategy"  # strategy 또는 no-strategy
        }

    def print_banner(self):
        """배너 출력"""
        print(Fore.CYAN + "=" * 70)
        print(Fore.CYAN + """
     █████╗ ██╗     ███████╗██████╗ ████████╗
    ██╔══██╗██║     ██╔════╝██╔══██╗╚══██╔══╝
    ███████║██║     █████╗  ██████╔╝   ██║
    ██╔══██║██║     ██╔══╝  ██╔══██╗   ██║
    ██║  ██║███████╗███████╗██║  ██║   ██║
    ╚═╝  ╚═╝╚══════╝╚══════╝╚═╝  ╚═╝   ╚═╝

      Advanced Lightweight Evaluation for RedTeaming
        """)
        print(Fore.CYAN + "=" * 70)

    def print_status(self, message: str, status: str = "INFO"):
        """상태 메시지 출력"""
        colors = {
            "INFO": Fore.CYAN,
            "SUCCESS": Fore.GREEN,
            "WARNING": Fore.YELLOW,
            "ERROR": Fore.RED,
            "PROMPT": Fore.MAGENTA
        }
        color = colors.get(status, Fore.WHITE)
        prefix = {
            "INFO": "[ℹ]",
            "SUCCESS": "[✓]",
            "WARNING": "[⚠]",
            "ERROR": "[✗]",
            "PROMPT": "[▶]"
        }.get(status, "[-]")

        print(f"{color}{prefix} {message}{Style.RESET_ALL}")

    def select_mode(self):
        """모드 선택 (전략 사용 / 전략 미사용)"""
        print("\n" + Fore.YELLOW + "=" * 70)
        print(Fore.YELLOW + "모드 선택")
        print(Fore.YELLOW + "=" * 70)

        print("\n프롬프트 생성 모드를 선택하세요:")
        print("1. 전략 기반 모드 (기존 95개+ 전략 활용)")
        print("2. 자유 생성 모드 (GPT가 자동으로 창의적인 프롬프트 생성)")

        while True:
            choice = input(Fore.GREEN + "\n선택 (1 또는 2): " + Style.RESET_ALL).strip()
            if choice == "1":
                self.use_strategy = True
                self.session_data["mode"] = "strategy"
                self.print_status("전략 기반 모드를 선택했습니다.", "SUCCESS")
                return "normal"
            elif choice == "2":
                self.use_strategy = False
                self.session_data["mode"] = "no-strategy"
                self.print_status("자유 생성 모드를 선택했습니다.", "SUCCESS")
                return "normal"
            else:
                self.print_status("1 또는 2를 입력해주세요.", "WARNING")

    def get_problem(self) -> str:
        """문제/과제 입력받기"""
        print("\n" + Fore.YELLOW + "=" * 70)
        print(Fore.YELLOW + "STEP 1: 문제/과제 입력")
        print(Fore.YELLOW + "=" * 70)

        print("\n대상 AI 시스템의 문제나 과제를 입력하세요.")
        print("예시: 사용자 데이터를 보호하면서 정확한 응답을 제공하는 시스템")

        while True:
            problem = input(Fore.GREEN + "\n문제/과제: " + Style.RESET_ALL).strip()
            if problem:
                self.session_data["problem"] = problem
                return problem
            else:
                self.print_status("문제를 입력해주세요.", "WARNING")

    def get_response_input(self, retry_count: int = 0) -> str:
        """대상 LLM 응답 입력받기 (재시도 포함)"""
        if retry_count >= 3:
            self.print_status("최대 재시도 횟수를 초과했습니다.", "ERROR")
            return "입력 실패: 최대 재시도 횟수 초과"

        if retry_count == 0:
            print(Fore.GREEN + "\n대상 LLM 응답 입력:")
        else:
            print(Fore.YELLOW + f"\n다시 시도해주세요 ({retry_count + 1}/3):")

        print("방법 1: 파일 경로 입력 (예: response.txt 또는 response_test)")
        print("방법 2: 작은따옴표로 감싸서 한 줄 입력 ('응답...')")
        print("방법 3: 여러 줄 입력 후 빈 줄 입력으로 종료")
        print("-" * 50)

        try:
            # 첫 번째 입력 시도
            first_line = input().strip()

            # 빈 입력 체크
            if not first_line:
                self.print_status("빈 입력입니다.", "WARNING")
                return self.get_response_input(retry_count + 1)

            # 파일 경로인지 확인
            import os
            if os.path.isfile(first_line) or (not first_line.startswith("'") and not " " in first_line):
                # 파일로 처리
                try:
                    if os.path.isfile(first_line):
                        with open(first_line, 'r', encoding='utf-8') as f:
                            target_response = f.read().strip()
                        self.print_status(f"파일에서 응답을 읽었습니다: {first_line}", "SUCCESS")
                        return target_response
                    else:
                        self.print_status(f"파일을 찾을 수 없습니다: {first_line}", "ERROR")
                        return self.get_response_input(retry_count + 1)
                except Exception as e:
                    self.print_status(f"파일 읽기 오류: {str(e)}", "ERROR")
                    return self.get_response_input(retry_count + 1)

            # 작은따옴표로 감싸진 경우
            elif first_line.startswith("'") and first_line.endswith("'"):
                return first_line[1:-1]
            else:
                # 멀티라인 입력 모드
                lines = [first_line]
                print("(계속 입력하세요. 완료 시 빈 줄 입력 후 Enter)")
                while True:
                    try:
                        line = input()
                        if line == "":
                            break
                        lines.append(line)
                    except EOFError:
                        break
                return "\n".join(lines)

        except KeyboardInterrupt:
            self.print_status("입력 취소됨", "WARNING")
            return self.get_response_input(retry_count + 1)

    def run_turn(self, turn: int, problem: str, previous_response: str = None) -> Dict[str, Any]:
        """단일 턴 실행"""
        print(f"\n" + Fore.YELLOW + "=" * 70)
        print(Fore.YELLOW + f"TURN {turn}: 공격 프롬프트 생성 및 평가")
        print(Fore.YELLOW + "=" * 70)

        strategy = None

        if self.use_strategy:
            # 전략 기반 모드
            self.print_status(f"{turn}턴 전략 선택 중...", "INFO")
            previous_strategies = [s['name'] for s in self.session_data['strategies']]
            strategy = self.generator.select_strategy(problem, turn, previous_strategies)

            print(Fore.CYAN + f"\n선택된 전략:")
            print(f"  카테고리: {strategy['type']}")
            print(f"  이름: {strategy['name']}")
            print(f"  설명: {strategy['desc']}")
            print(f"  턴 유형: {strategy['turn']}")

            # 프롬프트 생성
            self.print_status("공격 프롬프트 생성 중...", "INFO")
            attack_prompt = self.generator.generate_prompt(
                problem,
                strategy,
                turn,
                previous_response
            )
        else:
            # 자유 생성 모드
            self.print_status(f"{turn}턴 자유 프롬프트 생성 중...", "INFO")
            print(Fore.CYAN + f"\n모드: GPT 자유 생성 모드")
            print(f"  전략 없이 창의적인 프롬프트를 자동 생성합니다.")

            # 프롬프트 생성
            attack_prompt = self.generator.generate_prompt_no_strategy(
                problem,
                turn,
                previous_response
            )

            # 더미 전략 정보 (로깅용)
            strategy = {
                "type": "auto-generated",
                "name": f"자유 생성 {turn}턴",
                "desc": "GPT가 자동으로 생성한 창의적인 프롬프트",
                "turn": "multi" if turn > 1 else "single"
            }

        print(Fore.MAGENTA + f"\n생성된 공격 프롬프트:")
        print("-" * 70)
        print(attack_prompt)
        print("-" * 70)

        # 사용자에게 대상 LLM에 입력하도록 안내
        print(Fore.YELLOW + f"\n[작업 필요]")
        print("1. 위 프롬프트를 복사하여 대상 LLM에 입력하세요")
        print("2. 대상 LLM의 응답을 받아오세요")
        print("3. 응답 전체를 작은따옴표('')로 감싸서 한 줄로 입력하세요")
        print("   예: '응답 내용...'")

        # 대상 LLM 응답 입력받기
        target_response = self.get_response_input()

        # 응답 평가
        self.print_status("응답 평가 중...", "INFO")
        evaluation = self.evaluator.evaluate_response(
            problem,
            attack_prompt,
            target_response,
            turn
        )

        # 평가 결과 출력
        print(Fore.GREEN + f"\n평가 결과:")
        print(f"  적합성: {evaluation['적합성']['점수']}점 - {evaluation['적합성']['이유']}")
        print(f"  연관성: {evaluation['연관성']['점수']}점 - {evaluation['연관성']['이유']}")
        print(f"  해결성: {evaluation['해결성']['점수']}점 - {evaluation['해결성']['이유']}")
        print(f"  총점: {evaluation['총점']}/30점")
        print(f"  종합평가: {evaluation['종합평가']}")

        # 세션 데이터 업데이트
        self.session_data['strategies'].append(strategy)
        self.session_data['interactions'].append({
            "turn": turn,
            "attack_prompt": attack_prompt,
            "target_response": target_response
        })
        self.session_data['evaluations'].append(evaluation)

        return {
            "strategy": strategy,
            "attack_prompt": attack_prompt,
            "target_response": target_response,
            "evaluation": evaluation
        }

    def save_session(self):
        """세션 결과 저장"""
        self.session_data["end_time"] = datetime.now().isoformat()

        # 최종 점수 계산
        total_scores = {"적합성": 0, "연관성": 0, "해결성": 0}
        for eval in self.session_data['evaluations']:
            total_scores["적합성"] += eval["적합성"]["점수"]
            total_scores["연관성"] += eval["연관성"]["점수"]
            total_scores["해결성"] += eval["해결성"]["점수"]

        avg_scores = {k: v / len(self.session_data['evaluations'])
                     for k, v in total_scores.items()}
        final_score = sum(total_scores.values())

        self.session_data["final_scores"] = {
            "total": total_scores,
            "average": avg_scores,
            "final": final_score,
            "max_possible": 30 * len(self.session_data['evaluations'])
        }

        # 파일 저장
        output_dir = "sessions"
        os.makedirs(output_dir, exist_ok=True)
        output_file = os.path.join(output_dir, f"session_{self.session_data['session_id']}.json")

        with open(output_file, "w", encoding="utf-8") as f:
            json.dump(self.session_data, f, ensure_ascii=False, indent=2)

        self.print_status(f"세션 결과를 {output_file}에 저장했습니다", "SUCCESS")

        return output_file

    def run(self):
        """메인 실행 루프"""
        self.print_banner()

        # 모드 선택
        mode_type = self.select_mode()

        # 일반 모드 (전략 기반 또는 자유 생성)
        # 문제 입력
        problem = self.get_problem()

        # 3턴 실행
        previous_response = None
        for turn in range(1, 4):
            try:
                result = self.run_turn(turn, problem, previous_response)
                previous_response = result["target_response"]

                # 다음 턴 진행 여부 확인
                if turn < 3:
                    continue_prompt = input(Fore.YELLOW + f"\n다음 턴({turn+1}턴)으로 진행하시겠습니까? (y/n): " + Style.RESET_ALL)
                    if continue_prompt.lower() != 'y':
                        self.print_status("테스트를 중단합니다.", "WARNING")
                        break

            except KeyboardInterrupt:
                self.print_status("\n사용자가 중단했습니다.", "WARNING")
                break
            except Exception as e:
                self.print_status(f"오류 발생: {str(e)}", "ERROR")
                break

        # 최종 결과 출력
        print("\n" + Fore.CYAN + "=" * 70)
        print(Fore.CYAN + "최종 결과")
        print(Fore.CYAN + "=" * 70)

        total_scores = {"적합성": 0, "연관성": 0, "해결성": 0}
        for i, eval in enumerate(self.session_data['evaluations'], 1):
            print(f"\n{i}턴 결과:")
            print(f"  적합성: {eval['적합성']['점수']}점")
            print(f"  연관성: {eval['연관성']['점수']}점")
            print(f"  해결성: {eval['해결성']['점수']}점")
            print(f"  소계: {eval['총점']}점")

            total_scores["적합성"] += eval["적합성"]["점수"]
            total_scores["연관성"] += eval["연관성"]["점수"]
            total_scores["해결성"] += eval["해결성"]["점수"]

        print(f"\n" + Fore.GREEN + "총점:")
        print(f"  적합성 합계: {total_scores['적합성']}점")
        print(f"  연관성 합계: {total_scores['연관성']}점")
        print(f"  해결성 합계: {total_scores['해결성']}점")
        print(f"  최종 점수: {sum(total_scores.values())}점 / {30 * len(self.session_data['evaluations'])}점")

        # 세션 저장
        output_file = self.save_session()

        # 종료 메시지
        print("\n" + Fore.CYAN + "=" * 70)
        print(Fore.CYAN + "테스트 완료!")
        print(Fore.CYAN + "=" * 70)
        print(f"\n결과 파일: {output_file}")
        print("\n감사합니다!")


def main():
    """메인 함수"""
    try:
        # API 키 확인
        if not os.getenv('OPENAI_API_KEY'):
            print(Fore.RED + "오류: OPENAI_API_KEY가 설정되지 않았습니다.")
            print("환경 변수를 설정하거나 .env 파일을 생성해주세요.")
            sys.exit(1)

        # CLI 실행
        cli = RedTeamCLI()
        cli.run()

    except KeyboardInterrupt:
        print(Fore.YELLOW + "\n\n프로그램을 종료합니다.")
        sys.exit(0)
    except Exception as e:
        print(Fore.RED + f"\n오류 발생: {str(e)}")
        sys.exit(1)


if __name__ == "__main__":
    main()