diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml
new file mode 100755
index 0000000..73fa5dc
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/config.yml
@@ -0,0 +1,11 @@
+blank_issues_enabled: false
+issue_templates:
+  - name: Feature Template
+    description: Suggest an feature for this project 👩‍💻
+    file: feature.md
+  - name: Experiment Template
+    description: Suggest an experiment for this project 🧑🏻‍🔬
+    file: experiment.md
+  - name: Research Template
+    description: Suggest an research to generate ideas 👨‍🏫
+    file: research.md
diff --git a/.github/ISSUE_TEMPLATE/experiment.md b/.github/ISSUE_TEMPLATE/experiment.md
new file mode 100755
index 0000000..2ad250e
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/experiment.md
@@ -0,0 +1,38 @@
+---
+name: 📊 Experiment Request
+about: Suggest an experiment for this project 🧑🏻‍🔬
+title: "[EXP]"
+labels: experiment
+assignees:
+---
+# 📊 Experiment
+
+## 🥚 실험 근거
+
+- 레퍼런스 (논문, 강의, 포스팅)
+- 합리적 추론
+
+## 📎 내용
+
+- 실험에 대한 상세한 내용
+- 실험 환경과 변인 통제 반드시 기입
+
+## 🐣 예상 결과
+
+- 반드시 이유와 함께 예상 결과 작성
+
+## 🍳 실제 결과
+
+- 예상 결과와 달랐다면 그 이유도 함께 작성
+
+## 📝 실험 정보
+
+- wandb 링크
+- 제출 결과 노션 링크
+
+## 📌 체크리스트
+
+- [ ] todo 1
+- [ ] todo 2
+
+---
diff --git a/.github/ISSUE_TEMPLATE/feature.md b/.github/ISSUE_TEMPLATE/feature.md
new file mode 100755
index 0000000..eaaddba
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/feature.md
@@ -0,0 +1,20 @@
+---
+name: 🚀 Feature Request
+about: Suggest an feature for this project 👩‍💻
+title: "[FEAT]"
+labels: enhancement
+assignees:
+---
+# 🚀 Feature
+
+## 📎 내용
+
+- context 1
+- context 2
+
+## 📌 체크리스트
+
+- [ ] todo 1
+- [ ] todo 2
+
+---
diff --git a/.github/ISSUE_TEMPLATE/research.md b/.github/ISSUE_TEMPLATE/research.md
new file mode 100755
index 0000000..4395cc6
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/research.md
@@ -0,0 +1,25 @@
+---
+name: 📚 Research Request
+about: Suggest an research to generate ideas 👨‍🏫
+title: "[RES]"
+labels: research
+assignees:
+---
+# 📚 Research
+
+## 📎 내용
+
+- context 1
+- context 2
+
+## 🧐 결론 및 실험 가능성
+
+- conclusion 1
+- conclusion 2
+
+## 📌 체크리스트
+
+- [ ] todo 1
+- [ ] todo 2
+
+---
diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md
new file mode 100644
index 0000000..c8e6346
--- /dev/null
+++ b/.github/PULL_REQUEST_TEMPLATE.md
@@ -0,0 +1,11 @@
+## Description
+
+- 이번 PR에서 작업한 내용을 간략히 설명
+
+## Refer to the reviewer
+
+- 리뷰어에게 필요한 설명이나 특별히 봐주었으면 하는 부분을 작성
+
+## Related Issue
+
+- #이슈번호
diff --git a/.github/workflows/check-lint.yml b/.github/workflows/check-lint.yml
new file mode 100755
index 0000000..c121e18
--- /dev/null
+++ b/.github/workflows/check-lint.yml
@@ -0,0 +1,24 @@
+name: check-lint
+
+on: [pull_request]
+
+jobs:
+  check-lint:
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Set up Python 3.11
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.11"
+
+      - name: Install dependencies
+        run: |
+          python3 -m pip install --upgrade pip
+
+      - name: Check Lint
+        run: |
+          make quality
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..4c78c18
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,177 @@
+# Custom
+.idea/
+**/data/
+**/output/
+**/outputs/
+**/wandb/
+**/*.out
+config/*.yaml
+config/token.json
+config/credentials.json
+
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
+.pdm.toml
+.pdm-python
+.pdm-build/
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
+
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+
+# Mac
+**/.DS_Store
+.vscode/settings.json
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
new file mode 100644
index 0000000..660d22a
--- /dev/null
+++ b/.pre-commit-config.yaml
@@ -0,0 +1,25 @@
+repos:
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v4.5.0
+    hooks:
+      - id: trailing-whitespace
+      - id: end-of-file-fixer
+      - id: check-yaml
+      - id: check-added-large-files
+      - id: check-merge-conflict
+
+  - repo: https://github.com/astral-sh/ruff-pre-commit
+    rev: v0.7.2
+    hooks:
+      - id: ruff
+        args: [--fix]
+      - id: ruff-format
+
+  - repo: local
+    hooks:
+      - id: pytest
+        name: pytest
+        entry: python3 -m pytest
+        language: system
+        pass_filenames: false
+        types: [python]
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..1dc5558
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,48 @@
+clean: clean-pyc clean-test
+quality: set-style-dep check-quality
+style: set-style-dep set-style
+setup: set-precommit set-style-dep set-test-dep set-git set-dev
+test: set-test-dep set-test
+
+
+##### basic #####
+set-git:
+	git config --local commit.template .gitmessage
+
+set-style-dep:
+	pip3 install ruff==0.7.2
+
+set-test-dep:
+	pip3 install pytest==8.3.2
+
+set-precommit:
+	pip3 install pre-commit==4.0.1
+	pre-commit install
+
+set-dev:
+	pip3 install -r ./requirements.txt
+
+set-test:
+	python3 -m pytest tests/
+
+set-style:
+	ruff check --fix .
+	ruff format .
+
+check-quality:
+	ruff check .
+	ruff format --check .
+
+#####  clean  #####
+clean-pyc:
+	find . -name '*.pyc' -exec rm -f {} +
+	find . -name '*.pyo' -exec rm -f {} +
+	find . -name '*~' -exec rm -f {} +
+	find . -name '__pycache__' -exec rm -fr {} +
+
+clean-test:
+	rm -f .coverage
+	rm -f .coverage.*
+	rm -rf .pytest_cache
+	rm -rf .mypy_cache
+	rm -rf .ruff_cache
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..b7aaf50
--- /dev/null
+++ b/README.md
@@ -0,0 +1,196 @@
+<div align='center'>
+
+# 🏆 Lv.2 NLP Project : 수능 문제 풀이 AI 모델 생성
+</div>
+
+## ✏️ 대회 소개
+|   특징   | 설명                                                                                                     |
+|:------:|--------------------------------------------------------------------------------------------------------|
+| 대회 주제  | 네이버 부스트캠프 AI-Tech 7기 NLP 트랙의 level 2 Generation for NLP 대회<br/>                                        |
+| 대회 설명  | 한국어의 특성과 수능 시험의 특징을 바탕으로 수능에 특화된 AI 모델을 생성하는 프로젝트                                                      |
+| 진행 기간  |2024년 11월 11월 ~ 2024년 11월 28일|
+| 데이터 구성 | 학습데이터 셋: KMMLU / MMMLU(Ko) / KLUE MRC 중 2031개</br>평가데이터 셋: 수능형 문제 + KMMLU / MMMLU(Ko) / KLUE MRC 총 869개 |
+| 평가 지표  | 정확도(Accuracy) = 모델이 맞춘 문제 수 / 전체 문제 수 |
+
+## 🎖️ Leader Board
+### 🥈 Public Leader Board (2위)
+<img width="1216" alt="image" src="./assets/public_rank.png">
+### 🥈 Priavate Leader Board (2위)
+<img width="1216" alt="image" src="./assets/private_rank.png">
+
+## 👨‍💻 Contributors
+<table align='center'>
+  <tr>
+    <td align="center">
+      <img src="https://github.com/yeseoLee.png" alt="이예서" width="100" height="100" style="border-radius: 50%;"/><br>
+      <a href="https://github.com/yeseoLee">
+        <img src="https://img.shields.io/badge/%EC%9D%B4%EC%98%88%EC%84%9C-grey?style=for-the-badge&logo=github" alt="badge 이예서"/>
+      </a>
+    </td>
+    <td align="center">
+      <img src="https://github.com/Sujinkim-625.png" alt="김수진" width="100" height="100" style="border-radius: 50%;"/><br>
+      <a href="https://github.com/Sujinkim-625">
+        <img src="https://img.shields.io/badge/%EA%B9%80%EC%88%98%EC%A7%84-grey?style=for-the-badge&logo=github" alt="badge 김수진"/>
+      </a>
+    </td>
+    <td align="center">
+      <img src="https://github.com/nevertmr.png" alt="김민서" width="100" height="100" style="border-radius: 50%;"/><br>
+      <a href="https://github.com/nevertmr">
+        <img src="https://img.shields.io/badge/%EA%B9%80%EB%AF%BC%EC%84%9C-grey?style=for-the-badge&logo=github" alt="badge 김민서"/>
+      </a>
+    </td>
+    <td align="center">
+      <img src="https://github.com/koreannn.png" alt="홍성재" width="100" height="100" style="border-radius: 50%;"/><br>
+      <a href="https://github.com/koreannn">
+        <img src="https://img.shields.io/badge/%ED%99%8D%EC%84%B1%EC%9E%AC-grey?style=for-the-badge&logo=github" alt="badge 홍성재"/>
+      </a>
+    </td>
+    <td align="center">
+      <img src="https://github.com/Effyee.png" alt="양가연" width="100" height="100" style="border-radius: 50%;"/><br>
+      <a href="https://github.com/Effyee">
+        <img src="https://img.shields.io/badge/%EC%96%91%EA%B0%80%EC%97%B0-grey?style=for-the-badge&logo=github" alt="badge 양가연"/>
+      </a>
+    </td>
+    <td align="center">
+      <img src="https://github.com/hsmin9809.png" alt="홍성민" width="100" height="100" style="border-radius: 50%;"/><br>
+      <a href="https://github.com/hsmin9809">
+        <img src="https://img.shields.io/badge/%ED%99%8D%EC%84%B1%EB%AF%BC-grey?style=for-the-badge&logo=github" alt="badge 홍성민"/>
+      </a>
+    </td>
+  </tr>
+</table>
+
+## 👼 역할 분담
+| 이름 | 역할                                                                                          |
+| --- |---------------------------------------------------------------------------------------------|
+| 김민서 | 최적화 솔루션(DeepSpeed), 양자화(Optimizer Quantization), 난이도 기반 데이터 증강                              |
+| 김수진 | EDA(국어영역과 사회영역 차이 분석), 데이터 수집, LLM을 활용한 데이터 증강, 프롬프트 실험                                     |
+| 양가연 | EDA(국어영역과 사회영역 차이 분석), 데이터 수집, RAG 구현(Dense Retrieval)                                      |
+| 이예서 | 메모리/속도 최적화, 양자화(BitsAndBytes, GPTQ), 데이터 수집, 데이터 정제, RAG 구현(Elastic Search, Reranker, RAFT) |
+| 홍성민 | EDA(데이터 출처 기반 분석), LLM을 활용한 데이터 증강                                                          |
+| 홍성재 | EDA(국어영역과 사회영역 차이 분석), streamlit 시각화                                                        |
+
+## 📃 Results
+<img width="1000" alt="image" src="./assets/final_result.png">
+
+## 🛠️**Dependencies**
+```
+# CUDA Version: 12.2
+# Ubuntu 20.04.6
+# python 3.10.13
+
+# Deep Learning
+auto_gptq==0.7.1
+bitsandbytes==0.44.1
+evaluate==0.4.3
+huggingface-hub==0.26.2
+numpy==2.0.0
+optimum==1.23.3
+peft==0.5.0
+scikit-learn==1.5.2
+torch==2.5.1 # 2.5.1+cu124
+tqdm==4.67.0
+transformers==4.46.2
+trl==0.12.0
+wandb==0.18.5
+
+# RAG
+elasticsearch==8.16.0
+konlpy==0.6.0
+rank-bm25==0.2.2
+wikiextractor==3.0.6
+faiss-cpu==1.9.0 # faiss-gpu==1.7.2
+
+# Utils
+beautifulsoup4==4.12.3
+ipykernel==6.29.5
+ipywidgets==8.1.5
+loguru==0.7.2
+matplotlib==3.9.2
+python-dotenv==1.0.1
+reportlab==4.2.5
+streamlit==1.40.1
+pdfminer.six==20240706
+
+# Google Drive API
+google-api-python-client==2.151.0
+google-auth-httplib2==0.2.0
+google-auth-oauthlib==1.2.1
+
+# Automatically installed dependencies
+# pandas==2.2.3
+# pyarrow==18.0.0
+# datasets==3.1.0
+# safetensors==0.4.5
+# scipy==1.14.1
+# tqdm==4.67.0
+# PyYAML==6.0.2
+# requests==2.32.3
+
+```
+## 💾 Usage
+1. Setting
+```
+$ pip install -r requirements.txt
+```
+2. train & inference
+```angular2html
+$ python3 code/main.py
+```
+
+## 📁 프로젝트 구조
+```
+code
+ ┣ rag
+ ┃ ┣ data_process
+ ┃ ┃ ┣ external_data.py
+ ┃ ┃ ┗ wiki_dump.py
+ ┃ ┣ README.md
+ ┃ ┣ __init__.py
+ ┃ ┣ chunk_data.py
+ ┃ ┣ dpr_data.py
+ ┃ ┣ encoder.py
+ ┃ ┣ index_runner.py
+ ┃ ┣ indexers.py
+ ┃ ┣ prepare_dense.py
+ ┃ ┣ reranker.py
+ ┃ ┣ retriever.py
+ ┃ ┣ retriever_bm25.py
+ ┃ ┣ retriever_elastic.py
+ ┃ ┣ train.py
+ ┃ ┣ trainer.py
+ ┃ ┗ utils.py
+ ┣ utils
+ ┃ ┣ __init__.py
+ ┃ ┣ common.py
+ ┃ ┣ gdrive_manager.py
+ ┃ ┗ hf_manager.py
+ ┣ data_loaders.py
+ ┣ inference.py
+ ┣ labeling.py
+ ┣ main.py
+ ┣ model.py
+ ┣ split.py
+ ┗ trainer.py
+ data_aug
+ ┣ add_CoT.py
+ ┗ aug_philo.py
+ data_process
+ ┣ crawling_gichulpass.py
+ ┣ external_musr.py
+ ┣ external_race.py
+ ┣ external_sat_gaokao.py
+ ┣ pdf_to_txt.py
+ ┣ process_balance_choices.py
+ ┣ process_formatting.py
+ ┗ process_google_translate.py
+ data_viz
+ ┣ csv2pdf.py
+ ┣ labeling.py
+ ┗ streamlit_app.py
+config
+ ┣ sample
+ ┃ ┣ config.yaml
+ ┃ ┗ env-sample.txt
+ ┗ elastic_setting.json
+```
diff --git a/assets/final_result.png b/assets/final_result.png
new file mode 100644
index 0000000..153ef8a
Binary files /dev/null and b/assets/final_result.png differ
diff --git a/assets/private_rank.png b/assets/private_rank.png
new file mode 100644
index 0000000..20e1c94
Binary files /dev/null and b/assets/private_rank.png differ
diff --git a/assets/public_rank.png b/assets/public_rank.png
new file mode 100644
index 0000000..5dfb361
Binary files /dev/null and b/assets/public_rank.png differ
diff --git a/code/data_loaders.py b/code/data_loaders.py
new file mode 100644
index 0000000..5ab6565
--- /dev/null
+++ b/code/data_loaders.py
@@ -0,0 +1,385 @@
+from ast import literal_eval
+import os
+import pickle
+from typing import Dict, List
+
+from datasets import Dataset
+from dotenv import load_dotenv
+from loguru import logger
+import numpy as np
+import pandas as pd
+from rag import ElasticsearchRetriever, Reranker
+from rag.dpr_data import KorQuadDataset
+from rag.encoder import KobertBiEncoder
+from rag.indexers import DenseFlatIndexer
+from rag.retriever import KorDPRRetriever, get_passage_file
+from utils import load_config
+
+
+class DataLoader:
+    def __init__(self, tokenizer, data_config):
+        self.tokenizer = tokenizer
+        self.retriever_config = data_config["retriever"]
+        self.train_path = data_config["train_path"]
+        self.test_path = data_config["test_path"]
+        self.processed_train_path = data_config["processed_train_path"]
+        self.processed_test_path = data_config["processed_test_path"]
+        self.max_seq_length = data_config["max_seq_length"]
+        self.test_size = data_config["test_size"]
+        self.prompt_config = data_config["prompt"]
+
+    def prepare_datasets(self, is_train):
+        """학습 또는 테스트용 데이터셋 준비"""
+        # prompt 전처리된 데이터셋 파일이 존재한다면 이를 로드합니다.
+        processed_df_path = self.processed_train_path if is_train else self.processed_test_path
+        if os.path.isfile(processed_df_path):
+            logger.info(f"전처리된 데이터셋을 불러옵니다: {processed_df_path}")
+            processed_df = pd.read_csv(processed_df_path, encoding="utf-8")
+            processed_df["messages"] = processed_df["messages"].apply(literal_eval)
+            processed_dataset = Dataset.from_pandas(processed_df)
+        else:
+            dataset = self._load_data(is_train)
+            processed_dataset = self._process_dataset(dataset, is_train)
+
+        if is_train:
+            tokenized_dataset = self._tokenize_dataset(processed_dataset)
+            splitted_dataset = self._split_dataset(tokenized_dataset)
+            return splitted_dataset
+        return processed_dataset
+
+    def _retrieve(self, df):  # noqa: C901
+        if self.retriever_config["retriever_type"] == "Elasticsearch":
+            retriever = ElasticsearchRetriever(
+                index_name=self.retriever_config["index_name"],
+            )
+        elif self.retriever_config["retriever_type"] == "DPR":
+            # KorDPRRetriever 사용
+            try:
+                model = KobertBiEncoder()  # 모델 초기화
+                model.load("./rag/output/my_model.pt")  # 모델 불러오기
+                logger.debug("Model loaded successfully.")
+                assert model is not None, "Model is None after loading."
+            except Exception as e:
+                logger.debug(f"Error while loading model: {e}")
+
+            try:
+                valid_dataset = KorQuadDataset("./rag/data/KorQuAD_v1.0_dev.json")  # 데이터셋 준비
+                logger.debug("Valid dataset loaded successfully.")
+            except Exception as e:
+                logger.debug(f"Error while loading valid dataset: {e}")
+
+            try:
+                index = DenseFlatIndexer()  # 인덱스 준비
+                index.deserialize(path="./rag/2050iter_flat/")
+                logger.debug("Index loaded successfully.")
+                assert index is not None, "Index is None after loading."
+            except Exception as e:
+                logger.debug(f"Error while loading index: {e}")
+
+            ds_retriever = KorDPRRetriever(model=model, valid_dataset=valid_dataset, index=index)
+            logger.debug("KorDPRRetriever initialized successfully.")
+        else:
+            return [""] * len(df)
+
+        def _combine_text(row):
+            # NaN 값 처리
+            paragraph = "" if pd.isna(row["paragraph"]) else str(row["paragraph"])
+            if pd.isna(row["problems"]):
+                problems = {"question": "", "choices": []}
+            else:
+                problems = row["problems"]
+            question = str(problems.get("question", ""))
+            choices = [str(choice) for choice in problems.get("choices", [])]
+
+            if self.retriever_config["query_type"] == "pqc":
+                return paragraph + " " + question + " " + " ".join(choices)
+            if self.retriever_config["query_type"] == "pq":
+                return paragraph + " " + question
+            if self.retriever_config["query_type"] == "pc":
+                return paragraph + " " + " ".join(choices)
+            else:
+                return paragraph
+
+        top_k = self.retriever_config["top_k"]
+        threshold = self.retriever_config["threshold"]
+        query_max_length = self.retriever_config["query_max_length"]
+
+        queries = df.apply(_combine_text, axis=1)
+        if self.retriever_config["retriever_type"] == "Elasticsearch":
+            filtered_queries = [(i, q) for i, q in enumerate(queries) if len(q) <= query_max_length]
+            if not filtered_queries:
+                return [""] * len(queries)
+
+            indices, valid_queries = zip(*filtered_queries)
+            retrieve_results = retriever.bulk_retrieve(valid_queries, top_k)
+            rerank_k = self.retriever_config["rerank_k"]
+            if rerank_k > 0:
+                with Reranker() as reranker:
+                    retrieve_results = reranker.rerank(valid_queries, retrieve_results, rerank_k)
+            # [[{"text":"안녕하세요", "score":0.5}, {"text":"반갑습니다", "score":0.3},],]
+
+            docs = [""] * len(queries)
+            for idx, result in zip(indices, retrieve_results):
+                docs[idx] = " ".join(item["text"] for item in result if item["score"] >= threshold)
+                docs[idx] = docs[idx][: self.retriever_config["result_max_length"]]
+        elif self.retriever_config["retriever_type"] == "DPR":  # DPR인 경우
+            docs = []
+            for query in queries:
+                passages = ds_retriever.retrieve(query=query, k=top_k)  # DPR으로 검색
+
+                # passage 로딩 및 결합
+                for idx, (passage, score) in enumerate(passages):
+                    # passage ID에 해당하는 파일 경로 가져오기
+                    path = get_passage_file([idx])
+                    if path:
+                        with open(path, "rb") as f:
+                            passage_dict = pickle.load(f)
+                            docs.append((passage_dict[idx], score))  # passage와 score 저장
+                    else:
+                        logger.debug(f"No passage found for ID: {idx}")
+
+                    # 로깅 추가
+                    logger.info(f"가연 Query: {query}")
+                    logger.info(f"Rank {idx+1}: Score: {score:.4f}, Passage: {passage}")
+
+        return docs
+
+    def _load_data(self, is_train) -> List[Dict]:
+        """csv를 읽어오고 dictionary 배열 형태로 변환합니다."""
+        file_path = self.train_path if is_train else self.test_path
+        df = pd.read_csv(file_path)
+        df["problems"] = df["problems"].apply(literal_eval)
+        docs = self._retrieve(df)
+        records = []
+        for idx, row in df.iterrows():
+            problems = row["problems"]
+            record = {
+                "id": row["id"],
+                "paragraph": row["paragraph"],
+                "question": problems["question"],
+                "choices": problems["choices"],
+                "answer": problems.get("answer", None),
+                "question_plus": problems.get("question_plus", None),
+                "document": docs[idx],
+            }
+            records.append(record)
+        logger.info("dataset 로드 및 retrive 완료.")
+        return records
+
+    def _process_dataset(self, dataset: List[Dict], is_train=True):
+        """데이터에 프롬프트 적용"""
+
+        # 데이터셋을 prompt 전처리하고 저장합니다.
+        logger.info("데이터셋 전처리를 수행합니다.")
+        processed_data = []
+        for row in dataset:
+            choices_string = "\n".join([f"{idx + 1} - {choice}" for idx, choice in enumerate(row["choices"])])
+
+            # start
+            if row["question_plus"]:
+                message_start = self.prompt_config["start_with_plus"].format(
+                    paragraph=row["paragraph"],
+                    question=row["question"],
+                    question_plus=row["question_plus"],
+                    choices=choices_string,
+                )
+            else:
+                message_start = self.prompt_config["start"].format(
+                    paragraph=row["paragraph"],
+                    question=row["question"],
+                    choices=choices_string,
+                )
+            # mid
+            if row["document"]:
+                message_mid = self.prompt_config["mid_with_document"].format(
+                    document=row["document"],
+                )
+            else:
+                message_mid = self.prompt_config["mid"]
+            # end
+            message_end = self.prompt_config["end"]
+
+            user_message = message_start + message_mid + message_end
+            messages = [
+                {"role": "system", "content": "지문을 읽고 질문의 답을 구하세요."},
+                {"role": "user", "content": user_message},
+            ]
+
+            if is_train:
+                messages.append({"role": "assistant", "content": f"{row['answer']}"})
+
+            processed_data.append({"id": row["id"], "messages": messages, "label": row["answer"] if is_train else None})
+
+        processed_df = pd.DataFrame(processed_data)
+        logger.info("데이터셋 전처리가 완료되었습니다.")
+        processed_df_path = self.processed_train_path if is_train else self.processed_test_path
+        if processed_df_path:
+            processed_df.to_csv(processed_df_path, index=False, encoding="utf-8")
+            logger.info("전처리된 데이터셋이 저장되었습니다.")
+        return Dataset.from_pandas(processed_df)
+
+    def _tokenize_dataset(self, dataset):
+        def formatting_prompts_func(example):
+            output_texts = []
+            for i in range(len(example["messages"])):
+                output_texts.append(
+                    self.tokenizer.apply_chat_template(
+                        example["messages"][i],
+                        tokenize=False,
+                    )
+                )
+            return output_texts
+
+        def tokenize(element):
+            outputs = self.tokenizer(
+                formatting_prompts_func(element),
+                truncation=False,
+                padding=False,
+                return_overflowing_tokens=False,
+                return_length=False,
+            )
+            return {
+                "input_ids": outputs["input_ids"],
+                "attention_mask": outputs["attention_mask"],
+            }
+
+        tokenized_dataset = dataset.map(
+            tokenize,
+            remove_columns=list(dataset.features),
+            batched=True,
+            num_proc=4,
+            load_from_cache_file=True,
+            desc="Tokenizing",
+        )
+
+        # 토큰 길이가 max_seq_length를 초과하는 데이터 필터링
+        logger.info(f"dataset length: {len(tokenized_dataset)}")
+        tokenized_dataset = tokenized_dataset.filter(lambda x: len(x["input_ids"]) <= self.max_seq_length)
+        logger.info(f"filtered dataset length: {len(tokenized_dataset)}")
+
+        return tokenized_dataset
+
+    def _split_dataset(self, dataset):
+        split_dataset = dataset.train_test_split(test_size=self.test_size, seed=42)
+        train_dataset = split_dataset["train"]
+        eval_dataset = split_dataset["test"]
+
+        logger.debug(self.tokenizer.decode(train_dataset[0]["input_ids"], skip_special_tokens=True))
+        train_dataset_token_lengths = [len(train_dataset[i]["input_ids"]) for i in range(len(train_dataset))]
+        logger.info(f"max token length: {max(train_dataset_token_lengths)}")
+        logger.info(f"min token length: {min(train_dataset_token_lengths)}")
+        logger.info(f"avg token length: {np.mean(train_dataset_token_lengths)}")
+
+        return train_dataset, eval_dataset
+
+
+if __name__ == "__main__":
+    config_folder = os.path.join(os.path.dirname(__file__), "..", "config/")
+    load_dotenv(os.path.join(config_folder, ".env"))
+    config = load_config()
+    data_config = config["data"]
+
+    def _retrieve(retriever_config, df):  # noqa: C901
+        if retriever_config["retriever_type"] == "Elasticsearch":
+            retriever = ElasticsearchRetriever(
+                index_name=retriever_config["index_name"],
+            )
+        elif retriever_config["retriever_type"] == "BM25":
+            raise NotImplementedError("BM25는 더 이상 지원하지 않습니다. Elasticsearch를 사용해주세요...")
+
+        elif retriever_config["retriever_type"] == "DPR":
+            # KorDPRRetriever 사용
+            try:
+                model = KobertBiEncoder()  # 모델 초기화
+                model.load("./rag/output/my_model.pt")  # 모델 불러오기
+                logger.debug("Model loaded successfully.")
+                assert model is not None, "Model is None after loading."
+            except Exception as e:
+                logger.debug(f"Error while loading model: {e}")
+
+            try:
+                valid_dataset = KorQuadDataset("./rag/data/KorQuAD_v1.0_dev.json")  # 데이터셋 준비
+                logger.debug("Valid dataset loaded successfully.")
+            except Exception as e:
+                logger.debug(f"Error while loading valid dataset: {e}")
+
+            try:
+                index = DenseFlatIndexer()  # 인덱스 준비
+                index.deserialize(path="./rag/2050iter_flat/")
+                logger.debug("Index loaded successfully.")
+                assert index is not None, "Index is None after loading."
+            except Exception as e:
+                logger.debug(f"Error while loading index: {e}")
+
+            ds_retriever = KorDPRRetriever(model=model, valid_dataset=valid_dataset, index=index)
+            logger.debug("KorDPRRetriever initialized successfully.")
+
+        else:
+            return [""] * len(df)
+
+        def _combine_text(row):
+            if retriever_config["query_type"] == "pqc":
+                return row["paragraph"] + " " + row["problems"]["question"] + " " + " ".join(row["problems"]["choices"])
+            if retriever_config["query_type"] == "pq":
+                return row["paragraph"] + " " + row["problems"]["question"]
+            if retriever_config["query_type"] == "pc":
+                return row["paragraph"] + " " + " ".join(row["problems"]["choices"])
+            else:
+                return row["paragraph"]
+
+        top_k = retriever_config["top_k"]
+        threshold = retriever_config["threshold"]
+        query_max_length = retriever_config["query_max_length"]
+
+        queries = df.apply(_combine_text, axis=1)
+        if retriever_config["retriever_type"] == "Elasticsearch":
+            filtered_queries = [(i, q) for i, q in enumerate(queries) if len(q) <= query_max_length]
+            if not filtered_queries:
+                return [""] * len(queries)
+
+            indices, valid_queries = zip(*filtered_queries)
+            retrieve_results = retriever.bulk_retrieve(valid_queries, top_k)
+            rerank_k = retriever_config["rerank_k"]
+            if rerank_k > 0:
+                with Reranker() as reranker:
+                    retrieve_results = reranker.rerank(valid_queries, retrieve_results, rerank_k)
+            # [[{"text":"안녕하세요", "score":0.5}, {"text":"반갑습니다", "score":0.3},],]
+
+            docs = [""] * len(queries)
+            for idx, result in zip(indices, retrieve_results):
+                docs[idx] = " ".join(
+                    f"[{item['score']}]: {item['text']}" for item in result if item["score"] >= threshold
+                )
+
+        elif retriever_config["retriever_type"] == "DPR":  # DPR인 경우
+            docs = []
+            for query in queries:
+                passages = ds_retriever.retrieve(query=query, k=top_k)  # DPR으로 검색
+
+                # passage 로딩 및 결합
+                for idx, (passage, score) in enumerate(passages):
+                    # passage ID에 해당하는 파일 경로 가져오기
+                    path = get_passage_file([idx])
+                    if path:
+                        with open(path, "rb") as f:
+                            passage_dict = pickle.load(f)
+                            docs.append((passage_dict[idx], score))  # passage와 score 저장
+                    else:
+                        logger.debug(f"No passage found for ID: {idx}")
+
+                    # 로깅 추가
+                    logger.info(f"Query: {query}")
+                    logger.info(f"Rank {idx+1}: Score: {score:.4f}, Passage: {passage}")
+        return docs
+
+    def load_and_save(retriever_config, file_path) -> List[Dict]:
+        """csv를 읽어오고 dictionary 배열 형태로 변환합니다."""
+        df = pd.read_csv(file_path)
+        df["problems"] = df["problems"].apply(literal_eval)
+        docs = _retrieve(retriever_config, df)
+        df["documents"] = docs
+        df.to_csv(file_path.replace(".csv", "_retrieve.csv"), index=False)
+        logger.debug("retrieve 결과가 csv로 저장되었습니다.")
+
+    load_and_save(data_config["retriever"], data_config["train_path"])
+    load_and_save(data_config["retriever"], data_config["test_path"])
diff --git a/code/inference.py b/code/inference.py
new file mode 100644
index 0000000..58767f5
--- /dev/null
+++ b/code/inference.py
@@ -0,0 +1,48 @@
+from loguru import logger
+import numpy as np
+import pandas as pd
+import torch
+from tqdm import tqdm
+
+
+class InferenceModel:
+    def __init__(self, inference_config, model, tokenizer, test_dataset):
+        self.inference_config = inference_config
+        self.model = model
+        self.tokenizer = tokenizer
+        self.test_dataset = test_dataset
+        self.pred_choices_map = {0: "1", 1: "2", 2: "3", 3: "4", 4: "5"}
+
+    def run_inference(self):
+        if not self.inference_config["do_test"]:
+            logger.info("추론 단계를 생략합니다. inference do_test 설정을 확인하세요.")
+            return
+
+        results = self._inference(self.test_dataset)
+        return self._save_results(results)
+
+    def _inference(self, test_dataset):
+        infer_results = []
+        self.model.config.use_cache = True
+        self.model.eval()
+
+        with torch.inference_mode():
+            for example in tqdm(test_dataset):
+                outputs = self.model(
+                    self.tokenizer.apply_chat_template(
+                        example["messages"], tokenize=True, add_generation_prompt=True, return_tensors="pt"
+                    ).to("cuda")
+                )
+
+                logits = outputs.logits[:, -1].flatten().cpu()
+                target_logits = [logits[self.tokenizer.vocab[str(i + 1)]] for i in range(5)]  # 선택지는 항상 5개
+                probs = torch.nn.functional.softmax(torch.tensor(target_logits, dtype=torch.float32), dim=-1)
+                predict_value = self.pred_choices_map[np.argmax(probs.detach().cpu().numpy())]
+
+                infer_results.append({"id": example["id"], "answer": predict_value})
+
+        return infer_results
+
+    def _save_results(self, results):
+        logger.info(self.inference_config["output_path"])
+        pd.DataFrame(results).to_csv(self.inference_config["output_path"], index=False)
diff --git a/code/labeling.py b/code/labeling.py
new file mode 100644
index 0000000..e9956c7
--- /dev/null
+++ b/code/labeling.py
@@ -0,0 +1,119 @@
+from cleanlab.classification import CleanLearning
+from cleanlab.filter import find_label_issues
+from loguru import logger
+import numpy as np
+import pandas as pd
+from sklearn.cluster import KMeans
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.model_selection import StratifiedKFold
+from xgboost import XGBClassifier
+
+
+# Pandas 출력 설정
+pd.set_option("display.max_columns", None)
+pd.set_option("display.max_rows", None)
+pd.set_option("display.max_colwidth", None)
+
+
+def create_initial_labels(input_file, output_file, num_clusters=2):
+    """TF-IDF와 K-means를 사용하여 초기 라벨을 생성합니다."""
+    df = pd.read_csv(input_file)
+    df.dropna(subset=["paragraph", "problems"], inplace=True)
+    df["combined_text"] = df["paragraph"] + " " + df["problems"]
+
+    vectorizer = TfidfVectorizer()
+    X = vectorizer.fit_transform(df["combined_text"])
+
+    kmeans = KMeans(n_clusters=num_clusters, random_state=42)
+    kmeans.fit(X)
+
+    df["target"] = kmeans.labels_
+    final_columns = ["id", "paragraph", "problems", "question_plus", "target"]
+    df[final_columns].to_csv(output_file, index=False)
+    logger.info(f"초기 라벨링이 완료되었습니다. 결과가 {output_file}에 저장되었습니다.")
+
+
+def load_and_preprocess_data(file_path):
+    """데이터를 로드하고 전처리합니다."""
+    df = pd.read_csv(file_path)
+    if "target" not in df.columns:
+        raise ValueError("데이터셋에 'target' 열이 없습니다.")
+
+    X = df[["paragraph", "problems"]].astype(str).agg(" ".join, axis=1)
+    y = df["target"].astype(int)
+    return df, X, y
+
+
+def vectorize_text(X, max_features=5000):
+    """텍스트 데이터를 벡터화합니다."""
+    vectorizer = TfidfVectorizer(max_features=max_features)
+    return vectorizer.fit_transform(X)
+
+
+def train_and_predict(X_vectorized, y, n_splits=5):
+    """모델을 훈련하고 예측 확률을 반환합니다."""
+    base_model = XGBClassifier(eval_metric="mlogloss", n_estimators=100)
+    model = CleanLearning(base_model)
+    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
+    pred_probs = np.zeros((len(y), len(np.unique(y))))
+
+    for fold, (train_index, val_index) in enumerate(skf.split(X_vectorized, y), 1):
+        logger.info(f"Fold {fold}/{n_splits}")
+        X_train, X_val = X_vectorized[train_index], X_vectorized[val_index]
+        y_train, _ = y[train_index], y[val_index]
+        model.fit(X_train, y_train)
+        pred_probs[val_index] = model.predict_proba(X_val)
+
+    return pred_probs
+
+
+def find_and_update_label_issues(df, y, pred_probs):
+    """레이블 이슈를 찾고 데이터프레임을 업데이트합니다."""
+    label_issues = find_label_issues(labels=y, pred_probs=pred_probs, return_indices_ranked_by="self_confidence")
+    df["is_label_issue"] = False
+    df.loc[label_issues, "is_label_issue"] = True
+    df["suggested_label"] = np.argmax(pred_probs, axis=1)
+    return df
+
+
+def save_and_print_results(df, output_file):
+    """결과를 저장하고 출력합니다."""
+    final_columns = ["id", "paragraph", "problems", "question_plus", "target", "suggested_label", "is_label_issue"]
+    df[final_columns].to_csv(output_file, index=False)
+
+    logger.info("\nID와 제안된 레이블:")
+    logger.info(df[["id", "suggested_label"]].to_string(index=False))
+
+    logger.info("\n레이블 이슈 통계:")
+    logger.info(df["is_label_issue"].value_counts(normalize=True))
+
+    logger.info("\n원래 레이블과 제안된 레이블 비교:")
+    logger.info(pd.crosstab(df["target"], df["suggested_label"]))
+
+
+def main():
+    initial_input_file = "../data/train.csv"
+    initial_output_file = "../data/output_with_labels.csv"
+    final_output_file = "../data/cleaned_output_with_labels_CL.csv"
+
+    # 초기 라벨링 수행
+    create_initial_labels(initial_input_file, initial_output_file)
+
+    # 데이터 로드 및 전처리
+    df, X, y = load_and_preprocess_data(initial_output_file)
+
+    # 텍스트 벡터화
+    X_vectorized = vectorize_text(X)
+
+    # 모델 훈련 및 예측
+    pred_probs = train_and_predict(X_vectorized, y)
+
+    # 레이블 이슈 찾기 및 업데이트
+    df = find_and_update_label_issues(df, y, pred_probs)
+
+    # 결과 저장 및 출력
+    save_and_print_results(df, final_output_file)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/code/main.py b/code/main.py
new file mode 100644
index 0000000..a8733ec
--- /dev/null
+++ b/code/main.py
@@ -0,0 +1,84 @@
+import os
+
+from data_loaders import DataLoader
+from inference import InferenceModel
+from loguru import logger
+from model import ModelHandler
+from trainer import CustomTrainer
+from utils import (
+    GoogleDriveManager,
+    create_experiment_filename,
+    load_config,
+    load_env_file,
+    log_config,
+    set_logger,
+    set_seed,
+)
+import wandb
+
+
+def main():
+    # env, config, log, seed 설정
+    load_env_file()
+    config = load_config()
+    set_logger(log_file=config["log"]["file"], log_level=config["log"]["level"])
+    set_seed()
+
+    # wandb 설정
+    exp_name = create_experiment_filename(config)
+    wandb.init(
+        config=config,
+        project=config["wandb"]["project"],
+        entity=config["wandb"]["entity"],
+        name=exp_name,
+    )
+
+    # wandb 실험명으로 config 갱신
+    config["training"]["run_name"] = exp_name
+    config["inference"]["output_path"] = os.path.join(config["inference"]["output_path"], exp_name + "_output.csv")
+    log_config(config)
+
+    try:
+        # 모델 및 토크나이저 설정
+        model_handler = ModelHandler(config["model"])
+        model, tokenizer = model_handler.setup()
+
+        # 학습용 데이터 처리
+        data_processor = DataLoader(tokenizer, config["data"])
+        train_dataset, eval_dataset = data_processor.prepare_datasets(is_train=True)
+        test_dataset = data_processor.prepare_datasets(is_train=False)
+
+        # 학습
+        trainer = CustomTrainer(
+            training_config=config["training"],
+            model=model,
+            tokenizer=tokenizer,
+            train_dataset=train_dataset,
+            eval_dataset=eval_dataset,
+        )
+        trained_model = trainer.train()
+
+        # 추론
+        inferencer = InferenceModel(
+            inference_config=config["inference"],
+            model=trained_model,
+            tokenizer=tokenizer,
+            test_dataset=test_dataset,
+        )
+        inferencer.run_inference()
+
+    except Exception as e:
+        logger.exception(f"Error occurred: {e}")
+        wandb.finish(exit_code=1)
+    else:
+        logger.info("Upload output & config to GDrive...")
+        gdrive_manager = GoogleDriveManager()
+        gdrive_manager.upload_exp(
+            config["exp"]["username"],
+            config["inference"]["output_path"],
+        )
+        wandb.finish()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/code/model.py b/code/model.py
new file mode 100644
index 0000000..6d7fd69
--- /dev/null
+++ b/code/model.py
@@ -0,0 +1,60 @@
+from loguru import logger
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
+
+
+class ModelHandler:
+    def __init__(self, model_config):
+        self.base_model = model_config["base_model"]
+        self.model_config = model_config["model"]
+        self.tokenizer_config = model_config["tokenizer"]
+
+    def setup(self):
+        model = self._load_model()
+        tokenizer = self._load_tokenizer()
+        return model, tokenizer
+
+    def _load_model(self):
+        torch_dtype = getattr(torch, self.model_config["torch_dtype"])
+        base_kwargs = {"trust_remote_code": True, "low_cpu_mem_usage": self.model_config["low_cpu_mem_usage"]}
+
+        if self.model_config["quantization"] == "BitsAndBytes":
+            bits = self.model_config["bits"]
+            if bits == 8:
+                quantization_config = BitsAndBytesConfig(
+                    load_in_8bit=True,
+                    bnb_8bit_use_double_quant=self.model_config["use_double_quant"],
+                    bnb_8bit_compute_dtype=torch_dtype,
+                )
+            elif bits == 4:
+                quantization_config = BitsAndBytesConfig(
+                    load_in_4bit=True,
+                    bnb_4bit_quant_type="nf4",
+                    bnb_4bit_use_double_quant=self.model_config["use_double_quant"],
+                    bnb_4bit_compute_dtype=torch_dtype,
+                )
+            else:
+                raise ValueError(f"Unsupported bits value: {bits}")
+
+            base_kwargs["quantization_config"] = quantization_config
+        elif self.model_config["quantization"] == "auto":
+            base_kwargs["torch_dtype"] = "auto"
+            base_kwargs["device_map"] = "auto"
+        else:
+            base_kwargs["torch_dtype"] = torch_dtype
+
+        logger.debug(f"base_kwargs: {base_kwargs}")
+        model = AutoModelForCausalLM.from_pretrained(self.base_model, **base_kwargs)
+        model.config.use_cache = self.model_config["use_cache"]
+        return model
+
+    def _load_tokenizer(self):
+        tokenizer = AutoTokenizer.from_pretrained(self.base_model, trust_remote_code=True)
+        self._setup_tokenizer(tokenizer)
+        return tokenizer
+
+    def _setup_tokenizer(self, tokenizer):
+        tokenizer.chat_template = self.tokenizer_config["chat_template"]
+        tokenizer.pad_token = tokenizer.eos_token
+        tokenizer.pad_token_id = tokenizer.eos_token_id
+        tokenizer.padding_side = self.tokenizer_config["padding_side"]
diff --git a/code/rag/README.md b/code/rag/README.md
new file mode 100644
index 0000000..4a54cc8
--- /dev/null
+++ b/code/rag/README.md
@@ -0,0 +1,34 @@
+# Dense Retriever 사용 가이드
+
+이 가이드는 Dense Retriever를 설정하고 사용하는 방법을 설명합니다.
+
+## 준비 단계
+
+1. **위키피디아 덤프 파일 준비**
+   - `rag` 폴더 내에 위키피디아 덤프 파일을 다운로드하고 압축을 해제합니다.
+   - `text` 폴더 내에 `AA`, `AB`, `AC` 폴더가 존재해야 합니다.
+   - 각 폴더 안에 `wiki_`로 시작하는 파일들이 있어야 합니다.
+2. **KorQuAD_v1.0 데이터셋 준비**
+    - `data` 폴더 내에 KorQuAD_v1.0_dev, KorQuAD_v1.0_train 파일을 준비해야합니다.
+    - https://korquad.github.io/category/1.0_KOR.html
+3. **데이터 전처리**
+   - `prepare_dense.py` 스크립트를 실행합니다.
+   - 실행 후 다음 파일들이 생성되어야 합니다:
+     - `preproccessed_passages/0-XXXX.p,XXXX-XXXX.p...`
+     - `titled_passage_map.p`
+     - `2050iter_flat/index_meta.dpr,index.dpr`
+   - 생성된 파일을 사용하여 예시 쿼리에 대한 적절한 문서 검색 결과를 확인합니다.
+## 사용 방법
+
+1. **설정 파일 수정**
+   - `config` 파일에서 `retriever_type`을 `"DPR"`로 설정합니다.
+
+2. **실행**
+   - 다음 명령어를 실행하여 Dense Retriever를 사용합니다:
+     ```bash
+     python main.py
+     ```
+
+## 주의사항
+- 이 코드는 https://github.com/TmaxEdu/KorDPR를 참고하여 작성되었습니다.
+- 위의 단계들을 순서대로 진행해야 Dense Retriever가 정상적으로 작동합니다.
diff --git a/code/rag/__init__.py b/code/rag/__init__.py
new file mode 100644
index 0000000..ca0ab1e
--- /dev/null
+++ b/code/rag/__init__.py
@@ -0,0 +1,18 @@
+# # __init__.py 파일 내에서 필요한 모듈들을 임포트
+from .chunk_data import DataChunk, save_orig_passage, save_title_index_map
+from .dpr_data import KorQuadDataset, KorQuadSampler, korquad_collator
+from .encoder import KobertBiEncoder
+
+# from .retriever_dense import DenseRetriever  # 이 부분도 추가합니다.
+from .reranker import Reranker
+
+# #from .utils import get_wiki_filepath, wiki_worker_init  # 변경 없음
+# # import transformers
+# # # 외부 스크립트에서 IndexRunner 임포트
+# from .index_runner import IndexRunner
+# from .retriever import KorDPRRetriever  # retriever.py에서 가져오기
+# from .indexers import DenseFlatIndexer
+# # # 추가된 부분
+from .retriever_bm25 import BM25Retriever
+from .retriever_elastic import ElasticsearchRetriever
+from .trainer import Trainer
diff --git a/code/rag/chunk_data.py b/code/rag/chunk_data.py
new file mode 100644
index 0000000..3bfe86f
--- /dev/null
+++ b/code/rag/chunk_data.py
@@ -0,0 +1,111 @@
+from collections import defaultdict
+from glob import glob
+import logging
+import os
+import pickle
+
+from tqdm import tqdm
+from transformers import AutoTokenizer
+
+
+os.makedirs("logs", exist_ok=True)
+logging.basicConfig(
+    filename="logs/log.log",
+    level=logging.DEBUG,
+    format="[%(asctime)s | %(funcName)s @ %(pathname)s] %(message)s",
+)
+logger = logging.getLogger()
+
+
+class DataChunk:
+    """인풋 text를 tokenizing한 뒤에 주어진 길이로 chunking 해서 반환합니다.
+    이때 하나의 chunk(context, index 단위)는 하나의 article에만 속해있어야 합니다."""
+
+    def __init__(self, chunk_size=100):
+        self.chunk_size = chunk_size
+        self.tokenizer = AutoTokenizer.from_pretrained("monologg/kobert", trust_remote_code=True)
+
+    def chunk(self, input_file):
+        logger.info(f"Processing file: {input_file}")
+        with open(input_file, "rt", encoding="utf8") as f:
+            input_txt = f.read().strip()
+        input_txt = input_txt.split("</doc>")
+        chunk_list = []
+        orig_text = []
+        for art in input_txt:
+            art = art.strip()
+            if not art:
+                logger.debug("Article is empty, passing")
+                continue
+            title = art.split("\n")[0].strip(">").split("title=")[1].strip('"')
+            text = "\n".join(art.split("\n")[2:]).strip()
+
+            logger.debug(f"Processing article: {title}")
+
+            encoded_title = self.tokenizer.encode(title, add_special_tokens=True)
+            encoded_txt = self.tokenizer.encode(text, add_special_tokens=True)
+            if len(encoded_txt) < 5:
+                logger.debug(f"Title {title} has <5 subwords in its article, passing")
+                continue
+
+            for start_idx in range(0, len(encoded_txt), self.chunk_size):
+                end_idx = min(len(encoded_txt), start_idx + self.chunk_size)
+                chunk = encoded_title + encoded_txt[start_idx:end_idx]
+                orig_text.append(self.tokenizer.decode(chunk))
+                chunk_list.append(chunk)
+
+        logger.info(f"Processed {len(orig_text)} chunks from {input_file}.")
+        return orig_text, chunk_list
+
+
+def save_orig_passage(input_path="text", passage_path="processed_passages", chunk_size=100):
+    os.makedirs(passage_path, exist_ok=True)
+    app = DataChunk(chunk_size=chunk_size)
+    idx = 0
+    for path in tqdm(glob(f"{input_path}/*/wiki_*")):
+        ret, _ = app.chunk(path)
+        logger.info(f"Processed {len(ret)} chunks from {path}.")  # 추가된 로그
+        if len(ret) > 0:  # 청크가 있는 경우에만 저장
+            to_save = {idx + i: ret[i] for i in range(len(ret))}
+            with open(f"{passage_path}/{idx}-{idx+len(ret)-1}.p", "wb") as f:
+                pickle.dump(to_save, f)
+            idx += len(ret)
+
+
+def save_title_index_map(index_path="title_passage_map.p", source_passage_path="processed_passages"):
+    logging.getLogger()
+    logger.debug(f"Looking for files in {source_passage_path}")
+    files = glob(f"{source_passage_path}/*")
+    logger.debug(f"Found {len(files)} files")
+
+    title_id_map = defaultdict(list)
+    for f in tqdm(files):
+        logger.debug(f"Processing file: {f}")
+        with open(f, "rb") as _f:
+            id_passage_map = pickle.load(_f)
+
+        # 로그 추가: id_passage_map의 형식 및 내용 확인
+        logger.debug(f"Loaded {len(id_passage_map)} passages from {f}")
+        logger.debug(f"Sample passage: {list(id_passage_map.items())[:5]}")  # 첫 5개 항목 출력
+
+        for id, passage in id_passage_map.items():
+            parts = passage.split("[SEP]")
+            if len(parts) > 1:
+                title = parts[0].split("[CLS]")[1].strip()
+                title_id_map[title].append(id)
+            else:
+                logger.debug(f"Unexpected passage format in file {f}, id {id}")
+
+        logger.debug(f"Processed {len(id_passage_map)} passages from {f}...")
+
+        logger.debug(f"Total unique titles: {len(title_id_map)}")
+
+    with open(index_path, "wb") as f:
+        pickle.dump(title_id_map, f)
+
+    logger.debug(f"Finished saving title_index_mapping at {index_path}!")
+
+
+# if __name__ == "__main__":
+#     save_orig_passage()
+#     save_title_index_map()
diff --git a/code/rag/data_process/external_data.py b/code/rag/data_process/external_data.py
new file mode 100644
index 0000000..e393476
--- /dev/null
+++ b/code/rag/data_process/external_data.py
@@ -0,0 +1,198 @@
+import json
+import os
+from pathlib import Path
+import re
+import urllib.request
+
+from loguru import logger
+
+
+def preprocess_text(text):
+    # 한글, 숫자, 특수문자, 공백만 남기고 나머지 제거
+    text = re.sub(r"\n", " ", text)
+    text = re.sub(r"\\n", " ", text)
+    text = re.sub(r"#", " ", text)
+    text = re.sub(r"\s+", " ", text).strip()
+    text = re.sub(r'[^ㄱ-ㅎ가-힣0-9!"#%&\'(),-./:;<=>?@[\]^_`{|}~\s]', "", text)
+
+    # 내용이 빈 괄호 제거
+    pattern = r"\(\s*\)"
+    while re.search(pattern, text):
+        text = re.sub(pattern, "", text)
+
+    return text
+
+
+def process_json_array(json_data):
+    # text 필드 전처리
+    if "text" in json_data:
+        json_data["text"] = preprocess_text(json_data["text"])
+
+    # title 필드 전처리
+    if "title" in json_data:
+        json_data["title"] = preprocess_text(json_data["title"])
+
+    return json_data
+
+
+def process_json_file(json_filename):
+    with open(json_filename, "r", encoding="utf-8") as f:
+        docs = json.load(f)
+
+    processed_docs = [process_json_array(item) for item in docs]
+
+    # 디렉토리와 파일명 분리 후 파일명에만 'processed_' 추가
+    directory, filename = os.path.split(json_filename)
+    output_path = os.path.join(directory, "processed_" + filename)
+    with open(output_path, "w", encoding="utf-8") as f:
+        json.dump(processed_docs, f, ensure_ascii=False, indent=2)
+
+
+def _dump_wiki(data_path: str = "../data"):
+    """
+    위키피디아 덤프를 다운로드하고 추출하는 함수
+    """
+    dump_filename = "kowiki-latest-pages-articles.xml.bz2"
+    dump_path = os.path.join(data_path, dump_filename)
+    wiki_url = f"https://dumps.wikimedia.org/kowiki/latest/{dump_filename}"
+
+    # wget https://dumps.wikimedia.org/kowiki/latest/kowiki-latest-pages-articles.xml.bz2
+    if not os.path.exists(dump_path):
+        logger.debug(f"위키피디아 덤프를 다운로드합니다: {wiki_url}")
+        urllib.request.urlretrieve(wiki_url, dump_path)
+        logger.debug(f"다운로드 완료: {dump_path}")
+
+    # python -m wikiextractor.WikiExtractor kowiki-latest-pages-articles.xml.bz2
+    extract_dir = os.path.join(data_path, "text")
+    if not os.path.exists(extract_dir):
+        logger.debug("WikiExtractor로 덤프 파일을 추출합니다...")
+        os.system(f"python -m wikiextractor.WikiExtractor {dump_path} -o {extract_dir}")
+        logger.debug("추출 완료")
+
+    def _get_filename_list(dirname):
+        filepaths = []
+        for root, dirs, files in os.walk(dirname):
+            for file in files:
+                filepath = os.path.join(root, file)
+                if re.match(r"wiki_[0-9][0-9]", file):
+                    filepaths.append(filepath)
+        return sorted(filepaths)
+
+    filepaths = _get_filename_list(extract_dir)
+    output_path = os.path.join(data_path, "wiki_dump.txt")
+
+    # 파일 내용 읽기
+    all_text = ""
+    for filepath in filepaths:
+        with open(filepath, "r", encoding="utf-8") as f:
+            all_text += f.read() + "\n"
+
+    # 전체 텍스트를 하나의 파일로 저장
+    with open(output_path, "w", encoding="utf-8") as f:
+        f.write(all_text)
+
+    logger.debug(f"총 {len(filepaths)}개의 파일을 처리했습니다.")
+    logger.debug(f"모든 내용이 {output_path} 파일에 저장되었습니다.")
+
+
+def _parse_wiki_dump(file_path: str = "../data/wiki_dump.txt"):
+    """
+    위키피디아 덤프 파일을 JSON 형식으로 변환하는 함수
+    """
+    documents = []
+    current_doc = ""
+    doc_id = None
+    title = None
+
+    with open(file_path, "r", encoding="utf-8") as f:
+        for line in f:
+            # 새로운 문서 시작
+            if line.startswith("<doc"):
+                doc_id = re.search(r'id="([^"]+)"', line).group(1)
+                title = re.search(r'title="([^"]+)"', line).group(1)
+                current_doc = ""
+            # 문서 끝
+            elif line.startswith("</doc>"):
+                if current_doc.strip():  # 빈 문서가 아닌 경우만 추가
+                    documents.append({"id": doc_id, "title": title, "text": current_doc.strip()})
+            # 문서 내용
+            else:
+                current_doc += line
+
+    # JSON 파일로 저장
+    output_path = file_path.replace(".txt", ".json")
+    with open(output_path, "w", encoding="utf-8") as json_file:
+        json.dump(documents, json_file, ensure_ascii=False, indent=4)
+
+    logger.debug(f"JSON 파일이 생성되었습니다: {output_path}")
+    logger.debug(f"총 {len(documents)}개의 문서가 처리되었습니다.")
+    return documents
+
+
+def wikipedia():
+    """
+    위키피디아 한국어 덤프 문서를 가져오고 파싱하여 하나의 JSON 파일 생성
+    """
+    _dump_wiki()
+    _parse_wiki_dump()
+
+
+def ai_hub_news_corpus(input_dir: str, output_file: str):
+    """
+    대규모 웹데이터 기반 한국어 말뭉치 데이터
+    \n https://www.aihub.or.kr/aihubdata/data/view.do?currMenu=115&topMenu=100&dataSetSn=624
+    \n 지정된 디렉토리의 모든 JSON 파일을 처리하여 하나의 JSON 파일로 통합
+    Args:
+        input_dir: 입력 JSON 파일들이 있는 디렉토리 경로
+        output_file: 출력될 통합 JSON 파일 경로
+    """
+    all_documents = []
+    input_path = Path(input_dir)
+
+    try:
+        # 입력 디렉토리 내의 모든 JSON 파일 처리
+        for json_file in input_path.glob("**/*.json"):
+            logger.info(f"처리 중인 파일: {json_file}")
+
+            try:
+                with open(json_file, "r", encoding="utf-8") as f:
+                    data = json.load(f)
+
+                # SJML 구조 확인 및 데이터 추출
+                if "SJML" in data and "text" in data["SJML"]:
+                    for doc in data["SJML"]["text"]:
+                        processed_doc = {"title": doc["title"], "text": doc["content"]}
+                        all_documents.append(processed_doc)
+                else:
+                    logger.warning(f"잘못된 JSON 구조: {json_file}")
+
+            except json.JSONDecodeError:
+                logger.error(f"JSON 파싱 오류: {json_file}")
+            except Exception as e:
+                logger.error(f"파일 처리 중 오류 발생: {json_file}, 오류: {str(e)}")
+
+        # 최종 결과를 단일 JSON 파일로 저장
+        if all_documents:
+            with open(output_file, "w", encoding="utf-8") as f:
+                json.dump(all_documents, f, ensure_ascii=False, indent=2)
+            logger.info(f"처리 완료: 총 {len(all_documents)}개 문서가 {output_file}에 저장됨")
+        else:
+            logger.warning("처리된 문서가 없습니다.")
+
+    except Exception as e:
+        logger.error(f"전체 처리 과정 중 오류 발생: {str(e)}")
+
+
+if __name__ == "__main__":
+    os.chdir("../../")
+
+    PROCESS_JSON_FILE = False
+    WIKIPEDIA = False
+    AI_HUB_NEWS_CORPUS = False
+
+    if PROCESS_JSON_FILE:
+        process_json_file("../data/documents.json")
+    if WIKIPEDIA:
+        wikipedia()
+    if AI_HUB_NEWS_CORPUS:
+        ai_hub_news_corpus("../data/ai_hub_news_corpus", "../data/ai_hub_news_corpus.json")
diff --git a/code/rag/data_process/wiki_dump.py b/code/rag/data_process/wiki_dump.py
new file mode 100644
index 0000000..a5ad05e
--- /dev/null
+++ b/code/rag/data_process/wiki_dump.py
@@ -0,0 +1,93 @@
+import json
+import os
+import re
+import urllib.request
+
+from loguru import logger
+
+
+def dump_wiki(data_path: str = "../data"):
+    """
+    위키피디아 덤프를 다운로드하고 추출하는 함수
+    """
+    dump_filename = "kowiki-latest-pages-articles.xml.bz2"
+    dump_path = os.path.join(data_path, dump_filename)
+    wiki_url = f"https://dumps.wikimedia.org/kowiki/latest/{dump_filename}"
+
+    # wget https://dumps.wikimedia.org/kowiki/latest/kowiki-latest-pages-articles.xml.bz2
+    if not os.path.exists(dump_path):
+        logger.debug(f"위키피디아 덤프를 다운로드합니다: {wiki_url}")
+        urllib.request.urlretrieve(wiki_url, dump_path)
+        logger.debug(f"다운로드 완료: {dump_path}")
+
+    # python -m wikiextractor.WikiExtractor kowiki-latest-pages-articles.xml.bz2
+    extract_dir = os.path.join(data_path, "text")
+    if not os.path.exists(extract_dir):
+        logger.debug("WikiExtractor로 덤프 파일을 추출합니다...")
+        os.system(f"python -m wikiextractor.WikiExtractor {dump_path} -o {extract_dir}")
+        logger.debug("추출 완료")
+
+    def _get_filename_list(dirname):
+        filepaths = []
+        for root, dirs, files in os.walk(dirname):
+            for file in files:
+                filepath = os.path.join(root, file)
+                if re.match(r"wiki_[0-9][0-9]", file):
+                    filepaths.append(filepath)
+        return sorted(filepaths)
+
+    filepaths = _get_filename_list(extract_dir)
+    output_path = os.path.join(data_path, "wiki_dump.txt")
+
+    # 파일 내용 읽기
+    all_text = ""
+    for filepath in filepaths:
+        with open(filepath, "r", encoding="utf-8") as f:
+            all_text += f.read() + "\n"
+
+    # 전체 텍스트를 하나의 파일로 저장
+    with open(output_path, "w", encoding="utf-8") as f:
+        f.write(all_text)
+
+    logger.debug(f"총 {len(filepaths)}개의 파일을 처리했습니다.")
+    logger.debug(f"모든 내용이 {output_path} 파일에 저장되었습니다.")
+
+
+def parse_wiki_dump(file_path: str = "../data/wiki_dump.txt"):
+    """
+    위키피디아 덤프 파일을 JSON 형식으로 변환하는 함수
+    """
+    documents = []
+    current_doc = ""
+    doc_id = None
+    title = None
+
+    with open(file_path, "r", encoding="utf-8") as f:
+        for line in f:
+            # 새로운 문서 시작
+            if line.startswith("<doc"):
+                doc_id = re.search(r'id="([^"]+)"', line).group(1)
+                title = re.search(r'title="([^"]+)"', line).group(1)
+                current_doc = ""
+            # 문서 끝
+            elif line.startswith("</doc>"):
+                if current_doc.strip():  # 빈 문서가 아닌 경우만 추가
+                    documents.append({"id": doc_id, "title": title, "text": current_doc.strip()})
+            # 문서 내용
+            else:
+                current_doc += line
+
+    # JSON 파일로 저장
+    output_path = file_path.replace(".txt", ".json")
+    with open(output_path, "w", encoding="utf-8") as json_file:
+        json.dump(documents, json_file, ensure_ascii=False, indent=4)
+
+    logger.debug(f"JSON 파일이 생성되었습니다: {output_path}")
+    logger.debug(f"총 {len(documents)}개의 문서가 처리되었습니다.")
+    return documents
+
+
+if __name__ == "__main__":
+    os.chdir("../../")
+    dump_wiki()
+    parse_wiki_dump()
diff --git a/code/rag/dpr_data.py b/code/rag/dpr_data.py
new file mode 100644
index 0000000..b13155f
--- /dev/null
+++ b/code/rag/dpr_data.py
@@ -0,0 +1,226 @@
+# from utils import get_passage_file
+from glob import glob
+import json
+import logging
+import math
+import os
+import pickle
+import re
+import typing
+from typing import Iterator, List, Sized, Tuple
+
+import torch
+from torch import tensor as T
+from torch.nn.utils.rnn import pad_sequence
+from tqdm import tqdm
+from transformers import AutoTokenizer
+
+
+def get_wiki_filepath(data_dir):
+    return glob(f"{data_dir}/*/wiki_*")
+
+
+def wiki_worker_init(worker_id):
+    worker_info = torch.utils.data.get_worker_info()
+    dataset = worker_info.dataset
+    # logger.debug(dataset)
+    # dataset =
+    overall_start = dataset.start
+    overall_end = dataset.end
+    split_size = int(math.ceil((overall_end - overall_start) / float(worker_info.num_workers)))
+    worker_id = worker_info.id
+    # end_idx = min((worker_id+1) * split_size, len(dataset.data))
+    dataset.start = overall_start + worker_id * split_size
+    dataset.end = min(dataset.start + split_size, overall_end)  # index error 방지
+
+
+def get_passage_file(p_id_list: typing.List[int]) -> str:
+    """passage id를 받아서 해당되는 파일 이름을 반환합니다."""
+    target_file = None
+    p_id_max = max(p_id_list)
+    p_id_min = min(p_id_list)
+    for f in glob("processed_passages/*.p"):
+        s, e = f.split("/")[1].split(".")[0].split("-")
+        s, e = int(s), int(e)
+        if p_id_min >= s and p_id_max <= e:
+            target_file = f
+    return target_file
+
+
+# set logger
+os.makedirs("logs", exist_ok=True)
+logging.basicConfig(
+    filename="logs/log.log",
+    level=logging.DEBUG,
+    format="[%(asctime)s | %(funcName)s @ %(pathname)s] %(message)s",
+)
+logger = logging.getLogger()
+
+
+def korquad_collator(batch: List[Tuple], padding_value: int) -> Tuple[torch.Tensor]:
+    """query, p_id, gold_passage를 batch로 반환합니다."""
+    batch_q = pad_sequence([T(e[0]) for e in batch], batch_first=True, padding_value=padding_value)
+    # logger.debug(batch_q.shape)
+    batch_q_attn_mask = (batch_q != padding_value).long()
+    # logger.debug(batch_q_attn_mask.shape)
+    batch_p_id = T([e[1] for e in batch])[:, None]
+    # logger.debug(batch_p_id.shape)
+    batch_p = pad_sequence([T(e[2]) for e in batch], batch_first=True, padding_value=padding_value)
+    # logger.debug(batch_p.shape)
+    batch_p_attn_mask = (batch_p != padding_value).long()
+    return (batch_q, batch_q_attn_mask, batch_p_id, batch_p, batch_p_attn_mask)
+
+
+class KorQuadSampler(torch.utils.data.BatchSampler):
+    """in-batch negative학습을 위해 batch 내에 중복 answer를 갖지 않도록 batch를 구성합니다.
+    sample 일부를 pass하기 때문에 전체 data 수보다 iteration을 통해 나오는 데이터 수가 몇십개 정도 적습니다."""
+
+    def __init__(
+        self,
+        data_source: Sized,
+        batch_size: int,
+        drop_last: bool = False,
+        shuffle: bool = True,
+        generator=None,
+    ) -> None:
+        if shuffle:
+            sampler = torch.utils.data.RandomSampler(data_source, replacement=False, generator=generator)
+        else:
+            sampler = torch.utils.data.SequentialSampler(data_source)
+        super(KorQuadSampler, self).__init__(sampler=sampler, batch_size=batch_size, drop_last=drop_last)
+
+    def __iter__(self) -> Iterator[List[int]]:
+        sampled_p_id = []
+        sampled_idx = []
+        for idx in self.sampler:
+            item = self.sampler.data_source[idx]
+            if item[1] in sampled_p_id:
+                continue  # 만일 같은 answer passage가 이미 뽑혔다면 pass
+            sampled_idx.append(idx)
+            sampled_p_id.append(item[1])
+            if len(sampled_idx) >= self.batch_size:
+                yield sampled_idx
+                sampled_p_id = []
+                sampled_idx = []
+        if len(sampled_idx) > 0 and not self.drop_last:
+            yield sampled_idx
+
+
+class KorQuadDataset:
+    def __init__(self, korquad_path: str, title_passage_map_path="title_passage_map.p"):
+        self.korquad_path = korquad_path
+        self.data_tuples = []
+        self.tokenizer = AutoTokenizer.from_pretrained("monologg/kobert", trust_remote_code=True)
+        self.pad_token_id = self.tokenizer.get_vocab()["[PAD]"]
+        self.load()
+
+    @property
+    def dataset(self) -> List[Tuple]:
+        return self.tokenized_tuples
+
+    def stat(self):
+        """korquad 데이터셋의 스탯을 출력합니다."""
+        raise NotImplementedError()
+
+    def load(self):
+        """데이터 전처리가 완료되었다면 load하고 그렇지 않으면 전처리를 수행합니다."""
+        self.korquad_processed_path = f"{self.korquad_path.split('.json')[0]}_processed.p"
+        if os.path.exists(self.korquad_processed_path):
+            logger.debug("preprocessed file already exists, loading...")
+            with open(self.korquad_processed_path, "rb") as f:
+                self.tokenized_tuples = pickle.load(f)
+            logger.debug("successfully loaded tokenized_tuples into self.tokenized_tuples")
+
+        else:
+            self._load_data()
+            self._match_passage()
+            logger.debug("successfully loaded data_tuples into self.data_tuples")
+            # tokenizing raw dataset
+            self.tokenized_tuples = [
+                (self.tokenizer.encode(q), id, self.tokenizer.encode(p))
+                for q, id, p in tqdm(self.data_tuples, desc="tokenize")
+            ]
+            self._save_processed_dataset()
+            logger.debug("finished tokenization")
+
+    def _load_data(self):
+        with open(self.korquad_path, "rt", encoding="utf8") as f:
+            data = json.load(f)
+        self.raw_json = data["data"]
+        logger.debug("data loaded into self.raw_json")
+        with open("title_passage_map.p", "rb") as f:
+            self.title_passage_map = pickle.load(f)
+        logger.debug("title passage mapping loaded into self.title_passage_map")
+
+    def _get_cand_ids(self, title):
+        """미리 구축한 ko-wiki 데이터에서 해당 title에 맞는 id들을 가지고 옵니다."""
+        refined_title = None
+        ret = self.title_passage_map.get(title, None)
+        if not ret:
+            refined_title = re.sub(r"\(.*\)", "", title).strip()
+            ret = self.title_passage_map.get(refined_title, None)
+        return ret, refined_title
+
+    def _match_passage(self):
+        """미리 구축한 ko-wiki 데이터와 korQuad의 answer를 매칭하여
+        (query, passage_id, passage)의 tuple을 구성합니다."""
+        for item in tqdm(self.raw_json, desc="matching silver passages"):
+            title = item["title"].replace("_", " ")  # _를 공백문자로 변경
+            para = item["paragraphs"]
+            cand_ids, refined_title = self._get_cand_ids(title)
+            if refined_title is not None and cand_ids:
+                logger.debug(f"refined the title and proceed : {title} -> {refined_title}")
+            if cand_ids is None:
+                logger.debug(f"No such title as {title} or {refined_title}. passing this title")
+                continue
+            target_file_p = get_passage_file(cand_ids)
+            if target_file_p is None:
+                logger.debug(f"No single target file for {title}, got passage ids {cand_ids}. passing this title")
+                continue
+            with open(target_file_p, "rb") as f:
+                target_file = pickle.load(f)
+            contexts = {cand_id: target_file[cand_id] for cand_id in cand_ids}
+
+            for p in para:
+                qas = p["qas"]
+                for qa in qas:
+                    answer = qa["answers"][0]["text"]  # 아무 정답이나 뽑습니다.
+                    answer_pos = qa["answers"][0]["answer_start"]
+                    answer_clue_start = max(0, answer_pos - 5)
+                    answer_clue_end = min(len(p["context"]), answer_pos + len(answer) + 5)
+                    answer_clue = p["context"][
+                        answer_clue_start:answer_clue_end
+                    ]  # gold passage를 찾기 위해서 +-5칸의 주변 text 활용
+                    question = qa["question"]
+                    answer_p = [
+                        (p_id, c) for p_id, c in contexts.items() if answer_clue in c
+                    ]  # answer가 단순히 들어있는 문서를 뽑는다.
+                    if not answer_p:
+                        answer_p = [(p_id, c) for p_id, c in contexts.items() if answer in c]
+
+                    self.data_tuples.extend([(question, p_id, c) for p_id, c in answer_p])
+
+    def _save_processed_dataset(self):
+        """전처리한 데이터를 저장합니다."""
+        with open(self.korquad_processed_path, "wb") as f:
+            pickle.dump(self.tokenized_tuples, f)
+        logger.debug(f"successfully saved self.tokenized_tuples into {self.korquad_processed_path}")
+
+
+# if __name__ == "__main__":
+#     ds = KorQuadDataset(korquad_path="./data/KorQuAD_v1.0_train.json")
+
+#     loader = torch.utils.data.DataLoader(
+#         dataset=ds.dataset,
+#         batch_sampler=KorQuadSampler(ds.dataset, batch_size=16, drop_last=False),
+#         collate_fn=lambda x: korquad_collator(x, padding_value=ds.pad_token_id),
+#         num_workers=4,
+#     )
+#     # logger.debug(len(_dataset.tokenized_tuples))
+#     torch.manual_seed(123412341235)
+#     cnt = 0
+#     for batch in tqdm(loader):
+#         #logger.debug(len(batch))
+#         cnt += batch[0].size(0)
+#         # break
+#     logger.debug(cnt)
diff --git a/code/rag/encoder.py b/code/rag/encoder.py
new file mode 100644
index 0000000..8bb1593
--- /dev/null
+++ b/code/rag/encoder.py
@@ -0,0 +1,62 @@
+from copy import deepcopy
+import logging
+import os
+
+import torch
+from transformers import BertModel
+
+
+# 두 개의 BertModel을 사용하여 passage와 query를 encoding을 실행
+# 토크나이징 후에 토큰을 고정된 크기의 벡터로 변경
+
+# 로그 디렉토리 생성 (없으면 새로 생성)
+os.makedirs("logs", exist_ok=True)
+
+# 로깅 설정: 로그를 파일로 저장하고 디버깅 레벨로 설정
+logging.basicConfig(
+    filename="logs/log.log",
+    level=logging.DEBUG,
+    format="[%(asctime)s | %(funcName)s @ %(pathname)s] %(message)s",
+)
+logger = logging.getLogger()
+
+
+# KobertBiEncoder 클래스 정의
+class KobertBiEncoder(torch.nn.Module):
+    def __init__(self):
+        # torch.nn.Module의 초기화 함수 호출
+        super(KobertBiEncoder, self).__init__()
+        # passage(문서)를 처리하는 BERT 모델
+        self.passage_encoder = BertModel.from_pretrained("monologg/kobert", trust_remote_code=True)
+        # query(질의)를 처리하는 BERT 모델
+        self.query_encoder = BertModel.from_pretrained("monologg/kobert", trust_remote_code=True)
+        # BERT 모델의 pooler output(임베딩 크기) 설정
+        self.emb_sz = self.passage_encoder.pooler.dense.out_features  # get cls token dim
+
+    def forward(self, x: torch.LongTensor, attn_mask: torch.LongTensor, type: str = "passage") -> torch.FloatTensor:
+        """passage 또는 query를 BERT로 인코딩합니다."""
+        # type이 'passage' 또는 'query'인지 확인
+        assert type in (
+            "passage",
+            "query",
+        ), "type should be either 'passage' or 'query'"
+        # type에 따라 다른 인코더 사용
+        if type == "passage":
+            # 문서(passage) 인코딩
+            return self.passage_encoder(input_ids=x, attention_mask=attn_mask).pooler_output
+        else:
+            # 질의(query) 인코딩
+            return self.query_encoder(input_ids=x, attention_mask=attn_mask).pooler_output
+
+    def checkpoint(self, model_ckpt_path):
+        # 모델의 가중치를 파일로 저장
+        torch.save(deepcopy(self.state_dict()), model_ckpt_path)
+        logger.debug(f"model self.state_dict saved to {model_ckpt_path}")
+
+    def load(self, model_ckpt_path):
+        # 저장된 가중치를 파일에서 로드
+        with open(model_ckpt_path, "rb") as f:
+            state_dict = torch.load(f)
+        # 모델에 로드된 가중치 적용
+        self.load_state_dict(state_dict)
+        logger.debug(f"model self.state_dict loaded from {model_ckpt_path}")
diff --git a/code/rag/index_runner.py b/code/rag/index_runner.py
new file mode 100644
index 0000000..6a28f73
--- /dev/null
+++ b/code/rag/index_runner.py
@@ -0,0 +1,188 @@
+from glob import glob
+import logging
+import math
+import os
+import typing
+from typing import List, Tuple
+
+from chunk_data import DataChunk
+from encoder import KobertBiEncoder
+import indexers
+import torch
+from torch import tensor as T
+from torch.nn.utils.rnn import pad_sequence
+from tqdm import tqdm
+import transformers
+
+
+# from utils import get_wiki_filepath, wiki_worker_init
+transformers.logging.set_verbosity_error()  # 토크나이저 초기화 관련 warning suppress
+
+
+def get_wiki_filepath(data_dir):
+    return glob(f"{data_dir}/*/wiki_*")
+
+
+def wiki_worker_init(worker_id):
+    worker_info = torch.utils.data.get_worker_info()
+    dataset = worker_info.dataset
+    # print(dataset)
+    # dataset =
+    overall_start = dataset.start
+    overall_end = dataset.end
+    split_size = int(math.ceil((overall_end - overall_start) / float(worker_info.num_workers)))
+    worker_id = worker_info.id
+    # end_idx = min((worker_id+1) * split_size, len(dataset.data))
+    dataset.start = overall_start + worker_id * split_size
+    dataset.end = min(dataset.start + split_size, overall_end)  # index error 방지
+
+
+def get_passage_file(p_id_list: typing.List[int]) -> str:
+    """passage id를 받아서 해당되는 파일 이름을 반환합니다."""
+    target_file = None
+    p_id_max = max(p_id_list)
+    p_id_min = min(p_id_list)
+    for f in glob("processed_passages/*.p"):
+        s, e = f.split("/")[1].split(".")[0].split("-")
+        s, e = int(s), int(e)
+        if p_id_min >= s and p_id_max <= e:
+            target_file = f
+    return target_file
+
+
+# logger basic config
+os.makedirs("logs", exist_ok=True)
+logging.basicConfig(
+    filename="logs/log.log",
+    level=logging.DEBUG,
+    format="[%(asctime)s | %(funcName)s @ %(pathname)s] %(message)s",
+)
+logger = logging.getLogger()
+
+
+def wiki_collator(batch: List, padding_value: int) -> Tuple[torch.Tensor]:
+    """passage를 batch로 반환합니다."""
+    batch_p = pad_sequence([T(e) for e in batch], batch_first=True, padding_value=padding_value)
+    batch_p_attn_mask = (batch_p != padding_value).long()
+    return (batch_p, batch_p_attn_mask)
+
+
+class WikiArticleStream(torch.utils.data.IterableDataset):
+    """
+    Indexing을 위해 random access가 필요하지 않고 large corpus를 다루기 위해 stream dataset을 사용합니다.
+    """
+
+    def __init__(self, wiki_path, chunker):
+        # self.chunk_size = chunk_size
+        super(WikiArticleStream, self).__init__()
+        self.chunker = chunker
+        self.pad_token_id = self.chunker.tokenizer.get_vocab()["[PAD]"]
+        self.wiki_path = wiki_path
+        self.max_length = 168  # maximum length for kowiki passage
+        # self.start = 0
+        # self.end = len(self.passages)
+
+    def __iter__(self):
+        # max_length가 되도록 padding 수행
+
+        _, passages = self.chunker.chunk(self.wiki_path)
+        logger.debug(f"chunked file {self.wiki_path}")
+        for passage in passages:
+            # if len(passage) > self.max_length:
+            #     continue  # 지정된 max_length보다 긴 passage의 경우 pass
+            # padded_passage = T(
+            #     passage
+            #     + [self.pad_token_id for _ in range(self.max_length - len(passage))]
+            # )
+            yield passage
+
+
+class IndexRunner:
+    """코퍼스에 대한 인덱싱을 수행하는 메인클래스입니다.
+    passage encoder와 data loader, FAISS indexer로 구성되어 있습니다."""
+
+    def __init__(
+        self,
+        data_dir: str,
+        model_ckpt_path: str,
+        indexer_type: str = "DenseFlatIndexer",
+        chunk_size: int = 100,
+        batch_size: int = 64,
+        buffer_size: int = 50000,
+        index_output: str = "",
+        device: str = "cuda:0",
+    ):
+        """
+        data_dir : 인덱싱할 한국어 wiki 데이터가 들어있는 디렉토리입니다. 하위에 AA, AB와 같은 디렉토리가 있습니다.
+        indexer_type : 사용할 FAISS indexer 종류로
+        DPR 리포에 있는 대로 Flat, HNSWFlat, HNSWSQ 세 종류 중에 사용할 수 있습니다.
+        chunk_size : indexing과 searching의 단위가 되는 passage의 길이입니다.
+        DPR 논문에서는 100개 token 길이 + title로 하나의 passage를 정의하였습니다.
+        """
+        if "=" in data_dir:
+            self.data_dir, self.to_this_page = data_dir.split("=")
+            self.to_this_page = int(self.to_this_page)
+            self.wiki_files = get_wiki_filepath(self.data_dir)
+        else:
+            self.data_dir = data_dir
+            self.wiki_files = get_wiki_filepath(self.data_dir)
+            self.to_this_page = len(self.wiki_files)
+
+        self.device = torch.device(device)
+        self.encoder = KobertBiEncoder().to(self.device)
+        self.encoder.load(model_ckpt_path)  # loading model
+        self.indexer = getattr(indexers, indexer_type)()
+        self.chunk_size = chunk_size
+        self.batch_size = batch_size
+        self.buffer_size = buffer_size
+        self.loader = self.get_loader(
+            self.wiki_files[: self.to_this_page],
+            chunk_size,
+            batch_size,
+            worker_init_fn=None,
+        )
+        self.indexer.init_index(self.encoder.emb_sz)
+        self.index_output = index_output if index_output else indexer_type
+
+    @staticmethod
+    def get_loader(wiki_files, chunk_size, batch_size, worker_init_fn=None):
+        chunker = DataChunk(chunk_size=chunk_size)
+        ds = torch.utils.data.ChainDataset(tuple(WikiArticleStream(path, chunker) for path in wiki_files))
+        loader = torch.utils.data.DataLoader(
+            ds,
+            batch_size=batch_size,
+            collate_fn=lambda x: wiki_collator(x, padding_value=chunker.tokenizer.get_vocab()["[PAD]"]),
+            num_workers=1,
+            worker_init_fn=worker_init_fn,
+        )  # TODO : chain dataset에서 worker 1 초과인 경우 확인하기
+        return loader
+
+    def run(self):
+        _to_index = []
+        cur = 0
+        for batch in tqdm(self.loader, desc="indexing"):
+            p, p_mask = batch
+            p, p_mask = p.to(self.device), p_mask.to(self.device)
+            with torch.no_grad():
+                p_emb = self.encoder(p, p_mask, "passage")
+            _to_index += [(cur + i, _emb) for i, _emb in enumerate(p_emb.cpu().numpy())]
+            cur += p_emb.size(0)
+            if len(_to_index) > self.buffer_size - self.batch_size:
+                logger.debug(f"perform indexing... {len(_to_index)} added")
+                self.indexer.index_data(_to_index)
+                _to_index = []
+        if _to_index:
+            logger.debug(f"perform indexing... {len(_to_index)} added")
+            self.indexer.index_data(_to_index)
+            _to_index = []
+        os.makedirs(self.index_output, exist_ok=True)
+        self.indexer.serialize(self.index_output)
+
+
+# if __name__ == "__main__":
+#     IndexRunner(
+#         data_dir="./dataset/text",
+#         model_ckpt_path="./my_model.pt",
+#         index_output="2050iter_flat",
+#     ).run()
+#     # test_loader()
diff --git a/code/rag/indexers.py b/code/rag/indexers.py
new file mode 100644
index 0000000..e503a28
--- /dev/null
+++ b/code/rag/indexers.py
@@ -0,0 +1,233 @@
+# Credit : facebookresearch/DPR
+
+"""
+FAISS-based index components for dense retriever
+"""
+
+import logging
+import os
+import pickle
+from typing import List, Tuple
+
+import faiss
+import numpy as np
+
+
+logger = logging.getLogger()
+
+
+class DenseIndexer(object):
+    def __init__(self, buffer_size: int = 50000):
+        """
+        인덱서를 초기화합니다.
+        - buffer_size: 한 번에 처리할 벡터의 최대 개수 (기본값은 50000).
+        - index_id_to_db_id: FAISS에서 인덱스 ID와 실제 데이터베이스 ID를 매핑하기 위한 리스트.
+        - index: FAISS 인덱스 객체.
+        """
+        self.buffer_size = buffer_size
+        self.index_id_to_db_id = []  # 인덱스 ID를 실제 데이터베이스 ID와 매핑하는 리스트.
+        self.index = None  # FAISS 인덱스 객체.
+
+    def init_index(self, vector_sz: int):
+        raise NotImplementedError
+
+    def index_data(self, data: List[Tuple[object, np.array]]):
+        raise NotImplementedError
+
+    def get_index_name(self):
+        raise NotImplementedError
+
+    def search_knn(self, query_vectors: np.array, top_docs: int) -> List[Tuple[List[object], List[float]]]:
+        raise NotImplementedError
+
+    def serialize(self, file: str):
+        logger.info("Serializing index to %s", file)
+
+        if os.path.isdir(file):
+            index_file = os.path.join(file, "index.dpr")
+            meta_file = os.path.join(file, "index_meta.dpr")
+        else:
+            index_file = file + ".index.dpr"
+            meta_file = file + ".index_meta.dpr"
+
+        faiss.write_index(self.index, index_file)  # FAISS 인덱스를 파일에 저장.
+        with open(meta_file, mode="wb") as f:
+            pickle.dump(self.index_id_to_db_id, f)  # ID 매핑 정보를 저장.
+
+    def get_files(self, path: str):
+        if os.path.isdir(path):
+            index_file = os.path.join(path, "index.dpr")  # FAISS 인덱스를 파일에서 로드.
+            meta_file = os.path.join(path, "index_meta.dpr")
+        else:
+            index_file = path + ".{}.dpr".format(self.get_index_name())
+            meta_file = path + ".{}_meta.dpr".format(self.get_index_name())
+        return index_file, meta_file
+
+    def index_exists(self, path: str):
+        index_file, meta_file = self.get_files(path)
+        return os.path.isfile(index_file) and os.path.isfile(meta_file)
+
+    def deserialize(self, path: str):
+        logger.info("Loading index from %s", path)
+        index_file, meta_file = self.get_files(path)
+
+        self.index = faiss.read_index(index_file)
+        logger.info("Loaded index of type %s and size %d", type(self.index), self.index.ntotal)
+
+        with open(meta_file, "rb") as reader:
+            self.index_id_to_db_id = pickle.load(reader)
+        assert (
+            len(self.index_id_to_db_id) == self.index.ntotal
+        ), "Deserialized index_id_to_db_id should match faiss index size"
+
+    def _update_id_mapping(self, db_ids: List) -> int:
+        self.index_id_to_db_id.extend(db_ids)
+        return len(self.index_id_to_db_id)
+
+
+class DenseFlatIndexer(DenseIndexer):
+    def __init__(self, buffer_size: int = 50000):
+        super(DenseFlatIndexer, self).__init__(buffer_size=buffer_size)
+
+    def init_index(self, vector_sz: int):
+        self.index = faiss.IndexFlatIP(vector_sz)  # Inner Product를 사용하는 기본 인덱스 초기화.
+
+    def index_data(self, data: List[Tuple[object, np.array]]):
+        n = len(data)
+        # indexing in batches is beneficial for many faiss index types
+        for i in range(0, n, self.buffer_size):
+            db_ids = [t[0] for t in data[i : i + self.buffer_size]]
+            vectors = [np.reshape(t[1], (1, -1)) for t in data[i : i + self.buffer_size]]
+            vectors = np.concatenate(vectors, axis=0)
+            total_data = self._update_id_mapping(db_ids)
+            self.index.add(vectors)
+            logger.info("data indexed %d", total_data)
+
+        indexed_cnt = len(self.index_id_to_db_id)
+        logger.info("Total data indexed %d", indexed_cnt)
+
+    def search_knn(self, query_vectors: np.array, top_docs: int) -> List[Tuple[List[object], List[float]]]:
+        scores, indexes = self.index.search(query_vectors, top_docs)  # 쿼리 벡터와 가장 유사한 벡터 검색.
+        # convert to external ids
+        db_ids = [[self.index_id_to_db_id[i] for i in query_top_idxs] for query_top_idxs in indexes]
+        result = [(db_ids[i], scores[i]) for i in range(len(db_ids))]
+        return result
+
+    def get_index_name(self):
+        return "flat_index"
+
+
+class DenseHNSWFlatIndexer(DenseIndexer):
+    """
+    Efficient index for retrieval. Note: default settings are for hugh accuracy but also high RAM usage
+    """
+
+    def __init__(
+        self,
+        buffer_size: int = 1e9,
+        store_n: int = 512,
+        ef_search: int = 128,
+        ef_construction: int = 200,
+    ):
+        super(DenseHNSWFlatIndexer, self).__init__(buffer_size=buffer_size)
+        self.store_n = store_n
+        self.ef_search = ef_search
+        self.ef_construction = ef_construction
+        self.phi = 0
+
+    def init_index(self, vector_sz: int):
+        # IndexHNSWFlat supports L2 similarity only
+        # so we have to apply DOT -> L2 similairy space conversion with the help of an extra dimension
+        index = faiss.IndexHNSWFlat(vector_sz + 1, self.store_n)  # L2 거리 기반 HNSW 인덱스 초기화.
+        index.hnsw.efSearch = self.ef_search
+        index.hnsw.efConstruction = self.ef_construction
+        self.index = index
+
+    def index_data(self, data: List[Tuple[object, np.array]]):
+        n = len(data)
+
+        # max norm is required before putting all vectors in the index to convert inner product similarity to L2
+        if self.phi > 0:
+            raise RuntimeError(
+                "DPR HNSWF index needs to index all data at once," "results will be unpredictable otherwise."
+            )
+        phi = 0
+        for i, item in enumerate(data):
+            id, doc_vector = item[0:2]
+            norms = (doc_vector**2).sum()
+            phi = max(phi, norms)
+        logger.info("HNSWF DotProduct -> L2 space phi={}".format(phi))
+        self.phi = phi
+
+        # indexing in batches is beneficial for many faiss index types
+        bs = int(self.buffer_size)
+        for i in range(0, n, bs):
+            db_ids = [t[0] for t in data[i : i + bs]]
+            vectors = [np.reshape(t[1], (1, -1)) for t in data[i : i + bs]]
+
+            norms = [(doc_vector**2).sum() for doc_vector in vectors]
+            aux_dims = [np.sqrt(phi - norm) for norm in norms]
+            hnsw_vectors = [np.hstack((doc_vector, aux_dims[i].reshape(-1, 1))) for i, doc_vector in enumerate(vectors)]
+            hnsw_vectors = np.concatenate(hnsw_vectors, axis=0)
+            self.train(hnsw_vectors)
+
+            self._update_id_mapping(db_ids)
+            self.index.add(hnsw_vectors)
+            logger.info("data indexed %d", len(self.index_id_to_db_id))
+        indexed_cnt = len(self.index_id_to_db_id)
+        logger.info("Total data indexed %d", indexed_cnt)
+
+    def train(self, vectors: np.array):
+        pass
+
+    def search_knn(self, query_vectors: np.array, top_docs: int) -> List[Tuple[List[object], List[float]]]:
+        aux_dim = np.zeros(len(query_vectors), dtype="float32")
+        query_nhsw_vectors = np.hstack((query_vectors, aux_dim.reshape(-1, 1)))
+        logger.info("query_hnsw_vectors %s", query_nhsw_vectors.shape)
+        scores, indexes = self.index.search(query_nhsw_vectors, top_docs)
+        # convert to external ids
+        db_ids = [[self.index_id_to_db_id[i] for i in query_top_idxs] for query_top_idxs in indexes]
+        result = [(db_ids[i], scores[i]) for i in range(len(db_ids))]
+        return result
+
+    def deserialize(self, file: str):
+        super(DenseHNSWFlatIndexer, self).deserialize(file)
+        # to trigger exception on subsequent indexing
+        self.phi = 1
+
+    def get_index_name(self):
+        return "hnsw_index"
+
+
+class DenseHNSWSQIndexer(DenseHNSWFlatIndexer):
+    """
+    Efficient index for retrieval. Note: default settings are for hugh accuracy but also high RAM usage
+    """
+
+    def __init__(
+        self,
+        buffer_size: int = 1e10,
+        store_n: int = 128,
+        ef_search: int = 128,
+        ef_construction: int = 200,
+    ):
+        super(DenseHNSWSQIndexer, self).__init__(
+            buffer_size=buffer_size,
+            store_n=store_n,
+            ef_search=ef_search,
+            ef_construction=ef_construction,
+        )
+
+    def init_index(self, vector_sz: int):
+        # IndexHNSWFlat supports L2 similarity only
+        # so we have to apply DOT -> L2 similairy space conversion with the help of an extra dimension
+        index = faiss.IndexHNSWSQ(vector_sz + 1, faiss.ScalarQuantizer.QT_8bit, self.store_n)
+        index.hnsw.efSearch = self.ef_search
+        index.hnsw.efConstruction = self.ef_construction
+        self.index = index
+
+    def train(self, vectors: np.array):
+        self.index.train(vectors)
+
+    def get_index_name(self):
+        return "hnswsq_index"
diff --git a/code/rag/prepare_dense.py b/code/rag/prepare_dense.py
new file mode 100644
index 0000000..3f81dfe
--- /dev/null
+++ b/code/rag/prepare_dense.py
@@ -0,0 +1,135 @@
+import logging
+import os
+
+from chunk_data import save_orig_passage, save_title_index_map
+from dpr_data import KorQuadDataset
+from encoder import KobertBiEncoder
+
+# 외부 스크립트에서 IndexRunner 임포트
+from index_runner import IndexRunner
+from indexers import DenseFlatIndexer  # index 관련
+from retriever import KorDPRRetriever  # retriever.py에서 가져오기
+import torch
+from trainer import Trainer
+import transformers
+
+
+transformers.logging.set_verbosity_error()  # 토크나이저 초기화 관련 경고 억제
+
+# 로깅 설정
+os.makedirs("logs", exist_ok=True)
+logging.basicConfig(
+    filename="logs/log.log",
+    level=logging.DEBUG,
+    format="[%(asctime)s | %(funcName)s @ %(pathname)s] %(message)s",
+)
+logger = logging.getLogger()
+
+
+# 모델 존재 여부 확인 함수
+def check_if_model_exists(model_path: str):
+    """모델 체크포인트가 존재하는지 확인하는 함수"""
+    return os.path.exists(model_path)
+
+
+# 위키 데이터 처리 함수
+def process_wiki_data():
+    """
+    위키 데이터를 처리하고 필요한 피클 파일을 생성합니다.
+    """
+    processed_passages_path = "processed_passages"
+
+    if not os.path.exists(processed_passages_path):
+        logger.debug(f"'{processed_passages_path}' 폴더가 존재하지 않습니다. 데이터 처리를 시작합니다.")
+
+        # 1. chunk 데이터를 피클 파일로 변환하여 processed_passages 폴더에 저장 (약 10분 소요)
+        save_orig_passage()
+
+        # 2. 제목과 인덱스 매핑 저장
+        save_title_index_map()
+
+        logger.debug("데이터 처리가 완료되었습니다.")
+    else:
+        logger.debug(f"'{processed_passages_path}' 폴더가 이미 존재합니다. 데이터 처리를 건너뜁니다.")
+
+
+if __name__ == "__main__":
+    # 위키 데이터 처리
+    # processed_passage 폴더 내에 피클화된 데이터가 저장됩니다. 10분 소요
+    process_wiki_data()
+
+    # 모델 경로 설정
+    model_path = "./output/my_model.pt"
+
+    # korquad 데이터로 모델을 학습시켜줍니다.
+    # 모델이 이미 존재하면 학습을 건너뜁니다
+    if check_if_model_exists(model_path):
+        logger.debug(f"이미 학습된 모델이 {model_path}에 존재합니다. 학습을 건너뜁니다.")
+    else:
+        logger.debug("학습된 모델이 없습니다. 학습을 시작합니다.")
+
+        # 모델과 데이터셋 준비
+        device = torch.device("cuda:0")
+        model = KobertBiEncoder()
+        train_dataset = KorQuadDataset("./data/KorQuAD_v1.0_train.json")
+        valid_dataset = KorQuadDataset("./data/KorQuAD_v1.0_dev.json")
+
+        # Trainer 객체 생성 및 학습 시작
+        my_trainer = Trainer(
+            model=model,
+            device=device,
+            train_dataset=train_dataset,
+            valid_dataset=valid_dataset,
+            num_epoch=10,
+            batch_size=128 - 32,
+            lr=1e-5,
+            betas=(0.9, 0.99),
+            num_warmup_steps=1000,
+            num_training_steps=100000,
+            valid_every=30,
+            best_val_ckpt_path=model_path,
+        )
+
+        # 학습 상태 불러오기
+        # my_trainer.load_training_state()
+
+        # 학습 시작
+        my_trainer.fit()
+
+    # Indexing을 실행하는 코드 (IndexRunner 사용)
+    index_path = "./2050iter_flat"  # 인덱스 파일 경로 설정
+
+    # 인덱스가 이미 존재하면 인덱싱을 건너뜁니다
+    if not os.path.exists(index_path):
+        logger.info("인덱스가 존재하지 않습니다. 인덱싱을 시작합니다.")
+        index_runner = IndexRunner(
+            data_dir="./text",
+            model_ckpt_path="./output/my_model.pt",
+            index_output=index_path,
+        )
+        index_runner.run()
+    else:
+        logger.info(f"인덱스 파일 '{index_path}'가 이미 존재합니다. 인덱싱을 건너뜁니다.")
+
+    # index 파일 로딩
+    index = DenseFlatIndexer()
+    index.deserialize(path=index_path)  # 이미 생성된 인덱스 파일을 로드
+
+    # retriever.py로부터 KorDPRRetriever 객체를 생성하여 쿼리 실행
+    model = KobertBiEncoder()
+    model.load("./output/my_model.pt")
+    model.eval()
+
+    valid_dataset = KorQuadDataset("./data/KorQuAD_v1.0_dev.json")
+    retriever = KorDPRRetriever(model=model, valid_dataset=valid_dataset, index=index)
+
+    # 'query'와 'k' 값을 설정합니다.
+    query = "중국의 천안문 사태가 일어난 년도는?"
+    k = 10  # 상위 10개 유사한 passage를 출력
+
+    # retrieve 메서드를 호출하여 가장 유사도가 높은 k개의 passage를 찾습니다.
+    passages = retriever.retrieve(query=query, k=k)
+
+    # 출력: 유사도 높은 passage와 그 유사도를 출력합니다.
+    for idx, (passage, sim) in enumerate(passages):
+        logger.debug(f"Rank {idx + 1} | Similarity: {sim:.4f} | Passage: {passage}")
diff --git a/code/rag/reranker.py b/code/rag/reranker.py
new file mode 100644
index 0000000..2716a23
--- /dev/null
+++ b/code/rag/reranker.py
@@ -0,0 +1,105 @@
+import gc
+import os
+from typing import Dict, List
+
+from dotenv import load_dotenv
+from loguru import logger
+import numpy as np
+import torch
+from tqdm import tqdm
+from transformers import AutoModelForSequenceClassification, AutoTokenizer
+
+from .retriever_elastic import ElasticsearchRetriever
+
+
+class Reranker:
+    def __init__(
+        self,
+        model_path: str = "Dongjin-kr/ko-reranker",
+        batch_size: int = 128,
+        max_length: int = 512,
+    ):
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        self.model = AutoModelForSequenceClassification.from_pretrained(model_path)
+        self.model.to(self.device)
+        self.model.eval()
+        self.tokenizer = AutoTokenizer.from_pretrained(model_path)
+        self.batch_size = batch_size
+        self.max_length = max_length
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        # GPU 메모리 정리
+        del self.model
+        del self.tokenizer
+        torch.cuda.empty_cache()
+        gc.collect()
+
+    def _exp_normalize(self, x):
+        y = np.exp(x - x.max(axis=1, keepdims=True))
+        return y / y.sum(axis=1, keepdims=True)
+
+    def rerank(self, queries: List[str], retrieve_results: List[List[Dict]], topk: int = 5) -> List[List[Dict]]:
+        # 입력 데이터 준비
+        all_pairs = []
+        for query, results in zip(queries, retrieve_results):
+            for result in results:
+                all_pairs.append([query, result["text"]])
+
+        # 배치 처리
+        all_scores = []
+        for i in tqdm(range(0, len(all_pairs), self.batch_size), desc="Reranking"):
+            batch_pairs = all_pairs[i : i + self.batch_size]
+
+            with torch.no_grad():
+                inputs = self.tokenizer(
+                    batch_pairs,
+                    padding=True,
+                    truncation=True,
+                    return_tensors="pt",
+                    max_length=self.max_length,
+                )
+                inputs = {k: v.to(self.device) for k, v in inputs.items()}
+                batch_scores = self.model(**inputs, return_dict=True).logits.view(-1).float().cpu().numpy()
+                all_scores.extend(batch_scores)
+
+        all_scores = np.array(all_scores)
+
+        reranked_results = []
+        start = 0
+        for results in retrieve_results:
+            end = start + len(results)
+            scores = all_scores[start:end]
+            scores = self._exp_normalize(scores.reshape(1, -1)).flatten()
+            top_indices = np.argsort(scores)[-topk:][::-1]
+
+            reranked_batch = [{"text": results[i]["text"], "score": float(scores[i])} for i in top_indices]
+            reranked_results.append(reranked_batch)
+            start = end
+
+        return reranked_results
+
+
+if __name__ == "__main__":
+    config_folder = os.path.join(os.path.dirname(__file__), "..", "..", "config")
+    load_dotenv(os.path.join(config_folder, ".env"))
+
+    reranker = Reranker(
+        model_path="Dongjin-kr/ko-reranker",
+        batch_size=128,
+        max_length=512,
+    )
+    retriever = ElasticsearchRetriever(
+        index_name="two-wiki-index",
+    )
+
+    query = "선비들 수만 명이 대궐 앞에 모여 만 동묘와 서원을 다시 설립할 것을 청하니, (가)이/가 크게 노하여 한성부의 조례(皂隷)와 병졸로 하여 금 한 강 밖으로 몰아내게 하고 드디어 천여 곳의 서원을 철폐하고 그 토지를 몰수하여 관에 속하게 하였다.－대한계년사"  # noqa: E501
+    retriever_result = retriever.retrieve(query, top_k=5)
+    logger.debug("Elasticsearch Retriever")
+    logger.debug(f"{retriever_result[:5]}")
+
+    reranked_results = reranker.rerank(queries=[query], retrieve_results=[retriever_result], topk=3)
+    logger.debug("Reranker")
+    logger.debug(f"{reranked_results}")
diff --git a/code/rag/retriever.py b/code/rag/retriever.py
new file mode 100644
index 0000000..b32d5b6
--- /dev/null
+++ b/code/rag/retriever.py
@@ -0,0 +1,187 @@
+from collections import defaultdict
+from glob import glob
+import math
+import os
+import pickle
+import typing
+
+# from utils import get_passage_file
+from typing import List
+
+from dpr_data import KorQuadDataset, KorQuadSampler, korquad_collator
+from encoder import KobertBiEncoder
+from indexers import DenseFlatIndexer
+from loguru import logger
+import torch
+from torch import tensor as T
+from tqdm import tqdm
+
+
+def get_wiki_filepath(data_dir):
+    return glob(f"{data_dir}/*/wiki_*")
+
+
+def wiki_worker_init(worker_id):
+    worker_info = torch.utils.data.get_worker_info()
+    dataset = worker_info.dataset
+    # logger.debug(dataset)
+    # dataset =
+    overall_start = dataset.start
+    overall_end = dataset.end
+    split_size = int(math.ceil((overall_end - overall_start) / float(worker_info.num_workers)))
+    worker_id = worker_info.id
+    # end_idx = min((worker_id+1) * split_size, len(dataset.data))
+    dataset.start = overall_start + worker_id * split_size
+    dataset.end = min(dataset.start + split_size, overall_end)  # index error 방지
+
+
+def get_passage_file(p_id_list: typing.List[int]) -> str:
+    """passage id를 받아서 해당되는 파일 이름을 반환합니다."""
+    target_file = None
+    p_id_max = max(p_id_list)
+    p_id_min = min(p_id_list)
+
+    # 현재 파일의 경로를 기준으로 'processed_passages' 경로 설정
+    current_dir = os.path.dirname(os.path.abspath(__file__))
+    passages_dir = os.path.join(current_dir, "processed_passages")
+
+    # 'processed_passages' 디렉터리에서 파일을 찾음
+    for f in glob(f"{passages_dir}/*.p"):
+        file_name = os.path.basename(f)
+        s, e = file_name.split(".")[0].split("-")
+        s, e = int(s), int(e)
+
+        if p_id_min >= s and p_id_max <= e:
+            target_file = f
+            break
+
+    if target_file is None:
+        logger.debug(f"No file found for passage IDs: {p_id_list}")
+
+    return target_file
+
+
+class KorDPRRetriever:
+    def __init__(self, model, valid_dataset, index, val_batch_size: int = 64, device="cuda:0"):
+        # 모델이 경로로 주어진 경우 로드
+        if isinstance(model, str):
+            self.model = KobertBiEncoder()
+            self.model.load(model)
+        else:
+            self.model = model
+
+        # 모델을 해당 디바이스로 이동
+        self.model = self.model.to(device)
+        self.model.eval()
+
+        # 데이터셋 로드
+        self.valid_dataset = valid_dataset
+
+        # 인덱스가 경로로 주어진 경우 로드
+        if isinstance(index, str):
+            self.index = DenseFlatIndexer()
+            self.index.deserialize(path=index)
+        else:
+            self.index = index
+        self.model = model.to(device)
+        self.device = device
+        self.tokenizer = valid_dataset.tokenizer
+        self.val_batch_size = val_batch_size
+        self.valid_loader = torch.utils.data.DataLoader(
+            dataset=valid_dataset.dataset,
+            batch_sampler=KorQuadSampler(valid_dataset.dataset, batch_size=val_batch_size, drop_last=False),
+            collate_fn=lambda x: korquad_collator(x, padding_value=valid_dataset.pad_token_id),
+            num_workers=4,
+        )
+        self.index = index
+
+    def val_top_k_acc(self, k: List[int] = [5] + list(range(10, 101, 10))):
+        """validation set에서 top k 정확도를 계산합니다."""
+
+        self.model.eval()  # 평가 모드
+        k_max = max(k)
+        sample_cnt = 0
+        retr_cnt = defaultdict(int)
+        with torch.no_grad():
+            for batch in tqdm(self.valid_loader, desc="valid"):
+                # batch_q, batch_q_attn_mask, batch_p_id, batch_p, batch_p_attn_mask
+                q, q_mask, p_id, a, a_mask = batch
+                q, q_mask = (
+                    q.to(self.device),
+                    q_mask.to(self.device),
+                )
+                q_emb = self.model(q, q_mask, "query")  # bsz x bert_dim
+                result = self.index.search_knn(query_vectors=q_emb.cpu().numpy(), top_docs=k_max)
+
+                for (pred_idx_lst, _), true_idx, _a, _a_mask in zip(result, p_id, a, a_mask):
+                    a_len = _a_mask.sum()
+                    _a = _a[:a_len]
+                    _a = _a[1:-1]
+                    _a_txt = self.tokenizer.decode(_a).strip()
+                    docs = [pickle.load(open(get_passage_file([idx]), "rb"))[idx] for idx in pred_idx_lst]
+
+                    for _k in k:
+                        if _a_txt in " ".join(docs[:_k]):
+                            retr_cnt[_k] += 1
+
+                bsz = q.size(0)
+                sample_cnt += bsz
+        retr_acc = {_k: float(v) / float(sample_cnt) for _k, v in retr_cnt.items()}
+        return retr_acc
+
+    def retrieve(self, query: str, k: int = 100):
+        """주어진 쿼리에 대해 가장 유사도가 높은 passage를 반환합니다."""
+        self.model.eval()  # 평가 모드
+        tok = self.tokenizer.batch_encode_plus([query], truncation=True, padding=True, max_length=512)
+
+        # Ensure tensors are moved to the same device as the model (cuda:0)
+        input_ids = T(tok["input_ids"]).to(self.device)
+        attention_mask = T(tok["attention_mask"]).to(self.device)
+
+        with torch.no_grad():
+            out = self.model(input_ids, attention_mask, "query")
+
+        result = self.index.search_knn(query_vectors=out.cpu().numpy(), top_docs=k)
+        # logger.debug(result)
+        # 원문 가져오기
+        passages = []
+        for idx, sim in zip(*result[0]):
+            # logger.debug(idx)
+            path = get_passage_file([idx])
+            if not path:
+                logger.debug(f"올바른 경로에 피클화된 위키피디아가 있는지 확인하세요.No single passage path for {idx}")
+                continue
+            with open(path, "rb") as f:
+                passage_dict = pickle.load(f)
+            # logger.debug(f"passage : {passage_dict[idx]}, sim : {sim}")
+            passages.append((passage_dict[idx], sim))
+            # logger.debug("성공!!!!!!")
+        return passages
+
+
+if __name__ == "__main__":
+    # parser = argparse.ArgumentParser()
+    # parser.add_argument("--query", "-q", type=str, required=True)
+    # parser.add_argument("--k", "-k", type=int, required=True)
+    # args = parser.parse_args()
+
+    model = KobertBiEncoder()
+    model.load("./output/my_model.pt")
+    model.eval()
+    valid_dataset = KorQuadDataset("./data/KorQuAD_v1.0_dev.json")
+    index = DenseFlatIndexer()
+    index.deserialize(path="./2050iter_flat")
+
+    retriever = KorDPRRetriever(model=model, valid_dataset=valid_dataset, index=index)
+
+    # 'query'와 'k' 값을 설정합니다.
+    query = "(가)이/가 크게 노하여 한성부의 조례(皂隷)와 병졸로 하여 금 한 강 밖으로 몰아내게 하고 드디어 천여 곳의 서원을 철폐하고 그 토지를 몰수하여 관에 속하게 하였다.－대한계년사 －"  # noqa: E501
+    # query = "성학집요의 저자는?"
+    k = 10  # 상위 20개 유사한 passage를 출력하려면 k를 20으로 설정
+
+    # retrieve 메서드를 호출하여 가장 유사도가 높은 k개의 passage를 찾습니다.
+    passages = retriever.retrieve(query=query, k=k)
+
+    # 출력: 유사도 높은 passage와 그 유사도를 출력합니다.
+    for idx, (passage, sim) in enumerate(passages):
+        logger.debug(f"Rank {idx + 1} | Similarity: {sim:.4f} | Passage: {passage}")
diff --git a/code/rag/retriever_bm25.py b/code/rag/retriever_bm25.py
new file mode 100644
index 0000000..532fcb9
--- /dev/null
+++ b/code/rag/retriever_bm25.py
@@ -0,0 +1,140 @@
+import json
+import os
+import pickle
+from typing import Dict, List, Optional
+
+from loguru import logger
+import numpy as np
+from rank_bm25 import BM25Okapi
+
+
+# from konlpy.tag import Okt
+# okt = Okt()
+# def okt_specific_pos_tokenizer(text, stem=True, norm=True):
+#     # pos 태깅 수행
+#     pos_tagged = okt.pos(text, stem=stem, norm=norm)
+#     # 명사(Noun), 형용사(Adjective), 동사(Verb)만 필터링
+#     filtered_words = [word for word, pos in pos_tagged if pos in ["Noun", "Adjective", "Verb"]]
+#     return filtered_words
+
+
+# Deprecated: 너무 느려서 더 이상 사용하지 않습니다.
+class BM25Retriever:
+    def __init__(
+        self,
+        tokenize_fn=None,
+        data_path: Optional[str] = "../data/",
+        pickle_filename: str = "wiki_bm25.pkl",
+        doc_filename: Optional[str] = "wiki_document.json",
+    ) -> None:
+        self.tokenize_fn = tokenize_fn if tokenize_fn else lambda x: x.split()
+        self.pickle_path = os.path.join(data_path, pickle_filename)
+        self.bm25 = None
+        self.corpus = []
+
+        # 데이터셋 로드
+        self._load_dataset(os.path.join(data_path, doc_filename))
+
+        # 기존 인덱스 로드
+        if os.path.exists(self.pickle_path):
+            self._load_pickle()
+            return
+
+        # 인덱스 생성
+        self._initialize_retriever()
+
+    def _load_dataset(self, json_path):
+        logger.info("문서 데이터셋 로드")
+        with open(json_path, "r", encoding="utf-8") as f:
+            docs = json.load(f)
+        self.corpus = [f"{doc['title']}: {doc['text']}" for doc in docs]
+
+    def _load_pickle(self):
+        logger.info("기존 BM25 인덱스 로드")
+        with open(self.pickle_path, "rb") as f:
+            data = pickle.load(f)
+            self.bm25 = data["bm25"]
+
+    def _initialize_retriever(self):
+        logger.info("새로운 BM25 인덱스 생성")
+
+        tokenized_corpus = [self.tokenize_fn(doc) for doc in self.corpus]
+        self.bm25 = BM25Okapi(tokenized_corpus)
+
+        with open(self.pickle_path, "wb") as f:
+            pickle.dump(
+                {
+                    "bm25": self.bm25,
+                },
+                f,
+            )
+        logger.info("인덱스 생성 완료")
+
+    def retrieve(self, query: str, top_k: int = 3) -> List[Dict]:
+        """
+        주어진 쿼리에 대해 상위 k개의 문서를 검색합니다.
+        """
+        if not self.bm25:
+            raise Exception("BM25 모델이 초기화되지 않았습니다.")
+
+        tokenized_query = self.tokenize_fn(query)
+        doc_scores = self.bm25.get_scores(tokenized_query)
+        top_indices = np.argsort(doc_scores)[-top_k:][::-1]
+
+        results = []
+        for idx in top_indices:
+            results.append(
+                {
+                    "text": self.corpus[idx],
+                    "score": float(doc_scores[idx]),
+                }
+            )
+        return results
+
+    def bulk_retrieve(self, queries: List[str], top_k: int = 3) -> List[List[Dict]]:
+        """
+        여러 쿼리에 대해 일괄적으로 검색을 수행합니다.
+        """
+        if not self.bm25:
+            raise Exception("BM25 모델이 초기화되지 않았습니다.")
+
+        results = []
+        logger.info(f"{len(queries)}개 쿼리 일괄 검색")
+        # 모든 쿼리를 한 번에 토크나이징
+        tokenized_queries = [self.tokenize_fn(query) for query in queries]
+
+        # 각 쿼리별로 검색 수행
+        for tokenized_query in tokenized_queries:
+            doc_scores = self.bm25.get_scores(tokenized_query)
+            top_indices = np.argsort(doc_scores)[-top_k:][::-1]
+
+            query_results = []
+            for idx in top_indices:
+                query_results.append(
+                    {
+                        "text": self.corpus[idx],
+                        "score": float(doc_scores[idx]),
+                    }
+                )
+            results.append(query_results)
+
+        logger.info(f"{len(queries)}개 쿼리 일괄 검색 완료")
+        return results
+
+
+if __name__ == "__main__":
+    os.chdir("..")
+    retriever = BM25Retriever(
+        tokenize_fn=None,
+        data_path="../data/",
+        pickle_filename="wiki_bm25.pkl",
+        doc_filename="wiki.json",
+    )
+
+    query = "선비들 수만 명이 대궐 앞에 모여 만 동묘와 서원을 다시 설립할 것을 청하니, (가)이/가 크게 노하여 한성부의 조례(皂隷)와 병졸로 하여 금 한 강 밖으로 몰아내게 하고 드디어 천여 곳의 서원을 철폐하고 그 토지를 몰수하여 관에 속하게 하였다.－대한계년사"  # noqa: E501
+    results = retriever.retrieve(query, top_k=5)
+
+    for i, result in enumerate(results, 1):
+        logger.debug(f"\n검색 결과 {i}")
+        logger.debug(f"점수: {result['score']:.4f}")
+        logger.debug(f"내용: {result['text'][:200]}...")
diff --git a/code/rag/retriever_elastic.py b/code/rag/retriever_elastic.py
new file mode 100644
index 0000000..597a4d2
--- /dev/null
+++ b/code/rag/retriever_elastic.py
@@ -0,0 +1,190 @@
+import json
+import os
+import re
+from typing import Dict, List, Optional
+import warnings
+
+from dotenv import load_dotenv
+from elasticsearch import Elasticsearch, ElasticsearchWarning
+from loguru import logger
+
+
+# ElasticsearchWarning 무시
+warnings.filterwarnings("ignore", category=ElasticsearchWarning)
+
+
+class ElasticsearchRetriever:
+    def __init__(
+        self,
+        index_name: str = "wiki-index",
+        data_path: Optional[str] = None,
+        setting_path: Optional[str] = None,
+        doc_filename: Optional[str] = None,
+    ) -> None:
+        self.index_name = index_name
+        self.client = self._connect_elasticsearch(
+            os.getenv("ELASTICSEARCH_URL"), os.getenv("ELASTICSEARCH_ID"), os.getenv("ELASTICSEARCH_PW")
+        )
+
+        # 데이터셋 로드 및 인덱스 초기화
+        if not self.client.indices.exists(index=self.index_name):
+            if data_path and setting_path and doc_filename:
+                docs = self._load_dataset(os.path.join(data_path, doc_filename))
+                self._initialize_index(setting_path)
+                self._insert_documents(docs)
+            else:
+                raise ValueError(f"존재하지 않는 인덱스: {index_name}")
+
+    def _connect_elasticsearch(self, url: str, id: str, pw: str) -> Elasticsearch:
+        """ElasticSearch 클라이언트 연결"""
+        es = Elasticsearch(
+            url,
+            basic_auth=(id, pw),
+            request_timeout=30,
+            max_retries=10,
+            retry_on_timeout=True,
+            verify_certs=False,
+        )
+        logger.info(f"Elasticsearch 연결 상태: {es.ping()}")
+        return es
+
+    def _load_dataset(self, doc_filename) -> Dict:
+        """문서 데이터셋 로드"""
+        with open(doc_filename, "r", encoding="utf-8") as f:
+            return json.load(f)
+
+    def _initialize_index(self, setting_path) -> None:
+        """인덱스 생성 및 설정"""
+        with open(setting_path, "r") as f:
+            setting = json.load(f)
+        self.client.indices.create(index=self.index_name, body=setting)
+        logger.info("인덱스 생성 완료")
+
+    def _delete_index(self):
+        if not self.client.indices.exists(index=self.index_name):
+            logger.info("Index doesn't exist.")
+            return
+
+        self.client.indices.delete(index=self.index_name)
+        logger.info("Index deletion has been completed")
+
+    def _insert_documents(self, docs) -> None:
+        """문서 데이터 bulk 삽입"""
+
+        def _preprocess(text):
+            text = re.sub(r"\n", " ", text)
+            text = re.sub(r"\\n", " ", text)
+            text = re.sub(r"#", " ", text)
+            text = re.sub(r"\s+", " ", text).strip()  # 두 개 이상의 연속된 공백을 하나로 치환
+            return text
+
+        bulk_data = []
+        for i, doc in enumerate(docs):
+            # bulk 작업을 위한 메타데이터
+            bulk_data.append({"index": {"_index": self.index_name, "_id": i}})
+            # 실제 문서 데이터
+            bulk_data.append({"title": doc["title"], "text": _preprocess(doc["text"])})
+
+            # 1000개 단위로 벌크 삽입 수행
+            if (i + 1) % 1000 == 0:
+                try:
+                    response = self.client.bulk(body=bulk_data)
+                    if response["errors"]:
+                        logger.warning(f"{i+1}번째 벌크 삽입 중 일부 오류 발생")
+                    bulk_data = []  # 벌크 데이터 초기화
+                    logger.info(f"{i+1}개 문서 벌크 삽입 완료")
+                except Exception as e:
+                    logger.error(f"벌크 삽입 실패 (인덱스: {i}): {e}")
+                    bulk_data = []  # 오류 발생 시에도 데이터 초기화
+
+        # 남은 데이터 처리
+        if bulk_data:
+            try:
+                response = self.client.bulk(body=bulk_data)
+                if response["errors"]:
+                    logger.warning("마지막 벌크 삽입 중 일부 오류 발생")
+            except Exception as e:
+                logger.error(f"마지막 벌크 삽입 실패: {e}")
+
+        # 최종 문서 수 확인
+        n_records = self.client.count(index=self.index_name)["count"]
+        logger.info(f"총 {n_records}개 문서 삽입 완료")
+
+    def retrieve(self, query: str, top_k: int = 3) -> List[Dict]:
+        """단일 쿼리에 대한 검색 수행"""
+        query_body = {"query": {"bool": {"must": [{"match": {"text": query}}]}}}
+
+        response = self.client.search(index=self.index_name, body=query_body, size=top_k)
+
+        results = []
+        for hit in response["hits"]["hits"]:
+            results.append({"text": f"{hit['_source']['title']}: {hit['_source']['text']}", "score": hit["_score"]})
+        return results
+
+    def bulk_retrieve(self, queries: List[str], top_k: int = 3) -> List[List[Dict]]:
+        """여러 쿼리에 대한 일괄 검색 수행 (msearch API 사용)"""
+        logger.info(f"{len(queries)}개 쿼리 일괄 검색")
+
+        # msearch API를 위한 bulk 쿼리 준비
+        bulk_query = []
+        for query in queries:
+            # 메타데이터 라인
+            bulk_query.append({"index": self.index_name})
+            # 쿼리 라인
+            bulk_query.append({"query": {"bool": {"must": [{"match": {"text": query}}]}}, "size": top_k})
+
+        try:
+            # msearch API 호출
+            response = self.client.msearch(body=bulk_query)
+
+            # 결과 처리
+            results = []
+            for response_item in response["responses"]:
+                query_results = []
+                if not response_item.get("error"):
+                    for hit in response_item["hits"]["hits"]:
+                        query_results.append(
+                            {"text": f"{hit['_source']['title']}: {hit['_source']['text']}", "score": hit["_score"]}
+                        )
+                results.append(query_results)
+
+            logger.info(f"{len(queries)}개 쿼리 일괄 검색 완료")
+            return results
+
+        except Exception as e:
+            logger.error(f"Bulk search 실패: {e}")
+            return [[] for _ in queries]  # 에러 발생 시 빈 결과 반환
+
+
+if __name__ == "__main__":
+    config_folder = os.path.join(os.path.dirname(__file__), "..", "..", "config")
+    load_dotenv(os.path.join(config_folder, ".env"))
+
+    retriever = ElasticsearchRetriever(
+        data_path="../data/",
+        index_name="wiki-index",
+        setting_path="../config/elastic_setting.json",
+        doc_filename="wiki.json",
+    )
+
+    # 새로운 문서 추가 삽입시에만 사용
+    if False:
+        current_count = retriever.client.count(index=retriever.index_name)["count"]
+
+        logger.info("새로운 문서 추가 시작")
+        with open("new_wiki.json", "r", encoding="utf-8") as f:
+            new_docs = json.load(f)
+        retriever._insert_documents(new_docs)
+
+        # 문서 추가 확인
+        new_count = retriever.client.count(index=retriever.index_name)["count"]
+        logger.info(f"문서 추가 완료: {current_count} -> {new_count} ({new_count-current_count}개 추가)")
+
+    # 문서 검색 테스트
+    query = "선비들 수만 명이 대궐 앞에 모여 만 동묘와 서원을 다시 설립할 것을 청하니, (가)이/가 크게 노하여 한성부의 조례(皂隷)와 병졸로 하여 금 한 강 밖으로 몰아내게 하고 드디어 천여 곳의 서원을 철폐하고 그 토지를 몰수하여 관에 속하게 하였다.－대한계년사"  # noqa: E501
+    results = retriever.retrieve(query, top_k=5)
+
+    for i, result in enumerate(results, 1):
+        logger.debug(f"\n검색 결과 {i}")
+        logger.debug(f"점수: {result['score']:.4f}")
+        logger.debug(f"내용: {result['text'][:200]}...")
diff --git a/code/rag/train.py b/code/rag/train.py
new file mode 100644
index 0000000..20a33fe
--- /dev/null
+++ b/code/rag/train.py
@@ -0,0 +1,228 @@
+from copy import deepcopy
+import logging
+import os
+from typing import Tuple
+
+from dpr_data import KorQuadSampler, korquad_collator
+import numpy as np
+import torch
+from tqdm import tqdm
+import transformers
+import wandb
+
+
+# Ensure output directory exists
+os.makedirs("./output", exist_ok=True)
+
+# Set up logging
+os.makedirs("logs", exist_ok=True)
+logging.basicConfig(
+    filename="logs/log.log",
+    level=logging.DEBUG,
+    format="[%(asctime)s | %(funcName)s @ %(pathname)s] %(message)s",
+)
+logger = logging.getLogger()  # get root logger
+
+
+class Trainer:
+    """Basic trainer"""
+
+    def __init__(
+        self,
+        model,
+        device,
+        train_dataset,
+        valid_dataset,
+        num_epoch: int,
+        batch_size: int,
+        lr: float,
+        betas: Tuple[float],
+        num_warmup_steps: int,
+        num_training_steps: int,
+        valid_every: int,
+        best_val_ckpt_path: str,
+    ):
+        self.model = model.to(device)
+        self.device = device
+        self.optimizer = torch.optim.Adam(self.model.parameters(), lr=lr, betas=betas)
+        self.scheduler = transformers.get_linear_schedule_with_warmup(
+            self.optimizer, num_warmup_steps, num_training_steps
+        )
+        self.train_loader = torch.utils.data.DataLoader(
+            dataset=train_dataset.dataset,
+            batch_sampler=KorQuadSampler(train_dataset.dataset, batch_size=batch_size, drop_last=False),
+            collate_fn=lambda x: korquad_collator(x, padding_value=train_dataset.pad_token_id),
+            num_workers=4,
+        )
+        self.valid_loader = torch.utils.data.DataLoader(
+            dataset=valid_dataset.dataset,
+            batch_sampler=KorQuadSampler(valid_dataset.dataset, batch_size=batch_size, drop_last=False),
+            collate_fn=lambda x: korquad_collator(x, padding_value=valid_dataset.pad_token_id),
+            num_workers=4,
+        )
+
+        self.batch_size = batch_size
+        self.num_epoch = num_epoch
+        self.valid_every = valid_every
+        self.lr = lr
+        self.betas = betas
+        self.num_warmup_steps = num_warmup_steps
+        self.num_training_steps = num_training_steps
+        self.best_val_ckpt_path = best_val_ckpt_path
+        self.best_val_optim_path = best_val_ckpt_path.split(".pt")[0] + "_optim.pt"
+
+        self.start_ep = 1
+        self.start_step = 1
+
+    def ibn_loss(self, pred: torch.FloatTensor):
+        """In-batch negative loss calculation."""
+        bsz = pred.size(0)
+        target = torch.arange(bsz).to(self.device)
+        return torch.nn.functional.cross_entropy(pred, target)
+
+    def batch_acc(self, pred: torch.FloatTensor):
+        """Batch accuracy calculation."""
+        bsz = pred.size(0)
+        target = torch.arange(bsz)
+        return (pred.detach().cpu().max(1).indices == target).sum().float() / bsz
+
+    def fit(self):
+        """Train the model."""
+        wandb.init(
+            project="kordpr",
+            entity="lucas01",
+            config={
+                "batch_size": self.batch_size,
+                "lr": self.lr,
+                "betas": self.betas,
+                "num_warmup_steps": self.num_warmup_steps,
+                "num_training_steps": self.num_training_steps,
+                "valid_every": self.valid_every,
+            },
+        )
+        logger.debug("start training")
+        self.model.train()  # Set model to training mode
+        global_step_cnt = 0
+        prev_best = None
+        for ep in range(self.start_ep, self.num_epoch + 1):
+            for step, batch in enumerate(tqdm(self.train_loader, desc=f"epoch {ep} batch"), 1):
+                if ep == self.start_ep and step < self.start_step:
+                    continue  # Skip until the saved checkpoint
+
+                self.model.train()  # Set model to training mode
+                global_step_cnt += 1
+                q, q_mask, _, p, p_mask = batch
+                q, q_mask, p, p_mask = (
+                    q.to(self.device),
+                    q_mask.to(self.device),
+                    p.to(self.device),
+                    p_mask.to(self.device),
+                )
+                q_emb = self.model(q, q_mask, "query")
+                p_emb = self.model(p, p_mask, "passage")
+                pred = torch.matmul(q_emb, p_emb.T)
+                loss = self.ibn_loss(pred)
+                acc = self.batch_acc(pred)
+
+                self.optimizer.zero_grad()
+                loss.backward()
+                self.optimizer.step()
+                self.scheduler.step()
+                log = {
+                    "epoch": ep,
+                    "step": step,
+                    "global_step": global_step_cnt,
+                    "train_step_loss": loss.cpu().item(),
+                    "current_lr": float(self.scheduler.get_last_lr()[0]),
+                    "step_acc": acc,
+                }
+                if global_step_cnt % self.valid_every == 0:
+                    eval_dict = self.evaluate()
+                    log.update(eval_dict)
+                    if prev_best is None or eval_dict["valid_loss"] < prev_best:  # Save best validation model
+                        self.save_training_state(log)
+                wandb.log(log)
+
+    def evaluate(self):
+        """Evaluate the model."""
+        self.model.eval()  # Set model to evaluation mode
+        loss_list = []
+        sample_cnt = 0
+        valid_acc = 0
+        with torch.no_grad():
+            for batch in self.valid_loader:
+                q, q_mask, _, p, p_mask = batch
+                q, q_mask, p, p_mask = (
+                    q.to(self.device),
+                    q_mask.to(self.device),
+                    p.to(self.device),
+                    p_mask.to(self.device),
+                )
+                q_emb = self.model(q, q_mask, "query")
+                p_emb = self.model(p, p_mask, "passage")
+                pred = torch.matmul(q_emb, p_emb.T)
+                loss = self.ibn_loss(pred)
+                step_acc = self.batch_acc(pred)
+
+                bsz = q.size(0)
+                sample_cnt += bsz
+                valid_acc += step_acc * bsz
+                loss_list.append(loss.cpu().item() * bsz)
+        return {
+            "valid_loss": np.array(loss_list).sum() / float(sample_cnt),
+            "valid_acc": valid_acc / float(sample_cnt),
+        }
+
+    def save_training_state(self, log_dict: dict) -> None:
+        """Save model, optimizer, and other training states."""
+        checkpoint_path = os.path.join("./output", self.best_val_ckpt_path)
+        self.model.checkpoint(checkpoint_path)
+        training_state = {
+            "optimizer_state": deepcopy(self.optimizer.state_dict()),
+            "scheduler_state": deepcopy(self.scheduler.state_dict()),
+        }
+        training_state.update(log_dict)
+        optim_path = os.path.join("./output", self.best_val_optim_path)
+        torch.save(training_state, optim_path)
+        logger.debug(f"Saved optimizer/scheduler state into {optim_path}")
+
+    def load_training_state(self) -> None:
+        """Load model, optimizer, and other training states."""
+        checkpoint_path = os.path.join("./output", self.best_val_ckpt_path)
+        if os.path.exists(checkpoint_path):
+            self.model.load(checkpoint_path)
+            optim_path = os.path.join("./output", self.best_val_optim_path)
+            training_state = torch.load(optim_path)
+            logger.debug(f"Loaded optimizer/scheduler state from {optim_path}")
+            self.optimizer.load_state_dict(training_state["optimizer_state"])
+            self.scheduler.load_state_dict(training_state["scheduler_state"])
+            self.start_ep = training_state["epoch"]
+            self.start_step = training_state["step"]
+            logger.debug(f"Resumed training from epoch {self.start_ep} / step {self.start_step}")
+        else:
+            logger.debug("No checkpoint found, starting training from scratch.")
+
+
+# if __name__ == "__main__":
+#     device = torch.device("cuda:0")
+#     model = KobertBiEncoder()
+#     train_dataset = KorQuadDataset("./data/KorQuAD_v1.0_train.json")
+#     valid_dataset = KorQuadDataset("./data/KorQuAD_v1.0_dev.json")
+#     my_trainer = Trainer(
+#         model=model,
+#         device=device,
+#         train_dataset=train_dataset,
+#         valid_dataset=valid_dataset,
+#         num_epoch=1,
+#         batch_size=128 - 32,
+#         lr=1e-5,
+#         betas=(0.9, 0.99),
+#         num_warmup_steps=1000,
+#         num_training_steps=100000,
+#         valid_every=30,
+#         best_val_ckpt_path="my_model.pt",
+#     )
+#     my_trainer.load_training_state()
+#     my_trainer.fit()  # Start training
+#     # eval_dict = my_trainer.evaluate()  # If you want to evaluate after training
+#     # print(eval_dict)
diff --git a/code/rag/trainer.py b/code/rag/trainer.py
new file mode 100644
index 0000000..79502eb
--- /dev/null
+++ b/code/rag/trainer.py
@@ -0,0 +1,257 @@
+import os
+import sys
+
+
+# 현재 코드가 있는 디렉토리 기준으로 상위 디렉토리를 `sys.path`에 추가
+sys.path.append(os.path.abspath(os.path.dirname(__file__)))
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
+
+from copy import deepcopy
+import logging
+from typing import Tuple
+
+from dpr_data import KorQuadDataset, KorQuadSampler, korquad_collator
+from encoder import KobertBiEncoder
+import numpy as np
+import torch
+from tqdm import tqdm
+import transformers
+
+
+os.makedirs("logs", exist_ok=True)
+logging.basicConfig(
+    filename="logs/log.log",
+    level=logging.DEBUG,
+    format="[%(asctime)s | %(funcName)s @ %(pathname)s] %(message)s",
+)
+logger = logging.getLogger()  # get root logger
+
+
+class Trainer:
+    """basic trainer"""
+
+    def __init__(
+        self,
+        model,
+        device,
+        train_dataset,
+        valid_dataset,
+        num_epoch: int,
+        batch_size: int,
+        lr: float,
+        betas: Tuple[float],
+        num_warmup_steps: int,
+        num_training_steps: int,
+        valid_every: int,
+        best_val_ckpt_path: str,
+    ):
+        self.model = model.to(device)
+        self.device = device
+        self.optimizer = torch.optim.Adam(self.model.parameters(), lr=lr, betas=betas)
+        self.scheduler = transformers.get_linear_schedule_with_warmup(
+            self.optimizer, num_warmup_steps, num_training_steps
+        )
+        self.train_loader = torch.utils.data.DataLoader(
+            dataset=train_dataset.dataset,
+            batch_sampler=KorQuadSampler(train_dataset.dataset, batch_size=batch_size, drop_last=False),
+            collate_fn=lambda x: korquad_collator(x, padding_value=train_dataset.pad_token_id),
+            num_workers=4,
+        )
+        self.valid_loader = torch.utils.data.DataLoader(
+            dataset=valid_dataset.dataset,
+            batch_sampler=KorQuadSampler(valid_dataset.dataset, batch_size=batch_size, drop_last=False),
+            collate_fn=lambda x: korquad_collator(x, padding_value=valid_dataset.pad_token_id),
+            num_workers=4,
+        )
+
+        self.batch_size = batch_size
+        self.num_epoch = num_epoch
+        self.valid_every = valid_every
+        self.lr = lr
+        self.betas = betas
+        self.num_warmup_steps = num_warmup_steps
+        self.num_training_steps = num_training_steps
+        self.best_val_ckpt_path = best_val_ckpt_path
+        self.best_val_optim_path = best_val_ckpt_path.split(".pt")[0] + "_optim.pt"
+
+        self.start_ep = 1
+        self.start_step = 1
+
+    def ibn_loss(self, pred: torch.FloatTensor):
+        """in-batch negative를 활용한 batch의 loss를 계산합니다.
+        pred : bsz x bsz 또는 bsz x bsz*2의 logit 값을 가짐. 후자는 hard negative를 포함하는 경우.
+        """
+        bsz = pred.size(0)
+        target = torch.arange(bsz).to(self.device)  # 주대각선이 answer
+        return torch.nn.functional.cross_entropy(pred, target)
+
+    def batch_acc(self, pred: torch.FloatTensor):
+        """batch 내의 accuracy를 계산합니다."""
+        bsz = pred.size(0)
+        target = torch.arange(bsz)  # 주대각선이 answer
+        return (pred.detach().cpu().max(1).indices == target).sum().float() / bsz
+
+    def fit(self):
+        """모델을 학습합니다."""
+        # wandb.init(
+        #     project="personal",
+        #     entity="gayean01",
+        #     config={
+        #         "batch_size": self.batch_size,
+        #         "lr": self.lr,
+        #         "betas": self.betas,
+        #         "num_warmup_steps": self.num_warmup_steps,
+        #         "num_training_steps": self.num_training_steps,
+        #         "valid_every": self.valid_every,
+        #     },
+        # )
+        logger.debug("start training")
+        self.model.train()  # 학습모드
+        global_step_cnt = 0
+        prev_best = None
+        for ep in range(self.start_ep, self.num_epoch + 1):
+            for step, batch in enumerate(tqdm(self.train_loader, desc=f"epoch {ep} batch"), 1):
+                if ep == self.start_ep and step < self.start_step:
+                    continue  # 중간부터 학습시키는 경우 해당 지점까지 복원
+
+                self.model.train()  # 학습 모드
+                global_step_cnt += 1
+                q, q_mask, _, p, p_mask = batch
+                q, q_mask, p, p_mask = (
+                    q.to(self.device),
+                    q_mask.to(self.device),
+                    p.to(self.device),
+                    p_mask.to(self.device),
+                )
+                q_emb = self.model(q, q_mask, "query")  # bsz x bert_dim
+                p_emb = self.model(p, p_mask, "passage")  # bsz x bert_dim
+                pred = torch.matmul(q_emb, p_emb.T)  # bsz x bsz
+                loss = self.ibn_loss(pred)
+                acc = self.batch_acc(pred)
+
+                self.optimizer.zero_grad()
+                loss.backward()
+                self.optimizer.step()
+                self.scheduler.step()
+                log = {
+                    "epoch": ep,
+                    "step": step,
+                    "global_step": global_step_cnt,
+                    "train_step_loss": loss.cpu().item(),
+                    "current_lr": float(self.scheduler.get_last_lr()[0]),  # parameter group 1개이므로
+                    "step_acc": acc,
+                }
+                if global_step_cnt % self.valid_every == 0:
+                    eval_dict = self.evaluate()
+                    log.update(eval_dict)
+                    if prev_best is None or eval_dict["valid_loss"] < prev_best:  # best val loss인 경우 저장
+                        # self.model.checkpoint(self.best_val_ckpt_path)
+                        self.save_training_state(log)
+                # wandb.log(log)
+
+    def evaluate(self):
+        """모델을 평가합니다."""
+        self.model.eval()  # 평가 모드
+        loss_list = []
+        sample_cnt = 0
+        valid_acc = 0
+        with torch.no_grad():
+            for batch in self.valid_loader:
+                q, q_mask, _, p, p_mask = batch
+                q, q_mask, p, p_mask = (
+                    q.to(self.device),
+                    q_mask.to(self.device),
+                    p.to(self.device),
+                    p_mask.to(self.device),
+                )
+                q_emb = self.model(q, q_mask, "query")  # bsz x bert_dim
+                p_emb = self.model(p, p_mask, "passage")  # bsz x bert_dim
+                pred = torch.matmul(q_emb, p_emb.T)  # bsz x bsz
+                loss = self.ibn_loss(pred)
+                step_acc = self.batch_acc(pred)
+
+                bsz = q.size(0)
+                sample_cnt += bsz
+                valid_acc += step_acc * bsz
+                loss_list.append(loss.cpu().item() * bsz)
+        valid_loss = np.array(loss_list).sum() / float(sample_cnt)
+        valid_acc = valid_acc / float(sample_cnt)
+
+        # 콘솔에 출력
+        logger.info(f"Validation Loss: {valid_loss:.4f}, Validation Accuracy: {valid_acc:.4f}")
+        return {
+            "valid_loss": np.array(loss_list).sum() / float(sample_cnt),
+            "valid_acc": valid_acc / float(sample_cnt),
+        }
+
+    def save_training_state(self, log_dict: dict) -> None:
+        """모델, optimizer와 기타 정보를 저장합니다"""
+        self.model.checkpoint(self.best_val_ckpt_path)
+        training_state = {
+            "optimizer_state": deepcopy(self.optimizer.state_dict()),
+            "scheduler_state": deepcopy(self.scheduler.state_dict()),
+        }
+        training_state.update(log_dict)
+        torch.save(training_state, self.best_val_optim_path)
+        logger.debug(f"saved optimizer/scheduler state into {self.best_val_optim_path}")
+
+    def load_training_state(self) -> None:
+        """모델, optimizer와 기타 정보를 로드합니다"""
+        self.model.load(self.best_val_ckpt_path)
+        training_state = torch.load(self.best_val_optim_path)
+        logger.debug(f"loaded optimizer/scheduler state from {self.best_val_optim_path}")
+        self.optimizer.load_state_dict(training_state["optimizer_state"])
+        self.scheduler.load_state_dict(training_state["scheduler_state"])
+        self.start_ep = training_state["epoch"]
+        self.start_step = training_state["step"]
+        logger.debug(f"resume training from epoch {self.start_ep} / step {self.start_step}")
+
+
+# 모델 존재 여부 확인 함수
+def check_if_model_exists(model_path: str):
+    """모델 체크포인트가 존재하는지 확인하는 함수"""
+    return os.path.exists(model_path)
+
+
+# 메인 실행
+if __name__ == "__main__":
+    # 모델 경로 설정
+    model_path = "./output/my_model.pt"
+
+    # 모델이 없으면 학습 시작
+    if not check_if_model_exists(model_path):
+        logger.info(f"모델 '{model_path}'이 존재하지 않습니다. 학습을 시작합니다.")
+
+        # 학습을 위한 준비
+        device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+        model = KobertBiEncoder()
+        train_dataset = KorQuadDataset("./data/KorQuAD_v1.0_train.json")
+        valid_dataset = KorQuadDataset("./data/KorQuAD_v1.0_dev.json")
+
+        # Trainer 객체 생성
+        my_trainer = Trainer(
+            model=model,
+            device=device,
+            train_dataset=train_dataset,
+            valid_dataset=valid_dataset,
+            num_epoch=1,  # 학습 epoch 수
+            batch_size=32,  # 배치 크기
+            lr=1e-5,
+            betas=(0.9, 0.99),
+            num_warmup_steps=100,
+            num_training_steps=1000,
+            valid_every=100,
+            best_val_ckpt_path=model_path,
+        )
+
+        # 학습 수행
+        my_trainer.fit()
+        eval_dict = my_trainer.evaluate()
+        logger.info(eval_dict)
+
+        # 모델 저장 디렉토리 생성 및 저장
+        os.makedirs("output", exist_ok=True)
+        torch.save(model.state_dict(), model_path)
+        logger.info(f"학습 완료. 모델이 '{model_path}'에 저장되었습니다.")
+    else:
+        logger.info(f"모델 '{model_path}'이 이미 존재합니다. 학습을 건너뜁니다.")
diff --git a/code/rag/utils.py b/code/rag/utils.py
new file mode 100644
index 0000000..084b1cd
--- /dev/null
+++ b/code/rag/utils.py
@@ -0,0 +1,36 @@
+from glob import glob
+import math
+import typing
+
+import torch
+
+
+def get_wiki_filepath(data_dir):
+    return glob(f"{data_dir}/*/wiki_*")
+
+
+def wiki_worker_init(worker_id):
+    worker_info = torch.utils.data.get_worker_info()
+    dataset = worker_info.dataset
+    # print(dataset)
+    # dataset =
+    overall_start = dataset.start
+    overall_end = dataset.end
+    split_size = int(math.ceil((overall_end - overall_start) / float(worker_info.num_workers)))
+    worker_id = worker_info.id
+    # end_idx = min((worker_id+1) * split_size, len(dataset.data))
+    dataset.start = overall_start + worker_id * split_size
+    dataset.end = min(dataset.start + split_size, overall_end)  # index error 방지
+
+
+def get_passage_file(p_id_list: typing.List[int]) -> str:
+    """passage id를 받아서 해당되는 파일 이름을 반환합니다."""
+    target_file = None
+    p_id_max = max(p_id_list)
+    p_id_min = min(p_id_list)
+    for f in glob("processed_passages/*.p"):
+        s, e = f.split("/")[1].split(".")[0].split("-")
+        s, e = int(s), int(e)
+        if p_id_min >= s and p_id_max <= e:
+            target_file = f
+    return target_file
diff --git a/code/split.py b/code/split.py
new file mode 100644
index 0000000..361573a
--- /dev/null
+++ b/code/split.py
@@ -0,0 +1,85 @@
+from ast import literal_eval
+
+from loguru import logger
+import pandas as pd
+
+
+def load_data(file_path):
+    data = pd.read_csv(file_path)
+    records = []
+    for _, row in data.iterrows():
+        problems = literal_eval(row["problems"])
+        record = {
+            "id": row["id"],
+            "paragraph": row["paragraph"],
+            "question": problems["question"],
+            "choices": problems["choices"],
+            "answer": problems.get("answer", None),
+        }
+        records.append(record)
+    logger.debug(records[0])  # 첫 번째 레코드 출력 (디버깅용)
+    return data, records
+
+
+def classify_questions(records):
+    social_keywords = ["ㄱ.", "㉠", "위의", "위 글" "단락", "본문", "밑줄 친", "(가)", "다음", "시기"]
+    classifications = []
+
+    for record in records:
+        question = record["question"]
+        paragraph = record["paragraph"]
+
+        # 사회 영역 판단
+        contains_social_keywords = any(keyword in question for keyword in social_keywords)
+
+        # 각 선택지가 본문에 포함되어 있는지 확인
+        choices_found_in_paragraph = {choice: choice in paragraph for choice in record["choices"]}
+
+        # 정답이 포함된 선택지 찾기
+        answer_index = record["answer"]
+        answer_found_in_paragraph = False
+
+        if answer_index is not None and 0 <= answer_index < len(record["choices"]):
+            answer = record["choices"][answer_index]  # 정답 선택지
+            answer_found_in_paragraph = choices_found_in_paragraph.get(answer, False)
+
+        # if contains_social_keywords and not answer_found_in_paragraph:
+        #     classification = '사회'
+        if contains_social_keywords:
+            classification = "사회"
+        elif not contains_social_keywords and answer_found_in_paragraph:
+            classification = "국어"
+        else:
+            classification = "불확실"  # 두 조건 모두 해당하지 않거나 모두 해당하는 경우
+
+        classifications.append(
+            {
+                "id": record["id"],
+                "classification": classification,
+                "contains_social_keywords": contains_social_keywords,
+                "answer_found_in_paragraph": answer_found_in_paragraph,
+                "choices_found_in_paragraph": choices_found_in_paragraph,  # 선택지 포함 여부 추가
+            }
+        )
+
+    return classifications
+
+
+def main():
+    file_path = "../data/train.csv"  # 파일 경로 설정
+    data, records = load_data(file_path)
+
+    classifications = classify_questions(records)
+
+    # 결과를 데이터프레임으로 변환
+    result_df = pd.DataFrame(classifications)
+
+    # 결과를 CSV 파일로 저장
+    output_file_path = "../data/classification_results.csv"  # 출력 파일 경로 설정
+    result_df.to_csv(output_file_path, index=False, encoding="utf-8-sig")  # CSV로 저장
+
+    logger.debug(f"결과가 {output_file_path}에 저장되었습니다.")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/code/trainer.py b/code/trainer.py
new file mode 100644
index 0000000..5304a5c
--- /dev/null
+++ b/code/trainer.py
@@ -0,0 +1,115 @@
+import evaluate
+import numpy as np
+from peft import LoraConfig
+import torch
+from transformers import EarlyStoppingCallback
+from trl import DataCollatorForCompletionOnlyLM, SFTConfig, SFTTrainer
+
+
+class CustomTrainer:
+    def __init__(self, training_config, model, tokenizer, train_dataset, eval_dataset):
+        self.model = model
+        self.tokenizer = tokenizer
+        self.train_dataset = train_dataset
+        self.eval_dataset = eval_dataset
+        self.training_config = training_config
+        self.acc_metric = evaluate.load("accuracy")
+        self.int_output_map = {"1": 0, "2": 1, "3": 2, "4": 3, "5": 4}
+
+    def train(self):
+        trainer = self._setup_trainer()
+        trainer.train()
+        return trainer.model
+
+    def _setup_trainer(self):
+        # 데이터 콜레이터 설정
+        data_collator = DataCollatorForCompletionOnlyLM(
+            response_template=self.training_config["response_template"],
+            tokenizer=self.tokenizer,
+        )
+
+        # LoRA 설정
+        peft_config = LoraConfig(
+            r=self.training_config["lora"]["r"],
+            lora_alpha=self.training_config["lora"]["lora_alpha"],
+            lora_dropout=self.training_config["lora"]["lora_dropout"],
+            target_modules=self.training_config["lora"]["target_modules"],
+            bias=self.training_config["lora"]["bias"],
+            task_type=self.training_config["lora"]["task_type"],
+        )
+
+        # SFT 설정
+        sft_config = SFTConfig(
+            do_train=self.training_config["params"]["do_train"],
+            do_eval=self.training_config["params"]["do_eval"],
+            lr_scheduler_type=self.training_config["params"]["lr_scheduler_type"],
+            max_seq_length=self.training_config["params"]["max_seq_length"],
+            per_device_train_batch_size=self.training_config["params"]["per_device_train_batch_size"],
+            per_device_eval_batch_size=self.training_config["params"]["per_device_eval_batch_size"],
+            gradient_accumulation_steps=self.training_config["params"]["gradient_accumulation_steps"],
+            gradient_checkpointing=self.training_config["params"]["gradient_checkpointing"],
+            max_grad_norm=self.training_config["params"]["max_grad_norm"],
+            num_train_epochs=self.training_config["params"]["num_train_epochs"],
+            learning_rate=self.training_config["params"]["learning_rate"],
+            weight_decay=self.training_config["params"]["weight_decay"],
+            optim=self.training_config["params"]["optim"],
+            logging_strategy=self.training_config["params"]["logging_strategy"],
+            save_strategy=self.training_config["params"]["save_strategy"],
+            eval_strategy=self.training_config["params"]["eval_strategy"],
+            logging_steps=self.training_config["params"]["logging_steps"],
+            save_steps=self.training_config["params"]["save_steps"],
+            eval_steps=self.training_config["params"]["eval_steps"],
+            save_total_limit=self.training_config["params"]["save_total_limit"],
+            save_only_model=self.training_config["params"]["save_only_model"],
+            load_best_model_at_end=self.training_config["params"]["load_best_model_at_end"],
+            report_to=self.training_config["params"]["report_to"],
+            run_name=self.training_config["params"]["run_name"],
+            output_dir=self.training_config["params"]["output_dir"],
+            overwrite_output_dir=self.training_config["params"]["overwrite_output_dir"],
+            metric_for_best_model=self.training_config["params"]["metric_for_best_model"],
+        )
+
+        return SFTTrainer(
+            model=self.model,
+            train_dataset=self.train_dataset,
+            eval_dataset=self.eval_dataset,
+            data_collator=data_collator,
+            tokenizer=self.tokenizer,
+            peft_config=peft_config,
+            args=sft_config,
+            compute_metrics=self._compute_metrics,
+            preprocess_logits_for_metrics=self._preprocess_logits_for_metrics,
+            callbacks=[
+                EarlyStoppingCallback(
+                    early_stopping_patience=self.training_config["params"]["early_stop_patience"],
+                    early_stopping_threshold=self.training_config["params"]["early_stop_threshold"],
+                )
+            ],
+        )
+
+    # 모델의 logits를 조정하여 정답 토큰 부분만 출력하도록 설정
+    def _preprocess_logits_for_metrics(self, logits, labels):
+        logits = logits if not isinstance(logits, tuple) else logits[0]
+        logit_idx = [
+            self.tokenizer.vocab["1"],
+            self.tokenizer.vocab["2"],
+            self.tokenizer.vocab["3"],
+            self.tokenizer.vocab["4"],
+            self.tokenizer.vocab["5"],
+        ]
+        return logits[:, -2, logit_idx]  # -2: answer token, -1: eos token
+
+    # metric 계산 함수
+    def _compute_metrics(self, evaluation_result):
+        logits, labels = evaluation_result
+        # 토큰화된 레이블 디코딩
+        labels = np.where(labels != -100, labels, self.tokenizer.pad_token_id)
+        labels = self.tokenizer.batch_decode(labels, skip_special_tokens=True)
+        labels = [x.split("<end_of_turn>")[0].strip() for x in labels]
+        labels = [self.int_output_map[x] for x in labels]
+
+        # softmax 함수를 사용하여 logits 변환
+        probs = torch.nn.functional.softmax(torch.tensor(logits), dim=-1)
+        predictions = np.argmax(probs, axis=-1)
+
+        return self.acc_metric.compute(predictions=predictions, references=labels)
diff --git a/code/utils/__init__.py b/code/utils/__init__.py
new file mode 100644
index 0000000..04618c3
--- /dev/null
+++ b/code/utils/__init__.py
@@ -0,0 +1,13 @@
+"""
+프로젝트 전반에 사용하는 유틸리티 모듈입니다.
+
+## 주요 기능
+- hf_manager.py: 허깅페이스에 모델/데이터 업로드
+- gdrive_manager.py: config & output을 구글 드라이브로 자동 업로드
+- common.py: 인자 및 로깅 설정을 위한 함수 모음
+
+"""
+
+from .common import create_experiment_filename, load_config, load_env_file, log_config, set_logger, set_seed, timer
+from .gdrive_manager import GoogleDriveManager
+from .hf_manager import HuggingFaceHubManager
diff --git a/code/utils/common.py b/code/utils/common.py
new file mode 100644
index 0000000..ac5aa5b
--- /dev/null
+++ b/code/utils/common.py
@@ -0,0 +1,106 @@
+import argparse
+from contextlib import contextmanager
+from datetime import datetime
+import os
+import random
+import time
+
+from dotenv import load_dotenv
+from loguru import logger
+import numpy as np
+import torch
+import yaml
+from zoneinfo import ZoneInfo
+
+
+# 코드 전역에서 첫 실행 시점의 타임스탬프를 동일하게 사용
+CURRENT_TIME = None
+
+
+def set_seed(seed=42):
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)  # if use multi-GPU
+    torch.backends.cudnn.deterministic = True
+    torch.backends.cudnn.benchmark = False
+    np.random.seed(seed)
+    random.seed(seed)
+
+
+def load_config():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--config", type=str, default="config.yaml")
+    args = parser.parse_args()
+
+    with open(os.path.join("../config", args.config), encoding="utf-8") as f:
+        config = yaml.safe_load(f)
+    return config
+
+
+def load_env_file(filepath="../config/.env"):
+    try:
+        # .env 파일 로드 시도
+        if load_dotenv(filepath):
+            logger.debug(f".env 파일을 성공적으로 로드했습니다: {filepath}")
+        else:
+            raise FileNotFoundError  # 파일이 없으면 예외 발생
+    except FileNotFoundError:
+        logger.debug(f"경고: 지정된 .env 파일을 찾을 수 없습니다: {filepath}")
+    except Exception as e:
+        logger.debug(f"오류 발생: .env 파일 로드 중 예외가 발생했습니다: {e}")
+
+
+def set_logger(log_file="../log/file.log", log_level="DEBUG"):
+    # 로거 설정
+    logger.add(
+        log_file,
+        format="{time:YYYY-MM-DD at HH:mm:ss} | {level} | {message}",
+        level=log_level,
+        rotation="12:00",  # 매일 12시에 새로운 로그 파일 생성
+        retention="7 days",  # 7일 후 로그 제거
+    )
+
+
+# config 확인
+def log_config(config, depth=0):
+    if depth == 0:
+        print("*" * 40)
+    for k, v in config.items():
+        prefix = ["\t" * depth, k, ":"]
+
+        if isinstance(v, dict):
+            print(*prefix)
+            log_config(v, depth + 1)
+        else:
+            prefix.append(v)
+            print(*prefix)
+    if depth == 0:
+        print("*" * 40)
+
+
+def get_current_time():
+    global CURRENT_TIME
+    if CURRENT_TIME is None:
+        CURRENT_TIME = datetime.now(ZoneInfo("Asia/Seoul")).strftime("%m%d%H%M")
+    return CURRENT_TIME
+
+
+def create_experiment_filename(config):
+    if config is None:
+        config = load_config()
+    username = config["exp"]["username"]
+    base_model = config["model"]["base_model"].replace("/", "_")
+    train_path = config["data"]["train_path"]
+    train_name = os.path.splitext(os.path.basename(train_path))[0]
+    num_train_epochs = config["training"]["params"]["num_train_epochs"]
+    learning_rate = config["training"]["params"]["learning_rate"]
+    current_time = get_current_time()
+
+    return f"{username}_{base_model}_{train_name}_{num_train_epochs}_{learning_rate}_{current_time}"
+
+
+@contextmanager
+def timer(name):
+    t0 = time.time()
+    yield
+    logger.debug(f"[{name}] done in {time.time() - t0:.3f} s")
diff --git a/code/utils/gdrive_manager.py b/code/utils/gdrive_manager.py
new file mode 100755
index 0000000..a882613
--- /dev/null
+++ b/code/utils/gdrive_manager.py
@@ -0,0 +1,196 @@
+import io
+import json
+import os.path
+
+from dotenv import load_dotenv
+from google.auth.transport.requests import Request
+from google.oauth2.credentials import Credentials
+from google_auth_oauthlib.flow import InstalledAppFlow
+from googleapiclient.discovery import build
+from googleapiclient.http import MediaFileUpload, MediaIoBaseUpload
+from loguru import logger
+import pandas as pd
+
+
+SCOPES = [
+    "https://www.googleapis.com/auth/drive.file",
+    "https://www.googleapis.com/auth/drive",
+]
+
+
+class GoogleDriveManager:
+    def __init__(self):
+        config_folder = os.path.join(os.path.dirname(__file__), "..", "..", "config")
+        load_dotenv(os.path.join(config_folder, ".env"))
+        self.config_folder = config_folder
+        self.root_folder_id = os.getenv("GDRIVE_FOLDER_ID")
+        self.credentials = os.getenv("GDRIVE_CREDENTIALS")
+        self.token = os.getenv("GDRIVE_TOKEN")
+        self.is_create_token = os.getenv("GDRIVE_CREATE_TOKEN")
+
+        # 환경 변수 검증 추가
+        if not all([self.root_folder_id, self.credentials, self.token]):
+            logger.error(f"필수 환경 변수가 설정되지 않았습니다. {[self.root_folder_id, self.credentials, self.token]}")
+            raise ValueError("필수 환경 변수가 설정되지 않았습니다.")
+
+        self.service = self.get_drive_service()
+
+    def get_drive_service(self):
+        creds = None
+        if os.path.exists(self.token):
+            creds = Credentials.from_authorized_user_file(self.token, SCOPES)
+
+        if not creds or not creds.valid:
+            if creds and creds.expired and creds.refresh_token:
+                creds.refresh(Request())
+            elif self.is_create_token == "true":
+                flow = InstalledAppFlow.from_client_secrets_file(self.credentials, scopes=SCOPES)
+                creds = flow.run_local_server(port=0, open_browser=False)
+                # 리프레시 토큰 저장
+                with open(self.token, "w") as token_file:
+                    token_data = {
+                        "refresh_token": creds.refresh_token,
+                        "token": creds.token,
+                        "token_uri": creds.token_uri,
+                        "client_id": creds.client_id,
+                        "client_secret": creds.client_secret,
+                        "scopes": creds.scopes,
+                    }
+                    json.dump(token_data, token_file)
+                    logger.info("구글 드라이브 토큰이 갱신되었습니다.")
+            else:
+                logger.error("구글 드라이브 업로드 실패. 토큰을 갱신을 요청하세요.")
+            with open(self.token, "w") as token:
+                token.write(creds.to_json())
+        return build("drive", "v3", credentials=creds)
+
+    def find_folder_id_by_name(self, folder_name, parent_folder_id=None):
+        """폴더명으로 폴더 ID 찾기"""
+        if not parent_folder_id:
+            parent_folder_id = self.root_folder_id
+
+        # 특정 폴더명과 정확히 일치하는 폴더 검색 쿼리
+        query = f"name='{folder_name}' and "
+        query += f"'{parent_folder_id}' in parents and "
+        query += "mimeType='application/vnd.google-apps.folder' and "
+        query += "trashed=false"
+
+        try:
+            results = (
+                self.service.files()
+                .list(
+                    q=query,
+                    spaces="drive",
+                    fields="files(id, name)",
+                    pageSize=1,  # 첫 번째 일치하는 폴더만 필요
+                )
+                .execute()
+            )
+
+            files = results.get("files", [])
+
+            if not files:
+                logger.info(f"폴더를 찾을 수 없습니다: {folder_name}")
+                return None
+
+            return files[0]["id"]
+
+        except Exception as e:
+            logger.info(f"폴더 검색 중 오류 발생: {str(e)}")
+            return None
+
+    def list_folder_files(self, folder_id=None):
+        """폴더 내 파일 목록 조회"""
+        if not folder_id:
+            folder_id = self.root_folder_id
+        query = f"'{folder_id}' in parents and trashed=false"
+
+        try:
+            results = (
+                self.service.files()
+                .list(
+                    q=query,
+                    pageSize=100,
+                    fields="nextPageToken, files(id, name, mimeType, modifiedTime, size)",
+                )
+                .execute()
+            )
+
+            return results.get("files", [])
+        except Exception as e:
+            logger.info(f"Error listing files: {str(e)}")
+            return []
+
+    def upload_yaml_file(self, file_path, filename, folder_id=None):
+        """YAML 파일 경로를 받아서 업로드"""
+        try:
+            # 파일 메타데이터 설정
+            file_metadata = {"name": filename, "mimeType": "application/x-yaml"}
+            if folder_id:
+                file_metadata["parents"] = [folder_id]
+
+            # 미디어 객체 생성
+            media = MediaFileUpload(file_path, mimetype="application/x-yaml", resumable=True)
+
+            # 파일 업로드
+            file = self.service.files().create(body=file_metadata, media_body=media, fields="id, name").execute()
+
+            logger.debug(f"Successfully uploaded {filename} to Google Drive")
+            return file
+
+        except FileNotFoundError:
+            logger.error(f"File not found: {file_path}")
+            return None
+        except Exception as e:
+            logger.error(f"Error uploading YAML file: {str(e)}")
+            return None
+
+    def upload_dataframe(self, dataframe, filename, folder_id=None):
+        """Pandas DataFrame 직접 업로드"""
+        try:
+            # DataFrame을 CSV 스트림으로 변환
+            buffer = io.StringIO()
+            dataframe.to_csv(buffer, index=False)
+            file_stream = io.BytesIO(buffer.getvalue().encode("utf-8"))
+
+            # 파일 메타데이터 설정
+            file_metadata = {"name": filename, "mimeType": "text/csv"}
+            if folder_id:
+                file_metadata["parents"] = [folder_id]
+
+            # 미디어 객체 생성
+            media = MediaIoBaseUpload(file_stream, mimetype="text/csv", resumable=True)
+
+            # 파일 업로드
+            file = self.service.files().create(body=file_metadata, media_body=media, fields="id, name").execute()
+            return file
+
+        except Exception as e:
+            logger.error(f"Error uploading DataFrame: {str(e)}")
+            return None
+
+    def upload_exp(self, user_name, output_path, config_path=None):
+        df = pd.read_csv(output_path)
+        df_basename = os.path.basename(output_path)
+
+        if config_path is None:
+            config_path = os.path.join(self.config_folder, "config.yaml")
+        config_basename = df_basename.replace("output.csv", "config.yaml")
+
+        # 실험자명으로 폴더명 찾기
+        folder_id = self.find_folder_id_by_name(user_name)
+        _ = self.upload_dataframe(df, df_basename, folder_id)
+        _ = self.upload_yaml_file(config_path, config_basename, folder_id)
+
+        gdrive_url = os.path.join("https://drive.google.com/drive/folders", folder_id)
+        logger.info(f"구글 드라이브에 업로드 되었습니다: {gdrive_url}")
+
+
+if __name__ == "__main__":
+    os.chdir("..")
+    load_dotenv("../config/.env")
+    drive_manager = GoogleDriveManager()
+    # 파일 목록 조회
+    files = drive_manager.list_folder_files()
+    for file in files:
+        logger.info(f"Name: {file['name']}, ID: {file['id']}")
diff --git a/code/utils/hf_manager.py b/code/utils/hf_manager.py
new file mode 100755
index 0000000..976798f
--- /dev/null
+++ b/code/utils/hf_manager.py
@@ -0,0 +1,81 @@
+import os
+
+from datasets import load_dataset
+from dotenv import load_dotenv
+from huggingface_hub import HfApi
+from loguru import logger
+from peft import AutoPeftModelForCausalLM
+from transformers import AutoTokenizer
+
+
+class HuggingFaceHubManager:
+    def __init__(self):
+        load_dotenv(os.path.join(os.path.dirname(__file__), "..", "..", "config", ".env"))
+        self.token = os.getenv("HF_TOKEN")
+        self.organization = os.getenv("HF_TEAM_NAME")
+        self.project_name = os.getenv("HF_PROJECT_NAME")
+
+        # 환경 변수 검증 추가
+        if not all([self.token, self.organization, self.project_name]):
+            raise ValueError("필수 환경 변수가 설정되지 않았습니다.")
+
+    def upload_model(self, model_name, username, checkpoint_path):
+        repo_id = f"{model_name}-{username}"
+        try:
+            model = AutoPeftModelForCausalLM.from_pretrained(
+                checkpoint_path,
+                trust_remote_code=True,
+                device_map="auto",
+            )
+            tokenizer = AutoTokenizer.from_pretrained(checkpoint_path, trust_remote_code=True)
+
+            model.push_to_hub(repo_id=repo_id, organization=self.organization, use_auth_token=self.token)
+            tokenizer.push_to_hub(repo_id=repo_id, organization=self.organization, use_auth_token=self.token)
+            logger.debug(f"your model pushed successfully in {repo_id}, hugging face")
+        except Exception as e:
+            logger.debug(f"An error occurred while uploading to Hugging Face: {e}")
+
+    def upload_dataset(self, file_name, private=True):
+        """
+        폴더 내 데이터 파일들을 Hugging Face Hub에 데이터셋으로 업로드하는 함수.
+
+        Parameters:
+        - file_name (str): 업로드할 로컬 데이터 파일 이름
+        - token (str): Hugging Face 액세스 토큰. 쓰기 권한 필요
+        - private (bool): True면 비공개, False면 공개 설정
+
+        Returns:
+        - None
+        """
+        api = HfApi()
+        repo_id = f"{self.organization}/{self.project_name}-{file_name}"
+
+        # 리포지토리 존재 여부 확인
+        try:
+            api.repo_info(repo_id, repo_type="dataset", token=self.token)
+            logger.debug(f"'{repo_id}' 리포지토리가 이미 존재합니다. 기존 리포지토리에 데이터셋을 업로드합니다.")
+        except Exception:
+            # 리포지토리가 없으면 생성
+            logger.debug(f"'{repo_id}' 리포지토리가 존재하지 않습니다. 새로 생성한 후 업로드합니다.")
+            api.create_repo(repo_id=repo_id, repo_type="dataset", private=private, token=self.token)
+
+        # 파일 경로 설정
+        file_path = os.path.join("..", "data", f"{file_name}.csv")
+        if not os.path.exists(file_path):
+            logger.debug(f"파일 '{file_path}'이 존재하지 않습니다.")
+            return
+
+        # 데이터셋 로드 및 업로드
+        dataset = load_dataset("csv", data_files={"train": file_path})
+        dataset.push_to_hub(repo_id, token=self.token)
+        logger.debug(f"데이터셋이 '{repo_id}'에 업로드되었습니다.")
+
+
+if __name__ == "__main__":
+    os.chdir("..")
+    load_dotenv("../config/.env")
+    logger.debug(f'{os.getenv("UPLOAD_MODEL_NAME")}, {os.getenv("USERNAME")}, {os.getenv("CHECKPOINT_PATH")}')
+
+    hf_manager = HuggingFaceHubManager()
+    hf_manager.upload_model(os.getenv("UPLOAD_MODEL_NAME"), os.getenv("USERNAME"), os.getenv("CHECKPOINT_PATH"))
+    # hf_manager.upload_dataset(args.dataname, private=True)
diff --git a/config/elastic_setting.json b/config/elastic_setting.json
new file mode 100644
index 0000000..4e49153
--- /dev/null
+++ b/config/elastic_setting.json
@@ -0,0 +1,33 @@
+{
+    "settings": {
+        "analysis": {
+            "filter": {
+                "my_shingle": {
+                    "type": "shingle"
+                }
+            },
+            "analyzer": {
+                "my_analyzer": {
+                    "type": "custom",
+                    "tokenizer": "nori_tokenizer",
+                    "decompound_mode": "mixed",
+                    "filter": ["my_shingle"]
+                }
+            },
+            "similarity": {
+                "my_similarity": {
+                    "type": "BM25"
+                }
+            }
+        }
+    },
+
+    "mappings": {
+        "properties": {
+            "document_text": {
+                "type": "text",
+                "analyzer": "my_analyzer"
+            }
+        }
+    }
+}
diff --git a/config/sample/config.yaml b/config/sample/config.yaml
new file mode 100755
index 0000000..df99b8a
--- /dev/null
+++ b/config/sample/config.yaml
@@ -0,0 +1,95 @@
+data:
+  train_path: "../data/train.csv"
+  test_path: "../data/test.csv"
+  processed_train_path: "../data/train_500_60to1_es.csv" # 미리 전처리한 데이터 사용: 비워두면 동작하지 않음
+  processed_test_path: "../data/test_500_60to1_es.csv" # 미리 전처리한 데이터 사용: 비워두면 동작하지 않음
+  max_seq_length: 2048
+  test_size: 0.1
+  retriever:
+    retriever_type: "Elasticsearch" # Elasticsearch
+    query_type: "p" # retrieve 쿼리 타입: pqc, pq, pc, p
+    query_max_length: 500 # retrieve 대상이 될 쿼리의 최대 길이: 250-500 권장
+    result_max_length: 1500 # retrieve 결과 문서의 최대 길이: 1500-2000 권장
+    top_k: 60 # 60~80
+    rerank_k: 1 # 0 이하는  reranker 동작하지 않음
+    threshold: 0.2 # 0.2 ~ 0.5
+    index_name: "two-wiki-index" # wiki-index, two-wiki-index, aihub-news-index
+  prompt:
+    start: "지문:\n {paragraph}\n\n질문:\n {question}\n\n선택지:\n {choices}\n\n"
+    start_with_plus: "지문:\n {paragraph}\n\n질문:\n {question}\n\n<보기>:\n {question_plus}\n\n선택지:\n {choices}\n\n"
+    mid: ""
+    mid_with_document: "힌트:\n {document}\n\n"
+    end: "1, 2, 3, 4, 5 중에 하나를 정답으로 고르세요.\n정답:"
+    end_gen_cot: "1, 2, 3, 4, 5 중에 하나를 정답으로 고르기 위한 근거를 차근차근 생각해보세요.\n근거:"
+    end_with_cot: "1, 2, 3, 4, 5 중에 하나를 정답으로 고르세요.\n{cot}\n정답:"
+
+model:
+  base_model: "beomi/gemma-ko-2b"
+  model:
+    torch_dtype: "float16"
+    low_cpu_mem_usage: true
+    use_cache: false # gradient_checkpointing이 true면 false여야함
+    quantization: "" # BitsAndBytes, auto
+    bits: 8 # 8 or 4
+    use_double_quant: false
+  tokenizer:
+    padding_side: "right"
+    chat_template: "{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}{% if system_message is defined %}{{ system_message }}{% endif %}{% for message in messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '<start_of_turn>user\n' + content + '<end_of_turn>\n<start_of_turn>model\n' }}{% elif message['role'] == 'assistant' %}{{ content + '<end_of_turn>\n' }}{% endif %}{% endfor %}"
+
+training:
+  response_template: "<start_of_turn>model"
+  lora:
+    r: 6
+    lora_alpha: 8
+    lora_dropout: 0.05
+    target_modules: ["q_proj", "k_proj"]
+    bias: "none"
+    task_type: "CAUSAL_LM"
+
+  params:
+    do_train: true
+    do_eval: true
+    lr_scheduler_type: "cosine"
+    max_seq_length: 2048
+    per_device_train_batch_size: 1
+    per_device_eval_batch_size: 1
+    gradient_accumulation_steps: 1
+    gradient_checkpointing: true
+    max_grad_norm: 0.3
+    num_train_epochs: 3
+    learning_rate: 2.0e-05
+    weight_decay: 0.01
+    optim: "adamw_torch" # 양자화: adamw_bnb_8bit
+    logging_strategy: "steps"
+    save_strategy: "steps"
+    eval_strategy: "steps"
+    logging_steps: 300
+    save_steps: 600
+    eval_steps: 300
+    save_total_limit: 4
+    save_only_model: true
+    load_best_model_at_end: true # early_stop을 위해 필요
+    report_to: "wandb"
+    run_name: "../outputs" # wandb 세팅이 존재한다면 동적으로 생성됩니다.
+    output_dir: "../outputs"
+    overwrite_output_dir: true
+    metric_for_best_model: "accuracy" # early_stop 기준
+    early_stop_patience: 2
+    early_stop_threshold: 0
+
+
+inference:
+  do_test: true
+  output_path: "../outputs/"
+
+log:
+  file: "../log/file.log"
+  level: "INFO"
+
+wandb:
+  project: generation_for_nlp
+  entity: hidong1015-nlp04
+
+exp:
+  # 실험자 [sujin, seongmin, sungjae, gayeon, yeseo, minseo]
+  username: fubao
diff --git a/config/sample/env-sample.txt b/config/sample/env-sample.txt
new file mode 100644
index 0000000..dc8689d
--- /dev/null
+++ b/config/sample/env-sample.txt
@@ -0,0 +1,17 @@
+# .env로 변환하여 사용
+HF_TOKEN=""
+HF_TEAM_NAME = "paper-company"
+HF_PROJECT_NAME = "KSAT"
+
+GDRIVE_TOKEN="../config/token.json"
+GDRIVE_CREDENTIALS="../config/credentials.json"
+GDRIVE_FOLDER_ID =""
+GDRIVE_CREATE_TOKEN= "false"
+
+UPLOAD_MODEL_NAME = "1115-fubao-exaone3.0-base-v1" # 날짜-이름-베이스모델-사용데이터셋-버전
+USERNAME = "fubao"
+CHECKPOINT_PATH="../outputs/checkpoint-9999"
+
+ELASTICSEARCH_URL="http://localhost:9200"
+ELASTICSEARCH_ID=""
+ELASTICSEARCH_PW=""
diff --git a/data_aug/add_CoT.py b/data_aug/add_CoT.py
new file mode 100644
index 0000000..417dd85
--- /dev/null
+++ b/data_aug/add_CoT.py
@@ -0,0 +1,57 @@
+from ast import literal_eval
+
+import dspy
+import pandas as pd
+from tqdm import tqdm
+
+
+def process_csv(file_path, output_path, lm_api_key):
+    lm = dspy.LM("openai/gpt-4o", api_key=lm_api_key)
+    dspy.configure(lm=lm)
+
+    df = pd.read_csv(file_path)
+
+    records = []
+    for _, row in df.iterrows():
+        problems = literal_eval(row["problems"])
+        record = {
+            "id": row["id"],
+            "paragraph": row["paragraph"],
+            "question": problems["question"],
+            "choices": problems["choices"],
+            "answer": problems.get("answer", None),
+            "question_plus": problems.get("question_plus", None),
+        }
+        records.append(record)
+
+    df = pd.DataFrame(records)
+
+    df["steps"] = None
+    data_list = []
+
+    for index, row in tqdm(df.iterrows(), total=len(df)):
+        input_data = {
+            "paragraph": row["paragraph"],
+            "question": row["question"],
+            "choices": row["choices"],
+            "answer": row["answer"],
+        }
+        classify = dspy.ChainOfThought("paragraph: str, question: str, choices: list, answer: str -> steps: list", n=1)
+
+        input_data["question"] = f"{row['question']} 단계별 설명(CoT)을 사용하여 올바른 답을 도출하세요."
+        response = classify(**input_data)
+        print("response.completions", response.completions)
+        data_list.append(response.completions)
+
+    df["steps"] = data_list
+
+    df.to_csv(output_path, index=False, encoding="utf-8-sig")
+    print(f"Updated CSV file saved to: {output_path}")
+
+
+if __name__ == "__main__":
+    input_file_path = ""
+    output_file_path = ""
+    api_key = ""
+
+    process_csv(input_file_path, output_file_path, api_key)
diff --git a/data_aug/aug_philo.py b/data_aug/aug_philo.py
new file mode 100644
index 0000000..67a210a
--- /dev/null
+++ b/data_aug/aug_philo.py
@@ -0,0 +1,64 @@
+import os
+import warnings
+
+from langchain.prompts import PromptTemplate
+from langchain_openai import ChatOpenAI
+import pandas as pd
+
+
+def parse_output(output):
+    lines = output.split("\n")
+    data_list = []
+    data = {"지문": "", "문제": "", "보기": "", "정답": "", "해설": ""}
+    current_key = None
+
+    for line in lines:
+        line = line.strip()
+        if line.startswith("지문:"):
+            if data["지문"]:
+                data_list.append(data.copy())
+                data = {"지문": "", "문제": "", "보기": "", "정답": "", "해설": ""}
+            current_key = "지문"
+            data["지문"] = line.replace("지문:", "").strip()
+        elif line.startswith("문제:"):
+            current_key = "문제"
+            data["문제"] = line.replace("문제:", "").strip()
+        elif line.startswith("보기:"):
+            current_key = "보기"
+            data["보기"] = line.replace("보기:", "").strip()
+        elif line.startswith("정답:"):
+            current_key = "정답"
+            data["정답"] = line.replace("정답:", "").strip()
+        elif line.startswith("해설:"):
+            current_key = "해설"
+            data["해설"] = line.replace("해설:", "").strip()
+        elif current_key:
+            data[current_key] += " " + line
+
+    if data["지문"]:
+        data_list.append(data)
+
+    return data_list
+
+
+if __name__ == "__main__":
+    warnings.filterwarnings("ignore")
+    os.environ["OPENAI_API_KEY"] = ""
+
+    # 사용할 LLM 모델 설정
+    llm = ChatOpenAI(model_name="gpt-4o-mini", temperature=0.9)
+
+    # 프롬프트 템플릿 설정
+    prompt_template = """"""
+
+    prompt = PromptTemplate(input_variables=[], template=prompt_template)
+
+    response = llm.invoke(prompt.format())
+    output = response.content
+
+    parsed_data_list = parse_output(output)
+
+    df = pd.DataFrame(parsed_data_list)
+
+    csv_filename = "philosophy_questions.csv"
+    df.to_csv(csv_filename, index=False, encoding="utf-8-sig")
diff --git a/data_process/crawling_gichulpass.py b/data_process/crawling_gichulpass.py
new file mode 100644
index 0000000..3b846b1
--- /dev/null
+++ b/data_process/crawling_gichulpass.py
@@ -0,0 +1,159 @@
+import json
+import re
+
+from bs4 import BeautifulSoup
+from datasets import load_dataset
+import pandas as pd
+import requests
+
+
+def answer_symbol_to_int(symbol: str) -> int:
+    # 정규표현식으로 특수문자만 추출
+    special_chars = re.findall(r"[①②③④⑤]", symbol)
+    cleaned_symbol = special_chars[0] if special_chars else symbol
+
+    answer_map = {"①": 1, "②": 2, "③": 3, "④": 4, "⑤": 5}
+    return answer_map.get(cleaned_symbol, -1)
+
+
+def extract_question_data(soup, with_table=False):
+    questions_with_table = []
+    questions_without_table = []
+
+    for item in soup.select("#examList > li"):
+        # 문제 번호와 질문
+        question_element = item.select_one(".pr_problem")
+        question_text = question_element.get_text(strip=True)
+
+        # 테이블 존재 여부 확인
+        has_table = False
+
+        # 문제에 테이블이 있는 경우
+        question_table = question_element.find("table")
+        if question_table:
+            has_table = True
+            question_text = {"text": question_text, "table_html": str(question_table)}
+
+        # 문제 설명 (있는 경우)
+        example = item.select_one(".exampleCon")
+        example_text = example.get_text(strip=True) if example else ""
+
+        # 예시에 테이블이 있는 경우
+        if example:
+            example_table = example.find("table")
+            if example_table:
+                has_table = True
+                example_text = {"text": example_text, "table_html": str(example_table)}
+
+        # '그림' 포함된 문제는 건너뛰기
+        if isinstance(question_text, str) and "그림" in question_text:
+            continue
+        if isinstance(example_text, str) and "그림" in example_text:
+            continue
+
+        # 선택지를 리스트로 저장
+        choices = []
+        for choice in item.select(".questionCon li label"):
+            choices.append(choice.get_text(strip=True))
+
+        # 정답 번호 추출
+        answer_element = item.select_one(".answer_num")
+        answer_text = answer_element.get_text(strip=True).strip() if answer_element else ""
+        answer = answer_symbol_to_int(answer_text)
+
+        # 정답 설명
+        explanation = item.select_one(".answer_explan")
+        explanation_text = explanation.get_text(strip=True) if explanation else ""
+
+        # 데이터 저장
+        question_data = {
+            "question": question_text,
+            "paragraph": example_text,
+            "choices": json.dumps(choices, ensure_ascii=False),
+            "answer": answer,
+            "answer_explanation": explanation_text,
+        }
+
+        # 테이블 유무에 따라 다른 리스트에 저장
+        if has_table:
+            questions_with_table.append(question_data)
+        else:
+            questions_without_table.append(question_data)
+
+    return pd.DataFrame(questions_without_table) if not with_table else pd.DataFrame(questions_with_table)
+
+
+def crawl_and_save(subject_code):
+    headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"}
+
+    url = "https://gichulpass.com/bbs/board.php"
+
+    # 회계학 문제
+    if subject_code == 20:
+        wr_ids = [903, 27, 889, 887, 885, 884, 880]
+
+    # 헌법 문제
+    if subject_code == 26:
+        wr_ids = (
+            [1136, 1063, 963, 953, 875, 849, 543, 537, 526, 515, 510, 500, 542, 536, 525, 514, 509]
+            + [499, 541, 535, 524, 513, 508, 498, 540, 534, 523, 512, 507, 497, 539, 533, 522, 511, 506]
+            + [496, 538, 532, 521, 505, 495, 531, 520, 504, 494, 530, 519, 503, 493, 529, 518, 502, 492]
+            + [528, 517, 501, 491, 527, 516, 490]
+        )
+
+    # 한국사 문제
+    if subject_code == 34:
+        # 9급만 필터링
+        wr_ids = (
+            list(range(808, 814))
+            + list(range(224, 243))
+            + list(range(264, 269))
+            + list(range(288, 298))
+            + list(range(305, 326))
+            + [16, 841, 870, 897, 897, 912, 962, 1012, 1026, 1053, 1167, 1172, 1173, 1176, 1177, 1184]
+            + [1205, 1240, 1241, 1242, 1243, 1244, 1245, 1246, 1247, 1257, 1262, 1357, 1367, 1368, 1404, 1405]
+        )
+
+    # 사회 문제
+    if subject_code == 35:
+        wr_ids = list(range(808, 840)) + [865, 894, 908, 1010, 1027, 1050]
+
+    dfs = []
+    for wr_id in wr_ids:
+        params = {"bo_table": "exam", "wr_id": wr_id, "subject": subject_code}
+        response = requests.get(url, params=params, headers=headers)
+        soup = BeautifulSoup(response.text, "html.parser")
+        df = extract_question_data(soup)
+        dfs.append(df)
+
+    concated_df = pd.concat(dfs, axis=0)
+    len_df = len(concated_df)
+    concated_df.to_csv(f"gichulpass_{subject_code}_{len_df}_raw.csv", index=False, encoding="utf-8")
+
+
+def check_KMMLU(input_file, output_file):
+    df = pd.read_csv(input_file, encoding="utf-8")
+    ds = load_dataset("HAERAE-HUB/KMMLU", "Korean-History")
+    # df의 None값을 빈 문자열로 변환하고 모든 값을 문자열로 변환
+    df = df.fillna("")
+    df = df.astype(str)
+
+    # 모든 split의 question을 하나로 합치기
+    questions = pd.concat([pd.DataFrame(ds["train"]), pd.DataFrame(ds["dev"]), pd.DataFrame(ds["test"])])
+
+    # 포함 여부를 확인하는 함수
+    def check_inclusion(column):
+        return column.apply(lambda x: any(x in question for question in questions["question"]))
+
+    # paragraph와 question 각각 확인 후 둘 중 하나라도 True면 True
+    df["include"] = check_inclusion(df["paragraph"]) | check_inclusion(df["question"])
+
+    new_df = df[not df["include"]]
+    new_df.to_csv(output_file, index=False)
+
+
+if __name__ == "__main__":
+    crawl_and_save(subject_code=20)
+    crawl_and_save(subject_code=26)
+    crawl_and_save(subject_code=34)
+    crawl_and_save(subject_code=35)
diff --git a/data_process/external_musr.py b/data_process/external_musr.py
new file mode 100644
index 0000000..8167799
--- /dev/null
+++ b/data_process/external_musr.py
@@ -0,0 +1,63 @@
+from data_process.process_google_translate import TranslationCache, translate_column, translate_list_column
+from datasets import load_dataset
+from loguru import logger
+import pandas as pd
+from tqdm import tqdm
+
+
+def dataset_to_pd(data_name):
+    data = load_dataset(data_name)
+    dfs = [
+        pd.DataFrame(data["murder_mysteries"]),
+        pd.DataFrame(data["object_placements"]),
+        pd.DataFrame(data["team_allocation"]),
+    ]
+    return pd.concat(dfs, axis=0)
+
+
+def process_data(df):
+    return pd.DataFrame(
+        {
+            "paragraph": df["narrative"],
+            "question": df["question"],
+            "choices": df["choices"],
+            "answer": df["answer_index"].apply(lambda x: x + 1),
+        }
+    )
+
+
+def process_external_datasets(dataset_name, output_filename):
+    df = dataset_to_pd(dataset_name)
+    df = process_data(df)
+
+    # 개행문자를 \n 문자열로 변환
+    for col in df.columns:
+        if df[col].dtype == "object":  # 문자열 컬럼에 대해서만 처리
+            df[col] = df[col].str.replace("\n", "\\n")
+
+    df.to_csv(output_filename, index=False)
+
+
+def translate_df(input_filename, output_filename):
+    df = pd.read_csv(input_filename)
+
+    logger.info("단락 번역 중...")
+    with TranslationCache("paragraph_cache.json") as paragraph_cache:
+        df["paragraph"] = translate_column(df["paragraph"], paragraph_cache)
+
+    logger.info("질문 번역 중...")
+    with TranslationCache("question_cache.json") as question_cache:
+        df["question"] = translate_column(df["question"], question_cache)
+
+    logger.info("선택지 번역 중...")
+    with TranslationCache("choices_cache.json") as choices_cache:
+        tqdm.pandas(desc="선택지 번역 중")
+        df["choices"] = df["choices"].apply(lambda x: translate_list_column(x, choices_cache))
+
+    df.to_csv(output_filename, index=False)
+
+
+if __name__ == "__main__":
+    dataset_name = "TAUR-Lab/MuSR"
+    process_external_datasets(dataset_name, "MuSR_en_raw.csv")
+    translate_df("MuSR_en_raw.csv", "MuSR_ko_raw.csv")
diff --git a/data_process/external_race.py b/data_process/external_race.py
new file mode 100644
index 0000000..cd7f01c
--- /dev/null
+++ b/data_process/external_race.py
@@ -0,0 +1,73 @@
+from datasets import load_dataset
+import pandas as pd
+
+
+def dataset_to_pd(data_name):
+    """주어진 데이터셋 이름으로부터 DataFrame을 생성합니다."""
+    dataset = load_dataset(data_name, "high", split="validation")  # 'train', 'validation', 'test' 중 선택
+    return pd.DataFrame(dataset)
+
+
+def process_query_data(df):
+    """DataFrame을 처리하여 필요한 형식으로 변환합니다."""
+
+    # answer를 A, B, C, D 형식에서 1, 2, 3, 4 형식으로 변환
+    def convert_answer(answer):
+        if answer == "A":
+            return 1
+        elif answer == "B":
+            return 2
+        elif answer == "C":
+            return 3
+        elif answer == "D":
+            return 4
+        else:
+            return None  # 예상치 못한 값에 대한 처리
+
+    df["article"] = (
+        df["article"]
+        .str.replace(",", "")
+        .str.replace('""', "")
+        .str.replace(r"\. ", ".")
+        .str.replace(r'\." ', '."')
+        .str.replace(r"([.!?]) ", r"\1")
+    )
+    # 문제를 문자열로 변환하여 DataFrame 생성
+    problems = df.apply(
+        lambda row: {"question": row["question"], "choices": row["options"], "answer": convert_answer(row["answer"])},
+        axis=1,
+    )
+
+    return pd.DataFrame(
+        {
+            "id": df["example_id"],  # 예제의 ID
+            "paragraph": df["article"],  # 정리된 article 사용
+            "problems": problems.apply(str),  # 문제를 문자열로 변환
+            "question_plus": None,  # question_plus가 원본 데이터에 없다고 가정
+        }
+    )
+
+
+if __name__ == "__main__":
+    dataset_name = "ehovy/race"  # 사용할 데이터셋 이름
+    df = dataset_to_pd(dataset_name)  # 데이터셋을 DataFrame으로 변환
+    processed_df = process_query_data(df)  # 데이터 처리
+
+    # 결과를 CSV 파일로 저장 (따옴표 처리)
+    processed_df.to_csv("processed_race_dataset.csv", index=False)
+
+    import re
+
+    import pandas as pd
+
+    # CSV 파일 읽기
+    df = pd.read_csv("processed_race_dataset.csv")
+
+    # paragraph 열만 수정
+    df["paragraph"] = df["paragraph"].apply(lambda x: re.sub(r"\n", "", x))
+    df["paragraph"] = df["paragraph"].apply(lambda x: re.sub(r"\. ", ".", x))
+    df["paragraph"] = df["paragraph"].apply(lambda x: re.sub(r'\." ', '."', x))
+    df["paragraph"] = df["paragraph"].apply(lambda x: re.sub(r"([.!?]) ", r"\1", x))
+
+    # 수정된 DataFrame을 다시 CSV로 저장
+    df.to_csv("modified_file.csv", index=False)
diff --git a/data_process/external_sat_gaokao.py b/data_process/external_sat_gaokao.py
new file mode 100644
index 0000000..904b285
--- /dev/null
+++ b/data_process/external_sat_gaokao.py
@@ -0,0 +1,86 @@
+from datasets import load_dataset
+import pandas as pd
+
+
+def dataset_to_pd(data_name):
+    data = load_dataset(data_name)
+    return pd.DataFrame(data["test"])
+
+
+def process_query_data(input_df):
+    def _split_query_data(df):
+        paragraphs = []
+        questions = []
+
+        for index, row in df.iterrows():
+            text = row["query"]
+            # Paragraph와 나머지 텍스트 분리
+            paragraph, rest = text.split("Q:", 1)
+            # Question과 Answer Choices 분리
+            question, choices = rest.split("Answer Choices: ", 1)
+
+            paragraphs.append(paragraph.strip())
+            questions.append(question.strip())
+
+        return pd.DataFrame(
+            {
+                "paragraph": paragraphs,
+                "question": questions,
+                "choices": df["choices"],
+                "answer": df["gold"].apply(lambda x: x[0] + 1),  # gold 배열을 풀고 +1
+            }
+        )
+
+    def _split_answer_choices(df):
+        def process_choices(choices_list):  # 리스트 형태로 입력 받음
+            new_choices = []
+            for choice in choices_list:
+                new_choice = (
+                    choice.replace("(A)", "")
+                    .replace("(B)", "")
+                    .replace("(C)", "")
+                    .replace("(D)", "")
+                    .replace("(E)", "")
+                )
+                new_choices.append(new_choice.strip())
+            return new_choices
+
+        df["choices"] = df["choices"].apply(process_choices)
+        return df
+
+    split_df = _split_query_data(input_df)
+    final_df = _split_answer_choices(split_df)
+    return final_df
+
+
+def process_and_concat_external_datasets(dataset_names, output_filename):
+    dfs = []
+    for dataset_name in dataset_names:
+        df = dataset_to_pd(dataset_name)
+        df = process_query_data(df)
+        dfs.append(df)
+
+    concated_df = pd.concat(dfs, axis=0)
+    concated_df.to_csv(output_filename, index=False)
+
+
+def clean_string(text):
+    # 문자열 내부의 모든 큰따옴표를 작은따옴표로 변환
+    text = text.replace('"', "'")
+    # 연속된 큰따옴표를 하나로 변환
+    while "''" in text:
+        text = text.replace("''", "'")
+    text = text.strip()  # 앞뒤 공백 제거
+    return text
+
+
+if __name__ == "__main__":
+    dataset_names = [
+        "dmayhem93/agieval-sat-en",
+        "dmayhem93/agieval-logiqa-en",
+        "dmayhem93/agieval-lsat-rc",
+        "dmayhem93/agieval-lsat-lr",
+        "dmayhem93/agieval-lsat-ar",
+        "dmayhem93/agieval-gaokao-english",
+    ]
+    process_and_concat_external_datasets(dataset_names, "sat_gaokao_en_raw.csv")
diff --git a/data_process/pdf_to_txt.py b/data_process/pdf_to_txt.py
new file mode 100644
index 0000000..61889a7
--- /dev/null
+++ b/data_process/pdf_to_txt.py
@@ -0,0 +1,29 @@
+import os
+import re
+
+from pdfminer.high_level import extract_text
+
+
+def split_text_by_keyword(text, keyword):
+    sections = re.split(rf"{keyword}", text)
+    sections = [section.strip() + keyword for section in sections[:-1]] + [sections[-1].strip()]
+    return sections
+
+
+def save_sections_to_files(sections, output_dir="sections"):
+    os.makedirs(output_dir, exist_ok=True)
+    for i, section in enumerate(sections):
+        file_name = os.path.join(output_dir, f"section_{i+1}.txt")
+        with open(file_name, "w", encoding="utf-8") as f:
+            f.write(section)
+    print(f"Sections saved to '{output_dir}' directory.")
+
+
+if __name__ == "__main__":
+    pdf_file_path = "./data/test/2025.pdf"
+    output_dir = "./data/test/sections"
+    keyword = "답하시오"
+
+    text = extract_text(pdf_file_path)
+    split_text = split_text_by_keyword(text, keyword)
+    save_sections_to_files(split_text, output_dir=output_dir)
diff --git a/data_process/process_balance_choices.py b/data_process/process_balance_choices.py
new file mode 100644
index 0000000..4f1fa39
--- /dev/null
+++ b/data_process/process_balance_choices.py
@@ -0,0 +1,60 @@
+from ast import literal_eval
+import random
+
+import pandas as pd
+
+
+def balance_choices_dataset(file_path):
+    # CSV 파일 읽기
+    df = pd.read_csv(file_path)
+    df["problems"] = df["problems"].apply(literal_eval)
+
+    # 선택지와 답 교체 함수
+    def swap_choices_and_answer(df):
+        for index, row in df.iterrows():
+            problems = row["problems"]
+            choices = problems["choices"]
+            answer = problems["answer"]
+            # 선택지 랜덤 섞기
+            shuffled_choices = choices[:]
+            random.shuffle(shuffled_choices)
+            # 새로운 답 인덱스 계산
+            new_answer = shuffled_choices.index(choices[answer - 1])
+            # 교체된 선택지와 답으로 업데이트
+            problems["choices"] = shuffled_choices
+            problems["answer"] = new_answer + 1
+            df.at[index, "problems"] = problems
+        return df
+
+    # 선택지와 답 교체 적용
+    return swap_choices_and_answer(df)
+
+
+def answer_counts(file_path):
+    df = pd.read_csv(file_path)
+    df["problems"] = df["problems"].apply(literal_eval)
+
+    records = []
+    for idx, row in df.iterrows():
+        problems = row["problems"]
+        record = {
+            "id": row["id"],
+            "paragraph": row["paragraph"],
+            "question": problems["question"],
+            "choices": problems["choices"],
+            "answer": problems.get("answer", None),
+            "question_plus": problems.get("question_plus", None),
+        }
+        records.append(record)
+
+    processed_df = pd.DataFrame(records)
+    print(len(processed_df))
+
+    print(processed_df["choices"].apply(len).value_counts())
+    print(processed_df["answer"].value_counts())
+
+
+if __name__ == "__main__":
+    train_balanced = balance_choices_dataset("../data/train.csv")
+    train_balanced.to_csv("../data/train_balanced.csv", index=False)
+    answer_counts("../data/train_balanced.csv")
diff --git a/data_process/process_formatting.py b/data_process/process_formatting.py
new file mode 100644
index 0000000..4dd9f87
--- /dev/null
+++ b/data_process/process_formatting.py
@@ -0,0 +1,29 @@
+import pandas as pd
+
+
+def formatting(suffix, input_filename, output_filename):
+    # CSV 파일 읽기
+    df = pd.read_csv(input_filename, encoding="utf-8")
+
+    # 새로운 형식으로 변환
+    new_records = []
+    for idx, row in df.iterrows():
+        new_record = {
+            "id": f"external-data-{suffix}{idx + 1}",
+            "paragraph": "" if pd.isna(row["paragraph"]) else str(row["paragraph"]),
+            "problems": {"question": row["question"].strip(), "choices": eval(row["choices"]), "answer": row["answer"]},
+        }
+        new_records.append(new_record)
+
+    # 새로운 DataFrame 생성
+    new_df = pd.DataFrame(new_records)
+    new_df.to_csv(output_filename, index=False)
+
+
+if __name__ == "__main__":
+    formatting("gichulpass20-", "gichulpass_20_107_raw.csv", "gichulpass_20_107.csv")
+    formatting("gichulpass26-", "gichulpass_26_1319_raw.csv", "gichulpass_24_1319.csv")
+    formatting("gichulpass34-", "gichulpass_34_1352_raw.csv", "gichulpass_34_1352.csv")
+    formatting("gichulpass35-", "gichulpass_35_568_raw.csv", "gichulpass_35_568.csv")
+    formatting("SAT", "sat_gaokao_ko_raw.csv", "sat_gaokao_ko.csv")
+    formatting("MuSR", "MuSR_ko_raw.csv", "MuSR_ko.csv")
diff --git a/data_process/process_google_translate.py b/data_process/process_google_translate.py
new file mode 100644
index 0000000..cf62868
--- /dev/null
+++ b/data_process/process_google_translate.py
@@ -0,0 +1,80 @@
+from ast import literal_eval
+import json
+import os
+import time
+
+from googletrans import Translator
+from loguru import logger
+from tqdm import tqdm
+
+
+class TranslationCache:
+    def __init__(self, cache_file="translation_cache.json"):
+        self.cache_file = cache_file
+        self.cache = {}
+
+    def __enter__(self):
+        if os.path.exists(self.cache_file):
+            with open(self.cache_file, "r", encoding="utf-8") as f:
+                self.cache = json.load(f)
+        return self
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        with open(self.cache_file, "w", encoding="utf-8") as f:
+            json.dump(self.cache, f, ensure_ascii=False, indent=2)
+        self.cache.clear()  # 메모리 해제
+
+    def get_translation(self, text):
+        return self.cache.get(text)
+
+    def add_translation(self, text, translation):
+        self.cache[text] = translation
+
+
+def translate_with_retry(translator, text, cache, max_retries=3):
+    # 캐시에서 번역 확인
+    cached_translation = cache.get_translation(text)
+    if cached_translation:
+        return cached_translation
+
+    # 캐시에 없는 경우 번역 수행
+    for attempt in range(max_retries):
+        try:
+            translated = translator.translate(text, src="en", dest="ko").text
+            time.sleep(0.01)
+            cache.add_translation(text, translated)
+            return translated
+        except Exception as e:
+            if attempt == max_retries - 1:
+                logger.info(f"번역 실패: {str(e)}")
+                return text
+            time.sleep(1)
+
+
+def translate_list_column(text, cache):
+    items = literal_eval(text)
+    translator = Translator()
+    translated_items = []
+
+    for item in items:
+        translated = translate_with_retry(translator, item, cache)
+        translated_items.append(translated)
+    return translated_items
+
+
+def translate_column(texts, cache):
+    translator = Translator()
+    translated_texts = []
+
+    # 중복 제거를 위해 유니크한 텍스트만 추출
+    unique_texts = list(set(texts))
+
+    for text in tqdm(unique_texts, desc="고유 텍스트 번역 중"):
+        translated = translate_with_retry(translator, text, cache)
+
+    # 원본 순서대로 캐시에서 번역 가져오기
+    for text in texts:
+        translated = cache.get_translation(text)
+        translated_texts.append(translated)
+
+    return translated_texts
diff --git a/data_viz/csv2pdf.py b/data_viz/csv2pdf.py
new file mode 100644
index 0000000..c08e0fb
--- /dev/null
+++ b/data_viz/csv2pdf.py
@@ -0,0 +1,132 @@
+import argparse
+import ast
+import os
+import sys
+import textwrap
+
+import pandas as pd
+from reportlab.lib.pagesizes import A4
+from reportlab.lib.units import cm
+from reportlab.pdfbase import pdfmetrics
+from reportlab.pdfbase.ttfonts import TTFont
+from reportlab.pdfgen import canvas
+
+
+def draw_wrapped_text(c, text, x, y, max_width, max_height, line_height=14):
+    """텍스트를 페이지 너비에 맞게 줄바꿈하여 출력하며, 페이지 높이를 초과하면 페이지를 넘김."""
+    wrapped_text = textwrap.fill(text, width=70)
+    text_obj = c.beginText(x, y)
+    text_obj.setFont("NanumGothic", 10)
+    text_obj.setLeading(line_height)
+
+    for line in wrapped_text.splitlines():
+        if text_obj.getY() < max_height:
+            c.drawText(text_obj)
+            c.showPage()
+            text_obj = c.beginText(x, A4[1] - 3 * cm)
+            text_obj.setFont("NanumGothic", 10)
+            text_obj.setLeading(line_height)
+        text_obj.textLine(line)
+
+    c.drawText(text_obj)
+
+
+def create_csat_style_pdf(data, filename):
+    c = canvas.Canvas(filename, pagesize=A4)
+    width, height = A4
+
+    for index, row in data.iterrows():
+        try:
+            # 문제 ID 출력
+            c.setFont("NanumGothic", 12)
+            c.drawString(1 * cm, height - 1 * cm, f"문제 ID: {row['id']}")
+
+            # 본문 출력
+            paragraph = row["paragraph"]
+            draw_wrapped_text(c, paragraph, 1 * cm, height - 2 * cm, max_width=85, max_height=14 * cm)
+
+            # 문제 출력
+            problem_data = ast.literal_eval(row["problems"])
+            question = problem_data["question"]
+            draw_wrapped_text(
+                c,
+                f"문제: {question}",
+                1 * cm,
+                height - 16 * cm,
+                max_width=85,
+                max_height=8 * cm,
+            )
+
+            # 선택지 출력
+            choices = problem_data["choices"]
+            choice_y = height - 19 * cm
+            for i, choice in enumerate(choices, 1):
+                draw_wrapped_text(
+                    c,
+                    f"{i}. {choice}",
+                    1 * cm,
+                    choice_y,
+                    max_width=85,
+                    max_height=3 * cm,
+                )
+                choice_y -= 1.2 * cm
+
+            # 정답 표시
+            answer = problem_data["answer"]
+            draw_wrapped_text(
+                c,
+                f"정답: {answer}",
+                1 * cm,
+                choice_y - 1 * cm,
+                max_width=85,
+                max_height=3 * cm,
+            )
+
+            c.showPage()
+        except KeyError as ke:
+            print(f"Data format error: 필수 키가 없습니다. {ke}")
+
+    # PDF 저장
+    c.save()
+
+
+if __name__ == "__main__":
+    # 인자 파서 설정
+    parser = argparse.ArgumentParser(description="Generate a CSAT style PDF from CSV data.")
+    parser.add_argument(
+        "--csv_path",
+        default="../data/train.csv",
+        help="Path to the CSV file containing the data.",
+    )
+    args = parser.parse_args()
+
+    # CSV 파일 읽기 및 컬럼 확인
+    try:
+        df = pd.read_csv(args.csv_path)
+        required_columns = {"id", "paragraph", "problems"}
+        if not required_columns.issubset(df.columns):
+            missing_columns = required_columns - set(df.columns)
+            raise ValueError(f"CSV 파일에 필요한 컬럼이 없습니다: {', '.join(missing_columns)}")
+    except FileNotFoundError:
+        print(f"CSV 파일을 찾을 수 없습니다: {args.csv_path}")
+        sys.exit(1)
+    except ValueError as ve:
+        print(ve)
+        sys.exit(1)
+
+    # 한글 폰트 등록
+    font_path = os.path.abspath("../data/NanumGothic.ttf")
+    if not os.path.isfile(font_path):
+        print(
+            f"폰트 파일을 찾을 수 없습니다: {font_path}"
+            f"https://hangeul.naver.com/fonts/search?f=nanum 에서 폰트를 다운받아 data폴더에 넣어주세요."
+        )
+        sys.exit(1)
+
+    pdfmetrics.registerFont(TTFont("NanumGothic", font_path))
+
+    # PDF 파일명 설정 (입력 파일과 동일 경로 및 이름으로 설정, 확장자만 .pdf로 변경)
+    pdf_filename = os.path.splitext(args.csv_path)[0] + ".pdf"
+
+    # PDF 생성 함수 호출
+    create_csat_style_pdf(df, pdf_filename)
diff --git a/data_viz/labeling.py b/data_viz/labeling.py
new file mode 100644
index 0000000..9c60556
--- /dev/null
+++ b/data_viz/labeling.py
@@ -0,0 +1,98 @@
+from ast import literal_eval
+
+import pandas as pd
+import streamlit as st
+
+
+def load_data(file_path):
+    data = pd.read_csv(file_path)
+    records = []
+    for _, row in data.iterrows():
+        problems = literal_eval(row["problems"])
+        record = {
+            "id": row["id"],
+            "paragraph": row["paragraph"],
+            "question": problems["question"],
+            "choices": problems["choices"],
+            "answer": problems.get("answer", None),
+            "question_plus": problems.get("question_plus", None),
+            "target": problems.get("target", None),
+            "suggested_label": problems.get("suggested_label", None),
+            "is_label_issue": problems.get("is_label_issue", None),
+        }
+        records.append(record)
+    return data, records
+
+
+def display_instance(record):
+    st.subheader("Paragraph")
+    st.write(record["paragraph"])
+
+    st.subheader("Question, Choices, Answer")
+    st.markdown("#### Question:")
+    st.write(record["question"])
+
+    st.markdown("#### Choices:")
+    for i, choice in enumerate(record["choices"], 1):
+        st.write(f"{i} : {choice}")
+
+    st.markdown("#### Answer:")
+    st.write(str(record["answer"]))
+
+    st.subheader("Question Plus")
+    st.write(str(record["question_plus"]))
+
+
+def main():
+    st.title("CSV 데이터 인스턴스 뷰어")
+
+    data, records = load_data("../data/cleaned_output_with_labels_CL.csv")
+
+    # 1055번째 인덱스를 기준으로 데이터 분할
+    split_index = 792
+    before_split = data.iloc[:split_index]
+    after_split = data.iloc[split_index:]
+
+    # 1055 이전과 이후의 suggested_label 개수 계산
+    label_counts = {
+        "Before 1380": {
+            "Label 0": (before_split["suggested_label"] == 0).sum(),
+            "Not Label 0": (before_split["suggested_label"] != 0).sum(),
+        },
+        "After 1380": {
+            "Label 1": (after_split["suggested_label"] == 1).sum(),
+            "Not Label 1": (after_split["suggested_label"] != 1).sum(),
+        },
+    }
+
+    # 결과를 데이터프레임으로 변환
+    label_counts_df = pd.DataFrame(label_counts)
+
+    # 라벨 개수 출력
+    st.subheader("Suggested Label 개수")
+    st.write(label_counts_df)
+
+    # Before Split에서 suggested_label이 1인 행 필터링
+    before_split_label_1 = before_split[before_split["suggested_label"] == 1]
+
+    # After Split에서 suggested_label이 1인 행 필터링
+    after_split_label_1 = after_split[after_split["suggested_label"] == 0]
+
+    # 결과 출력
+    st.subheader("Before 1380에서 suggested_label이 1인 인스턴스")
+    st.write(before_split_label_1)
+
+    st.subheader("After 1380에서 suggested_label이 0인 인스턴스")
+    st.write(after_split_label_1)
+
+    # 인스턴스 선택 기능 추가
+    instance_index = st.number_input("인스턴스 선택", min_value=0, max_value=len(data) - 1, value=0, step=1)
+
+    st.write(f"선택된 인스턴스 (인덱스 {instance_index}):")
+    st.write(data.iloc[instance_index])
+
+    display_instance(records[instance_index])
+
+
+if __name__ == "__main__":
+    main()
diff --git a/data_viz/streamlit_app.py b/data_viz/streamlit_app.py
new file mode 100644
index 0000000..51153d9
--- /dev/null
+++ b/data_viz/streamlit_app.py
@@ -0,0 +1,71 @@
+from ast import literal_eval
+import re
+
+import pandas as pd
+import streamlit as st
+
+
+def load_data(file_path):
+    data = pd.read_csv(file_path)
+    records = []
+    for _, row in data.iterrows():
+        problems = literal_eval(row["problems"])
+        record = {
+            "id": row["id"],
+            "paragraph": row["paragraph"],
+            "question": problems["question"],
+            "choices": problems["choices"],
+            "answer": problems.get("answer", None),
+            "question_plus": problems.get("question_plus", None),
+            "documents": row.get("documents", None),
+        }
+        records.append(record)
+    return data, records
+
+
+def display_instance(left, right, record):
+    with left:
+        st.subheader("Paragraph")
+        st.write(record["paragraph"])
+
+        st.subheader("Question, Choices, Answer")
+        st.markdown("#### Question:")
+        st.write(record["question"])
+
+        st.markdown("#### Choices:")
+        for i, choice in enumerate(record["choices"], 1):
+            if i == record["answer"]:
+                st.markdown(f"<span style='color:red'>{i} : {choice}</span>", unsafe_allow_html=True)
+            else:
+                st.write(f"{i} : {choice}")
+
+        st.markdown("#### Answer:")
+        st.write(str(record["answer"]))
+
+        st.subheader("Question Plus")
+        st.write(str(record["question_plus"]))
+    with right:
+        st.subheader("Documents")
+        result = re.split(r"(?=\[)", str(record["documents"]))
+        for part in result:
+            st.write(part)
+
+
+def main(file_path="../data/train.csv"):
+    st.set_page_config(layout="wide")
+    st.title("CSV 데이터 인스턴스 뷰어")
+    data, records = load_data(file_path)
+
+    left, right = st.columns([0.5, 0.5])
+    with left:
+        instance_index = st.number_input("인스턴스 선택", min_value=0, max_value=len(data) - 1, value=0, step=1)
+
+    display_instance(left, right, records[instance_index])
+
+    with left:
+        st.write(f"선택된 인스턴스 (인덱스 {instance_index}):")
+        st.write(data.iloc[instance_index])
+
+
+if __name__ == "__main__":
+    main("../data/train_retrieve.csv")
diff --git a/ensemble/hard_voting.py b/ensemble/hard_voting.py
new file mode 100644
index 0000000..8965761
--- /dev/null
+++ b/ensemble/hard_voting.py
@@ -0,0 +1,90 @@
+from collections import Counter
+import csv
+from glob import glob
+
+
+"""
+# 하드 보팅 앙상블 사용 방법 (CSV 버전)
+
+1. 파일 준비:
+   - 'ensemble/results_hard' 폴더 안에 앙상블하고 싶은 모든 CSV 파일들을 넣습니다.
+
+2. 우선순위 설정:
+   - 'priority_order' 리스트에 모델의 우선순위를 정의합니다.
+   - 예: priority_order = ['predictions1.csv', 'predictions2.csv', 'predictions3.csv']
+   - 리스트의 앞쪽에 있는 모델일수록 높은 우선순위를 가집니다.
+
+3. 코드 실행:
+   - 설정을 마친 후 코드를 실행합니다.
+   - 코드는 자동으로 폴더 내의 모든 CSV 파일을 읽어 앙상블을 수행합니다.
+
+4. 하드 보팅 과정:
+   - 각 질문에 대해 모든 모델의 답변을 수집합니다.
+   - 가장 많이 나온 답변(들)을 선택합니다.
+   - 동점인 경우, 우선순위가 가장 높은 모델의 답변을 선택합니다.
+
+5. 결과 확인:
+   - 앙상블 결과는 'final_hard_predictions.csv' 파일로 저장됩니다.
+   - 이 파일에는 각 질문에 대한 최종 답변이 포함되어 있습니다.
+
+주의: 모델의 우선순위는 각 모델의 성능이나 특성을 고려하여 신중히 결정해야 합니다.
+우선순위 설정에 따라 최종 결과가 크게 달라질 수 있습니다.
+"""
+
+# 우선순위 직접 정의
+priority_order = ["output (1).csv", "output (7).csv", "output (8).csv"]
+
+
+def hard_voting_with_priority(predictions, priority_order):
+    result = {}
+    for id in predictions[0].keys():
+        answers = [pred[id] for pred in predictions if id in pred]
+        answer_counts = Counter(answer for answer in answers if answer)
+
+        if answer_counts:
+            max_count = max(answer_counts.values())
+            top_answers = [ans for ans, count in answer_counts.items() if count == max_count]
+
+            if len(top_answers) == 1:
+                result[id] = top_answers[0]
+            else:
+                for model in priority_order:
+                    model_index = next((i for i, pred in enumerate(predictions) if pred.get("filename") == model), None)
+                    if model_index is not None:
+                        model_answer = predictions[model_index].get(id, "")
+                        if model_answer in top_answers:
+                            result[id] = model_answer
+                            break
+                else:
+                    result[id] = top_answers[0]
+        else:
+            result[id] = ""
+    return result
+
+
+# 예측 파일들을 로드합니다.
+prediction_files = glob("./results_hard/*.csv")
+predictions = []
+
+# 각 prediction 파일을 읽어와서 predictions 리스트에 추가합니다.
+for file_name in prediction_files:
+    prediction = {}
+    with open(file_name, "r", encoding="utf-8") as file:
+        csv_reader = csv.reader(file)
+        next(csv_reader)  # 헤더 행을 건너뜁니다
+        for row in csv_reader:
+            prediction[row[0]] = row[1]  # 첫 번째 열을 키로, 두 번째 열을 값으로 사용
+    # Remove filename key from prediction dictionary
+    predictions.append(prediction)
+
+# 하드 보팅을 수행합니다.
+final_predictions = hard_voting_with_priority(predictions, priority_order)
+
+# 결과를 CSV 파일로 저장합니다.
+with open("final_hard_predictions.csv", "w", newline="", encoding="utf-8") as f:
+    writer = csv.writer(f)
+    writer.writerow(["id", "answer"])  # 헤더 작성
+    for id, answer in final_predictions.items():
+        writer.writerow([id, answer])
+
+print("앙상블 결과가 'final_hard_predictions.csv' 파일로 저장되었습니다.")
diff --git a/.github/.keep b/ensemble/results_hard/.gitkeep
similarity index 100%
rename from .github/.keep
rename to ensemble/results_hard/.gitkeep
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 0000000..db9b2f8
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,48 @@
+[tool.ruff]
+line-length = 120
+
+# Exclude the following files and directories.
+exclude = [
+  ".git",
+  ".hg",
+  ".mypy_cache",
+  ".tox",
+  ".venv",
+  "_build",
+  "buck-out",
+  "build",
+  "dist",
+  "env",
+  "venv",
+  "**/*.ipynb",  # Jupyter Notebook 파일 제외
+]
+
+[tool.ruff.lint]
+# Never enforce `E501` (line length violations).
+extend-select = ["C901", "E501", "E402"]
+select = ["C", "E", "F", "I", "W"]
+
+
+[tool.ruff.lint.per-file-ignores]
+"__init__.py" = ["E402", "F401", "F403", "F811"]
+
+[tool.ruff.lint.isort]
+lines-after-imports = 2
+
+# Setting the order of sections
+section-order = ["standard-library", "third-party", "local-folder"]
+combine-as-imports = true
+force-sort-within-sections = true
+
+[tool.ruff.format]
+# Like Black, use double quotes for strings.
+quote-style = "double"
+
+# Like Black, indent with spaces, rather than tabs.
+indent-style = "space"
+
+# Like Black, respect magic trailing commas.
+skip-magic-trailing-comma = false
+
+# Like Black, automatically detect the appropriate line ending.
+line-ending = "auto"
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..49cc160
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,51 @@
+# CUDA Version: 12.2
+# Ubuntu 20.04.6
+# python 3.10.13
+
+# Deep Learning
+auto_gptq==0.7.1
+bitsandbytes==0.44.1
+evaluate==0.4.3
+huggingface-hub==0.26.2
+numpy==2.0.0
+optimum==1.23.3
+peft==0.5.0
+scikit-learn==1.5.2
+torch==2.5.1 # 2.5.1+cu124
+tqdm==4.67.0
+transformers==4.46.2
+trl==0.12.0
+wandb==0.18.5
+
+# RAG
+elasticsearch==8.16.0
+konlpy==0.6.0
+rank-bm25==0.2.2
+wikiextractor==3.0.6
+faiss-cpu==1.9.0 # faiss-gpu==1.7.2
+
+# Utils
+beautifulsoup4==4.12.3
+ipykernel==6.29.5
+ipywidgets==8.1.5
+loguru==0.7.2
+matplotlib==3.9.2
+python-dotenv==1.0.1
+reportlab==4.2.5
+streamlit==1.40.1
+pdfminer.six==20240706
+
+# Google Drive API
+google-api-python-client==2.151.0
+google-auth-httplib2==0.2.0
+google-auth-oauthlib==1.2.1
+
+# Automatically installed dependencies
+# pandas==2.2.3
+# pyarrow==18.0.0
+# datasets==3.1.0
+# safetensors==0.4.5
+# scipy==1.14.1
+# tqdm==4.67.0
+# PyYAML==6.0.2
+# requests==2.32.3
diff --git a/script/run_script.bash b/script/run_script.bash
new file mode 100755
index 0000000..c564b27
--- /dev/null
+++ b/script/run_script.bash
@@ -0,0 +1,11 @@
+#!/bin/bash
+
+# 첫 번째 실험
+nohup python -u main.py --config config-normal.yaml &
+PYTHON_PID_1=$!
+wait $PYTHON_PID_1
+
+# 두 번째 실험
+nohup python -u main.py --config config-rag.yaml &
+PYTHON_PID_2=$!
+wait $PYTHON_PID_2
diff --git a/script/run_with_gpu_monitoring.bash b/script/run_with_gpu_monitoring.bash
new file mode 100755
index 0000000..fecac05
--- /dev/null
+++ b/script/run_with_gpu_monitoring.bash
@@ -0,0 +1,31 @@
+#!/bin/bash
+
+# adamw_torch 설정
+nvidia-smi --query-gpu=timestamp,name,utilization.gpu,memory.used,memory.free --format=csv -l 1 > ../log/gpu_log_adamw_torch.csv &
+NVIDIA_LOG_PID_1=$!
+nohup python -u main.py --config config-adamw_torch.yaml &
+PYTHON_PID_1=$!
+wait $PYTHON_PID_1
+
+# 첫 번째 모니터링 프로세스 종료
+kill $NVIDIA_LOG_PID_1
+
+# adafactor 설정
+nvidia-smi --query-gpu=timestamp,name,utilization.gpu,memory.used,memory.free --format=csv -l 1 > ../log/gpu_log_adafactor.csv &
+NVIDIA_LOG_PID_2=$!
+nohup python -u main.py --config config-adafactor.yaml &
+PYTHON_PID_2=$!
+wait $PYTHON_PID_2
+
+# 두 번째 모니터링 프로세스 종료
+kill $NVIDIA_LOG_PID_2
+
+# adamw_bnb_8bit 설정
+nvidia-smi --query-gpu=timestamp,name,utilization.gpu,memory.used,memory.free --format=csv -l 1 > ../log/gpu_log_adamw_bnb_8bit.csv &
+NVIDIA_LOG_PID_3=$!
+nohup python -u main.py --config config-adamw_bnb_8bit.yaml &
+PYTHON_PID_3=$!
+wait $PYTHON_PID_3
+
+# 세 번째 모니터링 프로세스 종료
+kill $NVIDIA_LOG_PID_3
diff --git a/script/setup-gpu-server.bash b/script/setup-gpu-server.bash
new file mode 100755
index 0000000..e73dcb0
--- /dev/null
+++ b/script/setup-gpu-server.bash
@@ -0,0 +1,111 @@
+#!/bin/bash
+
+##########################################
+# GPU 서버 인스턴스 생성 시 필요한 개발 환경 세팅
+# conda 미설치 환경에서는 conda 설치 과정을 추가
+# 유저명 / 디렉토리 / 권한 설정 등 수정하여 사용
+##########################################
+
+##################### Install #####################
+apt-get update
+apt-get install -y sudo
+sudo apt-get install -y wget git vim build-essential
+
+##################### Set root password #####################
+echo "root:root" | chpasswd
+
+##################### conda #####################
+export PATH="/opt/conda/bin:$PATH"
+conda init bash
+conda config --set auto_activate_base false
+source ~/.bashrc
+conda create -n main python=3.10.13 -y
+sudo chmod -R 777 /opt/conda/env
+
+##################### Users: dir & permission #####################
+users=("camper")
+
+for i in "${!users[@]}"; do
+    user="${users[$i]}"
+    user_folder="/data/ephemeral/home/$user"
+
+    # Create user with custom home directory and give sudo privileges
+    sudo mkdir -p $user_folder
+    sudo chmod 777 $user_folder
+    sudo adduser --disabled-password --home $user_folder --gecos "" $user
+    # Set user password same as username
+    echo "${user}:${user}" | sudo chpasswd
+    sudo chsh -s /bin/bash $user
+    echo "$user ALL=(ALL) NOPASSWD:ALL" | sudo tee /etc/sudoers.d/$user
+
+done
+
+##################### Users: conda #####################
+for user in "${users[@]}"; do
+    user_folder="/data/ephemeral/home/$user"
+
+    # Add conda to each user's PATH and initialize conda
+    su - $user bash -c 'export PATH="/opt/conda/bin:$PATH"; conda init bash; conda config --set auto_activate_base false; source ~/.bashrc;'
+    echo "cd $user_folder" | sudo tee -a $user_folder/.bashrc
+    echo 'conda activate main' | sudo tee -a $user_folder/.bashrc
+
+    # Add local bin path to each user's .bashrc
+    echo "export PATH=\$PATH:/data/ephemeral/home/$user/.local/bin" | sudo tee -a $user_folder/.bashrc
+
+    sudo chmod -R 777 $user_folder
+    sudo chown -R $user:$user $user_folder
+
+done
+
+##################### Git #####################
+users=("sujin" "seongmin" "sungjae" "gayeon" "yeseo" "minseo")
+BASE_DIR="/data/ephemeral/home/camper"
+
+# 각 사용자별 디렉토리 생성
+for user in "${users[@]}"; do
+    mkdir -p "$BASE_DIR/$user"
+done
+
+# 글로벌 .gitconfig 생성
+cat << EOF > "$BASE_DIR/.gitconfig"
+[user]
+    name = Camper User
+    email = camper@example.com
+
+# 사용자별 폴더 설정 포함
+EOF
+
+# includeIf 설정을 동적으로 추가
+for user in "${users[@]}"; do
+    cat << EOF >> "$BASE_DIR/.gitconfig"
+[includeIf "gitdir:$BASE_DIR/$user/"]
+    path = $BASE_DIR/$user/.gitconfig
+EOF
+done
+
+# 각 사용자 폴더에 .gitconfig 생성
+for user in "${users[@]}"; do
+    cat << EOF > "$BASE_DIR/$user/.gitconfig"
+[user]
+    name = $user
+    email = $user@example.com
+EOF
+done
+
+# 권한 설정
+chown -R camper:camper "$BASE_DIR"
+chmod -R 755 "$BASE_DIR"
+
+echo "Git configuration setup completed!"
+
+echo "Setup complete!"
+
+
+
+##### git은 각자 폴더에서 세팅 #####
+# git clone https://"$token"@github.com/boostcampaitech7/level2-nlp-generationfornlp-nlp-02-lv3.git
+
+# git config --local user.email "$email"
+# git config --local user.name "$username"
+# git config --local credential.helper "cache --timeout=360000"
+# git config --local commit.template .gitmessage.txt
diff --git a/setup.cfg b/setup.cfg
new file mode 100644
index 0000000..06354ac
--- /dev/null
+++ b/setup.cfg
@@ -0,0 +1,2 @@
+[tool:pytest]
+addopts = -ra -v -l
diff --git a/tests/__init__.py b/tests/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/test_sample.py b/tests/test_sample.py
new file mode 100644
index 0000000..d2b4018
--- /dev/null
+++ b/tests/test_sample.py
@@ -0,0 +1,2 @@
+def test_add():
+    assert 1 + 2 == 3