Skip to content
This repository was archived by the owner on Feb 16, 2026. It is now read-only.

Commit 6629556

Browse files
authored
feat: add dataset distillation extraction (#48)
1 parent 9a03df7 commit 6629556

30 files changed

+4255
-1712
lines changed

README.md

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010

1111
<div align="center">
1212

13-
Search your codebase semantically or chat with it from cli. Keep the vector database superfast up to date to the latest code changes.
13+
Generate datasets from code for finetuning, search your codebase semantically or chat with your code from cli. Keep the vector database superfast up to date to the latest code changes.
1414
100% local support without any dataleaks.
1515
Built with [langchain](https://github.com/langchain-ai/langchain), [treesitter](https://github.com/tree-sitter/tree-sitter), [sentence-transformers](https://github.com/UKPLab/sentence-transformers), [instructor-embedding](https://github.com/xlang-ai/instructor-embedding),
1616
[faiss](https://github.com/facebookresearch/faiss), [lama.cpp](https://github.com/ggerganov/llama.cpp), [Ollama](https://github.com/jmorganca/ollama), [Streamlit](https://github.com/streamlit/streamlit).
@@ -19,6 +19,8 @@ Built with [langchain](https://github.com/langchain-ai/langchain), [treesitter](
1919

2020
## ✨ Features
2121

22+
- 🗒️ &nbsp;Finetuning dataset generation
23+
- export in Alpaca, conversational, instruction or completionn format
2224
- 🔎 &nbsp;Semantic code search
2325
- 💬 &nbsp;GPT-like chat with your codebase
2426
- ⚙️ &nbsp;Synchronize vector store and latest code changes with ease
@@ -32,6 +34,19 @@ Built with [langchain](https://github.com/langchain-ai/langchain), [treesitter](
3234
3335
## 🚀 Usage
3436

37+
#### Export finetuning dataset from codebase in conversational format:
38+
```
39+
codeqai dataset
40+
```
41+
Export in different format like Alpaca with:
42+
```
43+
codeqai dataset --format alpaca
44+
```
45+
Export dataset with model distillation
46+
```
47+
codeqai dataset --distillation doc
48+
```
49+
3550
#### Start semantic search:
3651

3752
```

codeqai/app.py

Lines changed: 39 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,8 @@
1515
from codeqai.bootstrap import bootstrap
1616
from codeqai.cache import create_cache_dir, get_cache_path, save_vector_cache
1717
from codeqai.config import create_config, get_config_path, load_config
18-
from codeqai.constants import EmbeddingsModel, LlmHost
18+
from codeqai.constants import DistillationMode, EmbeddingsModel, LlmHost
19+
from codeqai.dataset_extractor import DatasetExtractor
1920
from codeqai.embeddings import Embeddings
2021
from codeqai.vector_store import VectorStore
2122

@@ -81,20 +82,33 @@ def run():
8182
"chat",
8283
"configure",
8384
"sync",
84-
"export-dataset (experimental)",
85+
"dataset",
8586
],
86-
help="Action to perform. 'search' will semantically search the codebase. 'chat' will chat with the codebase.",
87+
help="Action to perform. 'app' to start the streamlit app, 'search' to search the codebase, "
88+
+ "'chat' to chat with the model, 'configure' to start config wizard, "
89+
+ "'sync' to sync the vector store with the current git checkout, 'dataset' to export a dataset for model distillation.",
8790
)
8891
parser.add_argument(
8992
"--distillation",
90-
action="store_true",
91-
help="Use model distillation for finetuning dataset extraction.",
93+
type=DistillationMode,
94+
default=DistillationMode.NONE,
95+
help="Use model distillation for finetuning dataset extraction. Default is None."
96+
+ "Supported modes are, 'full', 'doc', 'code'.\n"
97+
+ "doc - Extracts only documentation for distillation.\n"
98+
+ "code - Extracts will chunk code blocks with inlined comments for distillation.\n"
99+
+ "full - Uses both doc and code mode",
92100
)
93101
parser.add_argument(
94102
"--format",
95103
type=str,
96-
default="Conversational",
97-
help="Format of the finetuning dataset. Supported formats are Conversational and Alpaca. Default is Conversational format.",
104+
default="conversational",
105+
help="Format of the finetuning dataset. Supported formats are conversational and alpaca. Default is Conversational format.",
106+
)
107+
parser.add_argument(
108+
"--max-tokens",
109+
type=int,
110+
default=1024,
111+
help="Token limit per code block for distillation dataset extraction. Default is 1024.",
98112
)
99113
args = parser.parse_args()
100114

@@ -149,10 +163,26 @@ def run():
149163
),
150164
)
151165

152-
if args.action == "extract-dataset":
166+
if args.action == "dataset":
167+
print(args.distillation)
168+
spinner = yaspin(
169+
text=f"Parsing codebase for {args.format} dataset export...",
170+
color="green",
171+
)
172+
spinner.start()
153173
repo_name = repo.repo_name()
154174
files = repo.load_files()
155-
documents = codeparser.parse_code_files_for_finetuning(files)
175+
documents = codeparser.parse_code_files_for_finetuning(
176+
files, args.max_tokens, spinner
177+
)
178+
dateset_extractor = DatasetExtractor(
179+
args.format,
180+
args.distillation,
181+
documents,
182+
config,
183+
args.max_tokens,
184+
)
185+
dateset_extractor.export()
156186
exit()
157187

158188
# check if faiss.index exists

codeqai/bootstrap.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,17 @@
88

99

1010
def bootstrap(config, repo_name, embeddings_model=None):
11+
"""
12+
Initializes the necessary components for the application.
13+
14+
Args:
15+
config (dict): Configuration dictionary containing settings for embeddings and LLM.
16+
repo_name (str): The name of the repository.
17+
embeddings_model (Embeddings, optional): Pre-initialized embeddings model. Defaults to None.
18+
19+
Returns:
20+
tuple: A tuple containing the vector store, memory, and QA chain.
21+
"""
1122
if embeddings_model is None:
1223
embeddings_model = Embeddings(
1324
model=EmbeddingsModel[config["embeddings"].upper().replace("-", "_")],

codeqai/cache.py

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,15 @@ def to_json(self):
2727

2828

2929
def load_vector_cache(filename) -> Dict[str, VectorCache]:
30+
"""
31+
Loads a vector cache from a JSON file.
32+
33+
Args:
34+
filename (str): The name of the file containing the vector cache.
35+
36+
Returns:
37+
Dict[str, VectorCache]: A dictionary where the keys are strings and the values are VectorCache objects.
38+
"""
3039
with open(
3140
get_cache_path() + "/" + filename, "r", encoding="utf-8"
3241
) as vector_cache_file:
@@ -38,13 +47,29 @@ def load_vector_cache(filename) -> Dict[str, VectorCache]:
3847

3948

4049
def save_vector_cache(vector_cache, filename):
50+
"""
51+
Saves a vector cache to a JSON file.
52+
53+
Args:
54+
vector_cache (Dict[str, VectorCache]): A dictionary where the keys are strings and the values are VectorCache objects.
55+
filename (str): The name of the file to save the vector cache to.
56+
"""
4157
with open(
4258
get_cache_path() + "/" + filename, "w", encoding="utf-8"
4359
) as vector_cache_file:
4460
json.dump(vector_cache, default=VectorCache.to_json, fp=vector_cache_file)
4561

4662

4763
def get_cache_path():
64+
"""
65+
Returns the cache directory path based on the operating system.
66+
67+
Returns:
68+
str: The path to the cache directory.
69+
70+
Raises:
71+
NotImplementedError: If the operating system is not supported.
72+
"""
4873
system = platform.system()
4974

5075
if system == "Linux" or system == "Darwin":
@@ -60,6 +85,12 @@ def get_cache_path():
6085

6186

6287
def create_cache_dir():
88+
"""
89+
Creates the cache directory if it does not already exist.
90+
91+
This function checks if the cache directory exists at the path returned by get_cache_path().
92+
If the directory does not exist, it creates the directory and any necessary parent directories.
93+
"""
6394
if not os.path.exists(get_cache_path()):
6495
path = Path(get_cache_path())
6596
path.mkdir(parents=True, exist_ok=True)

codeqai/codeparser.py

Lines changed: 51 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,25 @@
1-
import ast
21
import os
32

3+
import inquirer
44
from langchain.schema import Document
55
from langchain.text_splitter import RecursiveCharacterTextSplitter
6+
from yaspin import yaspin
67

78
from codeqai import repo, utils
89
from codeqai.constants import Language
910
from codeqai.treesitter.treesitter import Treesitter, TreesitterMethodNode
1011

1112

1213
def parse_code_files_for_db(code_files: list[str]) -> list[Document]:
14+
"""
15+
Parses a list of code files and returns a list of Document objects for database storage.
16+
17+
Args:
18+
code_files (list[str]): List of paths to code files to be parsed.
19+
20+
Returns:
21+
list[Document]: List of Document objects containing parsed code information.
22+
"""
1323
documents = []
1424
code_splitter = None
1525
for code_file in code_files:
@@ -60,7 +70,21 @@ def parse_code_files_for_db(code_files: list[str]) -> list[Document]:
6070
return documents
6171

6272

63-
def parse_code_files_for_finetuning(code_files: list[str]) -> list[dict]:
73+
def parse_code_files_for_finetuning(
74+
code_files: list[str], max_tokens, spinner
75+
) -> list[dict]:
76+
"""
77+
Parses a list of code files for fine-tuning and returns a list of dictionaries containing method information.
78+
79+
Args:
80+
code_files (list[str]): List of paths to code files to be parsed.
81+
max_tokens (int): Maximum number of tokens allowed for output.
82+
83+
Returns:
84+
list[dict]: List of dictionaries containing method information, including method name, code, description, and language.
85+
"""
86+
input_tokens = 0
87+
output_tokens = 0
6488
documents = []
6589
for code_file in code_files:
6690
with open(code_file, "r", encoding="utf-8") as file:
@@ -84,10 +108,34 @@ def parse_code_files_for_finetuning(code_files: list[str]) -> list[dict]:
84108
)
85109

86110
document = {
111+
"method_name": node.name,
87112
"code": method_source_code,
88113
"description": node.doc_comment,
89-
"language": programming_language,
114+
"language": programming_language.value,
90115
}
91116
documents.append(document)
92117

118+
if node.doc_comment is not None:
119+
input_tokens += utils.count_tokens(node.doc_comment)
120+
output_tokens += max_tokens
121+
122+
spinner.stop()
123+
124+
print(f"Estimated input tokens for distillation needed: {input_tokens}.")
125+
print(f"Maximum output tokens for distillation nedeed: {output_tokens}.")
126+
questions = [
127+
inquirer.Confirm(
128+
"confirm",
129+
message="Proceed?",
130+
default=True,
131+
),
132+
]
133+
134+
confirm = inquirer.prompt(questions)
135+
136+
if confirm and confirm["confirm"]:
137+
pass
138+
else:
139+
exit()
140+
93141
return documents

codeqai/config.py

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,18 @@
88

99

1010
def get_config_path():
11+
"""
12+
Returns the configuration file path based on the operating system.
13+
14+
This function determines the appropriate configuration directory based on the operating system
15+
and constructs the full path to the configuration file.
16+
17+
Returns:
18+
str: The path to the configuration file.
19+
20+
Raises:
21+
NotImplementedError: If the operating system is not supported.
22+
"""
1123
system = platform.system()
1224

1325
if system == "Linux" or system == "Darwin":
@@ -25,17 +37,44 @@ def get_config_path():
2537

2638

2739
def load_config():
40+
"""
41+
Loads the configuration from the configuration file.
42+
43+
This function reads the configuration file specified by get_config_path() and parses its content
44+
using the YAML parser.
45+
46+
Returns:
47+
dict: The configuration dictionary loaded from the file.
48+
"""
2849
with open(get_config_path(), "r", encoding="utf-8") as config_file:
2950
config = yaml.safe_load(config_file)
3051
return config
3152

3253

3354
def save_config(config):
55+
"""
56+
Saves the configuration to the configuration file.
57+
58+
Args:
59+
config (dict): The configuration dictionary to be saved.
60+
61+
This function writes the provided configuration dictionary to the configuration file specified by get_config_path()
62+
using the YAML format.
63+
"""
3464
with open(get_config_path(), "w", encoding="utf-8") as config_file:
3565
yaml.dump(config, config_file, default_flow_style=False)
3666

3767

3868
def create_config():
69+
"""
70+
Creates a new configuration interactively by prompting the user for input.
71+
72+
This function prompts the user with a series of questions to configure the embeddings model and LLM host.
73+
Based on the user's responses, it constructs a configuration dictionary and saves it to the configuration file.
74+
75+
Returns:
76+
dict: The configuration dictionary created based on user input.
77+
"""
3978
os.makedirs(os.path.dirname(get_config_path()), exist_ok=True)
4079

4180
questions = [

codeqai/constants.py

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -39,5 +39,14 @@ class LlmHost(Enum):
3939

4040

4141
class DatasetFormat(Enum):
42-
ALPACA = "Alpaca"
43-
CONVERSATIONAL = "Conversational"
42+
ALPACA = "alpaca"
43+
CONVERSATIONAL = "conversational"
44+
INSTRUCTION = "instruction"
45+
COMPLETION = "completion"
46+
47+
48+
class DistillationMode(Enum):
49+
NONE = "none"
50+
FULL = "full"
51+
DOCUMENTATION = "doc"
52+
CODE = "code"

0 commit comments

Comments
 (0)