|
15 | 15 | from codeqai.bootstrap import bootstrap |
16 | 16 | from codeqai.cache import create_cache_dir, get_cache_path, save_vector_cache |
17 | 17 | from codeqai.config import create_config, get_config_path, load_config |
18 | | -from codeqai.constants import EmbeddingsModel, LlmHost |
| 18 | +from codeqai.constants import DistillationMode, EmbeddingsModel, LlmHost |
| 19 | +from codeqai.dataset_extractor import DatasetExtractor |
19 | 20 | from codeqai.embeddings import Embeddings |
20 | 21 | from codeqai.vector_store import VectorStore |
21 | 22 |
|
@@ -81,20 +82,33 @@ def run(): |
81 | 82 | "chat", |
82 | 83 | "configure", |
83 | 84 | "sync", |
84 | | - "export-dataset (experimental)", |
| 85 | + "dataset", |
85 | 86 | ], |
86 | | - help="Action to perform. 'search' will semantically search the codebase. 'chat' will chat with the codebase.", |
| 87 | + help="Action to perform. 'app' to start the streamlit app, 'search' to search the codebase, " |
| 88 | + + "'chat' to chat with the model, 'configure' to start config wizard, " |
| 89 | + + "'sync' to sync the vector store with the current git checkout, 'dataset' to export a dataset for model distillation.", |
87 | 90 | ) |
88 | 91 | parser.add_argument( |
89 | 92 | "--distillation", |
90 | | - action="store_true", |
91 | | - help="Use model distillation for finetuning dataset extraction.", |
| 93 | + type=DistillationMode, |
| 94 | + default=DistillationMode.NONE, |
| 95 | + help="Use model distillation for finetuning dataset extraction. Default is None." |
| 96 | + + "Supported modes are, 'full', 'doc', 'code'.\n" |
| 97 | + + "doc - Extracts only documentation for distillation.\n" |
| 98 | + + "code - Extracts will chunk code blocks with inlined comments for distillation.\n" |
| 99 | + + "full - Uses both doc and code mode", |
92 | 100 | ) |
93 | 101 | parser.add_argument( |
94 | 102 | "--format", |
95 | 103 | type=str, |
96 | | - default="Conversational", |
97 | | - help="Format of the finetuning dataset. Supported formats are Conversational and Alpaca. Default is Conversational format.", |
| 104 | + default="conversational", |
| 105 | + help="Format of the finetuning dataset. Supported formats are conversational and alpaca. Default is Conversational format.", |
| 106 | + ) |
| 107 | + parser.add_argument( |
| 108 | + "--max-tokens", |
| 109 | + type=int, |
| 110 | + default=1024, |
| 111 | + help="Token limit per code block for distillation dataset extraction. Default is 1024.", |
98 | 112 | ) |
99 | 113 | args = parser.parse_args() |
100 | 114 |
|
@@ -149,10 +163,26 @@ def run(): |
149 | 163 | ), |
150 | 164 | ) |
151 | 165 |
|
152 | | - if args.action == "extract-dataset": |
| 166 | + if args.action == "dataset": |
| 167 | + print(args.distillation) |
| 168 | + spinner = yaspin( |
| 169 | + text=f"Parsing codebase for {args.format} dataset export...", |
| 170 | + color="green", |
| 171 | + ) |
| 172 | + spinner.start() |
153 | 173 | repo_name = repo.repo_name() |
154 | 174 | files = repo.load_files() |
155 | | - documents = codeparser.parse_code_files_for_finetuning(files) |
| 175 | + documents = codeparser.parse_code_files_for_finetuning( |
| 176 | + files, args.max_tokens, spinner |
| 177 | + ) |
| 178 | + dateset_extractor = DatasetExtractor( |
| 179 | + args.format, |
| 180 | + args.distillation, |
| 181 | + documents, |
| 182 | + config, |
| 183 | + args.max_tokens, |
| 184 | + ) |
| 185 | + dateset_extractor.export() |
156 | 186 | exit() |
157 | 187 |
|
158 | 188 | # check if faiss.index exists |
|
0 commit comments