1+ {
2+ "run_info" : {
3+ "created_at" : " 2025-10-23T16:18:17+00:00" ,
4+ "total_time" : 2331.184612270001 ,
5+ "experiment_name" : " delora/llama-3.2-3B-rank32" ,
6+ "peft_branch" : " main" ,
7+ "train_config" : {
8+ "model_id" : " meta-llama/Llama-3.2-3B" ,
9+ "dtype" : " bfloat16" ,
10+ "max_seq_length" : 768 ,
11+ "batch_size" : 4 ,
12+ "batch_size_eval" : 50 ,
13+ "max_steps" : 5000 ,
14+ "eval_steps" : 250 ,
15+ "compile" : false ,
16+ "query_template" : " Question: {query} Think step by step.\n Answer:" ,
17+ "seed" : 0 ,
18+ "grad_norm_clip" : 1.0 ,
19+ "optimizer_type" : " AdamW" ,
20+ "optimizer_kwargs" : {
21+ "lr" : 0.001
22+ },
23+ "lr_scheduler" : " cosine" ,
24+ "use_amp" : false ,
25+ "autocast_adapter_dtype" : true ,
26+ "generation_kwargs" : {
27+ "max_length" : 800 ,
28+ "max_new_tokens" : 300
29+ },
30+ "attn_implementation" : null
31+ },
32+ "peft_config" : {
33+ "task_type" : " CAUSAL_LM" ,
34+ "peft_type" : " DELORA" ,
35+ "auto_mapping" : null ,
36+ "peft_version" : " 0.17.2.dev0@UNKNOWN" ,
37+ "base_model_name_or_path" : " meta-llama/Llama-3.2-3B" ,
38+ "revision" : null ,
39+ "inference_mode" : false ,
40+ "r" : 32 ,
41+ "delora_lambda" : 15 ,
42+ "module_dropout" : 0.0 ,
43+ "target_modules" : [
44+ " q_proj" ,
45+ " v_proj"
46+ ],
47+ "exclude_modules" : null ,
48+ "bias" : " none" ,
49+ "init_weights" : true ,
50+ "layers_to_transform" : null ,
51+ "layers_pattern" : null ,
52+ "rank_pattern" : {},
53+ "lambda_pattern" : {},
54+ "modules_to_save" : null
55+ },
56+ "error_msg" : " "
57+ },
58+ "train_info" : {
59+ "accelerator_memory_reserved_avg" : 11956236845 ,
60+ "accelerator_memory_max" : 22361931776 ,
61+ "accelerator_memory_reserved_99th" : 17769252782 ,
62+ "train_time" : 2063.197599866002 ,
63+ "file_size" : 37417520 ,
64+ "num_trainable_params" : 9175096 ,
65+ "num_total_params" : 3221924920 ,
66+ "status" : " success" ,
67+ "metrics" : [
68+ {
69+ "step" : 250 ,
70+ "valid accuracy" : 0.32 ,
71+ "train loss" : 0.7512386105060578 ,
72+ "train samples" : 1000 ,
73+ "train time" : 37.84413140498509 ,
74+ "eval time" : 13.205585324998538 ,
75+ "tokens / sec" : 5594.500181132732 ,
76+ "mem allocated avg" : 6926794532.864 ,
77+ "mem reserved avg" : 12007369605.12 ,
78+ "elapsed time" : 112.85904153599768
79+ },
80+ {
81+ "step" : 500 ,
82+ "valid accuracy" : 0.38 ,
83+ "train loss" : 0.7050024774074555 ,
84+ "train samples" : 2000 ,
85+ "train time" : 37.53846677497859 ,
86+ "eval time" : 13.265299970000342 ,
87+ "tokens / sec" : 5540.849636902056 ,
88+ "mem allocated avg" : 6919349673.984 ,
89+ "mem reserved avg" : 11903770296.32 ,
90+ "elapsed time" : 212.84601919299894
91+ },
92+ {
93+ "step" : 750 ,
94+ "valid accuracy" : 0.32 ,
95+ "train loss" : 0.6706294032335282 ,
96+ "train samples" : 3000 ,
97+ "train time" : 37.80458352702772 ,
98+ "eval time" : 13.272025713999028 ,
99+ "tokens / sec" : 5671.29644072703 ,
100+ "mem allocated avg" : 6929633923.072 ,
101+ "mem reserved avg" : 12056694620.16 ,
102+ "elapsed time" : 313.49587832399993
103+ },
104+ {
105+ "step" : 1000 ,
106+ "valid accuracy" : 0.4 ,
107+ "train loss" : 0.6481547034978866 ,
108+ "train samples" : 4000 ,
109+ "train time" : 37.52610543700939 ,
110+ "eval time" : 13.21725967599923 ,
111+ "tokens / sec" : 5551.761835496328 ,
112+ "mem allocated avg" : 6919568891.904 ,
113+ "mem reserved avg" : 11917057851.392 ,
114+ "elapsed time" : 413.16383353999845
115+ },
116+ {
117+ "step" : 1250 ,
118+ "valid accuracy" : 0.38 ,
119+ "train loss" : 0.6453099972009659 ,
120+ "train samples" : 5000 ,
121+ "train time" : 37.5804522819999 ,
122+ "eval time" : 9.624667924999812 ,
123+ "tokens / sec" : 5549.108308626837 ,
124+ "mem allocated avg" : 6921147688.96 ,
125+ "mem reserved avg" : 11914943922.176 ,
126+ "elapsed time" : 509.47617638500014
127+ },
128+ {
129+ "step" : 1500 ,
130+ "valid accuracy" : 0.46 ,
131+ "train loss" : 0.6384247626066208 ,
132+ "train samples" : 6000 ,
133+ "train time" : 37.65730221097692 ,
134+ "eval time" : 9.775350372998219 ,
135+ "tokens / sec" : 5558.842182246954 ,
136+ "mem allocated avg" : 6921056847.872 ,
137+ "mem reserved avg" : 11953078534.144 ,
138+ "elapsed time" : 606.1567662300004
139+ },
140+ {
141+ "step" : 1750 ,
142+ "valid accuracy" : 0.48 ,
143+ "train loss" : 0.6297660274505615 ,
144+ "train samples" : 7000 ,
145+ "train time" : 37.82186047102368 ,
146+ "eval time" : 7.911249515000236 ,
147+ "tokens / sec" : 5535.290897717534 ,
148+ "mem allocated avg" : 6923910838.272 ,
149+ "mem reserved avg" : 11956249427.968 ,
150+ "elapsed time" : 701.1174360119985
151+ },
152+ {
153+ "step" : 2000 ,
154+ "valid accuracy" : 0.5 ,
155+ "train loss" : 0.6332990030050277 ,
156+ "train samples" : 8000 ,
157+ "train time" : 37.523248280005646 ,
158+ "eval time" : 8.530133835996821 ,
159+ "tokens / sec" : 5535.128474223041 ,
160+ "mem allocated avg" : 6920641826.816 ,
161+ "mem reserved avg" : 11907327066.112 ,
162+ "elapsed time" : 796.1569609649996
163+ },
164+ {
165+ "step" : 2250 ,
166+ "valid accuracy" : 0.4 ,
167+ "train loss" : 0.6243826431035996 ,
168+ "train samples" : 9000 ,
169+ "train time" : 38.08898475294336 ,
170+ "eval time" : 13.285918199999287 ,
171+ "tokens / sec" : 5643.311350885762 ,
172+ "mem allocated avg" : 6931386861.568 ,
173+ "mem reserved avg" : 12094938284.032 ,
174+ "elapsed time" : 897.2201951069983
175+ },
176+ {
177+ "step" : 2500 ,
178+ "valid accuracy" : 0.5 ,
179+ "train loss" : 0.6215927278995514 ,
180+ "train samples" : 10000 ,
181+ "train time" : 37.63880846399843 ,
182+ "eval time" : 13.24860273900049 ,
183+ "tokens / sec" : 5472.1976705773695 ,
184+ "mem allocated avg" : 6917278386.176 ,
185+ "mem reserved avg" : 11845175869.44 ,
186+ "elapsed time" : 998.0728250969987
187+ },
188+ {
189+ "step" : 2750 ,
190+ "valid accuracy" : 0.42 ,
191+ "train loss" : 0.6130854382514953 ,
192+ "train samples" : 11000 ,
193+ "train time" : 37.79084398697523 ,
194+ "eval time" : 13.198808683002426 ,
195+ "tokens / sec" : 5606.675523653974 ,
196+ "mem allocated avg" : 6926927112.192 ,
197+ "mem reserved avg" : 12020548108.288 ,
198+ "elapsed time" : 1098.4325272319984
199+ },
200+ {
201+ "step" : 3000 ,
202+ "valid accuracy" : 0.46 ,
203+ "train loss" : 0.604831589102745 ,
204+ "train samples" : 12000 ,
205+ "train time" : 37.568779274977715 ,
206+ "eval time" : 10.355002560001594 ,
207+ "tokens / sec" : 5555.969718159649 ,
208+ "mem allocated avg" : 6922721505.28 ,
209+ "mem reserved avg" : 11937609940.992 ,
210+ "elapsed time" : 1195.2514979959997
211+ },
212+ {
213+ "step" : 3250 ,
214+ "valid accuracy" : 0.4 ,
215+ "train loss" : 0.6124310380220414 ,
216+ "train samples" : 13000 ,
217+ "train time" : 37.70235535401662 ,
218+ "eval time" : 10.490295633000642 ,
219+ "tokens / sec" : 5593.841499282662 ,
220+ "mem allocated avg" : 6924630044.672 ,
221+ "mem reserved avg" : 11975081852.928 ,
222+ "elapsed time" : 1292.7081366849998
223+ },
224+ {
225+ "step" : 3500 ,
226+ "valid accuracy" : 0.54 ,
227+ "train loss" : 0.5956783784627915 ,
228+ "train samples" : 14000 ,
229+ "train time" : 37.79015436899135 ,
230+ "eval time" : 7.505472221000673 ,
231+ "tokens / sec" : 5550.387488549399 ,
232+ "mem allocated avg" : 6923355121.664 ,
233+ "mem reserved avg" : 11948884230.144 ,
234+ "elapsed time" : 1387.1216009819982
235+ },
236+ {
237+ "step" : 3750 ,
238+ "valid accuracy" : 0.48 ,
239+ "train loss" : 0.5921734108924865 ,
240+ "train samples" : 15000 ,
241+ "train time" : 37.99711803697937 ,
242+ "eval time" : 8.399906407001254 ,
243+ "tokens / sec" : 5703.143059142048 ,
244+ "mem allocated avg" : 6933243086.848 ,
245+ "mem reserved avg" : 12128694042.624 ,
246+ "elapsed time" : 1483.2807508709993
247+ },
248+ {
249+ "step" : 4000 ,
250+ "valid accuracy" : 0.52 ,
251+ "train loss" : 0.6020598074197769 ,
252+ "train samples" : 16000 ,
253+ "train time" : 37.42554273099813 ,
254+ "eval time" : 13.19645261199912 ,
255+ "tokens / sec" : 5460.78921203528 ,
256+ "mem allocated avg" : 6915014187.008 ,
257+ "mem reserved avg" : 11819355734.016 ,
258+ "elapsed time" : 1582.7408143280009
259+ },
260+ {
261+ "step" : 4250 ,
262+ "valid accuracy" : 0.5 ,
263+ "train loss" : 0.58726664686203 ,
264+ "train samples" : 17000 ,
265+ "train time" : 37.58307892599987 ,
266+ "eval time" : 9.69436509300067 ,
267+ "tokens / sec" : 5624.579093592081 ,
268+ "mem allocated avg" : 6926118213.632 ,
269+ "mem reserved avg" : 11987807371.264 ,
270+ "elapsed time" : 1679.2568312559997
271+ },
272+ {
273+ "step" : 4500 ,
274+ "valid accuracy" : 0.52 ,
275+ "train loss" : 0.5931945472955704 ,
276+ "train samples" : 18000 ,
277+ "train time" : 37.45943218199682 ,
278+ "eval time" : 7.795902468998975 ,
279+ "tokens / sec" : 5547.815006653474 ,
280+ "mem allocated avg" : 6920348925.952 ,
281+ "mem reserved avg" : 11897596280.832 ,
282+ "elapsed time" : 1773.5582212900008
283+ },
284+ {
285+ "step" : 4750 ,
286+ "valid accuracy" : 0.5 ,
287+ "train loss" : 0.5837668641805649 ,
288+ "train samples" : 19000 ,
289+ "train time" : 37.71794232197135 ,
290+ "eval time" : 10.624573600001895 ,
291+ "tokens / sec" : 5566.024737190049 ,
292+ "mem allocated avg" : 6922591481.856 ,
293+ "mem reserved avg" : 11951140765.696 ,
294+ "elapsed time" : 1871.3457676430007
295+ },
296+ {
297+ "step" : 5000 ,
298+ "valid accuracy" : 0.52 ,
299+ "train loss" : 0.5912798082828522 ,
300+ "train samples" : 20000 ,
301+ "train time" : 37.50696286400489 ,
302+ "eval time" : 9.267422332999558 ,
303+ "tokens / sec" : 5553.1022534454405 ,
304+ "mem allocated avg" : 6919856828.416 ,
305+ "mem reserved avg" : 11901413097.472 ,
306+ "elapsed time" : 1967.2812061679979
307+ },
308+ {
309+ "step" : 5000 ,
310+ "test accuracy" : 0.5056861258529188 ,
311+ "train loss" : 0.5912798082828522 ,
312+ "train samples" : 20000 ,
313+ "train total tokens" : 4198051
314+ }
315+ ]
316+ },
317+ "meta_info" : {
318+ "model_info" : {
319+ "sha" : " 13afe5124825b4f3751f836b40dafda64c1ed062" ,
320+ "created_at" : " 2024-09-18T15:23:48+00:00"
321+ },
322+ "dataset_info" : {
323+ "metamath" : {
324+ "sha" : " aa4f34d3d2d3231299b5b03d9b3e5a20da45aa18" ,
325+ "created_at" : " 2023-09-21T17:22:46+00:00"
326+ },
327+ "gsm8k" : {
328+ "sha" : " e53f048856ff4f594e959d75785d2c2d37b678ee" ,
329+ "created_at" : " 2022-04-12T10:22:10+00:00"
330+ }
331+ },
332+ "package_info" : {
333+ "transformers-version" : " 4.57.1" ,
334+ "transformers-commit-hash" : null ,
335+ "peft-version" : " 0.17.2.dev0" ,
336+ "peft-commit-hash" : " a18ba67f242ab2eb74cdabab76ea2fd836b5cd83" ,
337+ "datasets-version" : " 4.2.0" ,
338+ "datasets-commit-hash" : null ,
339+ "bitsandbytes-version" : " 0.46.0" ,
340+ "bitsandbytes-commit-hash" : null ,
341+ "torch-version" : " 2.9.0+cu128" ,
342+ "torch-commit-hash" : null
343+ },
344+ "system_info" : {
345+ "system" : " Linux" ,
346+ "release" : " 6.14.0-1014-aws" ,
347+ "version" : " #14~24.04.1-Ubuntu SMP Tue Sep 23 14:51:14 UTC 2025" ,
348+ "machine" : " x86_64" ,
349+ "processor" : " x86_64" ,
350+ "accelerator" : " NVIDIA L40S"
351+ },
352+ "pytorch_info": "PyTorch built with:\n - GCC 13.3\n - C++ Version: 201703\n - Intel(R) oneAPI Math Kernel Library Version 2024.2-Product Build 20240605 for Intel(R) 64 architecture applications\n - Intel(R) MKL-DNN v3.7.1 (Git Hash 8d263e693366ef8db40acc569cc7d8edf644556d)\n - OpenMP 201511 (a.k.a. OpenMP 4.5)\n - LAPACK is enabled (usually provided by MKL)\n - NNPACK is enabled\n - CPU capability usage: AVX2\n - CUDA Runtime 12.8\n - NVCC architecture flags: -gencode;arch=compute_70,code=sm_70;-gencode;arch=compute_75,code=sm_75;-gencode;arch=compute_80,code=sm_80;-gencode;arch=compute_86,code=sm_86;-gencode;arch=compute_90,code=sm_90;-gencode;arch=compute_100,code=sm_100;-gencode;arch=compute_120,code=sm_120\n - CuDNN 90.7.1\n - Built with CuDNN 90.8\n - Magma 2.6.1\n - Build settings: BLAS_INFO=mkl, BUILD_TYPE=Release, COMMIT_SHA=0fabc3ba44823f257e70ce397d989c8de5e362c1, CUDA_VERSION=12.8, CUDNN_VERSION=9.8.0, CXX_COMPILER=/opt/rh/gcc-toolset-13/root/usr/bin/c++, CXX_FLAGS= -fvisibility-inlines-hidden -DUSE_PTHREADPOOL -DNDEBUG -DUSE_KINETO -DLIBKINETO_NOROCTRACER -DLIBKINETO_NOXPUPTI=ON -DUSE_FBGEMM -DUSE_PYTORCH_QNNPACK -DUSE_XNNPACK -DSYMBOLICATE_MOBILE_DEBUG_HANDLE -O2 -fPIC -DC10_NODEPRECATED -Wall -Wextra -Werror=return-type -Werror=non-virtual-dtor -Werror=range-loop-construct -Werror=bool-operation -Wnarrowing -Wno-missing-field-initializers -Wno-unknown-pragmas -Wno-unused-parameter -Wno-strict-overflow -Wno-strict-aliasing -Wno-stringop-overflow -Wsuggest-override -Wno-psabi -Wno-error=old-style-cast -faligned-new -Wno-maybe-uninitialized -fno-math-errno -fno-trapping-math -Werror=format -Wno-dangling-reference -Wno-error=dangling-reference -Wno-stringop-overflow, LAPACK_INFO=mkl, PERF_WITH_AVX=1, PERF_WITH_AVX2=1, TORCH_VERSION=2.9.0, USE_CUDA=ON, USE_CUDNN=ON, USE_CUSPARSELT=1, USE_GFLAGS=OFF, USE_GLOG=OFF, USE_GLOO=ON, USE_MKL=ON, USE_MKLDNN=ON, USE_MPI=OFF, USE_NCCL=1, USE_NNPACK=ON, USE_OPENMP=ON, USE_ROCM=OFF, USE_ROCM_KERNEL_ASSERT=OFF, USE_XCCL=OFF, USE_XPU=OFF, \n"
353+ }
354+ }
0 commit comments