1+ {
2+ "run_info" : {
3+ "created_at" : " 2025-09-16T16:39:46+00:00" ,
4+ "total_time" : 2239.912140868604 ,
5+ "experiment_name" : " osf/llama-3.2-3B-default" ,
6+ "peft_branch" : " orthogonal-subspace-learning" ,
7+ "train_config" : {
8+ "model_id" : " meta-llama/Llama-3.2-3B" ,
9+ "dtype" : " bfloat16" ,
10+ "max_seq_length" : 768 ,
11+ "batch_size" : 4 ,
12+ "batch_size_eval" : 50 ,
13+ "max_steps" : 5000 ,
14+ "eval_steps" : 250 ,
15+ "compile" : false ,
16+ "query_template" : " Question: {query} Think step by step.\n Answer:" ,
17+ "seed" : 0 ,
18+ "grad_norm_clip" : 1.0 ,
19+ "optimizer_type" : " AdamW" ,
20+ "optimizer_kwargs" : {
21+ "lr" : 5e-05
22+ },
23+ "lr_scheduler" : " cosine" ,
24+ "use_amp" : false ,
25+ "autocast_adapter_dtype" : true ,
26+ "generation_kwargs" : {
27+ "max_length" : 800 ,
28+ "max_new_tokens" : 300
29+ },
30+ "attn_implementation" : null
31+ },
32+ "peft_config" : {
33+ "task_type" : null ,
34+ "peft_type" : " OSF" ,
35+ "auto_mapping" : null ,
36+ "base_model_name_or_path" : " meta-llama/Llama-3.2-3B" ,
37+ "revision" : null ,
38+ "inference_mode" : false ,
39+ "effective_rank" : null ,
40+ "target_modules" : [
41+ " q_proj" ,
42+ " k_proj" ,
43+ " v_proj" ,
44+ " o_proj" ,
45+ " gate_proj" ,
46+ " down_proj" ,
47+ " up_proj"
48+ ],
49+ "rank_pattern" : null
50+ },
51+ "error_msg" : " "
52+ },
53+ "train_info" : {
54+ "cuda_memory_reserved_avg" : 36947070287 ,
55+ "cuda_memory_max" : 48360325120 ,
56+ "cuda_memory_reserved_99th" : 43331459481 ,
57+ "train_time" : 1869.4566851742566 ,
58+ "file_size" : 4199070800 ,
59+ "num_trainable_params" : 2099492864 ,
60+ "num_total_params" : 5312242688 ,
61+ "status" : " success" ,
62+ "metrics" : [
63+ {
64+ "step" : 250 ,
65+ "valid accuracy" : 0.42 ,
66+ "train loss" : 0.8737268534898758 ,
67+ "train samples" : 1000 ,
68+ "train time" : 50.654031636193395 ,
69+ "eval time" : 17.10858508758247 ,
70+ "tokens / sec" : 4179.70679057898 ,
71+ "mem allocated avg" : 27892705824.768 ,
72+ "mem reserved avg" : 36960483672.064 ,
73+ "elapsed time" : 106.69924115203321
74+ },
75+ {
76+ "step" : 500 ,
77+ "valid accuracy" : 0.3 ,
78+ "train loss" : 0.706649267077446 ,
79+ "train samples" : 2000 ,
80+ "train time" : 49.455417566001415 ,
81+ "eval time" : 17.0339136980474 ,
82+ "tokens / sec" : 4205.707084009904 ,
83+ "mem allocated avg" : 27883097303.04 ,
84+ "mem reserved avg" : 36874945036.288 ,
85+ "elapsed time" : 196.98712424002588
86+ },
87+ {
88+ "step" : 750 ,
89+ "valid accuracy" : 0.4 ,
90+ "train loss" : 0.7112378623485566 ,
91+ "train samples" : 3000 ,
92+ "train time" : 49.469826178625226 ,
93+ "eval time" : 17.050548058003187 ,
94+ "tokens / sec" : 4333.975203912031 ,
95+ "mem allocated avg" : 27893185933.312 ,
96+ "mem reserved avg" : 37054301863.936 ,
97+ "elapsed time" : 287.14699434675276
98+ },
99+ {
100+ "step" : 1000 ,
101+ "valid accuracy" : 0.4 ,
102+ "train loss" : 0.6787356187105179 ,
103+ "train samples" : 4000 ,
104+ "train time" : 49.53192405030131 ,
105+ "eval time" : 17.035450777038932 ,
106+ "tokens / sec" : 4206.095442374253 ,
107+ "mem allocated avg" : 27886242983.936 ,
108+ "mem reserved avg" : 36932272783.36 ,
109+ "elapsed time" : 378.03659191541374
110+ },
111+ {
112+ "step" : 1250 ,
113+ "valid accuracy" : 0.48 ,
114+ "train loss" : 0.6607321311235428 ,
115+ "train samples" : 5000 ,
116+ "train time" : 49.44732685945928 ,
117+ "eval time" : 9.998461779206991 ,
118+ "tokens / sec" : 4217.3766155795065 ,
119+ "mem allocated avg" : 27885243686.912 ,
120+ "mem reserved avg" : 36913549410.304 ,
121+ "elapsed time" : 462.07444413751364
122+ },
123+ {
124+ "step" : 1500 ,
125+ "valid accuracy" : 0.42 ,
126+ "train loss" : 0.6361023392677307 ,
127+ "train samples" : 6000 ,
128+ "train time" : 49.50303632207215 ,
129+ "eval time" : 9.531860370188951 ,
130+ "tokens / sec" : 4228.649706213367 ,
131+ "mem allocated avg" : 27886243244.032 ,
132+ "mem reserved avg" : 36938178363.392 ,
133+ "elapsed time" : 545.6157620940357
134+ },
135+ {
136+ "step" : 1750 ,
137+ "valid accuracy" : 0.42 ,
138+ "train loss" : 0.6153428200483322 ,
139+ "train samples" : 7000 ,
140+ "train time" : 49.356958812102675 ,
141+ "eval time" : 17.035431072115898 ,
142+ "tokens / sec" : 4241.6511275946905 ,
143+ "mem allocated avg" : 27888012863.488 ,
144+ "mem reserved avg" : 36950635446.272 ,
145+ "elapsed time" : 636.3067722842097
146+ },
147+ {
148+ "step" : 2000 ,
149+ "valid accuracy" : 0.5 ,
150+ "train loss" : 0.6005183280706405 ,
151+ "train samples" : 8000 ,
152+ "train time" : 49.20968849770725 ,
153+ "eval time" : 17.04335389100015 ,
154+ "tokens / sec" : 4220.632284833034 ,
155+ "mem allocated avg" : 27884932820.992 ,
156+ "mem reserved avg" : 36899943088.128 ,
157+ "elapsed time" : 726.614170236513
158+ },
159+ {
160+ "step" : 2250 ,
161+ "valid accuracy" : 0.46 ,
162+ "train loss" : 0.5723800752162933 ,
163+ "train samples" : 9000 ,
164+ "train time" : 49.73068151436746 ,
165+ "eval time" : 17.04573674313724 ,
166+ "tokens / sec" : 4322.241189031371 ,
167+ "mem allocated avg" : 27895625330.688 ,
168+ "mem reserved avg" : 37090221883.392 ,
169+ "elapsed time" : 817.9893315602094
170+ },
171+ {
172+ "step" : 2500 ,
173+ "valid accuracy" : 0.6 ,
174+ "train loss" : 0.5600862271785736 ,
175+ "train samples" : 10000 ,
176+ "train time" : 48.890957264229655 ,
177+ "eval time" : 17.02940934151411 ,
178+ "tokens / sec" : 4212.783130566615 ,
179+ "mem allocated avg" : 27882288250.88 ,
180+ "mem reserved avg" : 36840266530.816 ,
181+ "elapsed time" : 906.5962386727333
182+ },
183+ {
184+ "step" : 2750 ,
185+ "valid accuracy" : 0.54 ,
186+ "train loss" : 0.5380131875276566 ,
187+ "train samples" : 11000 ,
188+ "train time" : 49.336590841412544 ,
189+ "eval time" : 10.081329967826605 ,
190+ "tokens / sec" : 4294.601560149747 ,
191+ "mem allocated avg" : 27892309329.92 ,
192+ "mem reserved avg" : 37012685979.648 ,
193+ "elapsed time" : 989.601529257372
194+ },
195+ {
196+ "step" : 3000 ,
197+ "valid accuracy" : 0.6 ,
198+ "train loss" : 0.5155149220228196 ,
199+ "train samples" : 12000 ,
200+ "train time" : 49.203675450757146 ,
201+ "eval time" : 11.957756957039237 ,
202+ "tokens / sec" : 4242.183090750958 ,
203+ "mem allocated avg" : 27887082600.448 ,
204+ "mem reserved avg" : 36930251128.832 ,
205+ "elapsed time" : 1074.3195775337517
206+ },
207+ {
208+ "step" : 3250 ,
209+ "valid accuracy" : 0.66 ,
210+ "train loss" : 0.5271206270456315 ,
211+ "train samples" : 13000 ,
212+ "train time" : 49.4996285866946 ,
213+ "eval time" : 17.057839507237077 ,
214+ "tokens / sec" : 4260.6582316193335 ,
215+ "mem allocated avg" : 27888553652.224 ,
216+ "mem reserved avg" : 36957832871.936 ,
217+ "elapsed time" : 1165.3889896385372
218+ },
219+ {
220+ "step" : 3500 ,
221+ "valid accuracy" : 0.6 ,
222+ "train loss" : 0.5041869692802429 ,
223+ "train samples" : 14000 ,
224+ "train time" : 49.48238063044846 ,
225+ "eval time" : 10.848188759759068 ,
226+ "tokens / sec" : 4238.882554307271 ,
227+ "mem allocated avg" : 27886496616.448 ,
228+ "mem reserved avg" : 36946550194.176 ,
229+ "elapsed time" : 1249.9889227095991
230+ },
231+ {
232+ "step" : 3750 ,
233+ "valid accuracy" : 0.64 ,
234+ "train loss" : 0.503728393316269 ,
235+ "train samples" : 15000 ,
236+ "train time" : 49.83149162121117 ,
237+ "eval time" : 10.790844598785043 ,
238+ "tokens / sec" : 4348.715901326916 ,
239+ "mem allocated avg" : 27898321977.344 ,
240+ "mem reserved avg" : 37120454426.624 ,
241+ "elapsed time" : 1335.144711509347
242+ },
243+ {
244+ "step" : 4000 ,
245+ "valid accuracy" : 0.6 ,
246+ "train loss" : 0.5094073206186295 ,
247+ "train samples" : 16000 ,
248+ "train time" : 49.31607539579272 ,
249+ "eval time" : 10.857380656525493 ,
250+ "tokens / sec" : 4144.145663655863 ,
251+ "mem allocated avg" : 27880284809.216 ,
252+ "mem reserved avg" : 36817315299.328 ,
253+ "elapsed time" : 1419.5142810810357
254+ },
255+ {
256+ "step" : 4250 ,
257+ "valid accuracy" : 0.62 ,
258+ "train loss" : 0.5039986494779587 ,
259+ "train samples" : 17000 ,
260+ "train time" : 49.57314972765744 ,
261+ "eval time" : 10.780956281349063 ,
262+ "tokens / sec" : 4264.183356541164 ,
263+ "mem allocated avg" : 27890458138.624 ,
264+ "mem reserved avg" : 36982679928.832 ,
265+ "elapsed time" : 1504.5707566738129
266+ },
267+ {
268+ "step" : 4500 ,
269+ "valid accuracy" : 0.6 ,
270+ "train loss" : 0.5099123200178146 ,
271+ "train samples" : 18000 ,
272+ "train time" : 49.25641443952918 ,
273+ "eval time" : 10.854705560952425 ,
274+ "tokens / sec" : 4219.105315818973 ,
275+ "mem allocated avg" : 27885825058.816 ,
276+ "mem reserved avg" : 36892141682.688 ,
277+ "elapsed time" : 1588.7291173245758
278+ },
279+ {
280+ "step" : 4750 ,
281+ "valid accuracy" : 0.64 ,
282+ "train loss" : 0.5009565546512603 ,
283+ "train samples" : 19000 ,
284+ "train time" : 49.56661003828049 ,
285+ "eval time" : 11.43848267942667 ,
286+ "tokens / sec" : 4235.492397762592 ,
287+ "mem allocated avg" : 27887769325.568 ,
288+ "mem reserved avg" : 36944511762.432 ,
289+ "elapsed time" : 1674.5024977531284
290+ },
291+ {
292+ "step" : 5000 ,
293+ "valid accuracy" : 0.58 ,
294+ "train loss" : 0.5061850098371505 ,
295+ "train samples" : 20000 ,
296+ "train time" : 49.38067917525768 ,
297+ "eval time" : 10.836303755640984 ,
298+ "tokens / sec" : 4217.843972149319 ,
299+ "mem allocated avg" : 27883190544.384 ,
300+ "mem reserved avg" : 36882184404.992 ,
301+ "elapsed time" : 1759.2127967737615
302+ },
303+ {
304+ "step" : 5000 ,
305+ "test accuracy" : 0.5572403335860501 ,
306+ "train loss" : 0.5061850098371505 ,
307+ "train samples" : 20000 ,
308+ "train total tokens" : 4198051
309+ }
310+ ]
311+ },
312+ "meta_info" : {
313+ "model_info" : {
314+ "sha" : " 13afe5124825b4f3751f836b40dafda64c1ed062" ,
315+ "created_at" : " 2024-09-18T15:23:48+00:00"
316+ },
317+ "dataset_info" : {
318+ "metamath" : {
319+ "sha" : " aa4f34d3d2d3231299b5b03d9b3e5a20da45aa18" ,
320+ "created_at" : " 2023-09-21T17:22:46+00:00"
321+ },
322+ "gsm8k" : {
323+ "sha" : " e53f048856ff4f594e959d75785d2c2d37b678ee" ,
324+ "created_at" : " 2022-04-12T10:22:10+00:00"
325+ }
326+ },
327+ "package_info" : {
328+ "transformers-version" : " 4.56.1" ,
329+ "transformers-commit-hash" : null ,
330+ "peft-version" : " 0.16.1.dev0" ,
331+ "peft-commit-hash" : " 845479e2eabeb26da93a0e6465f2e9e0eab09abc" ,
332+ "datasets-version" : " 4.0.0" ,
333+ "datasets-commit-hash" : null ,
334+ "bitsandbytes-version" : " 0.47.0" ,
335+ "bitsandbytes-commit-hash" : null ,
336+ "torch-version" : " 2.8.0+cu128" ,
337+ "torch-commit-hash" : null
338+ },
339+ "system_info" : {
340+ "system" : " Linux" ,
341+ "release" : " 5.14.0-547.el9.x86_64" ,
342+ "version" : " #1 SMP PREEMPT_DYNAMIC Mon Dec 30 20:10:38 UTC 2024" ,
343+ "machine" : " x86_64" ,
344+ "processor" : " x86_64" ,
345+ "gpu" : " NVIDIA H100 80GB HBM3"
346+ },
347+ "pytorch_info": "PyTorch built with:\n - GCC 13.3\n - C++ Version: 201703\n - Intel(R) oneAPI Math Kernel Library Version 2024.2-Product Build 20240605 for Intel(R) 64 architecture applications\n - Intel(R) MKL-DNN v3.7.1 (Git Hash 8d263e693366ef8db40acc569cc7d8edf644556d)\n - OpenMP 201511 (a.k.a. OpenMP 4.5)\n - LAPACK is enabled (usually provided by MKL)\n - NNPACK is enabled\n - CPU capability usage: AVX512\n - CUDA Runtime 12.8\n - NVCC architecture flags: -gencode;arch=compute_70,code=sm_70;-gencode;arch=compute_75,code=sm_75;-gencode;arch=compute_80,code=sm_80;-gencode;arch=compute_86,code=sm_86;-gencode;arch=compute_90,code=sm_90;-gencode;arch=compute_100,code=sm_100;-gencode;arch=compute_120,code=sm_120\n - CuDNN 91.0.2 (built against CUDA 12.9)\n - Built with CuDNN 90.8\n - Magma 2.6.1\n - Build settings: BLAS_INFO=mkl, BUILD_TYPE=Release, COMMIT_SHA=a1cb3cc05d46d198467bebbb6e8fba50a325d4e7, CUDA_VERSION=12.8, CUDNN_VERSION=9.8.0, CXX_COMPILER=/opt/rh/gcc-toolset-13/root/usr/bin/c++, CXX_FLAGS= -fvisibility-inlines-hidden -DUSE_PTHREADPOOL -DNDEBUG -DUSE_KINETO -DLIBKINETO_NOROCTRACER -DLIBKINETO_NOXPUPTI=ON -DUSE_FBGEMM -DUSE_PYTORCH_QNNPACK -DUSE_XNNPACK -DSYMBOLICATE_MOBILE_DEBUG_HANDLE -O2 -fPIC -DC10_NODEPRECATED -Wall -Wextra -Werror=return-type -Werror=non-virtual-dtor -Werror=range-loop-construct -Werror=bool-operation -Wnarrowing -Wno-missing-field-initializers -Wno-unknown-pragmas -Wno-unused-parameter -Wno-strict-overflow -Wno-strict-aliasing -Wno-stringop-overflow -Wsuggest-override -Wno-psabi -Wno-error=old-style-cast -faligned-new -Wno-maybe-uninitialized -fno-math-errno -fno-trapping-math -Werror=format -Wno-dangling-reference -Wno-error=dangling-reference -Wno-stringop-overflow, LAPACK_INFO=mkl, PERF_WITH_AVX=1, PERF_WITH_AVX2=1, TORCH_VERSION=2.8.0, USE_CUDA=ON, USE_CUDNN=ON, USE_CUSPARSELT=1, USE_GFLAGS=OFF, USE_GLOG=OFF, USE_GLOO=ON, USE_MKL=ON, USE_MKLDNN=ON, USE_MPI=OFF, USE_NCCL=1, USE_NNPACK=ON, USE_OPENMP=ON, USE_ROCM=OFF, USE_ROCM_KERNEL_ASSERT=OFF, USE_XCCL=OFF, USE_XPU=OFF, \n"
348+ }
349+ }
0 commit comments