Skip to content

Commit 2d435a5

Browse files
adding results of OSF method on MetaMathQA benchmark
1 parent 845479e commit 2d435a5

File tree

4 files changed

+425
-0
lines changed

4 files changed

+425
-0
lines changed
Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
{
2+
"task_type": null,
3+
"peft_type": "OSF",
4+
"auto_mapping": null,
5+
"base_model_name_or_path": "meta-llama/Llama-3.2-3B",
6+
"revision": null,
7+
"inference_mode": false,
8+
"effective_rank": null,
9+
"target_modules": [
10+
"q_proj",
11+
"k_proj",
12+
"v_proj",
13+
"o_proj",
14+
"gate_proj",
15+
"down_proj",
16+
"up_proj"
17+
],
18+
"rank_pattern": null
19+
}
20+
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
{
2+
"optimizer_kwargs": {
3+
"lr": 5e-5
4+
}
5+
}
6+
Lines changed: 349 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,349 @@
1+
{
2+
"run_info": {
3+
"created_at": "2025-09-16T16:39:46+00:00",
4+
"total_time": 2239.912140868604,
5+
"experiment_name": "osf/llama-3.2-3B-default",
6+
"peft_branch": "orthogonal-subspace-learning",
7+
"train_config": {
8+
"model_id": "meta-llama/Llama-3.2-3B",
9+
"dtype": "bfloat16",
10+
"max_seq_length": 768,
11+
"batch_size": 4,
12+
"batch_size_eval": 50,
13+
"max_steps": 5000,
14+
"eval_steps": 250,
15+
"compile": false,
16+
"query_template": "Question: {query} Think step by step.\nAnswer:",
17+
"seed": 0,
18+
"grad_norm_clip": 1.0,
19+
"optimizer_type": "AdamW",
20+
"optimizer_kwargs": {
21+
"lr": 5e-05
22+
},
23+
"lr_scheduler": "cosine",
24+
"use_amp": false,
25+
"autocast_adapter_dtype": true,
26+
"generation_kwargs": {
27+
"max_length": 800,
28+
"max_new_tokens": 300
29+
},
30+
"attn_implementation": null
31+
},
32+
"peft_config": {
33+
"task_type": null,
34+
"peft_type": "OSF",
35+
"auto_mapping": null,
36+
"base_model_name_or_path": "meta-llama/Llama-3.2-3B",
37+
"revision": null,
38+
"inference_mode": false,
39+
"effective_rank": null,
40+
"target_modules": [
41+
"q_proj",
42+
"k_proj",
43+
"v_proj",
44+
"o_proj",
45+
"gate_proj",
46+
"down_proj",
47+
"up_proj"
48+
],
49+
"rank_pattern": null
50+
},
51+
"error_msg": ""
52+
},
53+
"train_info": {
54+
"cuda_memory_reserved_avg": 36947070287,
55+
"cuda_memory_max": 48360325120,
56+
"cuda_memory_reserved_99th": 43331459481,
57+
"train_time": 1869.4566851742566,
58+
"file_size": 4199070800,
59+
"num_trainable_params": 2099492864,
60+
"num_total_params": 5312242688,
61+
"status": "success",
62+
"metrics": [
63+
{
64+
"step": 250,
65+
"valid accuracy": 0.42,
66+
"train loss": 0.8737268534898758,
67+
"train samples": 1000,
68+
"train time": 50.654031636193395,
69+
"eval time": 17.10858508758247,
70+
"tokens / sec": 4179.70679057898,
71+
"mem allocated avg": 27892705824.768,
72+
"mem reserved avg": 36960483672.064,
73+
"elapsed time": 106.69924115203321
74+
},
75+
{
76+
"step": 500,
77+
"valid accuracy": 0.3,
78+
"train loss": 0.706649267077446,
79+
"train samples": 2000,
80+
"train time": 49.455417566001415,
81+
"eval time": 17.0339136980474,
82+
"tokens / sec": 4205.707084009904,
83+
"mem allocated avg": 27883097303.04,
84+
"mem reserved avg": 36874945036.288,
85+
"elapsed time": 196.98712424002588
86+
},
87+
{
88+
"step": 750,
89+
"valid accuracy": 0.4,
90+
"train loss": 0.7112378623485566,
91+
"train samples": 3000,
92+
"train time": 49.469826178625226,
93+
"eval time": 17.050548058003187,
94+
"tokens / sec": 4333.975203912031,
95+
"mem allocated avg": 27893185933.312,
96+
"mem reserved avg": 37054301863.936,
97+
"elapsed time": 287.14699434675276
98+
},
99+
{
100+
"step": 1000,
101+
"valid accuracy": 0.4,
102+
"train loss": 0.6787356187105179,
103+
"train samples": 4000,
104+
"train time": 49.53192405030131,
105+
"eval time": 17.035450777038932,
106+
"tokens / sec": 4206.095442374253,
107+
"mem allocated avg": 27886242983.936,
108+
"mem reserved avg": 36932272783.36,
109+
"elapsed time": 378.03659191541374
110+
},
111+
{
112+
"step": 1250,
113+
"valid accuracy": 0.48,
114+
"train loss": 0.6607321311235428,
115+
"train samples": 5000,
116+
"train time": 49.44732685945928,
117+
"eval time": 9.998461779206991,
118+
"tokens / sec": 4217.3766155795065,
119+
"mem allocated avg": 27885243686.912,
120+
"mem reserved avg": 36913549410.304,
121+
"elapsed time": 462.07444413751364
122+
},
123+
{
124+
"step": 1500,
125+
"valid accuracy": 0.42,
126+
"train loss": 0.6361023392677307,
127+
"train samples": 6000,
128+
"train time": 49.50303632207215,
129+
"eval time": 9.531860370188951,
130+
"tokens / sec": 4228.649706213367,
131+
"mem allocated avg": 27886243244.032,
132+
"mem reserved avg": 36938178363.392,
133+
"elapsed time": 545.6157620940357
134+
},
135+
{
136+
"step": 1750,
137+
"valid accuracy": 0.42,
138+
"train loss": 0.6153428200483322,
139+
"train samples": 7000,
140+
"train time": 49.356958812102675,
141+
"eval time": 17.035431072115898,
142+
"tokens / sec": 4241.6511275946905,
143+
"mem allocated avg": 27888012863.488,
144+
"mem reserved avg": 36950635446.272,
145+
"elapsed time": 636.3067722842097
146+
},
147+
{
148+
"step": 2000,
149+
"valid accuracy": 0.5,
150+
"train loss": 0.6005183280706405,
151+
"train samples": 8000,
152+
"train time": 49.20968849770725,
153+
"eval time": 17.04335389100015,
154+
"tokens / sec": 4220.632284833034,
155+
"mem allocated avg": 27884932820.992,
156+
"mem reserved avg": 36899943088.128,
157+
"elapsed time": 726.614170236513
158+
},
159+
{
160+
"step": 2250,
161+
"valid accuracy": 0.46,
162+
"train loss": 0.5723800752162933,
163+
"train samples": 9000,
164+
"train time": 49.73068151436746,
165+
"eval time": 17.04573674313724,
166+
"tokens / sec": 4322.241189031371,
167+
"mem allocated avg": 27895625330.688,
168+
"mem reserved avg": 37090221883.392,
169+
"elapsed time": 817.9893315602094
170+
},
171+
{
172+
"step": 2500,
173+
"valid accuracy": 0.6,
174+
"train loss": 0.5600862271785736,
175+
"train samples": 10000,
176+
"train time": 48.890957264229655,
177+
"eval time": 17.02940934151411,
178+
"tokens / sec": 4212.783130566615,
179+
"mem allocated avg": 27882288250.88,
180+
"mem reserved avg": 36840266530.816,
181+
"elapsed time": 906.5962386727333
182+
},
183+
{
184+
"step": 2750,
185+
"valid accuracy": 0.54,
186+
"train loss": 0.5380131875276566,
187+
"train samples": 11000,
188+
"train time": 49.336590841412544,
189+
"eval time": 10.081329967826605,
190+
"tokens / sec": 4294.601560149747,
191+
"mem allocated avg": 27892309329.92,
192+
"mem reserved avg": 37012685979.648,
193+
"elapsed time": 989.601529257372
194+
},
195+
{
196+
"step": 3000,
197+
"valid accuracy": 0.6,
198+
"train loss": 0.5155149220228196,
199+
"train samples": 12000,
200+
"train time": 49.203675450757146,
201+
"eval time": 11.957756957039237,
202+
"tokens / sec": 4242.183090750958,
203+
"mem allocated avg": 27887082600.448,
204+
"mem reserved avg": 36930251128.832,
205+
"elapsed time": 1074.3195775337517
206+
},
207+
{
208+
"step": 3250,
209+
"valid accuracy": 0.66,
210+
"train loss": 0.5271206270456315,
211+
"train samples": 13000,
212+
"train time": 49.4996285866946,
213+
"eval time": 17.057839507237077,
214+
"tokens / sec": 4260.6582316193335,
215+
"mem allocated avg": 27888553652.224,
216+
"mem reserved avg": 36957832871.936,
217+
"elapsed time": 1165.3889896385372
218+
},
219+
{
220+
"step": 3500,
221+
"valid accuracy": 0.6,
222+
"train loss": 0.5041869692802429,
223+
"train samples": 14000,
224+
"train time": 49.48238063044846,
225+
"eval time": 10.848188759759068,
226+
"tokens / sec": 4238.882554307271,
227+
"mem allocated avg": 27886496616.448,
228+
"mem reserved avg": 36946550194.176,
229+
"elapsed time": 1249.9889227095991
230+
},
231+
{
232+
"step": 3750,
233+
"valid accuracy": 0.64,
234+
"train loss": 0.503728393316269,
235+
"train samples": 15000,
236+
"train time": 49.83149162121117,
237+
"eval time": 10.790844598785043,
238+
"tokens / sec": 4348.715901326916,
239+
"mem allocated avg": 27898321977.344,
240+
"mem reserved avg": 37120454426.624,
241+
"elapsed time": 1335.144711509347
242+
},
243+
{
244+
"step": 4000,
245+
"valid accuracy": 0.6,
246+
"train loss": 0.5094073206186295,
247+
"train samples": 16000,
248+
"train time": 49.31607539579272,
249+
"eval time": 10.857380656525493,
250+
"tokens / sec": 4144.145663655863,
251+
"mem allocated avg": 27880284809.216,
252+
"mem reserved avg": 36817315299.328,
253+
"elapsed time": 1419.5142810810357
254+
},
255+
{
256+
"step": 4250,
257+
"valid accuracy": 0.62,
258+
"train loss": 0.5039986494779587,
259+
"train samples": 17000,
260+
"train time": 49.57314972765744,
261+
"eval time": 10.780956281349063,
262+
"tokens / sec": 4264.183356541164,
263+
"mem allocated avg": 27890458138.624,
264+
"mem reserved avg": 36982679928.832,
265+
"elapsed time": 1504.5707566738129
266+
},
267+
{
268+
"step": 4500,
269+
"valid accuracy": 0.6,
270+
"train loss": 0.5099123200178146,
271+
"train samples": 18000,
272+
"train time": 49.25641443952918,
273+
"eval time": 10.854705560952425,
274+
"tokens / sec": 4219.105315818973,
275+
"mem allocated avg": 27885825058.816,
276+
"mem reserved avg": 36892141682.688,
277+
"elapsed time": 1588.7291173245758
278+
},
279+
{
280+
"step": 4750,
281+
"valid accuracy": 0.64,
282+
"train loss": 0.5009565546512603,
283+
"train samples": 19000,
284+
"train time": 49.56661003828049,
285+
"eval time": 11.43848267942667,
286+
"tokens / sec": 4235.492397762592,
287+
"mem allocated avg": 27887769325.568,
288+
"mem reserved avg": 36944511762.432,
289+
"elapsed time": 1674.5024977531284
290+
},
291+
{
292+
"step": 5000,
293+
"valid accuracy": 0.58,
294+
"train loss": 0.5061850098371505,
295+
"train samples": 20000,
296+
"train time": 49.38067917525768,
297+
"eval time": 10.836303755640984,
298+
"tokens / sec": 4217.843972149319,
299+
"mem allocated avg": 27883190544.384,
300+
"mem reserved avg": 36882184404.992,
301+
"elapsed time": 1759.2127967737615
302+
},
303+
{
304+
"step": 5000,
305+
"test accuracy": 0.5572403335860501,
306+
"train loss": 0.5061850098371505,
307+
"train samples": 20000,
308+
"train total tokens": 4198051
309+
}
310+
]
311+
},
312+
"meta_info": {
313+
"model_info": {
314+
"sha": "13afe5124825b4f3751f836b40dafda64c1ed062",
315+
"created_at": "2024-09-18T15:23:48+00:00"
316+
},
317+
"dataset_info": {
318+
"metamath": {
319+
"sha": "aa4f34d3d2d3231299b5b03d9b3e5a20da45aa18",
320+
"created_at": "2023-09-21T17:22:46+00:00"
321+
},
322+
"gsm8k": {
323+
"sha": "e53f048856ff4f594e959d75785d2c2d37b678ee",
324+
"created_at": "2022-04-12T10:22:10+00:00"
325+
}
326+
},
327+
"package_info": {
328+
"transformers-version": "4.56.1",
329+
"transformers-commit-hash": null,
330+
"peft-version": "0.16.1.dev0",
331+
"peft-commit-hash": "845479e2eabeb26da93a0e6465f2e9e0eab09abc",
332+
"datasets-version": "4.0.0",
333+
"datasets-commit-hash": null,
334+
"bitsandbytes-version": "0.47.0",
335+
"bitsandbytes-commit-hash": null,
336+
"torch-version": "2.8.0+cu128",
337+
"torch-commit-hash": null
338+
},
339+
"system_info": {
340+
"system": "Linux",
341+
"release": "5.14.0-547.el9.x86_64",
342+
"version": "#1 SMP PREEMPT_DYNAMIC Mon Dec 30 20:10:38 UTC 2024",
343+
"machine": "x86_64",
344+
"processor": "x86_64",
345+
"gpu": "NVIDIA H100 80GB HBM3"
346+
},
347+
"pytorch_info": "PyTorch built with:\n - GCC 13.3\n - C++ Version: 201703\n - Intel(R) oneAPI Math Kernel Library Version 2024.2-Product Build 20240605 for Intel(R) 64 architecture applications\n - Intel(R) MKL-DNN v3.7.1 (Git Hash 8d263e693366ef8db40acc569cc7d8edf644556d)\n - OpenMP 201511 (a.k.a. OpenMP 4.5)\n - LAPACK is enabled (usually provided by MKL)\n - NNPACK is enabled\n - CPU capability usage: AVX512\n - CUDA Runtime 12.8\n - NVCC architecture flags: -gencode;arch=compute_70,code=sm_70;-gencode;arch=compute_75,code=sm_75;-gencode;arch=compute_80,code=sm_80;-gencode;arch=compute_86,code=sm_86;-gencode;arch=compute_90,code=sm_90;-gencode;arch=compute_100,code=sm_100;-gencode;arch=compute_120,code=sm_120\n - CuDNN 91.0.2 (built against CUDA 12.9)\n - Built with CuDNN 90.8\n - Magma 2.6.1\n - Build settings: BLAS_INFO=mkl, BUILD_TYPE=Release, COMMIT_SHA=a1cb3cc05d46d198467bebbb6e8fba50a325d4e7, CUDA_VERSION=12.8, CUDNN_VERSION=9.8.0, CXX_COMPILER=/opt/rh/gcc-toolset-13/root/usr/bin/c++, CXX_FLAGS= -fvisibility-inlines-hidden -DUSE_PTHREADPOOL -DNDEBUG -DUSE_KINETO -DLIBKINETO_NOROCTRACER -DLIBKINETO_NOXPUPTI=ON -DUSE_FBGEMM -DUSE_PYTORCH_QNNPACK -DUSE_XNNPACK -DSYMBOLICATE_MOBILE_DEBUG_HANDLE -O2 -fPIC -DC10_NODEPRECATED -Wall -Wextra -Werror=return-type -Werror=non-virtual-dtor -Werror=range-loop-construct -Werror=bool-operation -Wnarrowing -Wno-missing-field-initializers -Wno-unknown-pragmas -Wno-unused-parameter -Wno-strict-overflow -Wno-strict-aliasing -Wno-stringop-overflow -Wsuggest-override -Wno-psabi -Wno-error=old-style-cast -faligned-new -Wno-maybe-uninitialized -fno-math-errno -fno-trapping-math -Werror=format -Wno-dangling-reference -Wno-error=dangling-reference -Wno-stringop-overflow, LAPACK_INFO=mkl, PERF_WITH_AVX=1, PERF_WITH_AVX2=1, TORCH_VERSION=2.8.0, USE_CUDA=ON, USE_CUDNN=ON, USE_CUSPARSELT=1, USE_GFLAGS=OFF, USE_GLOG=OFF, USE_GLOO=ON, USE_MKL=ON, USE_MKLDNN=ON, USE_MPI=OFF, USE_NCCL=1, USE_NNPACK=ON, USE_OPENMP=ON, USE_ROCM=OFF, USE_ROCM_KERNEL_ASSERT=OFF, USE_XCCL=OFF, USE_XPU=OFF, \n"
348+
}
349+
}

0 commit comments

Comments
 (0)