adding results of OSF method on MetaMathQA benchmark

NikhilNayak-debug · NikhilNayak-debug · commit 2d435a557664 · 2025-09-16T17:25:35.000Z
diff --git a/method_comparison/MetaMathQA/experiments/osf/llama-3.2-3B-default/adapter_config.json b/method_comparison/MetaMathQA/experiments/osf/llama-3.2-3B-default/adapter_config.json
@@ -0,0 +1,20 @@
+{
+  "task_type": null,
+  "peft_type": "OSF",
+  "auto_mapping": null,
+  "base_model_name_or_path": "meta-llama/Llama-3.2-3B",
+  "revision": null,
+  "inference_mode": false,
+  "effective_rank": null,
+  "target_modules": [
+    "q_proj",
+    "k_proj",
+    "v_proj",
+    "o_proj",
+    "gate_proj",
+    "down_proj",
+    "up_proj"
+  ],
+  "rank_pattern": null
+}
+
diff --git a/method_comparison/MetaMathQA/experiments/osf/llama-3.2-3B-default/training_params.json b/method_comparison/MetaMathQA/experiments/osf/llama-3.2-3B-default/training_params.json
@@ -0,0 +1,6 @@
+{
+  "optimizer_kwargs": {
+    "lr": 5e-5
+  }
+}
+
diff --git a/method_comparison/MetaMathQA/results/osf--llama-3.2-3B-default.json b/method_comparison/MetaMathQA/results/osf--llama-3.2-3B-default.json
@@ -0,0 +1,349 @@
+{
+  "run_info": {
+    "created_at": "2025-09-16T16:39:46+00:00",
+    "total_time": 2239.912140868604,
+    "experiment_name": "osf/llama-3.2-3B-default",
+    "peft_branch": "orthogonal-subspace-learning",
+    "train_config": {
+      "model_id": "meta-llama/Llama-3.2-3B",
+      "dtype": "bfloat16",
+      "max_seq_length": 768,
+      "batch_size": 4,
+      "batch_size_eval": 50,
+      "max_steps": 5000,
+      "eval_steps": 250,
+      "compile": false,
+      "query_template": "Question: {query} Think step by step.\nAnswer:",
+      "seed": 0,
+      "grad_norm_clip": 1.0,
+      "optimizer_type": "AdamW",
+      "optimizer_kwargs": {
+        "lr": 5e-05
+      },
+      "lr_scheduler": "cosine",
+      "use_amp": false,
+      "autocast_adapter_dtype": true,
+      "generation_kwargs": {
+        "max_length": 800,
+        "max_new_tokens": 300
+      },
+      "attn_implementation": null
+    },
+    "peft_config": {
+      "task_type": null,
+      "peft_type": "OSF",
+      "auto_mapping": null,
+      "base_model_name_or_path": "meta-llama/Llama-3.2-3B",
+      "revision": null,
+      "inference_mode": false,
+      "effective_rank": null,
+      "target_modules": [
+        "q_proj",
+        "k_proj",
+        "v_proj",
+        "o_proj",
+        "gate_proj",
+        "down_proj",
+        "up_proj"
+      ],
+      "rank_pattern": null
+    },
+    "error_msg": ""
+  },
+  "train_info": {
+    "cuda_memory_reserved_avg": 36947070287,
+    "cuda_memory_max": 48360325120,
+    "cuda_memory_reserved_99th": 43331459481,
+    "train_time": 1869.4566851742566,
+    "file_size": 4199070800,
+    "num_trainable_params": 2099492864,
+    "num_total_params": 5312242688,
+    "status": "success",
+    "metrics": [
+      {
+        "step": 250,
+        "valid accuracy": 0.42,
+        "train loss": 0.8737268534898758,
+        "train samples": 1000,
+        "train time": 50.654031636193395,
+        "eval time": 17.10858508758247,
+        "tokens / sec": 4179.70679057898,
+        "mem allocated avg": 27892705824.768,
+        "mem reserved avg": 36960483672.064,
+        "elapsed time": 106.69924115203321
+      },
+      {
+        "step": 500,
+        "valid accuracy": 0.3,
+        "train loss": 0.706649267077446,
+        "train samples": 2000,
+        "train time": 49.455417566001415,
+        "eval time": 17.0339136980474,
+        "tokens / sec": 4205.707084009904,
+        "mem allocated avg": 27883097303.04,
+        "mem reserved avg": 36874945036.288,
+        "elapsed time": 196.98712424002588
+      },
+      {
+        "step": 750,
+        "valid accuracy": 0.4,
+        "train loss": 0.7112378623485566,
+        "train samples": 3000,
+        "train time": 49.469826178625226,
+        "eval time": 17.050548058003187,
+        "tokens / sec": 4333.975203912031,
+        "mem allocated avg": 27893185933.312,
+        "mem reserved avg": 37054301863.936,
+        "elapsed time": 287.14699434675276
+      },
+      {
+        "step": 1000,
+        "valid accuracy": 0.4,
+        "train loss": 0.6787356187105179,
+        "train samples": 4000,
+        "train time": 49.53192405030131,
+        "eval time": 17.035450777038932,
+        "tokens / sec": 4206.095442374253,
+        "mem allocated avg": 27886242983.936,
+        "mem reserved avg": 36932272783.36,
+        "elapsed time": 378.03659191541374
+      },
+      {
+        "step": 1250,
+        "valid accuracy": 0.48,
+        "train loss": 0.6607321311235428,
+        "train samples": 5000,
+        "train time": 49.44732685945928,
+        "eval time": 9.998461779206991,
+        "tokens / sec": 4217.3766155795065,
+        "mem allocated avg": 27885243686.912,
+        "mem reserved avg": 36913549410.304,
+        "elapsed time": 462.07444413751364
+      },
+      {
+        "step": 1500,
+        "valid accuracy": 0.42,
+        "train loss": 0.6361023392677307,
+        "train samples": 6000,
+        "train time": 49.50303632207215,
+        "eval time": 9.531860370188951,
+        "tokens / sec": 4228.649706213367,
+        "mem allocated avg": 27886243244.032,
+        "mem reserved avg": 36938178363.392,
+        "elapsed time": 545.6157620940357
+      },
+      {
+        "step": 1750,
+        "valid accuracy": 0.42,
+        "train loss": 0.6153428200483322,
+        "train samples": 7000,
+        "train time": 49.356958812102675,
+        "eval time": 17.035431072115898,
+        "tokens / sec": 4241.6511275946905,
+        "mem allocated avg": 27888012863.488,
+        "mem reserved avg": 36950635446.272,
+        "elapsed time": 636.3067722842097
+      },
+      {
+        "step": 2000,
+        "valid accuracy": 0.5,
+        "train loss": 0.6005183280706405,
+        "train samples": 8000,
+        "train time": 49.20968849770725,
+        "eval time": 17.04335389100015,
+        "tokens / sec": 4220.632284833034,
+        "mem allocated avg": 27884932820.992,
+        "mem reserved avg": 36899943088.128,
+        "elapsed time": 726.614170236513
+      },
+      {
+        "step": 2250,
+        "valid accuracy": 0.46,
+        "train loss": 0.5723800752162933,
+        "train samples": 9000,
+        "train time": 49.73068151436746,
+        "eval time": 17.04573674313724,
+        "tokens / sec": 4322.241189031371,
+        "mem allocated avg": 27895625330.688,
+        "mem reserved avg": 37090221883.392,
+        "elapsed time": 817.9893315602094
+      },
+      {
+        "step": 2500,
+        "valid accuracy": 0.6,
+        "train loss": 0.5600862271785736,
+        "train samples": 10000,
+        "train time": 48.890957264229655,
+        "eval time": 17.02940934151411,
+        "tokens / sec": 4212.783130566615,
+        "mem allocated avg": 27882288250.88,
+        "mem reserved avg": 36840266530.816,
+        "elapsed time": 906.5962386727333
+      },
+      {
+        "step": 2750,
+        "valid accuracy": 0.54,
+        "train loss": 0.5380131875276566,
+        "train samples": 11000,
+        "train time": 49.336590841412544,
+        "eval time": 10.081329967826605,
+        "tokens / sec": 4294.601560149747,
+        "mem allocated avg": 27892309329.92,
+        "mem reserved avg": 37012685979.648,
+        "elapsed time": 989.601529257372
+      },
+      {
+        "step": 3000,
+        "valid accuracy": 0.6,
+        "train loss": 0.5155149220228196,
+        "train samples": 12000,
+        "train time": 49.203675450757146,
+        "eval time": 11.957756957039237,
+        "tokens / sec": 4242.183090750958,
+        "mem allocated avg": 27887082600.448,
+        "mem reserved avg": 36930251128.832,
+        "elapsed time": 1074.3195775337517
+      },
+      {
+        "step": 3250,
+        "valid accuracy": 0.66,
+        "train loss": 0.5271206270456315,
+        "train samples": 13000,
+        "train time": 49.4996285866946,
+        "eval time": 17.057839507237077,
+        "tokens / sec": 4260.6582316193335,
+        "mem allocated avg": 27888553652.224,
+        "mem reserved avg": 36957832871.936,
+        "elapsed time": 1165.3889896385372
+      },
+      {
+        "step": 3500,
+        "valid accuracy": 0.6,
+        "train loss": 0.5041869692802429,
+        "train samples": 14000,
+        "train time": 49.48238063044846,
+        "eval time": 10.848188759759068,
+        "tokens / sec": 4238.882554307271,
+        "mem allocated avg": 27886496616.448,
+        "mem reserved avg": 36946550194.176,
+        "elapsed time": 1249.9889227095991
+      },
+      {
+        "step": 3750,
+        "valid accuracy": 0.64,
+        "train loss": 0.503728393316269,
+        "train samples": 15000,
+        "train time": 49.83149162121117,
+        "eval time": 10.790844598785043,
+        "tokens / sec": 4348.715901326916,
+        "mem allocated avg": 27898321977.344,
+        "mem reserved avg": 37120454426.624,
+        "elapsed time": 1335.144711509347
+      },
+      {
+        "step": 4000,
+        "valid accuracy": 0.6,
+        "train loss": 0.5094073206186295,
+        "train samples": 16000,
+        "train time": 49.31607539579272,
+        "eval time": 10.857380656525493,
+        "tokens / sec": 4144.145663655863,
+        "mem allocated avg": 27880284809.216,
+        "mem reserved avg": 36817315299.328,
+        "elapsed time": 1419.5142810810357
+      },
+      {
+        "step": 4250,
+        "valid accuracy": 0.62,
+        "train loss": 0.5039986494779587,
+        "train samples": 17000,
+        "train time": 49.57314972765744,
+        "eval time": 10.780956281349063,
+        "tokens / sec": 4264.183356541164,
+        "mem allocated avg": 27890458138.624,
+        "mem reserved avg": 36982679928.832,
+        "elapsed time": 1504.5707566738129
+      },
+      {
+        "step": 4500,
+        "valid accuracy": 0.6,
+        "train loss": 0.5099123200178146,
+        "train samples": 18000,
+        "train time": 49.25641443952918,
+        "eval time": 10.854705560952425,
+        "tokens / sec": 4219.105315818973,
+        "mem allocated avg": 27885825058.816,
+        "mem reserved avg": 36892141682.688,
+        "elapsed time": 1588.7291173245758
+      },
+      {
+        "step": 4750,
+        "valid accuracy": 0.64,
+        "train loss": 0.5009565546512603,
+        "train samples": 19000,
+        "train time": 49.56661003828049,
+        "eval time": 11.43848267942667,
+        "tokens / sec": 4235.492397762592,
+        "mem allocated avg": 27887769325.568,
+        "mem reserved avg": 36944511762.432,
+        "elapsed time": 1674.5024977531284
+      },
+      {
+        "step": 5000,
+        "valid accuracy": 0.58,
+        "train loss": 0.5061850098371505,
+        "train samples": 20000,
+        "train time": 49.38067917525768,
+        "eval time": 10.836303755640984,
+        "tokens / sec": 4217.843972149319,
+        "mem allocated avg": 27883190544.384,
+        "mem reserved avg": 36882184404.992,
+        "elapsed time": 1759.2127967737615
+      },
+      {
+        "step": 5000,
+        "test accuracy": 0.5572403335860501,
+        "train loss": 0.5061850098371505,
+        "train samples": 20000,
+        "train total tokens": 4198051
+      }
+    ]
+  },
+  "meta_info": {
+    "model_info": {
+      "sha": "13afe5124825b4f3751f836b40dafda64c1ed062",
+      "created_at": "2024-09-18T15:23:48+00:00"
+    },
+    "dataset_info": {
+      "metamath": {
+        "sha": "aa4f34d3d2d3231299b5b03d9b3e5a20da45aa18",
+        "created_at": "2023-09-21T17:22:46+00:00"
+      },
+      "gsm8k": {
+        "sha": "e53f048856ff4f594e959d75785d2c2d37b678ee",
+        "created_at": "2022-04-12T10:22:10+00:00"
+      }
+    },
+    "package_info": {
+      "transformers-version": "4.56.1",
+      "transformers-commit-hash": null,
+      "peft-version": "0.16.1.dev0",
+      "peft-commit-hash": "845479e2eabeb26da93a0e6465f2e9e0eab09abc",
+      "datasets-version": "4.0.0",
+      "datasets-commit-hash": null,
+      "bitsandbytes-version": "0.47.0",
+      "bitsandbytes-commit-hash": null,
+      "torch-version": "2.8.0+cu128",
+      "torch-commit-hash": null
+    },
+    "system_info": {
+      "system": "Linux",
+      "release": "5.14.0-547.el9.x86_64",
+      "version": "#1 SMP PREEMPT_DYNAMIC Mon Dec 30 20:10:38 UTC 2024",
+      "machine": "x86_64",
+      "processor": "x86_64",
+      "gpu": "NVIDIA H100 80GB HBM3"
+    },
+    "pytorch_info": "PyTorch built with:\n  - GCC 13.3\n  - C++ Version: 201703\n  - Intel(R) oneAPI Math Kernel Library Version 2024.2-Product Build 20240605 for Intel(R) 64 architecture applications\n  - Intel(R) MKL-DNN v3.7.1 (Git Hash 8d263e693366ef8db40acc569cc7d8edf644556d)\n  - OpenMP 201511 (a.k.a. OpenMP 4.5)\n  - LAPACK is enabled (usually provided by MKL)\n  - NNPACK is enabled\n  - CPU capability usage: AVX512\n  - CUDA Runtime 12.8\n  - NVCC architecture flags: -gencode;arch=compute_70,code=sm_70;-gencode;arch=compute_75,code=sm_75;-gencode;arch=compute_80,code=sm_80;-gencode;arch=compute_86,code=sm_86;-gencode;arch=compute_90,code=sm_90;-gencode;arch=compute_100,code=sm_100;-gencode;arch=compute_120,code=sm_120\n  - CuDNN 91.0.2  (built against CUDA 12.9)\n    - Built with CuDNN 90.8\n  - Magma 2.6.1\n  - Build settings: BLAS_INFO=mkl, BUILD_TYPE=Release, COMMIT_SHA=a1cb3cc05d46d198467bebbb6e8fba50a325d4e7, CUDA_VERSION=12.8, CUDNN_VERSION=9.8.0, CXX_COMPILER=/opt/rh/gcc-toolset-13/root/usr/bin/c++, CXX_FLAGS= -fvisibility-inlines-hidden -DUSE_PTHREADPOOL -DNDEBUG -DUSE_KINETO -DLIBKINETO_NOROCTRACER -DLIBKINETO_NOXPUPTI=ON -DUSE_FBGEMM -DUSE_PYTORCH_QNNPACK -DUSE_XNNPACK -DSYMBOLICATE_MOBILE_DEBUG_HANDLE -O2 -fPIC -DC10_NODEPRECATED -Wall -Wextra -Werror=return-type -Werror=non-virtual-dtor -Werror=range-loop-construct -Werror=bool-operation -Wnarrowing -Wno-missing-field-initializers -Wno-unknown-pragmas -Wno-unused-parameter -Wno-strict-overflow -Wno-strict-aliasing -Wno-stringop-overflow -Wsuggest-override -Wno-psabi -Wno-error=old-style-cast -faligned-new -Wno-maybe-uninitialized -fno-math-errno -fno-trapping-math -Werror=format -Wno-dangling-reference -Wno-error=dangling-reference -Wno-stringop-overflow, LAPACK_INFO=mkl, PERF_WITH_AVX=1, PERF_WITH_AVX2=1, TORCH_VERSION=2.8.0, USE_CUDA=ON, USE_CUDNN=ON, USE_CUSPARSELT=1, USE_GFLAGS=OFF, USE_GLOG=OFF, USE_GLOO=ON, USE_MKL=ON, USE_MKLDNN=ON, USE_MPI=OFF, USE_NCCL=1, USE_NNPACK=ON, USE_OPENMP=ON, USE_ROCM=OFF, USE_ROCM_KERNEL_ASSERT=OFF, USE_XCCL=OFF, USE_XPU=OFF, \n"
+  }
+}
diff --git a/src/peft/tuners/osf/model.py b/src/peft/tuners/osf/model.py

-Original file line number
+Diff line change
@@ @@ -0,0 +1,6 @@ @@
 +{
 +  "optimizer_kwargs": {
 +    "lr": 5e-5
 +  }
 +}
++