diff --git a/method_comparison/MetaMathQA/results/delora--llama-3.2-3B-rank32.json b/method_comparison/MetaMathQA/results/delora--llama-3.2-3B-rank32.json new file mode 100644 index 0000000000..317ae09c61 --- /dev/null +++ b/method_comparison/MetaMathQA/results/delora--llama-3.2-3B-rank32.json @@ -0,0 +1,354 @@ +{ + "run_info": { + "created_at": "2025-10-23T16:18:17+00:00", + "total_time": 2331.184612270001, + "experiment_name": "delora/llama-3.2-3B-rank32", + "peft_branch": "main", + "train_config": { + "model_id": "meta-llama/Llama-3.2-3B", + "dtype": "bfloat16", + "max_seq_length": 768, + "batch_size": 4, + "batch_size_eval": 50, + "max_steps": 5000, + "eval_steps": 250, + "compile": false, + "query_template": "Question: {query} Think step by step.\nAnswer:", + "seed": 0, + "grad_norm_clip": 1.0, + "optimizer_type": "AdamW", + "optimizer_kwargs": { + "lr": 0.001 + }, + "lr_scheduler": "cosine", + "use_amp": false, + "autocast_adapter_dtype": true, + "generation_kwargs": { + "max_length": 800, + "max_new_tokens": 300 + }, + "attn_implementation": null + }, + "peft_config": { + "task_type": "CAUSAL_LM", + "peft_type": "DELORA", + "auto_mapping": null, + "peft_version": "0.17.2.dev0@UNKNOWN", + "base_model_name_or_path": "meta-llama/Llama-3.2-3B", + "revision": null, + "inference_mode": false, + "r": 32, + "delora_lambda": 15, + "module_dropout": 0.0, + "target_modules": [ + "q_proj", + "v_proj" + ], + "exclude_modules": null, + "bias": "none", + "init_weights": true, + "layers_to_transform": null, + "layers_pattern": null, + "rank_pattern": {}, + "lambda_pattern": {}, + "modules_to_save": null + }, + "error_msg": "" + }, + "train_info": { + "accelerator_memory_reserved_avg": 11956236845, + "accelerator_memory_max": 22361931776, + "accelerator_memory_reserved_99th": 17769252782, + "train_time": 2063.197599866002, + "file_size": 37417520, + "num_trainable_params": 9175096, + "num_total_params": 3221924920, + "status": "success", + "metrics": [ + { + "step": 250, + "valid accuracy": 0.32, + "train loss": 0.7512386105060578, + "train samples": 1000, + "train time": 37.84413140498509, + "eval time": 13.205585324998538, + "tokens / sec": 5594.500181132732, + "mem allocated avg": 6926794532.864, + "mem reserved avg": 12007369605.12, + "elapsed time": 112.85904153599768 + }, + { + "step": 500, + "valid accuracy": 0.38, + "train loss": 0.7050024774074555, + "train samples": 2000, + "train time": 37.53846677497859, + "eval time": 13.265299970000342, + "tokens / sec": 5540.849636902056, + "mem allocated avg": 6919349673.984, + "mem reserved avg": 11903770296.32, + "elapsed time": 212.84601919299894 + }, + { + "step": 750, + "valid accuracy": 0.32, + "train loss": 0.6706294032335282, + "train samples": 3000, + "train time": 37.80458352702772, + "eval time": 13.272025713999028, + "tokens / sec": 5671.29644072703, + "mem allocated avg": 6929633923.072, + "mem reserved avg": 12056694620.16, + "elapsed time": 313.49587832399993 + }, + { + "step": 1000, + "valid accuracy": 0.4, + "train loss": 0.6481547034978866, + "train samples": 4000, + "train time": 37.52610543700939, + "eval time": 13.21725967599923, + "tokens / sec": 5551.761835496328, + "mem allocated avg": 6919568891.904, + "mem reserved avg": 11917057851.392, + "elapsed time": 413.16383353999845 + }, + { + "step": 1250, + "valid accuracy": 0.38, + "train loss": 0.6453099972009659, + "train samples": 5000, + "train time": 37.5804522819999, + "eval time": 9.624667924999812, + "tokens / sec": 5549.108308626837, + "mem allocated avg": 6921147688.96, + "mem reserved avg": 11914943922.176, + "elapsed time": 509.47617638500014 + }, + { + "step": 1500, + "valid accuracy": 0.46, + "train loss": 0.6384247626066208, + "train samples": 6000, + "train time": 37.65730221097692, + "eval time": 9.775350372998219, + "tokens / sec": 5558.842182246954, + "mem allocated avg": 6921056847.872, + "mem reserved avg": 11953078534.144, + "elapsed time": 606.1567662300004 + }, + { + "step": 1750, + "valid accuracy": 0.48, + "train loss": 0.6297660274505615, + "train samples": 7000, + "train time": 37.82186047102368, + "eval time": 7.911249515000236, + "tokens / sec": 5535.290897717534, + "mem allocated avg": 6923910838.272, + "mem reserved avg": 11956249427.968, + "elapsed time": 701.1174360119985 + }, + { + "step": 2000, + "valid accuracy": 0.5, + "train loss": 0.6332990030050277, + "train samples": 8000, + "train time": 37.523248280005646, + "eval time": 8.530133835996821, + "tokens / sec": 5535.128474223041, + "mem allocated avg": 6920641826.816, + "mem reserved avg": 11907327066.112, + "elapsed time": 796.1569609649996 + }, + { + "step": 2250, + "valid accuracy": 0.4, + "train loss": 0.6243826431035996, + "train samples": 9000, + "train time": 38.08898475294336, + "eval time": 13.285918199999287, + "tokens / sec": 5643.311350885762, + "mem allocated avg": 6931386861.568, + "mem reserved avg": 12094938284.032, + "elapsed time": 897.2201951069983 + }, + { + "step": 2500, + "valid accuracy": 0.5, + "train loss": 0.6215927278995514, + "train samples": 10000, + "train time": 37.63880846399843, + "eval time": 13.24860273900049, + "tokens / sec": 5472.1976705773695, + "mem allocated avg": 6917278386.176, + "mem reserved avg": 11845175869.44, + "elapsed time": 998.0728250969987 + }, + { + "step": 2750, + "valid accuracy": 0.42, + "train loss": 0.6130854382514953, + "train samples": 11000, + "train time": 37.79084398697523, + "eval time": 13.198808683002426, + "tokens / sec": 5606.675523653974, + "mem allocated avg": 6926927112.192, + "mem reserved avg": 12020548108.288, + "elapsed time": 1098.4325272319984 + }, + { + "step": 3000, + "valid accuracy": 0.46, + "train loss": 0.604831589102745, + "train samples": 12000, + "train time": 37.568779274977715, + "eval time": 10.355002560001594, + "tokens / sec": 5555.969718159649, + "mem allocated avg": 6922721505.28, + "mem reserved avg": 11937609940.992, + "elapsed time": 1195.2514979959997 + }, + { + "step": 3250, + "valid accuracy": 0.4, + "train loss": 0.6124310380220414, + "train samples": 13000, + "train time": 37.70235535401662, + "eval time": 10.490295633000642, + "tokens / sec": 5593.841499282662, + "mem allocated avg": 6924630044.672, + "mem reserved avg": 11975081852.928, + "elapsed time": 1292.7081366849998 + }, + { + "step": 3500, + "valid accuracy": 0.54, + "train loss": 0.5956783784627915, + "train samples": 14000, + "train time": 37.79015436899135, + "eval time": 7.505472221000673, + "tokens / sec": 5550.387488549399, + "mem allocated avg": 6923355121.664, + "mem reserved avg": 11948884230.144, + "elapsed time": 1387.1216009819982 + }, + { + "step": 3750, + "valid accuracy": 0.48, + "train loss": 0.5921734108924865, + "train samples": 15000, + "train time": 37.99711803697937, + "eval time": 8.399906407001254, + "tokens / sec": 5703.143059142048, + "mem allocated avg": 6933243086.848, + "mem reserved avg": 12128694042.624, + "elapsed time": 1483.2807508709993 + }, + { + "step": 4000, + "valid accuracy": 0.52, + "train loss": 0.6020598074197769, + "train samples": 16000, + "train time": 37.42554273099813, + "eval time": 13.19645261199912, + "tokens / sec": 5460.78921203528, + "mem allocated avg": 6915014187.008, + "mem reserved avg": 11819355734.016, + "elapsed time": 1582.7408143280009 + }, + { + "step": 4250, + "valid accuracy": 0.5, + "train loss": 0.58726664686203, + "train samples": 17000, + "train time": 37.58307892599987, + "eval time": 9.69436509300067, + "tokens / sec": 5624.579093592081, + "mem allocated avg": 6926118213.632, + "mem reserved avg": 11987807371.264, + "elapsed time": 1679.2568312559997 + }, + { + "step": 4500, + "valid accuracy": 0.52, + "train loss": 0.5931945472955704, + "train samples": 18000, + "train time": 37.45943218199682, + "eval time": 7.795902468998975, + "tokens / sec": 5547.815006653474, + "mem allocated avg": 6920348925.952, + "mem reserved avg": 11897596280.832, + "elapsed time": 1773.5582212900008 + }, + { + "step": 4750, + "valid accuracy": 0.5, + "train loss": 0.5837668641805649, + "train samples": 19000, + "train time": 37.71794232197135, + "eval time": 10.624573600001895, + "tokens / sec": 5566.024737190049, + "mem allocated avg": 6922591481.856, + "mem reserved avg": 11951140765.696, + "elapsed time": 1871.3457676430007 + }, + { + "step": 5000, + "valid accuracy": 0.52, + "train loss": 0.5912798082828522, + "train samples": 20000, + "train time": 37.50696286400489, + "eval time": 9.267422332999558, + "tokens / sec": 5553.1022534454405, + "mem allocated avg": 6919856828.416, + "mem reserved avg": 11901413097.472, + "elapsed time": 1967.2812061679979 + }, + { + "step": 5000, + "test accuracy": 0.5056861258529188, + "train loss": 0.5912798082828522, + "train samples": 20000, + "train total tokens": 4198051 + } + ] + }, + "meta_info": { + "model_info": { + "sha": "13afe5124825b4f3751f836b40dafda64c1ed062", + "created_at": "2024-09-18T15:23:48+00:00" + }, + "dataset_info": { + "metamath": { + "sha": "aa4f34d3d2d3231299b5b03d9b3e5a20da45aa18", + "created_at": "2023-09-21T17:22:46+00:00" + }, + "gsm8k": { + "sha": "e53f048856ff4f594e959d75785d2c2d37b678ee", + "created_at": "2022-04-12T10:22:10+00:00" + } + }, + "package_info": { + "transformers-version": "4.57.1", + "transformers-commit-hash": null, + "peft-version": "0.17.2.dev0", + "peft-commit-hash": "a18ba67f242ab2eb74cdabab76ea2fd836b5cd83", + "datasets-version": "4.2.0", + "datasets-commit-hash": null, + "bitsandbytes-version": "0.46.0", + "bitsandbytes-commit-hash": null, + "torch-version": "2.9.0+cu128", + "torch-commit-hash": null + }, + "system_info": { + "system": "Linux", + "release": "6.14.0-1014-aws", + "version": "#14~24.04.1-Ubuntu SMP Tue Sep 23 14:51:14 UTC 2025", + "machine": "x86_64", + "processor": "x86_64", + "accelerator": "NVIDIA L40S" + }, + "pytorch_info": "PyTorch built with:\n - GCC 13.3\n - C++ Version: 201703\n - Intel(R) oneAPI Math Kernel Library Version 2024.2-Product Build 20240605 for Intel(R) 64 architecture applications\n - Intel(R) MKL-DNN v3.7.1 (Git Hash 8d263e693366ef8db40acc569cc7d8edf644556d)\n - OpenMP 201511 (a.k.a. OpenMP 4.5)\n - LAPACK is enabled (usually provided by MKL)\n - NNPACK is enabled\n - CPU capability usage: AVX2\n - CUDA Runtime 12.8\n - NVCC architecture flags: -gencode;arch=compute_70,code=sm_70;-gencode;arch=compute_75,code=sm_75;-gencode;arch=compute_80,code=sm_80;-gencode;arch=compute_86,code=sm_86;-gencode;arch=compute_90,code=sm_90;-gencode;arch=compute_100,code=sm_100;-gencode;arch=compute_120,code=sm_120\n - CuDNN 90.7.1\n - Built with CuDNN 90.8\n - Magma 2.6.1\n - Build settings: BLAS_INFO=mkl, BUILD_TYPE=Release, COMMIT_SHA=0fabc3ba44823f257e70ce397d989c8de5e362c1, CUDA_VERSION=12.8, CUDNN_VERSION=9.8.0, CXX_COMPILER=/opt/rh/gcc-toolset-13/root/usr/bin/c++, CXX_FLAGS= -fvisibility-inlines-hidden -DUSE_PTHREADPOOL -DNDEBUG -DUSE_KINETO -DLIBKINETO_NOROCTRACER -DLIBKINETO_NOXPUPTI=ON -DUSE_FBGEMM -DUSE_PYTORCH_QNNPACK -DUSE_XNNPACK -DSYMBOLICATE_MOBILE_DEBUG_HANDLE -O2 -fPIC -DC10_NODEPRECATED -Wall -Wextra -Werror=return-type -Werror=non-virtual-dtor -Werror=range-loop-construct -Werror=bool-operation -Wnarrowing -Wno-missing-field-initializers -Wno-unknown-pragmas -Wno-unused-parameter -Wno-strict-overflow -Wno-strict-aliasing -Wno-stringop-overflow -Wsuggest-override -Wno-psabi -Wno-error=old-style-cast -faligned-new -Wno-maybe-uninitialized -fno-math-errno -fno-trapping-math -Werror=format -Wno-dangling-reference -Wno-error=dangling-reference -Wno-stringop-overflow, LAPACK_INFO=mkl, PERF_WITH_AVX=1, PERF_WITH_AVX2=1, TORCH_VERSION=2.9.0, USE_CUDA=ON, USE_CUDNN=ON, USE_CUSPARSELT=1, USE_GFLAGS=OFF, USE_GLOG=OFF, USE_GLOO=ON, USE_MKL=ON, USE_MKLDNN=ON, USE_MPI=OFF, USE_NCCL=1, USE_NNPACK=ON, USE_OPENMP=ON, USE_ROCM=OFF, USE_ROCM_KERNEL_ASSERT=OFF, USE_XCCL=OFF, USE_XPU=OFF, \n" + } +} \ No newline at end of file diff --git a/method_comparison/MetaMathQA/results/lora--llama-3.2-3B-rank10-target-mlp.json b/method_comparison/MetaMathQA/results/lora--llama-3.2-3B-rank10-target-mlp.json new file mode 100644 index 0000000000..7fdd1804b7 --- /dev/null +++ b/method_comparison/MetaMathQA/results/lora--llama-3.2-3B-rank10-target-mlp.json @@ -0,0 +1,373 @@ +{ + "run_info": { + "created_at": "2025-10-23T16:57:13+00:00", + "total_time": 2248.6777099889987, + "experiment_name": "lora/llama-3.2-3B-rank10-target-mlp", + "peft_branch": "main", + "train_config": { + "model_id": "meta-llama/Llama-3.2-3B", + "dtype": "bfloat16", + "max_seq_length": 768, + "batch_size": 4, + "batch_size_eval": 50, + "max_steps": 5000, + "eval_steps": 250, + "compile": false, + "query_template": "Question: {query} Think step by step.\nAnswer:", + "seed": 0, + "grad_norm_clip": 1.0, + "optimizer_type": "AdamW", + "optimizer_kwargs": { + "lr": 0.0001, + "weight_decay": 0.1 + }, + "lr_scheduler": "cosine", + "use_amp": false, + "autocast_adapter_dtype": true, + "generation_kwargs": { + "max_length": 800, + "max_new_tokens": 300 + }, + "attn_implementation": null + }, + "peft_config": { + "task_type": "CAUSAL_LM", + "peft_type": "LORA", + "auto_mapping": null, + "peft_version": "0.17.2.dev0@UNKNOWN", + "base_model_name_or_path": "meta-llama/Llama-3.2-3B", + "revision": null, + "inference_mode": false, + "r": 10, + "target_modules": [ + "down_proj", + "up_proj", + "gate_proj" + ], + "exclude_modules": null, + "lora_alpha": 20, + "lora_dropout": 0.0, + "fan_in_fan_out": false, + "bias": "none", + "use_rslora": false, + "modules_to_save": null, + "init_lora_weights": true, + "layers_to_transform": null, + "layers_pattern": null, + "rank_pattern": {}, + "alpha_pattern": {}, + "megatron_config": null, + "megatron_core": "megatron.core", + "trainable_token_indices": null, + "loftq_config": {}, + "eva_config": null, + "corda_config": null, + "use_dora": false, + "alora_invocation_tokens": null, + "use_qalora": false, + "qalora_group_size": 16, + "layer_replication": null, + "lora_bias": false, + "target_parameters": null, + "arrow_config": null, + "ensure_weight_tying": false + }, + "error_msg": "" + }, + "train_info": { + "accelerator_memory_reserved_avg": 12694032954, + "accelerator_memory_max": 24712839168, + "accelerator_memory_reserved_99th": 19381941698, + "train_time": 2051.9836875680085, + "file_size": 37868984, + "num_trainable_params": 9461760, + "num_total_params": 3222211584, + "status": "success", + "metrics": [ + { + "step": 250, + "valid accuracy": 0.3, + "train loss": 0.9570077481269836, + "train samples": 1000, + "train time": 36.60406935900755, + "eval time": 11.010158622000745, + "tokens / sec": 5784.029035774408, + "mem allocated avg": 6935100323.84, + "mem reserved avg": 12750096957.44, + "elapsed time": 110.24424074899798 + }, + { + "step": 500, + "valid accuracy": 0.34, + "train loss": 0.6960296107530594, + "train samples": 2000, + "train time": 36.118105334990105, + "eval time": 12.692174800999055, + "tokens / sec": 5758.74614880479, + "mem allocated avg": 6925969729.536, + "mem reserved avg": 12626524372.992, + "elapsed time": 209.50398311299796 + }, + { + "step": 750, + "valid accuracy": 0.34, + "train loss": 0.6628359504938126, + "train samples": 3000, + "train time": 36.51641339201524, + "eval time": 7.421836968998832, + "tokens / sec": 5871.359755360898, + "mem allocated avg": 6938496946.176, + "mem reserved avg": 12801720451.072, + "elapsed time": 304.07620789499924 + }, + { + "step": 1000, + "valid accuracy": 0.44, + "train loss": 0.6412760821580887, + "train samples": 4000, + "train time": 36.199786640008824, + "eval time": 12.675621071000933, + "tokens / sec": 5755.172042084423, + "mem allocated avg": 6928097030.144, + "mem reserved avg": 12653737017.344, + "elapsed time": 402.97702367999955 + }, + { + "step": 1250, + "valid accuracy": 0.36, + "train loss": 0.6370910699367524, + "train samples": 5000, + "train time": 36.12392316597834, + "eval time": 7.926442895997752, + "tokens / sec": 5772.850281012721, + "mem allocated avg": 6928575430.656, + "mem reserved avg": 12650138304.512, + "elapsed time": 497.5677833490008 + }, + { + "step": 1500, + "valid accuracy": 0.54, + "train loss": 0.6281076629161835, + "train samples": 6000, + "train time": 36.30386043796898, + "eval time": 7.837062710997998, + "tokens / sec": 5766.081002808941, + "mem allocated avg": 6928918276.096, + "mem reserved avg": 12675237019.648, + "elapsed time": 592.1144442160003 + }, + { + "step": 1750, + "valid accuracy": 0.36, + "train loss": 0.6193128414154053, + "train samples": 7000, + "train time": 36.462194165000255, + "eval time": 12.642417379000108, + "tokens / sec": 5741.700542008469, + "mem allocated avg": 6929703954.432, + "mem reserved avg": 12699194884.096, + "elapsed time": 691.625672238999 + }, + { + "step": 2000, + "valid accuracy": 0.4, + "train loss": 0.6216564847230911, + "train samples": 8000, + "train time": 36.21432654597447, + "eval time": 7.853862869000295, + "tokens / sec": 5735.1887998338925, + "mem allocated avg": 6928315158.528, + "mem reserved avg": 12636641034.24, + "elapsed time": 785.9735525059987 + }, + { + "step": 2250, + "valid accuracy": 0.48, + "train loss": 0.6131566362380981, + "train samples": 9000, + "train time": 36.94326955001088, + "eval time": 7.9556675359999645, + "tokens / sec": 5818.3263857851125, + "mem allocated avg": 6937815918.592, + "mem reserved avg": 12847388033.024, + "elapsed time": 881.850712686999 + }, + { + "step": 2500, + "valid accuracy": 0.48, + "train loss": 0.6090103325843811, + "train samples": 10000, + "train time": 35.94672909302972, + "eval time": 7.34414544700121, + "tokens / sec": 5729.784188902412, + "mem allocated avg": 6922815086.592, + "mem reserved avg": 12567728619.52, + "elapsed time": 975.391175255998 + }, + { + "step": 2750, + "valid accuracy": 0.54, + "train loss": 0.5998001435995102, + "train samples": 11000, + "train time": 36.57762499699311, + "eval time": 8.164194213000883, + "tokens / sec": 5792.6396264770565, + "mem allocated avg": 6934571272.192, + "mem reserved avg": 12769894072.32, + "elapsed time": 1070.6943081120007 + }, + { + "step": 3000, + "valid accuracy": 0.38, + "train loss": 0.5910915687084198, + "train samples": 12000, + "train time": 36.188985478995164, + "eval time": 12.692775247996906, + "tokens / sec": 5767.80468524468, + "mem allocated avg": 6928919042.048, + "mem reserved avg": 12673836122.112, + "elapsed time": 1169.909223751998 + }, + { + "step": 3250, + "valid accuracy": 0.48, + "train loss": 0.5988883073329926, + "train samples": 13000, + "train time": 36.27180437299103, + "eval time": 8.65234920200237, + "tokens / sec": 5814.461222586507, + "mem allocated avg": 6930257350.656, + "mem reserved avg": 12713606512.64, + "elapsed time": 1265.7605457559985 + }, + { + "step": 3500, + "valid accuracy": 0.56, + "train loss": 0.5822008575201034, + "train samples": 14000, + "train time": 36.417341429965745, + "eval time": 12.674946688999626, + "tokens / sec": 5759.618680659888, + "mem allocated avg": 6928901287.936, + "mem reserved avg": 12692374945.792, + "elapsed time": 1365.5030611420007 + }, + { + "step": 3750, + "valid accuracy": 0.6, + "train loss": 0.5811240552663803, + "train samples": 15000, + "train time": 36.86457888804216, + "eval time": 12.64913076099765, + "tokens / sec": 5878.352785695116, + "mem allocated avg": 6940823429.12, + "mem reserved avg": 12892720070.656, + "elapsed time": 1465.9017574809986 + }, + { + "step": 4000, + "valid accuracy": 0.52, + "train loss": 0.5901038019657135, + "train samples": 16000, + "train time": 36.045212401033496, + "eval time": 12.659105679998902, + "tokens / sec": 5669.906941487191, + "mem allocated avg": 6920815292.416, + "mem reserved avg": 12551395999.744, + "elapsed time": 1564.9818188249992 + }, + { + "step": 4250, + "valid accuracy": 0.48, + "train loss": 0.5774346487522125, + "train samples": 17000, + "train time": 36.271750094994786, + "eval time": 7.057205803001125, + "tokens / sec": 5827.923920030812, + "mem allocated avg": 6933466675.2, + "mem reserved avg": 12724444594.176, + "elapsed time": 1659.218996085001 + }, + { + "step": 4500, + "valid accuracy": 0.52, + "train loss": 0.5839375752210617, + "train samples": 18000, + "train time": 36.09228529396205, + "eval time": 7.738268649998645, + "tokens / sec": 5757.961800073832, + "mem allocated avg": 6927241906.176, + "mem reserved avg": 12634778763.264, + "elapsed time": 1753.44311027 + }, + { + "step": 4750, + "valid accuracy": 0.5, + "train loss": 0.5752255419492721, + "train samples": 19000, + "train time": 36.37206586197499, + "eval time": 7.730858285998693, + "tokens / sec": 5771.984489324259, + "mem allocated avg": 6928856346.624, + "mem reserved avg": 12691150209.024, + "elapsed time": 1848.2997361499984 + }, + { + "step": 5000, + "valid accuracy": 0.52, + "train loss": 0.5811339800357819, + "train samples": 20000, + "train time": 36.14202177399784, + "eval time": 8.250298782000755, + "tokens / sec": 5762.820942956926, + "mem allocated avg": 6926857431.04, + "mem reserved avg": 12628051099.648, + "elapsed time": 1943.1900490109983 + }, + { + "step": 5000, + "test accuracy": 0.5261561789234268, + "train loss": 0.5811339800357819, + "train samples": 20000, + "train total tokens": 4198051 + } + ] + }, + "meta_info": { + "model_info": { + "sha": "13afe5124825b4f3751f836b40dafda64c1ed062", + "created_at": "2024-09-18T15:23:48+00:00" + }, + "dataset_info": { + "metamath": { + "sha": "aa4f34d3d2d3231299b5b03d9b3e5a20da45aa18", + "created_at": "2023-09-21T17:22:46+00:00" + }, + "gsm8k": { + "sha": "e53f048856ff4f594e959d75785d2c2d37b678ee", + "created_at": "2022-04-12T10:22:10+00:00" + } + }, + "package_info": { + "transformers-version": "4.57.1", + "transformers-commit-hash": null, + "peft-version": "0.17.2.dev0", + "peft-commit-hash": "a18ba67f242ab2eb74cdabab76ea2fd836b5cd83", + "datasets-version": "4.2.0", + "datasets-commit-hash": null, + "bitsandbytes-version": "0.46.0", + "bitsandbytes-commit-hash": null, + "torch-version": "2.9.0+cu128", + "torch-commit-hash": null + }, + "system_info": { + "system": "Linux", + "release": "6.14.0-1014-aws", + "version": "#14~24.04.1-Ubuntu SMP Tue Sep 23 14:51:14 UTC 2025", + "machine": "x86_64", + "processor": "x86_64", + "accelerator": "NVIDIA L40S" + }, + "pytorch_info": "PyTorch built with:\n - GCC 13.3\n - C++ Version: 201703\n - Intel(R) oneAPI Math Kernel Library Version 2024.2-Product Build 20240605 for Intel(R) 64 architecture applications\n - Intel(R) MKL-DNN v3.7.1 (Git Hash 8d263e693366ef8db40acc569cc7d8edf644556d)\n - OpenMP 201511 (a.k.a. OpenMP 4.5)\n - LAPACK is enabled (usually provided by MKL)\n - NNPACK is enabled\n - CPU capability usage: AVX2\n - CUDA Runtime 12.8\n - NVCC architecture flags: -gencode;arch=compute_70,code=sm_70;-gencode;arch=compute_75,code=sm_75;-gencode;arch=compute_80,code=sm_80;-gencode;arch=compute_86,code=sm_86;-gencode;arch=compute_90,code=sm_90;-gencode;arch=compute_100,code=sm_100;-gencode;arch=compute_120,code=sm_120\n - CuDNN 90.7.1\n - Built with CuDNN 90.8\n - Magma 2.6.1\n - Build settings: BLAS_INFO=mkl, BUILD_TYPE=Release, COMMIT_SHA=0fabc3ba44823f257e70ce397d989c8de5e362c1, CUDA_VERSION=12.8, CUDNN_VERSION=9.8.0, CXX_COMPILER=/opt/rh/gcc-toolset-13/root/usr/bin/c++, CXX_FLAGS= -fvisibility-inlines-hidden -DUSE_PTHREADPOOL -DNDEBUG -DUSE_KINETO -DLIBKINETO_NOROCTRACER -DLIBKINETO_NOXPUPTI=ON -DUSE_FBGEMM -DUSE_PYTORCH_QNNPACK -DUSE_XNNPACK -DSYMBOLICATE_MOBILE_DEBUG_HANDLE -O2 -fPIC -DC10_NODEPRECATED -Wall -Wextra -Werror=return-type -Werror=non-virtual-dtor -Werror=range-loop-construct -Werror=bool-operation -Wnarrowing -Wno-missing-field-initializers -Wno-unknown-pragmas -Wno-unused-parameter -Wno-strict-overflow -Wno-strict-aliasing -Wno-stringop-overflow -Wsuggest-override -Wno-psabi -Wno-error=old-style-cast -faligned-new -Wno-maybe-uninitialized -fno-math-errno -fno-trapping-math -Werror=format -Wno-dangling-reference -Wno-error=dangling-reference -Wno-stringop-overflow, LAPACK_INFO=mkl, PERF_WITH_AVX=1, PERF_WITH_AVX2=1, TORCH_VERSION=2.9.0, USE_CUDA=ON, USE_CUDNN=ON, USE_CUSPARSELT=1, USE_GFLAGS=OFF, USE_GLOG=OFF, USE_GLOO=ON, USE_MKL=ON, USE_MKLDNN=ON, USE_MPI=OFF, USE_NCCL=1, USE_NNPACK=ON, USE_OPENMP=ON, USE_ROCM=OFF, USE_ROCM_KERNEL_ASSERT=OFF, USE_XCCL=OFF, USE_XPU=OFF, \n" + } +} \ No newline at end of file diff --git a/method_comparison/MetaMathQA/results/oft--llama-3.2-3B-rank32.json b/method_comparison/MetaMathQA/results/oft--llama-3.2-3B-rank32.json index b57f300fa3..0b7741e369 100644 --- a/method_comparison/MetaMathQA/results/oft--llama-3.2-3B-rank32.json +++ b/method_comparison/MetaMathQA/results/oft--llama-3.2-3B-rank32.json @@ -1,7 +1,7 @@ { "run_info": { - "created_at": "2025-07-31T14:11:12+00:00", - "total_time": 2493.9155955019996, + "created_at": "2025-10-23T17:34:45+00:00", + "total_time": 2374.6856670790003, "experiment_name": "oft/llama-3.2-3B-rank32", "peft_branch": "main", "train_config": { @@ -34,6 +34,7 @@ "task_type": null, "peft_type": "OFT", "auto_mapping": null, + "peft_version": "0.17.2.dev0@UNKNOWN", "base_model_name_or_path": "meta-llama/Llama-3.2-3B", "revision": null, "inference_mode": false, @@ -60,10 +61,10 @@ "error_msg": "" }, "train_info": { - "accelerator_memory_reserved_avg": 12057354384, - "accelerator_memory_max": 22294822912, - "accelerator_memory_reserved_99th": 17939310837, - "train_time": 2214.446992367006, + "accelerator_memory_reserved_avg": 12097176784, + "accelerator_memory_max": 22328377344, + "accelerator_memory_reserved_99th": 17958185205, + "train_time": 2166.5656557240145, "file_size": 32693568, "num_trainable_params": 8171520, "num_total_params": 3220921344, @@ -72,247 +73,247 @@ { "step": 250, "valid accuracy": 0.36, - "train loss": 0.9631057088375091, + "train loss": 0.9631274998188019, "train samples": 1000, - "train time": 43.418166981995455, - "eval time": 16.96007740999994, - "tokens / sec": 4876.276791873667, - "mem allocated avg": 6903823460.352, - "mem reserved avg": 12108561383.424, - "elapsed time": 113.91408998500083 + "train time": 40.319602065053914, + "eval time": 14.108862943998247, + "tokens / sec": 5251.019086408657, + "mem allocated avg": 6909552105.472, + "mem reserved avg": 12148658929.664, + "elapsed time": 117.40419055000166 }, { "step": 500, - "valid accuracy": 0.36, - "train loss": 0.7144306401014328, + "valid accuracy": 0.3, + "train loss": 0.7145850785970688, "train samples": 2000, - "train time": 42.455775934988196, - "eval time": 16.150497423999695, - "tokens / sec": 4899.097835792689, - "mem allocated avg": 6896105342.976, - "mem reserved avg": 11994249822.208, - "elapsed time": 220.49977440600014 + "train time": 39.82235778199902, + "eval time": 8.958179848999862, + "tokens / sec": 5223.07094769814, + "mem allocated avg": 6901974622.208, + "mem reserved avg": 12035630825.472, + "elapsed time": 217.32610749300147 }, { "step": 750, - "valid accuracy": 0.52, - "train loss": 0.6711842056512832, + "valid accuracy": 0.46, + "train loss": 0.6711596403121948, "train samples": 3000, - "train time": 43.15603912099323, - "eval time": 10.51256339000065, - "tokens / sec": 4968.041654585135, - "mem allocated avg": 6906686986.24, - "mem reserved avg": 12155101380.608, - "elapsed time": 322.5515955810006 + "train time": 40.14594141800262, + "eval time": 8.506328391002171, + "tokens / sec": 5340.539851031025, + "mem allocated avg": 6912328740.864, + "mem reserved avg": 12194418786.304, + "elapsed time": 317.6191419630013 }, { "step": 1000, "valid accuracy": 0.48, - "train loss": 0.6508683092594146, + "train loss": 0.651293668627739, "train samples": 4000, - "train time": 42.42713372799517, - "eval time": 16.934662378998837, - "tokens / sec": 4910.442485595753, - "mem allocated avg": 6897939019.776, - "mem reserved avg": 12025262505.984, - "elapsed time": 429.7382754350001 + "train time": 39.88486097396162, + "eval time": 9.90862209899933, + "tokens / sec": 5223.435531993199, + "mem allocated avg": 6903443197.952, + "mem reserved avg": 12063405506.56, + "elapsed time": 418.50864810500207 }, { "step": 1250, - "valid accuracy": 0.4, - "train loss": 0.6453732433319092, + "valid accuracy": 0.36, + "train loss": 0.6456290460824966, "train samples": 5000, - "train time": 42.549762738994104, - "eval time": 16.92903551499876, - "tokens / sec": 4901.03790423462, - "mem allocated avg": 6897900118.016, - "mem reserved avg": 12017234608.128, - "elapsed time": 537.135011331 + "train time": 39.799740495029255, + "eval time": 10.214905517997977, + "tokens / sec": 5239.682405116313, + "mem allocated avg": 6904099018.752, + "mem reserved avg": 12058431062.016, + "elapsed time": 519.874058526002 }, { "step": 1500, - "valid accuracy": 0.5, - "train loss": 0.636857116818428, + "valid accuracy": 0.44, + "train loss": 0.6369200776815415, "train samples": 6000, - "train time": 42.7670685170051, - "eval time": 16.97714005600028, - "tokens / sec": 4894.677312679627, - "mem allocated avg": 6899436058.624, - "mem reserved avg": 12045822984.192, - "elapsed time": 644.8122739440005 + "train time": 39.7944654230123, + "eval time": 9.540699907996895, + "tokens / sec": 5260.304360790541, + "mem allocated avg": 6905092661.248, + "mem reserved avg": 12085794701.312, + "elapsed time": 620.4396147330008 }, { "step": 1750, - "valid accuracy": 0.48, - "train loss": 0.6280697054862976, + "valid accuracy": 0.46, + "train loss": 0.6281714961528778, "train samples": 7000, - "train time": 42.93359049599712, - "eval time": 11.770931148001182, - "tokens / sec": 4876.251848060996, - "mem allocated avg": 6900382935.04, - "mem reserved avg": 12059630632.96, - "elapsed time": 747.525349122001 + "train time": 39.897877080999024, + "eval time": 10.18648028700045, + "tokens / sec": 5247.271667486872, + "mem allocated avg": 6906448510.976, + "mem reserved avg": 12100340547.584, + "elapsed time": 721.9082820210024 }, { "step": 2000, - "valid accuracy": 0.4, - "train loss": 0.6299525223970414, + "valid accuracy": 0.42, + "train loss": 0.6302315661907196, "train samples": 8000, - "train time": 42.82682755300084, - "eval time": 11.5680384089992, - "tokens / sec": 4849.670448808364, - "mem allocated avg": 6896952041.472, - "mem reserved avg": 12003611508.736, - "elapsed time": 849.5279627600012 + "train time": 39.71084841699121, + "eval time": 14.071537550997164, + "tokens / sec": 5230.20807359866, + "mem allocated avg": 6903141050.368, + "mem reserved avg": 12043474173.952, + "elapsed time": 826.8578335800012 }, { "step": 2250, - "valid accuracy": 0.42, - "train loss": 0.6208749743700027, + "valid accuracy": 0.44, + "train loss": 0.6209213199615479, "train samples": 9000, - "train time": 43.43083962600576, - "eval time": 16.986704689999897, - "tokens / sec": 4949.20203825146, - "mem allocated avg": 6908628027.392, - "mem reserved avg": 12188169273.344, - "elapsed time": 958.0240945160003 + "train time": 40.21075651299543, + "eval time": 14.178777003002324, + "tokens / sec": 5345.534842910316, + "mem allocated avg": 6914497898.496, + "mem reserved avg": 12228820467.712, + "elapsed time": 933.0094860480021 }, { "step": 2500, - "valid accuracy": 0.42, - "train loss": 0.6179436918497085, + "valid accuracy": 0.44, + "train loss": 0.618088245511055, "train samples": 10000, - "train time": 42.63891591101674, - "eval time": 17.232789900999705, - "tokens / sec": 4830.493355643306, - "mem allocated avg": 6893492830.208, - "mem reserved avg": 11953867063.296, - "elapsed time": 1065.2266578140006 + "train time": 39.52404374004254, + "eval time": 14.292836533997615, + "tokens / sec": 5211.182371790845, + "mem allocated avg": 6899276843.008, + "mem reserved avg": 11993117360.128, + "elapsed time": 1037.8300729750008 }, { "step": 2750, - "valid accuracy": 0.42, - "train loss": 0.6097300077676773, + "valid accuracy": 0.5, + "train loss": 0.6095741709470749, "train samples": 11000, - "train time": 43.157022238001446, - "eval time": 17.135427543998958, - "tokens / sec": 4909.537058222485, - "mem allocated avg": 6904392247.296, - "mem reserved avg": 12124977889.28, - "elapsed time": 1173.5244531360004 + "train time": 40.033341915019264, + "eval time": 8.408460123999248, + "tokens / sec": 5292.613353383542, + "mem allocated avg": 6909805750.272, + "mem reserved avg": 12163313827.84, + "elapsed time": 1137.8264588340026 }, { "step": 3000, - "valid accuracy": 0.42, - "train loss": 0.600518134355545, + "valid accuracy": 0.38, + "train loss": 0.6007885160446167, "train samples": 12000, - "train time": 42.90499155000907, - "eval time": 17.038416949999373, - "tokens / sec": 4864.958422301702, - "mem allocated avg": 6898886381.568, - "mem reserved avg": 12038994657.28, - "elapsed time": 1281.100714346001 + "train time": 39.80941545598034, + "eval time": 9.015956413000822, + "tokens / sec": 5243.257094061238, + "mem allocated avg": 6905287532.544, + "mem reserved avg": 12079830401.024, + "elapsed time": 1237.902389021001 }, { "step": 3250, - "valid accuracy": 0.54, - "train loss": 0.6095727566480637, + "valid accuracy": 0.56, + "train loss": 0.609751238822937, "train samples": 13000, - "train time": 42.991201876006016, - "eval time": 17.145920277998812, - "tokens / sec": 4905.678157318666, - "mem allocated avg": 6900920473.6, - "mem reserved avg": 12070426771.456, - "elapsed time": 1389.080374264 + "train time": 40.0327758529711, + "eval time": 9.789832267997554, + "tokens / sec": 5268.208249524811, + "mem allocated avg": 6907088541.696, + "mem reserved avg": 12110599815.168, + "elapsed time": 1339.3388089530017 }, { "step": 3500, - "valid accuracy": 0.54, - "train loss": 0.59402192902565, + "valid accuracy": 0.52, + "train loss": 0.5943620399236679, "train samples": 14000, - "train time": 43.139979139998104, - "eval time": 10.18719298600081, - "tokens / sec": 4862.079309758545, - "mem allocated avg": 6899826102.272, - "mem reserved avg": 12054404530.176, - "elapsed time": 1490.7450829120007 + "train time": 39.922039763983776, + "eval time": 8.802732422998815, + "tokens / sec": 5253.990057622979, + "mem allocated avg": 6905655146.496, + "mem reserved avg": 12095215108.096, + "elapsed time": 1439.3830861440001 }, { "step": 3750, - "valid accuracy": 0.58, - "train loss": 0.5927710949182511, + "valid accuracy": 0.48, + "train loss": 0.5927145059108734, "train samples": 15000, - "train time": 43.49427866901169, - "eval time": 10.884315328999946, - "tokens / sec": 4982.333461582249, - "mem allocated avg": 6910839183.36, - "mem reserved avg": 12223619530.752, - "elapsed time": 1593.6702795590008 + "train time": 40.492691420033225, + "eval time": 9.00371527400057, + "tokens / sec": 5351.65711145565, + "mem allocated avg": 6916861732.864, + "mem reserved avg": 12265587736.576, + "elapsed time": 1540.9954331820009 }, { "step": 4000, - "valid accuracy": 0.52, - "train loss": 0.6036465883255004, + "valid accuracy": 0.5, + "train loss": 0.6037785897254944, "train samples": 16000, - "train time": 42.54699739801072, - "eval time": 10.508950370000093, - "tokens / sec": 4803.464697829781, - "mem allocated avg": 6892073494.528, - "mem reserved avg": 11931788247.04, - "elapsed time": 1694.1543825910012 + "train time": 39.58210096696348, + "eval time": 9.008053338999161, + "tokens / sec": 5163.268118854439, + "mem allocated avg": 6898274762.752, + "mem reserved avg": 11974511427.584, + "elapsed time": 1640.5221296710006 }, { "step": 4250, "valid accuracy": 0.5, - "train loss": 0.5904108211994171, + "train loss": 0.5905539064407349, "train samples": 17000, - "train time": 42.904117188016244, - "eval time": 10.362485865000053, - "tokens / sec": 4927.009663749569, - "mem allocated avg": 6902539771.904, - "mem reserved avg": 12087044603.904, - "elapsed time": 1795.3652429800004 + "train time": 40.03998009499628, + "eval time": 10.12545333899834, + "tokens / sec": 5279.448179006884, + "mem allocated avg": 6908281157.632, + "mem reserved avg": 12122973011.968, + "elapsed time": 1742.3377487470025 }, { "step": 4500, "valid accuracy": 0.56, - "train loss": 0.5975252593755722, + "train loss": 0.5975803916454315, "train samples": 18000, - "train time": 42.7045542899923, - "eval time": 9.970661539999128, - "tokens / sec": 4866.413043179837, - "mem allocated avg": 6897064284.16, - "mem reserved avg": 12006883065.856, - "elapsed time": 1895.7771126360003 + "train time": 39.89842279496588, + "eval time": 8.936802754000382, + "tokens / sec": 5208.677071471134, + "mem allocated avg": 6903550846.976, + "mem reserved avg": 12046091419.648, + "elapsed time": 1842.5112857700005 }, { "step": 4750, - "valid accuracy": 0.54, - "train loss": 0.588557964682579, + "valid accuracy": 0.56, + "train loss": 0.5887055099010468, "train samples": 19000, - "train time": 42.698231221012975, - "eval time": 10.72399718899942, - "tokens / sec": 4916.8078863342525, - "mem allocated avg": 6900484192.256, - "mem reserved avg": 12052575813.632, - "elapsed time": 1997.1282366079995 + "train time": 39.961028160010756, + "eval time": 9.079531961000612, + "tokens / sec": 5253.59355518503, + "mem allocated avg": 6905698629.632, + "mem reserved avg": 12090920140.8, + "elapsed time": 1943.151558054 }, { "step": 5000, "valid accuracy": 0.56, - "train loss": 0.5946548076868057, + "train loss": 0.5947723392248153, "train samples": 20000, - "train time": 42.98944765599845, - "eval time": 10.321189939999385, - "tokens / sec": 4844.909887343902, - "mem allocated avg": 6896923324.416, - "mem reserved avg": 12004861411.328, - "elapsed time": 2098.129397994 + "train time": 39.70571685399773, + "eval time": 8.965388607000932, + "tokens / sec": 5245.592234636347, + "mem allocated avg": 6902749710.336, + "mem reserved avg": 12042400432.128, + "elapsed time": 2043.0771329560012 }, { "step": 5000, - "test accuracy": 0.5056861258529188, - "train loss": 0.5946548076868057, + "test accuracy": 0.4935557240333586, + "train loss": 0.5947723392248153, "train samples": 20000, "train total tokens": 4198051 } @@ -334,25 +335,25 @@ } }, "package_info": { - "transformers-version": "4.52.4", + "transformers-version": "4.57.1", "transformers-commit-hash": null, - "peft-version": "0.16.1.dev0", - "peft-commit-hash": "25e5c6b25c4589eb2683484ede1ba3d985d8a760", - "datasets-version": "3.6.0", + "peft-version": "0.17.2.dev0", + "peft-commit-hash": "a18ba67f242ab2eb74cdabab76ea2fd836b5cd83", + "datasets-version": "4.2.0", "datasets-commit-hash": null, "bitsandbytes-version": "0.46.0", "bitsandbytes-commit-hash": null, - "torch-version": "2.7.1+cu126", + "torch-version": "2.9.0+cu128", "torch-commit-hash": null }, "system_info": { "system": "Linux", - "release": "6.8.0-1031-aws", - "version": "#33-Ubuntu SMP Fri Jun 20 18:11:07 UTC 2025", + "release": "6.14.0-1014-aws", + "version": "#14~24.04.1-Ubuntu SMP Tue Sep 23 14:51:14 UTC 2025", "machine": "x86_64", "processor": "x86_64", "accelerator": "NVIDIA L40S" }, - "pytorch_info": "PyTorch built with:\n - GCC 11.2\n - C++ Version: 201703\n - Intel(R) oneAPI Math Kernel Library Version 2024.2-Product Build 20240605 for Intel(R) 64 architecture applications\n - Intel(R) MKL-DNN v3.7.1 (Git Hash 8d263e693366ef8db40acc569cc7d8edf644556d)\n - OpenMP 201511 (a.k.a. OpenMP 4.5)\n - LAPACK is enabled (usually provided by MKL)\n - NNPACK is enabled\n - CPU capability usage: AVX2\n - CUDA Runtime 12.6\n - NVCC architecture flags: -gencode;arch=compute_50,code=sm_50;-gencode;arch=compute_60,code=sm_60;-gencode;arch=compute_70,code=sm_70;-gencode;arch=compute_75,code=sm_75;-gencode;arch=compute_80,code=sm_80;-gencode;arch=compute_86,code=sm_86;-gencode;arch=compute_90,code=sm_90\n - CuDNN 90.7.1 (built against CUDA 12.8)\n - Built with CuDNN 90.5.1\n - Magma 2.6.1\n - Build settings: BLAS_INFO=mkl, BUILD_TYPE=Release, COMMIT_SHA=e2d141dbde55c2a4370fac5165b0561b6af4798b, CUDA_VERSION=12.6, CUDNN_VERSION=9.5.1, CXX_COMPILER=/opt/rh/gcc-toolset-11/root/usr/bin/c++, CXX_FLAGS= -D_GLIBCXX_USE_CXX11_ABI=1 -fvisibility-inlines-hidden -DUSE_PTHREADPOOL -DNDEBUG -DUSE_KINETO -DLIBKINETO_NOROCTRACER -DLIBKINETO_NOXPUPTI=ON -DUSE_FBGEMM -DUSE_PYTORCH_QNNPACK -DUSE_XNNPACK -DSYMBOLICATE_MOBILE_DEBUG_HANDLE -O2 -fPIC -Wall -Wextra -Werror=return-type -Werror=non-virtual-dtor -Werror=range-loop-construct -Werror=bool-operation -Wnarrowing -Wno-missing-field-initializers -Wno-unknown-pragmas -Wno-unused-parameter -Wno-strict-overflow -Wno-strict-aliasing -Wno-stringop-overflow -Wsuggest-override -Wno-psabi -Wno-error=old-style-cast -fdiagnostics-color=always -faligned-new -Wno-maybe-uninitialized -fno-math-errno -fno-trapping-math -Werror=format -Wno-stringop-overflow, LAPACK_INFO=mkl, PERF_WITH_AVX=1, PERF_WITH_AVX2=1, TORCH_VERSION=2.7.1, USE_CUDA=ON, USE_CUDNN=ON, USE_CUSPARSELT=1, USE_GFLAGS=OFF, USE_GLOG=OFF, USE_GLOO=ON, USE_MKL=ON, USE_MKLDNN=ON, USE_MPI=OFF, USE_NCCL=1, USE_NNPACK=ON, USE_OPENMP=ON, USE_ROCM=OFF, USE_ROCM_KERNEL_ASSERT=OFF, \n" + "pytorch_info": "PyTorch built with:\n - GCC 13.3\n - C++ Version: 201703\n - Intel(R) oneAPI Math Kernel Library Version 2024.2-Product Build 20240605 for Intel(R) 64 architecture applications\n - Intel(R) MKL-DNN v3.7.1 (Git Hash 8d263e693366ef8db40acc569cc7d8edf644556d)\n - OpenMP 201511 (a.k.a. OpenMP 4.5)\n - LAPACK is enabled (usually provided by MKL)\n - NNPACK is enabled\n - CPU capability usage: AVX2\n - CUDA Runtime 12.8\n - NVCC architecture flags: -gencode;arch=compute_70,code=sm_70;-gencode;arch=compute_75,code=sm_75;-gencode;arch=compute_80,code=sm_80;-gencode;arch=compute_86,code=sm_86;-gencode;arch=compute_90,code=sm_90;-gencode;arch=compute_100,code=sm_100;-gencode;arch=compute_120,code=sm_120\n - CuDNN 90.7.1\n - Built with CuDNN 90.8\n - Magma 2.6.1\n - Build settings: BLAS_INFO=mkl, BUILD_TYPE=Release, COMMIT_SHA=0fabc3ba44823f257e70ce397d989c8de5e362c1, CUDA_VERSION=12.8, CUDNN_VERSION=9.8.0, CXX_COMPILER=/opt/rh/gcc-toolset-13/root/usr/bin/c++, CXX_FLAGS= -fvisibility-inlines-hidden -DUSE_PTHREADPOOL -DNDEBUG -DUSE_KINETO -DLIBKINETO_NOROCTRACER -DLIBKINETO_NOXPUPTI=ON -DUSE_FBGEMM -DUSE_PYTORCH_QNNPACK -DUSE_XNNPACK -DSYMBOLICATE_MOBILE_DEBUG_HANDLE -O2 -fPIC -DC10_NODEPRECATED -Wall -Wextra -Werror=return-type -Werror=non-virtual-dtor -Werror=range-loop-construct -Werror=bool-operation -Wnarrowing -Wno-missing-field-initializers -Wno-unknown-pragmas -Wno-unused-parameter -Wno-strict-overflow -Wno-strict-aliasing -Wno-stringop-overflow -Wsuggest-override -Wno-psabi -Wno-error=old-style-cast -faligned-new -Wno-maybe-uninitialized -fno-math-errno -fno-trapping-math -Werror=format -Wno-dangling-reference -Wno-error=dangling-reference -Wno-stringop-overflow, LAPACK_INFO=mkl, PERF_WITH_AVX=1, PERF_WITH_AVX2=1, TORCH_VERSION=2.9.0, USE_CUDA=ON, USE_CUDNN=ON, USE_CUSPARSELT=1, USE_GFLAGS=OFF, USE_GLOG=OFF, USE_GLOO=ON, USE_MKL=ON, USE_MKLDNN=ON, USE_MPI=OFF, USE_NCCL=1, USE_NNPACK=ON, USE_OPENMP=ON, USE_ROCM=OFF, USE_ROCM_KERNEL_ASSERT=OFF, USE_XCCL=OFF, USE_XPU=OFF, \n" } } \ No newline at end of file diff --git a/method_comparison/MetaMathQA/results/osf--llama-3.2-3B-rank128.json b/method_comparison/MetaMathQA/results/osf--llama-3.2-3B-rank128.json new file mode 100644 index 0000000000..69f9ffc565 --- /dev/null +++ b/method_comparison/MetaMathQA/results/osf--llama-3.2-3B-rank128.json @@ -0,0 +1,361 @@ +{ + "run_info": { + "created_at": "2025-10-23T18:14:24+00:00", + "total_time": 4667.161105344003, + "experiment_name": "osf/llama-3.2-3B-rank128", + "peft_branch": "main", + "train_config": { + "model_id": "meta-llama/Llama-3.2-3B", + "dtype": "bfloat16", + "max_seq_length": 768, + "batch_size": 4, + "batch_size_eval": 50, + "max_steps": 5000, + "eval_steps": 250, + "compile": false, + "query_template": "Question: {query} Think step by step.\nAnswer:", + "seed": 0, + "grad_norm_clip": 1.0, + "optimizer_type": "AdamW", + "optimizer_kwargs": { + "lr": 5e-05 + }, + "lr_scheduler": "cosine", + "use_amp": false, + "autocast_adapter_dtype": true, + "generation_kwargs": { + "max_length": 800, + "max_new_tokens": 300 + }, + "attn_implementation": null + }, + "peft_config": { + "task_type": null, + "peft_type": "OSF", + "auto_mapping": null, + "peft_version": "0.17.2.dev0@UNKNOWN", + "base_model_name_or_path": "meta-llama/Llama-3.2-3B", + "revision": null, + "inference_mode": false, + "effective_rank": null, + "target_modules": [ + "q_proj", + "k_proj", + "v_proj", + "o_proj", + "gate_proj", + "down_proj", + "up_proj" + ], + "rank_pattern": { + "q_proj": 2944, + "o_proj": 2944, + "k_proj": 896, + "v_proj": 896, + "gate_proj": 2944, + "down_proj": 2944, + "up_proj": 2944 + }, + "init_weights": null, + "modules_to_save": null, + "target_svd_config": null + }, + "error_msg": "" + }, + "train_info": { + "accelerator_memory_reserved_avg": 27568475262, + "accelerator_memory_max": 38503710720, + "accelerator_memory_reserved_99th": 33747495813, + "train_time": 3807.1486767399983, + "file_size": 389155304, + "num_trainable_params": 194535936, + "num_total_params": 3407285760, + "status": "success", + "metrics": [ + { + "step": 250, + "valid accuracy": 0.3, + "train loss": 1.1347286381721498, + "train samples": 1000, + "train time": 71.47102643898688, + "eval time": 40.76489851000224, + "tokens / sec": 2962.3052941703513, + "mem allocated avg": 16401750667.264, + "mem reserved avg": 27622234914.816, + "elapsed time": 192.81275960900166 + }, + { + "step": 500, + "valid accuracy": 0.4, + "train loss": 0.7763967347145081, + "train samples": 2000, + "train time": 70.88684602297872, + "eval time": 40.57877984700099, + "tokens / sec": 2934.183302958298, + "mem allocated avg": 16395229587.456, + "mem reserved avg": 27508837711.872, + "elapsed time": 373.68713150500116 + }, + { + "step": 750, + "valid accuracy": 0.32, + "train loss": 0.6953229109048843, + "train samples": 3000, + "train time": 71.95635187800144, + "eval time": 32.978734361000534, + "tokens / sec": 2979.597970218205, + "mem allocated avg": 16406115661.824, + "mem reserved avg": 27673027936.256, + "elapsed time": 548.7281182350016 + }, + { + "step": 1000, + "valid accuracy": 0.4, + "train loss": 0.6674379595518112, + "train samples": 4000, + "train time": 71.01955180798177, + "eval time": 40.51733253599741, + "tokens / sec": 2933.5020384708405, + "mem allocated avg": 16397466488.832, + "mem reserved avg": 27537417699.328, + "elapsed time": 729.8881993149989 + }, + { + "step": 1250, + "valid accuracy": 0.42, + "train loss": 0.6616734237670898, + "train samples": 5000, + "train time": 71.39272207195972, + "eval time": 40.68997940099507, + "tokens / sec": 2920.9980225968384, + "mem allocated avg": 16395791054.848, + "mem reserved avg": 27535354101.76, + "elapsed time": 911.6590812729992 + }, + { + "step": 1500, + "valid accuracy": 0.42, + "train loss": 0.6527736356258392, + "train samples": 6000, + "train time": 71.3452017439995, + "eval time": 40.91994198199973, + "tokens / sec": 2934.058561515047, + "mem allocated avg": 16398943606.784, + "mem reserved avg": 27551267291.136, + "elapsed time": 1093.6544228519997 + }, + { + "step": 1750, + "valid accuracy": 0.4, + "train loss": 0.6452968027591706, + "train samples": 7000, + "train time": 71.2096087580212, + "eval time": 27.92585728400445, + "tokens / sec": 2939.98244971986, + "mem allocated avg": 16398630768.64, + "mem reserved avg": 27576273731.584, + "elapsed time": 1262.488236760997 + }, + { + "step": 2000, + "valid accuracy": 0.44, + "train loss": 0.647414596915245, + "train samples": 8000, + "train time": 71.19596286901651, + "eval time": 27.970824908996292, + "tokens / sec": 2917.2440631515974, + "mem allocated avg": 16394856886.272, + "mem reserved avg": 27520036503.552, + "elapsed time": 1431.4772036520008 + }, + { + "step": 2250, + "valid accuracy": 0.46, + "train loss": 0.6401616543531418, + "train samples": 9000, + "train time": 72.0709888140409, + "eval time": 40.758223525001085, + "tokens / sec": 2982.448326810298, + "mem allocated avg": 16407562051.584, + "mem reserved avg": 27707186348.032, + "elapsed time": 1614.6744573789983 + }, + { + "step": 2500, + "valid accuracy": 0.4, + "train loss": 0.6391781423091888, + "train samples": 10000, + "train time": 70.6605427990362, + "eval time": 40.550873344996944, + "tokens / sec": 2914.8799576276306, + "mem allocated avg": 16391433314.304, + "mem reserved avg": 27455117066.24, + "elapsed time": 1795.4127528829995 + }, + { + "step": 2750, + "valid accuracy": 0.44, + "train loss": 0.633193033695221, + "train samples": 11000, + "train time": 71.95110527896759, + "eval time": 41.14912619200186, + "tokens / sec": 2944.7914549540083, + "mem allocated avg": 16402253942.784, + "mem reserved avg": 27629960822.784, + "elapsed time": 1978.6382485249997 + }, + { + "step": 3000, + "valid accuracy": 0.46, + "train loss": 0.6262783712148666, + "train samples": 12000, + "train time": 71.53329248691443, + "eval time": 27.86414769000112, + "tokens / sec": 2917.9559998329883, + "mem allocated avg": 16397906978.816, + "mem reserved avg": 27544262803.456, + "elapsed time": 2147.756047652998 + }, + { + "step": 3250, + "valid accuracy": 0.5, + "train loss": 0.6382041232585907, + "train samples": 13000, + "train time": 71.6970354819714, + "eval time": 27.99712078000448, + "tokens / sec": 2941.558163210698, + "mem allocated avg": 16398972514.304, + "mem reserved avg": 27574889611.264, + "elapsed time": 2317.192205391999 + }, + { + "step": 3500, + "valid accuracy": 0.44, + "train loss": 0.6242904909849167, + "train samples": 14000, + "train time": 71.18509741093294, + "eval time": 40.980086228999426, + "tokens / sec": 2946.5436956441617, + "mem allocated avg": 16399184805.888, + "mem reserved avg": 27556719886.336, + "elapsed time": 2498.944009259998 + }, + { + "step": 3750, + "valid accuracy": 0.46, + "train loss": 0.6247457062005997, + "train samples": 15000, + "train time": 72.02823552303016, + "eval time": 41.01247237699863, + "tokens / sec": 3008.584042444186, + "mem allocated avg": 16410260455.424, + "mem reserved avg": 27745530675.2, + "elapsed time": 2682.3083391710024 + }, + { + "step": 4000, + "valid accuracy": 0.48, + "train loss": 0.6386832315921783, + "train samples": 16000, + "train time": 71.0258735000898, + "eval time": 41.09184825400007, + "tokens / sec": 2877.4443724334005, + "mem allocated avg": 16391636211.712, + "mem reserved avg": 27441292640.256, + "elapsed time": 2863.960687703002 + }, + { + "step": 4250, + "valid accuracy": 0.48, + "train loss": 0.6240871007442474, + "train samples": 17000, + "train time": 71.68756263409159, + "eval time": 40.60824938499718, + "tokens / sec": 2948.7541804004964, + "mem allocated avg": 16400376035.328, + "mem reserved avg": 27598243495.936, + "elapsed time": 3046.081177453998 + }, + { + "step": 4500, + "valid accuracy": 0.44, + "train loss": 0.633060937166214, + "train samples": 18000, + "train time": 71.06215882203833, + "eval time": 40.82050051999977, + "tokens / sec": 2924.453794324497, + "mem allocated avg": 16395753859.072, + "mem reserved avg": 27509492023.296, + "elapsed time": 3227.7807077830003 + }, + { + "step": 4750, + "valid accuracy": 0.46, + "train loss": 0.6253616527318955, + "train samples": 19000, + "train time": 70.68259103103628, + "eval time": 40.62497806800093, + "tokens / sec": 2970.1655943514734, + "mem allocated avg": 16399317602.304, + "mem reserved avg": 27571760660.48, + "elapsed time": 3408.8530542459994 + }, + { + "step": 5000, + "valid accuracy": 0.48, + "train loss": 0.6308260992765427, + "train samples": 20000, + "train time": 71.07603502904385, + "eval time": 28.169818383001257, + "tokens / sec": 2930.382933078504, + "mem allocated avg": 16396032215.04, + "mem reserved avg": 27510599319.552, + "elapsed time": 3577.845173487003 + }, + { + "step": 5000, + "test accuracy": 0.4359363153904473, + "train loss": 0.6308260992765427, + "train samples": 20000, + "train total tokens": 4198051 + } + ] + }, + "meta_info": { + "model_info": { + "sha": "13afe5124825b4f3751f836b40dafda64c1ed062", + "created_at": "2024-09-18T15:23:48+00:00" + }, + "dataset_info": { + "metamath": { + "sha": "aa4f34d3d2d3231299b5b03d9b3e5a20da45aa18", + "created_at": "2023-09-21T17:22:46+00:00" + }, + "gsm8k": { + "sha": "e53f048856ff4f594e959d75785d2c2d37b678ee", + "created_at": "2022-04-12T10:22:10+00:00" + } + }, + "package_info": { + "transformers-version": "4.57.1", + "transformers-commit-hash": null, + "peft-version": "0.17.2.dev0", + "peft-commit-hash": "a18ba67f242ab2eb74cdabab76ea2fd836b5cd83", + "datasets-version": "4.2.0", + "datasets-commit-hash": null, + "bitsandbytes-version": "0.46.0", + "bitsandbytes-commit-hash": null, + "torch-version": "2.9.0+cu128", + "torch-commit-hash": null + }, + "system_info": { + "system": "Linux", + "release": "6.14.0-1014-aws", + "version": "#14~24.04.1-Ubuntu SMP Tue Sep 23 14:51:14 UTC 2025", + "machine": "x86_64", + "processor": "x86_64", + "accelerator": "NVIDIA L40S" + }, + "pytorch_info": "PyTorch built with:\n - GCC 13.3\n - C++ Version: 201703\n - Intel(R) oneAPI Math Kernel Library Version 2024.2-Product Build 20240605 for Intel(R) 64 architecture applications\n - Intel(R) MKL-DNN v3.7.1 (Git Hash 8d263e693366ef8db40acc569cc7d8edf644556d)\n - OpenMP 201511 (a.k.a. OpenMP 4.5)\n - LAPACK is enabled (usually provided by MKL)\n - NNPACK is enabled\n - CPU capability usage: AVX2\n - CUDA Runtime 12.8\n - NVCC architecture flags: -gencode;arch=compute_70,code=sm_70;-gencode;arch=compute_75,code=sm_75;-gencode;arch=compute_80,code=sm_80;-gencode;arch=compute_86,code=sm_86;-gencode;arch=compute_90,code=sm_90;-gencode;arch=compute_100,code=sm_100;-gencode;arch=compute_120,code=sm_120\n - CuDNN 90.7.1\n - Built with CuDNN 90.8\n - Magma 2.6.1\n - Build settings: BLAS_INFO=mkl, BUILD_TYPE=Release, COMMIT_SHA=0fabc3ba44823f257e70ce397d989c8de5e362c1, CUDA_VERSION=12.8, CUDNN_VERSION=9.8.0, CXX_COMPILER=/opt/rh/gcc-toolset-13/root/usr/bin/c++, CXX_FLAGS= -fvisibility-inlines-hidden -DUSE_PTHREADPOOL -DNDEBUG -DUSE_KINETO -DLIBKINETO_NOROCTRACER -DLIBKINETO_NOXPUPTI=ON -DUSE_FBGEMM -DUSE_PYTORCH_QNNPACK -DUSE_XNNPACK -DSYMBOLICATE_MOBILE_DEBUG_HANDLE -O2 -fPIC -DC10_NODEPRECATED -Wall -Wextra -Werror=return-type -Werror=non-virtual-dtor -Werror=range-loop-construct -Werror=bool-operation -Wnarrowing -Wno-missing-field-initializers -Wno-unknown-pragmas -Wno-unused-parameter -Wno-strict-overflow -Wno-strict-aliasing -Wno-stringop-overflow -Wsuggest-override -Wno-psabi -Wno-error=old-style-cast -faligned-new -Wno-maybe-uninitialized -fno-math-errno -fno-trapping-math -Werror=format -Wno-dangling-reference -Wno-error=dangling-reference -Wno-stringop-overflow, LAPACK_INFO=mkl, PERF_WITH_AVX=1, PERF_WITH_AVX2=1, TORCH_VERSION=2.9.0, USE_CUDA=ON, USE_CUDNN=ON, USE_CUSPARSELT=1, USE_GFLAGS=OFF, USE_GLOG=OFF, USE_GLOO=ON, USE_MKL=ON, USE_MKLDNN=ON, USE_MPI=OFF, USE_NCCL=1, USE_NNPACK=ON, USE_OPENMP=ON, USE_ROCM=OFF, USE_ROCM_KERNEL_ASSERT=OFF, USE_XCCL=OFF, USE_XPU=OFF, \n" + } +} \ No newline at end of file diff --git a/method_comparison/MetaMathQA/results/prompt_tuning--llama-3.2-3B-sample_vocab-lr_0.001.json b/method_comparison/MetaMathQA/results/prompt_tuning--llama-3.2-3B-sample_vocab-lr_0.001.json new file mode 100644 index 0000000000..b01361552e --- /dev/null +++ b/method_comparison/MetaMathQA/results/prompt_tuning--llama-3.2-3B-sample_vocab-lr_0.001.json @@ -0,0 +1,349 @@ +{ + "run_info": { + "created_at": "2025-10-23T19:32:16+00:00", + "total_time": 2686.3397733460006, + "experiment_name": "prompt_tuning/llama-3.2-3B-sample_vocab-lr_0.001", + "peft_branch": "main", + "train_config": { + "model_id": "meta-llama/Llama-3.2-3B", + "dtype": "bfloat16", + "max_seq_length": 768, + "batch_size": 4, + "batch_size_eval": 50, + "max_steps": 5000, + "eval_steps": 250, + "compile": false, + "query_template": "Question: {query} Think step by step.\nAnswer:", + "seed": 0, + "grad_norm_clip": 1.0, + "optimizer_type": "AdamW", + "optimizer_kwargs": { + "lr": 0.001 + }, + "lr_scheduler": "cosine", + "use_amp": false, + "autocast_adapter_dtype": true, + "generation_kwargs": { + "max_length": 800, + "max_new_tokens": 300 + }, + "attn_implementation": null + }, + "peft_config": { + "task_type": "CAUSAL_LM", + "peft_type": "PROMPT_TUNING", + "auto_mapping": null, + "peft_version": "0.17.2.dev0@UNKNOWN", + "base_model_name_or_path": "meta-llama/Llama-3.2-3B", + "revision": null, + "inference_mode": false, + "num_virtual_tokens": 200, + "token_dim": 3072, + "num_transformer_submodules": 1, + "num_attention_heads": 24, + "num_layers": 28, + "modules_to_save": null, + "prompt_tuning_init": "SAMPLE_VOCAB", + "prompt_tuning_init_text": null, + "tokenizer_name_or_path": null, + "tokenizer_kwargs": null + }, + "error_msg": "" + }, + "train_info": { + "accelerator_memory_reserved_avg": 15333902725, + "accelerator_memory_max": 24423432192, + "accelerator_memory_reserved_99th": 20718058209, + "train_time": 2404.491197405987, + "file_size": 2457728, + "num_trainable_params": 614400, + "num_total_params": 3213364224, + "status": "success", + "metrics": [ + { + "step": 250, + "valid accuracy": 0.26, + "train loss": 1.2232583401203156, + "train samples": 1000, + "train time": 46.17508273195563, + "eval time": 16.505391887003498, + "tokens / sec": 4585.135260699362, + "mem allocated avg": 7082755428.352, + "mem reserved avg": 15369582411.776, + "elapsed time": 130.58785935799824 + }, + { + "step": 500, + "valid accuracy": 0.26, + "train loss": 0.7836624360084534, + "train samples": 2000, + "train time": 45.018037280060526, + "eval time": 13.687452188001771, + "tokens / sec": 4620.259179804925, + "mem allocated avg": 7075105931.264, + "mem reserved avg": 15270336790.528, + "elapsed time": 244.8983392040027 + }, + { + "step": 750, + "valid accuracy": 0.34, + "train loss": 0.7472633671760559, + "train samples": 3000, + "train time": 45.978240520918916, + "eval time": 16.433850564004388, + "tokens / sec": 4663.097099212682, + "mem allocated avg": 7084822022.144, + "mem reserved avg": 15411735166.976, + "elapsed time": 363.6571200059989 + }, + { + "step": 1000, + "valid accuracy": 0.36, + "train loss": 0.7188941253423691, + "train samples": 4000, + "train time": 45.29402179891622, + "eval time": 16.422749457000464, + "tokens / sec": 4599.635707443073, + "mem allocated avg": 7077147275.264, + "mem reserved avg": 15320416780.288, + "elapsed time": 481.36312461700436 + }, + { + "step": 1250, + "valid accuracy": 0.38, + "train loss": 0.7124129735231399, + "train samples": 5000, + "train time": 45.307238408975536, + "eval time": 10.875751104998926, + "tokens / sec": 4602.752392842549, + "mem allocated avg": 7076391862.272, + "mem reserved avg": 15302725206.016, + "elapsed time": 593.3289284760031 + }, + { + "step": 1500, + "valid accuracy": 0.44, + "train loss": 0.7025347559452056, + "train samples": 6000, + "train time": 45.58526100008021, + "eval time": 16.42347687900474, + "tokens / sec": 4592.07637309857, + "mem allocated avg": 7078312007.68, + "mem reserved avg": 15317346549.76, + "elapsed time": 711.5082658690008 + }, + { + "step": 1750, + "valid accuracy": 0.38, + "train loss": 0.6954681335687637, + "train samples": 7000, + "train time": 45.64173767795728, + "eval time": 16.42899393199332, + "tokens / sec": 4586.920013369872, + "mem allocated avg": 7079384836.096, + "mem reserved avg": 15339685412.864, + "elapsed time": 829.8873879540042 + }, + { + "step": 2000, + "valid accuracy": 0.38, + "train loss": 0.6959483157396317, + "train samples": 8000, + "train time": 45.487343653003336, + "eval time": 13.574101327998505, + "tokens / sec": 4566.017342854592, + "mem allocated avg": 7076109684.736, + "mem reserved avg": 15293246078.976, + "elapsed time": 945.0783910430036 + }, + { + "step": 2250, + "valid accuracy": 0.36, + "train loss": 0.6886743805408477, + "train samples": 9000, + "train time": 46.25566355796764, + "eval time": 13.330924835005135, + "tokens / sec": 4646.955279987001, + "mem allocated avg": 7087138603.008, + "mem reserved avg": 15455901188.096, + "elapsed time": 1060.9912824740022 + }, + { + "step": 2500, + "valid accuracy": 0.34, + "train loss": 0.685915477514267, + "train samples": 10000, + "train time": 45.102773971921124, + "eval time": 9.97469689600257, + "tokens / sec": 4566.614907726638, + "mem allocated avg": 7072584992.768, + "mem reserved avg": 15242411114.496, + "elapsed time": 1172.0626167860028 + }, + { + "step": 2750, + "valid accuracy": 0.34, + "train loss": 0.6786098405122757, + "train samples": 11000, + "train time": 45.94233982402511, + "eval time": 16.441506881994428, + "tokens / sec": 4611.889616671175, + "mem allocated avg": 7083189243.904, + "mem reserved avg": 15378675662.848, + "elapsed time": 1290.5899199240012 + }, + { + "step": 3000, + "valid accuracy": 0.34, + "train loss": 0.6700806043148041, + "train samples": 12000, + "train time": 45.345923172040784, + "eval time": 16.428019475999463, + "tokens / sec": 4603.081939871026, + "mem allocated avg": 7077922359.296, + "mem reserved avg": 15325441556.48, + "elapsed time": 1408.37935114 + }, + { + "step": 3250, + "valid accuracy": 0.38, + "train loss": 0.6773221861124039, + "train samples": 13000, + "train time": 45.62251189197559, + "eval time": 10.20512724499713, + "tokens / sec": 4622.739767143219, + "mem allocated avg": 7079430316.032, + "mem reserved avg": 15336581627.904, + "elapsed time": 1520.2662671619983 + }, + { + "step": 3500, + "valid accuracy": 0.36, + "train loss": 0.6638141021728515, + "train samples": 14000, + "train time": 45.362639506965934, + "eval time": 16.423270223000145, + "tokens / sec": 4623.849103132338, + "mem allocated avg": 7078731702.272, + "mem reserved avg": 15340423610.368, + "elapsed time": 1637.9249663660012 + }, + { + "step": 3750, + "valid accuracy": 0.4, + "train loss": 0.659807546377182, + "train samples": 15000, + "train time": 46.37028079503216, + "eval time": 16.430752983003913, + "tokens / sec": 4673.316535603474, + "mem allocated avg": 7089302118.4, + "mem reserved avg": 15476906262.528, + "elapsed time": 1757.2157563939982 + }, + { + "step": 4000, + "valid accuracy": 0.44, + "train loss": 0.6735307123661042, + "train samples": 16000, + "train time": 45.140112428038265, + "eval time": 10.74987911900098, + "tokens / sec": 4527.525276455804, + "mem allocated avg": 7071258566.656, + "mem reserved avg": 15226523090.944, + "elapsed time": 1869.2519954439995 + }, + { + "step": 4250, + "valid accuracy": 0.38, + "train loss": 0.6553376598358154, + "train samples": 17000, + "train time": 45.46981849010626, + "eval time": 10.198734415003855, + "tokens / sec": 4648.995905844576, + "mem allocated avg": 7081538940.928, + "mem reserved avg": 15362930245.632, + "elapsed time": 1981.2125737099996 + }, + { + "step": 4500, + "valid accuracy": 0.44, + "train loss": 0.6633048733472824, + "train samples": 18000, + "train time": 45.337251453973295, + "eval time": 11.732473295996897, + "tokens / sec": 4583.824412271184, + "mem allocated avg": 7076373809.152, + "mem reserved avg": 15290343620.608, + "elapsed time": 2094.4324725890037 + }, + { + "step": 4750, + "valid accuracy": 0.38, + "train loss": 0.6535381546020508, + "train samples": 19000, + "train time": 45.22358582002926, + "eval time": 16.417741852004838, + "tokens / sec": 4642.245770502773, + "mem allocated avg": 7078582056.96, + "mem reserved avg": 15336782954.496, + "elapsed time": 2212.095038350999 + }, + { + "step": 5000, + "valid accuracy": 0.38, + "train loss": 0.6601177526712417, + "train samples": 20000, + "train time": 45.29508365698712, + "eval time": 10.317947228002595, + "tokens / sec": 4598.291540363922, + "mem allocated avg": 7075333005.312, + "mem reserved avg": 15280059187.2, + "elapsed time": 2323.7172720720046 + }, + { + "step": 5000, + "test accuracy": 0.3912054586808188, + "train loss": 0.6601177526712417, + "train samples": 20000, + "train total tokens": 4198051 + } + ] + }, + "meta_info": { + "model_info": { + "sha": "13afe5124825b4f3751f836b40dafda64c1ed062", + "created_at": "2024-09-18T15:23:48+00:00" + }, + "dataset_info": { + "metamath": { + "sha": "aa4f34d3d2d3231299b5b03d9b3e5a20da45aa18", + "created_at": "2023-09-21T17:22:46+00:00" + }, + "gsm8k": { + "sha": "e53f048856ff4f594e959d75785d2c2d37b678ee", + "created_at": "2022-04-12T10:22:10+00:00" + } + }, + "package_info": { + "transformers-version": "4.57.1", + "transformers-commit-hash": null, + "peft-version": "0.17.2.dev0", + "peft-commit-hash": "a18ba67f242ab2eb74cdabab76ea2fd836b5cd83", + "datasets-version": "4.2.0", + "datasets-commit-hash": null, + "bitsandbytes-version": "0.46.0", + "bitsandbytes-commit-hash": null, + "torch-version": "2.9.0+cu128", + "torch-commit-hash": null + }, + "system_info": { + "system": "Linux", + "release": "6.14.0-1014-aws", + "version": "#14~24.04.1-Ubuntu SMP Tue Sep 23 14:51:14 UTC 2025", + "machine": "x86_64", + "processor": "x86_64", + "accelerator": "NVIDIA L40S" + }, + "pytorch_info": "PyTorch built with:\n - GCC 13.3\n - C++ Version: 201703\n - Intel(R) oneAPI Math Kernel Library Version 2024.2-Product Build 20240605 for Intel(R) 64 architecture applications\n - Intel(R) MKL-DNN v3.7.1 (Git Hash 8d263e693366ef8db40acc569cc7d8edf644556d)\n - OpenMP 201511 (a.k.a. OpenMP 4.5)\n - LAPACK is enabled (usually provided by MKL)\n - NNPACK is enabled\n - CPU capability usage: AVX2\n - CUDA Runtime 12.8\n - NVCC architecture flags: -gencode;arch=compute_70,code=sm_70;-gencode;arch=compute_75,code=sm_75;-gencode;arch=compute_80,code=sm_80;-gencode;arch=compute_86,code=sm_86;-gencode;arch=compute_90,code=sm_90;-gencode;arch=compute_100,code=sm_100;-gencode;arch=compute_120,code=sm_120\n - CuDNN 90.7.1\n - Built with CuDNN 90.8\n - Magma 2.6.1\n - Build settings: BLAS_INFO=mkl, BUILD_TYPE=Release, COMMIT_SHA=0fabc3ba44823f257e70ce397d989c8de5e362c1, CUDA_VERSION=12.8, CUDNN_VERSION=9.8.0, CXX_COMPILER=/opt/rh/gcc-toolset-13/root/usr/bin/c++, CXX_FLAGS= -fvisibility-inlines-hidden -DUSE_PTHREADPOOL -DNDEBUG -DUSE_KINETO -DLIBKINETO_NOROCTRACER -DLIBKINETO_NOXPUPTI=ON -DUSE_FBGEMM -DUSE_PYTORCH_QNNPACK -DUSE_XNNPACK -DSYMBOLICATE_MOBILE_DEBUG_HANDLE -O2 -fPIC -DC10_NODEPRECATED -Wall -Wextra -Werror=return-type -Werror=non-virtual-dtor -Werror=range-loop-construct -Werror=bool-operation -Wnarrowing -Wno-missing-field-initializers -Wno-unknown-pragmas -Wno-unused-parameter -Wno-strict-overflow -Wno-strict-aliasing -Wno-stringop-overflow -Wsuggest-override -Wno-psabi -Wno-error=old-style-cast -faligned-new -Wno-maybe-uninitialized -fno-math-errno -fno-trapping-math -Werror=format -Wno-dangling-reference -Wno-error=dangling-reference -Wno-stringop-overflow, LAPACK_INFO=mkl, PERF_WITH_AVX=1, PERF_WITH_AVX2=1, TORCH_VERSION=2.9.0, USE_CUDA=ON, USE_CUDNN=ON, USE_CUSPARSELT=1, USE_GFLAGS=OFF, USE_GLOG=OFF, USE_GLOO=ON, USE_MKL=ON, USE_MKLDNN=ON, USE_MPI=OFF, USE_NCCL=1, USE_NNPACK=ON, USE_OPENMP=ON, USE_ROCM=OFF, USE_ROCM_KERNEL_ASSERT=OFF, USE_XCCL=OFF, USE_XPU=OFF, \n" + } +} \ No newline at end of file diff --git a/method_comparison/MetaMathQA/results/road--llama-3.2-3B-lr_0.001.json b/method_comparison/MetaMathQA/results/road--llama-3.2-3B-lr_0.001.json new file mode 100644 index 0000000000..36f2a4fa36 --- /dev/null +++ b/method_comparison/MetaMathQA/results/road--llama-3.2-3B-lr_0.001.json @@ -0,0 +1,347 @@ +{ + "run_info": { + "created_at": "2025-10-23T20:17:06+00:00", + "total_time": 2179.066774046005, + "experiment_name": "road/llama-3.2-3B-lr_0.001", + "peft_branch": "main", + "train_config": { + "model_id": "meta-llama/Llama-3.2-3B", + "dtype": "bfloat16", + "max_seq_length": 768, + "batch_size": 4, + "batch_size_eval": 50, + "max_steps": 5000, + "eval_steps": 250, + "compile": false, + "query_template": "Question: {query} Think step by step.\nAnswer:", + "seed": 0, + "grad_norm_clip": 1.0, + "optimizer_type": "AdamW", + "optimizer_kwargs": { + "lr": 0.001 + }, + "lr_scheduler": "cosine", + "use_amp": false, + "autocast_adapter_dtype": true, + "generation_kwargs": { + "max_length": 800, + "max_new_tokens": 300 + }, + "attn_implementation": null + }, + "peft_config": { + "task_type": null, + "peft_type": "ROAD", + "auto_mapping": null, + "peft_version": "0.17.2.dev0@UNKNOWN", + "base_model_name_or_path": "meta-llama/Llama-3.2-3B", + "revision": null, + "inference_mode": false, + "variant": "road_2", + "group_size": 64, + "init_weights": true, + "target_modules": [ + "v_proj", + "q_proj" + ], + "modules_to_save": null + }, + "error_msg": "" + }, + "train_info": { + "accelerator_memory_reserved_avg": 11905060883, + "accelerator_memory_max": 22817013760, + "accelerator_memory_reserved_99th": 18119540080, + "train_time": 1957.655842856002, + "file_size": 931480, + "num_trainable_params": 229376, + "num_total_params": 3212979200, + "status": "success", + "metrics": [ + { + "step": 250, + "valid accuracy": 0.2, + "train loss": 1.1802424001693725, + "train samples": 1000, + "train time": 35.46842726102477, + "eval time": 12.287958328997775, + "tokens / sec": 5969.224359509503, + "mem allocated avg": 6782454061.056, + "mem reserved avg": 11957121843.2, + "elapsed time": 106.62609866999992 + }, + { + "step": 500, + "valid accuracy": 0.32, + "train loss": 0.8729077637195587, + "train samples": 2000, + "train time": 34.766947779018665, + "eval time": 12.224552476000099, + "tokens / sec": 5982.549901188677, + "mem allocated avg": 6774821242.88, + "mem reserved avg": 11843875635.2, + "elapsed time": 200.47699543899944 + }, + { + "step": 750, + "valid accuracy": 0.38, + "train loss": 0.7739069720506668, + "train samples": 3000, + "train time": 35.457704256870784, + "eval time": 9.125866555994435, + "tokens / sec": 6046.668967815496, + "mem allocated avg": 6785753763.84, + "mem reserved avg": 12008544010.24, + "elapsed time": 292.6061148650042 + }, + { + "step": 1000, + "valid accuracy": 0.36, + "train loss": 0.7330719463825226, + "train samples": 4000, + "train time": 34.75414815990371, + "eval time": 12.21972047399322, + "tokens / sec": 5994.564995276156, + "mem allocated avg": 6776829087.744, + "mem reserved avg": 11869007904.768, + "elapsed time": 386.3837600199986 + }, + { + "step": 1250, + "valid accuracy": 0.32, + "train loss": 0.7227181429862977, + "train samples": 5000, + "train time": 34.886374148009054, + "eval time": 12.182244757997978, + "tokens / sec": 5977.634680957555, + "mem allocated avg": 6777144475.648, + "mem reserved avg": 11868110323.712, + "elapsed time": 480.67707514700305 + }, + { + "step": 1500, + "valid accuracy": 0.42, + "train loss": 0.7143287745714187, + "train samples": 6000, + "train time": 34.91908030194463, + "eval time": 12.299323118000757, + "tokens / sec": 5994.7455141979335, + "mem allocated avg": 6777638397.952, + "mem reserved avg": 11895960502.272, + "elapsed time": 575.0448827479995 + }, + { + "step": 1750, + "valid accuracy": 0.44, + "train loss": 0.703706993818283, + "train samples": 7000, + "train time": 34.91686388308881, + "eval time": 12.217809029003547, + "tokens / sec": 5995.813389798628, + "mem allocated avg": 6779787098.112, + "mem reserved avg": 11905875836.928, + "elapsed time": 669.3053597020044 + }, + { + "step": 2000, + "valid accuracy": 0.32, + "train loss": 0.7052462505102157, + "train samples": 8000, + "train time": 34.839815382081724, + "eval time": 7.910366256000998, + "tokens / sec": 5961.455240856959, + "mem allocated avg": 6776694951.936, + "mem reserved avg": 11852641730.56, + "elapsed time": 759.2260113459997 + }, + { + "step": 2250, + "valid accuracy": 0.38, + "train loss": 0.6979660025835037, + "train samples": 9000, + "train time": 35.489929292030865, + "eval time": 12.21494767999684, + "tokens / sec": 6056.591384876774, + "mem allocated avg": 6788274759.68, + "mem reserved avg": 12044455641.088, + "elapsed time": 854.9085956600029 + }, + { + "step": 2500, + "valid accuracy": 0.4, + "train loss": 0.6967317589521408, + "train samples": 10000, + "train time": 34.54721695394983, + "eval time": 12.251032476997352, + "tokens / sec": 5961.898472879782, + "mem allocated avg": 6773104156.672, + "mem reserved avg": 11792201809.92, + "elapsed time": 948.5542301430032 + }, + { + "step": 2750, + "valid accuracy": 0.36, + "train loss": 0.6901429216861725, + "train samples": 11000, + "train time": 35.248878062957374, + "eval time": 12.229859940001916, + "tokens / sec": 6010.999828748116, + "mem allocated avg": 6784006227.968, + "mem reserved avg": 11973513183.232, + "elapsed time": 1043.7242833290002 + }, + { + "step": 3000, + "valid accuracy": 0.42, + "train loss": 0.6821614302396775, + "train samples": 12000, + "train time": 34.93150638397492, + "eval time": 7.433937801004504, + "tokens / sec": 5975.4365501900265, + "mem allocated avg": 6778632783.872, + "mem reserved avg": 11887857106.944, + "elapsed time": 1133.1159700200005 + }, + { + "step": 3250, + "valid accuracy": 0.42, + "train loss": 0.691840036034584, + "train samples": 13000, + "train time": 34.95601586808334, + "eval time": 12.231384652004635, + "tokens / sec": 6033.324873060365, + "mem allocated avg": 6780070547.456, + "mem reserved avg": 11917074628.608, + "elapsed time": 1227.7147229480033 + }, + { + "step": 3500, + "valid accuracy": 0.42, + "train loss": 0.6793323725461959, + "train samples": 14000, + "train time": 35.1950762630222, + "eval time": 7.8389490870031295, + "tokens / sec": 5959.640446080646, + "mem allocated avg": 6779174166.528, + "mem reserved avg": 11899424997.376, + "elapsed time": 1318.1422208670047 + }, + { + "step": 3750, + "valid accuracy": 0.44, + "train loss": 0.6769173287153244, + "train samples": 15000, + "train time": 35.7101883490468, + "eval time": 12.202735990998917, + "tokens / sec": 6068.380202362734, + "mem allocated avg": 6789275869.184, + "mem reserved avg": 12087027826.688, + "elapsed time": 1414.2409790489983 + }, + { + "step": 4000, + "valid accuracy": 0.42, + "train loss": 0.693774617433548, + "train samples": 16000, + "train time": 34.639687986935314, + "eval time": 6.860093137001968, + "tokens / sec": 5899.966537720583, + "mem allocated avg": 6771220369.408, + "mem reserved avg": 11770911522.816, + "elapsed time": 1502.9286165810045 + }, + { + "step": 4250, + "valid accuracy": 0.42, + "train loss": 0.6743522936105728, + "train samples": 17000, + "train time": 35.23996867898677, + "eval time": 8.069594663997123, + "tokens / sec": 5998.558112398354, + "mem allocated avg": 6781373200.384, + "mem reserved avg": 11933046538.24, + "elapsed time": 1593.8458517020044 + }, + { + "step": 4500, + "valid accuracy": 0.44, + "train loss": 0.6836657630205154, + "train samples": 18000, + "train time": 34.69980391602439, + "eval time": 12.216164864999882, + "tokens / sec": 5989.025197460252, + "mem allocated avg": 6776051484.672, + "mem reserved avg": 11847029751.808, + "elapsed time": 1687.6608126920037 + }, + { + "step": 4750, + "valid accuracy": 0.38, + "train loss": 0.6764673949480057, + "train samples": 19000, + "train time": 35.06901030093286, + "eval time": 12.21244175699394, + "tokens / sec": 5986.453515467914, + "mem allocated avg": 6778213396.48, + "mem reserved avg": 11901983522.816, + "elapsed time": 1782.2256710229994 + }, + { + "step": 5000, + "valid accuracy": 0.38, + "train loss": 0.683658688902855, + "train samples": 20000, + "train time": 34.891452592011774, + "eval time": 12.243604967006831, + "tokens / sec": 5969.370276309009, + "mem allocated avg": 6775612205.056, + "mem reserved avg": 11845553356.8, + "elapsed time": 1876.6560215470017 + }, + { + "step": 5000, + "test accuracy": 0.39651250947687644, + "train loss": 0.683658688902855, + "train samples": 20000, + "train total tokens": 4198051 + } + ] + }, + "meta_info": { + "model_info": { + "sha": "13afe5124825b4f3751f836b40dafda64c1ed062", + "created_at": "2024-09-18T15:23:48+00:00" + }, + "dataset_info": { + "metamath": { + "sha": "aa4f34d3d2d3231299b5b03d9b3e5a20da45aa18", + "created_at": "2023-09-21T17:22:46+00:00" + }, + "gsm8k": { + "sha": "e53f048856ff4f594e959d75785d2c2d37b678ee", + "created_at": "2022-04-12T10:22:10+00:00" + } + }, + "package_info": { + "transformers-version": "4.57.1", + "transformers-commit-hash": null, + "peft-version": "0.17.2.dev0", + "peft-commit-hash": "a18ba67f242ab2eb74cdabab76ea2fd836b5cd83", + "datasets-version": "4.2.0", + "datasets-commit-hash": null, + "bitsandbytes-version": "0.46.0", + "bitsandbytes-commit-hash": null, + "torch-version": "2.9.0+cu128", + "torch-commit-hash": null + }, + "system_info": { + "system": "Linux", + "release": "6.14.0-1014-aws", + "version": "#14~24.04.1-Ubuntu SMP Tue Sep 23 14:51:14 UTC 2025", + "machine": "x86_64", + "processor": "x86_64", + "accelerator": "NVIDIA L40S" + }, + "pytorch_info": "PyTorch built with:\n - GCC 13.3\n - C++ Version: 201703\n - Intel(R) oneAPI Math Kernel Library Version 2024.2-Product Build 20240605 for Intel(R) 64 architecture applications\n - Intel(R) MKL-DNN v3.7.1 (Git Hash 8d263e693366ef8db40acc569cc7d8edf644556d)\n - OpenMP 201511 (a.k.a. OpenMP 4.5)\n - LAPACK is enabled (usually provided by MKL)\n - NNPACK is enabled\n - CPU capability usage: AVX2\n - CUDA Runtime 12.8\n - NVCC architecture flags: -gencode;arch=compute_70,code=sm_70;-gencode;arch=compute_75,code=sm_75;-gencode;arch=compute_80,code=sm_80;-gencode;arch=compute_86,code=sm_86;-gencode;arch=compute_90,code=sm_90;-gencode;arch=compute_100,code=sm_100;-gencode;arch=compute_120,code=sm_120\n - CuDNN 90.7.1\n - Built with CuDNN 90.8\n - Magma 2.6.1\n - Build settings: BLAS_INFO=mkl, BUILD_TYPE=Release, COMMIT_SHA=0fabc3ba44823f257e70ce397d989c8de5e362c1, CUDA_VERSION=12.8, CUDNN_VERSION=9.8.0, CXX_COMPILER=/opt/rh/gcc-toolset-13/root/usr/bin/c++, CXX_FLAGS= -fvisibility-inlines-hidden -DUSE_PTHREADPOOL -DNDEBUG -DUSE_KINETO -DLIBKINETO_NOROCTRACER -DLIBKINETO_NOXPUPTI=ON -DUSE_FBGEMM -DUSE_PYTORCH_QNNPACK -DUSE_XNNPACK -DSYMBOLICATE_MOBILE_DEBUG_HANDLE -O2 -fPIC -DC10_NODEPRECATED -Wall -Wextra -Werror=return-type -Werror=non-virtual-dtor -Werror=range-loop-construct -Werror=bool-operation -Wnarrowing -Wno-missing-field-initializers -Wno-unknown-pragmas -Wno-unused-parameter -Wno-strict-overflow -Wno-strict-aliasing -Wno-stringop-overflow -Wsuggest-override -Wno-psabi -Wno-error=old-style-cast -faligned-new -Wno-maybe-uninitialized -fno-math-errno -fno-trapping-math -Werror=format -Wno-dangling-reference -Wno-error=dangling-reference -Wno-stringop-overflow, LAPACK_INFO=mkl, PERF_WITH_AVX=1, PERF_WITH_AVX2=1, TORCH_VERSION=2.9.0, USE_CUDA=ON, USE_CUDNN=ON, USE_CUSPARSELT=1, USE_GFLAGS=OFF, USE_GLOG=OFF, USE_GLOO=ON, USE_MKL=ON, USE_MKLDNN=ON, USE_MPI=OFF, USE_NCCL=1, USE_NNPACK=ON, USE_OPENMP=ON, USE_ROCM=OFF, USE_ROCM_KERNEL_ASSERT=OFF, USE_XCCL=OFF, USE_XPU=OFF, \n" + } +} \ No newline at end of file diff --git a/method_comparison/MetaMathQA/results/waveft--llama-3.2-3B-n_frequency-5000.json b/method_comparison/MetaMathQA/results/waveft--llama-3.2-3B-n_frequency-5000.json new file mode 100644 index 0000000000..04e997d6ff --- /dev/null +++ b/method_comparison/MetaMathQA/results/waveft--llama-3.2-3B-n_frequency-5000.json @@ -0,0 +1,358 @@ +{ + "run_info": { + "created_at": "2025-10-23T20:53:30+00:00", + "total_time": 3265.5213168810005, + "experiment_name": "waveft/llama-3.2-3B-n_frequency-5000", + "peft_branch": "main", + "train_config": { + "model_id": "meta-llama/Llama-3.2-3B", + "dtype": "bfloat16", + "max_seq_length": 768, + "batch_size": 4, + "batch_size_eval": 50, + "max_steps": 5000, + "eval_steps": 250, + "compile": false, + "query_template": "Question: {query} Think step by step.\nAnswer:", + "seed": 0, + "grad_norm_clip": 1.0, + "optimizer_type": "AdamW", + "optimizer_kwargs": { + "lr": 0.0001, + "weight_decay": 0.1 + }, + "lr_scheduler": "cosine", + "use_amp": false, + "autocast_adapter_dtype": true, + "generation_kwargs": { + "max_length": 800, + "max_new_tokens": 300 + }, + "attn_implementation": null + }, + "peft_config": { + "task_type": "CAUSAL_LM", + "peft_type": "WAVEFT", + "auto_mapping": null, + "peft_version": "0.17.2.dev0@UNKNOWN", + "base_model_name_or_path": "meta-llama/Llama-3.2-3B", + "revision": null, + "inference_mode": false, + "n_frequency": 5000, + "scaling": 25.0, + "wavelet_family": "db1", + "use_idwt": true, + "random_loc_seed": 777, + "fan_in_fan_out": false, + "target_modules": [ + "q_proj", + "v_proj" + ], + "exclude_modules": null, + "bias": "none", + "modules_to_save": null, + "layers_to_transform": null, + "layers_pattern": null, + "n_frequency_pattern": {}, + "proportional_parameters": false, + "init_weights": true + }, + "error_msg": "" + }, + "train_info": { + "accelerator_memory_reserved_avg": 14582950762, + "accelerator_memory_max": 24914165760, + "accelerator_memory_reserved_99th": 20564693483, + "train_time": 2783.132204494017, + "file_size": 1127304, + "num_trainable_params": 280000, + "num_total_params": 3213029824, + "status": "success", + "metrics": [ + { + "step": 250, + "valid accuracy": 0.34, + "train loss": 1.057445770263672, + "train samples": 1000, + "train time": 66.99218972800008, + "eval time": 29.364740246994188, + "tokens / sec": 3160.3534809000257, + "mem allocated avg": 6784647395.328, + "mem reserved avg": 14636350963.712, + "elapsed time": 149.00784370000474 + }, + { + "step": 500, + "valid accuracy": 0.34, + "train loss": 0.7512865424156189, + "train samples": 2000, + "train time": 65.21943290103809, + "eval time": 29.0264903579955, + "tokens / sec": 3189.156831762169, + "mem allocated avg": 6776712450.048, + "mem reserved avg": 14526627971.072, + "elapsed time": 283.5782854230056 + }, + { + "step": 750, + "valid accuracy": 0.34, + "train loss": 0.6967935096025467, + "train samples": 3000, + "train time": 66.11754105806904, + "eval time": 22.38963912300096, + "tokens / sec": 3242.724949672555, + "mem allocated avg": 6786416594.944, + "mem reserved avg": 14687840239.616, + "elapsed time": 412.45540837700537 + }, + { + "step": 1000, + "valid accuracy": 0.36, + "train loss": 0.6762894586324691, + "train samples": 4000, + "train time": 65.32631866908923, + "eval time": 22.478863092997926, + "tokens / sec": 3189.1587379250154, + "mem allocated avg": 6778948349.952, + "mem reserved avg": 14555082129.408, + "elapsed time": 540.8314235460057 + }, + { + "step": 1250, + "valid accuracy": 0.38, + "train loss": 0.6724052220582962, + "train samples": 5000, + "train time": 65.4501353219166, + "eval time": 29.181654504995095, + "tokens / sec": 3186.2118997051034, + "mem allocated avg": 6778097270.784, + "mem reserved avg": 14548866170.88, + "elapsed time": 675.7093939700062 + }, + { + "step": 1500, + "valid accuracy": 0.48, + "train loss": 0.6683271112442016, + "train samples": 6000, + "train time": 65.66567935897183, + "eval time": 19.425667663999775, + "tokens / sec": 3187.8296553616533, + "mem allocated avg": 6779162421.248, + "mem reserved avg": 14573251854.336, + "elapsed time": 801.0151559639999 + }, + { + "step": 1750, + "valid accuracy": 0.54, + "train loss": 0.6589902213811875, + "train samples": 7000, + "train time": 65.90766002406599, + "eval time": 24.991644790003193, + "tokens / sec": 3176.48965118098, + "mem allocated avg": 6781186451.456, + "mem reserved avg": 14585163677.696, + "elapsed time": 932.4019877410028 + }, + { + "step": 2000, + "valid accuracy": 0.42, + "train loss": 0.6641829339265823, + "train samples": 8000, + "train time": 65.52569843604579, + "eval time": 29.105937477994303, + "tokens / sec": 3169.6876944045835, + "mem allocated avg": 6777191518.208, + "mem reserved avg": 14533355634.688, + "elapsed time": 1067.1376723650028 + }, + { + "step": 2250, + "valid accuracy": 0.4, + "train loss": 0.6568171486854554, + "train samples": 9000, + "train time": 66.60324803898402, + "eval time": 18.03150882799673, + "tokens / sec": 3227.290054595945, + "mem allocated avg": 6789178621.952, + "mem reserved avg": 14712368529.408, + "elapsed time": 1192.7977026480003 + }, + { + "step": 2500, + "valid accuracy": 0.42, + "train loss": 0.6552880892753601, + "train samples": 10000, + "train time": 64.93178476598405, + "eval time": 29.03553620800085, + "tokens / sec": 3172.0520349519234, + "mem allocated avg": 6774276726.784, + "mem reserved avg": 14475549736.96, + "elapsed time": 1327.1052960740053 + }, + { + "step": 2750, + "valid accuracy": 0.42, + "train loss": 0.6487538056373596, + "train samples": 11000, + "train time": 66.10884880107187, + "eval time": 20.587333617004333, + "tokens / sec": 3205.0323646925253, + "mem allocated avg": 6784980387.84, + "mem reserved avg": 14651936997.376, + "elapsed time": 1454.7400554460037 + }, + { + "step": 3000, + "valid accuracy": 0.4, + "train loss": 0.6414109219312668, + "train samples": 12000, + "train time": 65.51606800403533, + "eval time": 17.688484279002296, + "tokens / sec": 3185.951269040499, + "mem allocated avg": 6780026255.36, + "mem reserved avg": 14562925477.888, + "elapsed time": 1578.3467425210038 + }, + { + "step": 3250, + "valid accuracy": 0.4, + "train loss": 0.6511869001388549, + "train samples": 13000, + "train time": 65.77464669098845, + "eval time": 22.121913303002657, + "tokens / sec": 3206.4178313388766, + "mem allocated avg": 6781515575.296, + "mem reserved avg": 14588720447.488, + "elapsed time": 1706.5400248380029 + }, + { + "step": 3500, + "valid accuracy": 0.46, + "train loss": 0.637642817735672, + "train samples": 14000, + "train time": 65.76092355793662, + "eval time": 29.041672920000565, + "tokens / sec": 3189.584158063204, + "mem allocated avg": 6779834134.528, + "mem reserved avg": 14574015217.664, + "elapsed time": 1842.0509184040056 + }, + { + "step": 3750, + "valid accuracy": 0.42, + "train loss": 0.6350828701257706, + "train samples": 15000, + "train time": 66.55924862711254, + "eval time": 20.789683652998065, + "tokens / sec": 3255.790960232193, + "mem allocated avg": 6791231805.44, + "mem reserved avg": 14752080199.68, + "elapsed time": 1970.2113445850046 + }, + { + "step": 4000, + "valid accuracy": 0.38, + "train loss": 0.65046697640419, + "train samples": 16000, + "train time": 65.04778776894818, + "eval time": 19.624021877003543, + "tokens / sec": 3141.890093571505, + "mem allocated avg": 6772911845.376, + "mem reserved avg": 14460534128.64, + "elapsed time": 2095.6094995790045 + }, + { + "step": 4250, + "valid accuracy": 0.42, + "train loss": 0.6331748945713043, + "train samples": 17000, + "train time": 65.85189565200562, + "eval time": 23.701296111001284, + "tokens / sec": 3210.067043735313, + "mem allocated avg": 6782308450.304, + "mem reserved avg": 14607057944.576, + "elapsed time": 2225.6684009890014 + }, + { + "step": 4500, + "valid accuracy": 0.4, + "train loss": 0.641278461933136, + "train samples": 18000, + "train time": 65.11867782095214, + "eval time": 23.630613847002678, + "tokens / sec": 3191.3731505944966, + "mem allocated avg": 6778411657.216, + "mem reserved avg": 14525831053.312, + "elapsed time": 2354.2676229330027 + }, + { + "step": 4750, + "valid accuracy": 0.4, + "train loss": 0.6345745379924774, + "train samples": 19000, + "train time": 65.44978067000193, + "eval time": 23.75194463099615, + "tokens / sec": 3207.634889695251, + "mem allocated avg": 6780527521.792, + "mem reserved avg": 14582739369.984, + "elapsed time": 2484.0077965070013 + }, + { + "step": 5000, + "valid accuracy": 0.44, + "train loss": 0.6398445825576782, + "train samples": 20000, + "train time": 65.40377733100468, + "eval time": 19.663959343997703, + "tokens / sec": 3184.5255503502062, + "mem allocated avg": 6777134090.24, + "mem reserved avg": 14518717513.728, + "elapsed time": 2609.523235476001 + }, + { + "step": 5000, + "test accuracy": 0.4162244124336619, + "train loss": 0.6398445825576782, + "train samples": 20000, + "train total tokens": 4198051 + } + ] + }, + "meta_info": { + "model_info": { + "sha": "13afe5124825b4f3751f836b40dafda64c1ed062", + "created_at": "2024-09-18T15:23:48+00:00" + }, + "dataset_info": { + "metamath": { + "sha": "aa4f34d3d2d3231299b5b03d9b3e5a20da45aa18", + "created_at": "2023-09-21T17:22:46+00:00" + }, + "gsm8k": { + "sha": "e53f048856ff4f594e959d75785d2c2d37b678ee", + "created_at": "2022-04-12T10:22:10+00:00" + } + }, + "package_info": { + "transformers-version": "4.57.1", + "transformers-commit-hash": null, + "peft-version": "0.17.2.dev0", + "peft-commit-hash": "a18ba67f242ab2eb74cdabab76ea2fd836b5cd83", + "datasets-version": "4.2.0", + "datasets-commit-hash": null, + "bitsandbytes-version": "0.46.0", + "bitsandbytes-commit-hash": null, + "torch-version": "2.9.0+cu128", + "torch-commit-hash": null + }, + "system_info": { + "system": "Linux", + "release": "6.14.0-1014-aws", + "version": "#14~24.04.1-Ubuntu SMP Tue Sep 23 14:51:14 UTC 2025", + "machine": "x86_64", + "processor": "x86_64", + "accelerator": "NVIDIA L40S" + }, + "pytorch_info": "PyTorch built with:\n - GCC 13.3\n - C++ Version: 201703\n - Intel(R) oneAPI Math Kernel Library Version 2024.2-Product Build 20240605 for Intel(R) 64 architecture applications\n - Intel(R) MKL-DNN v3.7.1 (Git Hash 8d263e693366ef8db40acc569cc7d8edf644556d)\n - OpenMP 201511 (a.k.a. OpenMP 4.5)\n - LAPACK is enabled (usually provided by MKL)\n - NNPACK is enabled\n - CPU capability usage: AVX2\n - CUDA Runtime 12.8\n - NVCC architecture flags: -gencode;arch=compute_70,code=sm_70;-gencode;arch=compute_75,code=sm_75;-gencode;arch=compute_80,code=sm_80;-gencode;arch=compute_86,code=sm_86;-gencode;arch=compute_90,code=sm_90;-gencode;arch=compute_100,code=sm_100;-gencode;arch=compute_120,code=sm_120\n - CuDNN 90.7.1\n - Built with CuDNN 90.8\n - Magma 2.6.1\n - Build settings: BLAS_INFO=mkl, BUILD_TYPE=Release, COMMIT_SHA=0fabc3ba44823f257e70ce397d989c8de5e362c1, CUDA_VERSION=12.8, CUDNN_VERSION=9.8.0, CXX_COMPILER=/opt/rh/gcc-toolset-13/root/usr/bin/c++, CXX_FLAGS= -fvisibility-inlines-hidden -DUSE_PTHREADPOOL -DNDEBUG -DUSE_KINETO -DLIBKINETO_NOROCTRACER -DLIBKINETO_NOXPUPTI=ON -DUSE_FBGEMM -DUSE_PYTORCH_QNNPACK -DUSE_XNNPACK -DSYMBOLICATE_MOBILE_DEBUG_HANDLE -O2 -fPIC -DC10_NODEPRECATED -Wall -Wextra -Werror=return-type -Werror=non-virtual-dtor -Werror=range-loop-construct -Werror=bool-operation -Wnarrowing -Wno-missing-field-initializers -Wno-unknown-pragmas -Wno-unused-parameter -Wno-strict-overflow -Wno-strict-aliasing -Wno-stringop-overflow -Wsuggest-override -Wno-psabi -Wno-error=old-style-cast -faligned-new -Wno-maybe-uninitialized -fno-math-errno -fno-trapping-math -Werror=format -Wno-dangling-reference -Wno-error=dangling-reference -Wno-stringop-overflow, LAPACK_INFO=mkl, PERF_WITH_AVX=1, PERF_WITH_AVX2=1, TORCH_VERSION=2.9.0, USE_CUDA=ON, USE_CUDNN=ON, USE_CUSPARSELT=1, USE_GFLAGS=OFF, USE_GLOG=OFF, USE_GLOO=ON, USE_MKL=ON, USE_MKLDNN=ON, USE_MPI=OFF, USE_NCCL=1, USE_NNPACK=ON, USE_OPENMP=ON, USE_ROCM=OFF, USE_ROCM_KERNEL_ASSERT=OFF, USE_XCCL=OFF, USE_XPU=OFF, \n" + } +} \ No newline at end of file