1- epoch , framework_config , gradient_accumulation_steps , mem_nvidia_mem_reserved , model_name_or_path , num_gpus , per_device_train_batch_size , torch_dtype , train_loss , train_runtime , train_samples_per_second , train_steps_per_second , train_tokens_per_second
2- 0.25 , none , 16.0 , 71199.0 , ibm-granite/granite-3.0-3b-a800m-instruct , 1 , 8 , bfloat16 , 0.9438143467903136 , 2371.9316 , 5.396 , 0.042 , 1505.608
3- 0.25 , none , 8.0 , 46829.0 , ibm-granite/granite-3.0-3b-a800m-instruct , 2 , 8 , bfloat16 , 0.9437569552659988 , 1355.7096 , 9.442 , 0.074 , 1317.096
4- 0.25 , none , 4.0 , 37996.0 , ibm-granite/granite-3.0-3b-a800m-instruct , 4 , 8 , bfloat16 , 0.9437739425897598 , 708.3914 , 18.069 , 0.141 , 1260.32
5- 0.25 , moe-scattermoe-granite-ep1 , 16.0 , 71187.0 , ibm-granite/granite-3.0-3b-a800m-instruct , 1 , 8 , bfloat16 , 0.9439476370811464 , 742.739 , 17.234 , 0.135 , 4808.149
6- 0.25 , moe-scattermoe-granite-ep1 , 8.0 , 52503.0 , ibm-granite/granite-3.0-3b-a800m-instruct , 2 , 8 , bfloat16 , 0.9506204092502594 , 485.5103 , 26.364 , 0.206 , 3677.78
7- 0.25 , moe-scattermoe-granite-ep1 , 4.0 , 51145.0 , ibm-granite/granite-3.0-3b-a800m-instruct , 4 , 8 , bfloat16 , 0.9572784686088562 , 262.9566 , 48.677 , 0.38 , 3395.238
8- 0.25 , moe-scattermoe-granite-ep2 , 8.0 , 40193.0 , ibm-granite/granite-3.0-3b-a800m-instruct , 2 , 8 , bfloat16 , 0.9437192791700364 , 577.2164 , 22.175 , 0.173 , 3093.467
9- 0.25 , moe-scattermoe-granite-ep2 , 4.0 , 40878.5 , ibm-granite/granite-3.0-3b-a800m-instruct , 4 , 8 , bfloat16 , 0.9509018939733506 , 300.285 , 42.626 , 0.333 , 2973.176
10- 0.25 , moe-scattermoe-granite-ep4 , 4.0 , 31777.5 , ibm-granite/granite-3.0-3b-a800m-instruct , 4 , 8 , bfloat16 , 0.9434539985656738 , 307.1264 , 41.677 , 0.326 , 2906.946
11- 0.25 , moe-scattermoe-granite-ep1-padding-free , 16.0 , 48401.0 , ibm-granite/granite-3.0-3b-a800m-instruct , 1 , 8 , bfloat16 , 0.9437484860420228 , 631.9756 , 20.254 , 0.158 , 3924.202
12- 0.25 , moe-scattermoe-granite-ep1-padding-free , 8.0 , 42452.0 , ibm-granite/granite-3.0-3b-a800m-instruct , 2 , 8 , bfloat16 , 0.9506663566827774 , 454.3444 , 28.172 , 0.22 , 2729.207
13- 0.25 , moe-scattermoe-granite-ep1-padding-free , 4.0 , 38560.0 , ibm-granite/granite-3.0-3b-a800m-instruct , 4 , 8 , bfloat16 , 0.957276314496994 , 241.2967 , 53.047 , 0.414 , 2569.451
14- 0.25 , moe-scattermoe-granite-ep2-padding-free , 8.0 , 31012.0 , ibm-granite/granite-3.0-3b-a800m-instruct , 2 , 8 , bfloat16 , 0.943688799738884 , 546.507 , 23.421 , 0.183 , 2268.955
15- 0.25 , moe-scattermoe-granite-ep2-padding-free , 4.0 , 28133.0 , ibm-granite/granite-3.0-3b-a800m-instruct , 4 , 8 , bfloat16 , 0.9505942213535308 , 283.5444 , 45.143 , 0.353 , 2186.607
16- 0.25 , moe-scattermoe-granite-ep4-padding-free , 4.0 , 21585.5 , ibm-granite/granite-3.0-3b-a800m-instruct , 4 , 8 , bfloat16 , 0.9441865116357804 , 284.6079 , 44.974 , 0.351 , 2178.436
17- 0.25 , moe-scattermoe-granite-ep1-padding-free-foak , 16.0 , 42651.0 , ibm-granite/granite-3.0-3b-a800m-instruct , 1 , 8 , bfloat16 , 0.9437448275089264 , 615.4528 , 20.798 , 0.162 , 4029.554
18- 0.25 , moe-scattermoe-granite-ep1-padding-free-foak , 8.0 , 37743.0 , ibm-granite/granite-3.0-3b-a800m-instruct , 2 , 8 , bfloat16 , 0.950773031115532 , 433.4811 , 29.528 , 0.231 , 2860.563
19- 0.25 , moe-scattermoe-granite-ep1-padding-free-foak , 4.0 , 35153.0 , ibm-granite/granite-3.0-3b-a800m-instruct , 4 , 8 , bfloat16 , 0.9572476959228516 , 232.0428 , 55.162 , 0.431 , 2671.921
20- 0.25 , moe-scattermoe-granite-ep2-padding-free-foak , 8.0 , 26075.0 , ibm-granite/granite-3.0-3b-a800m-instruct , 2 , 8 , bfloat16 , 0.9437651455402374 , 524.7751 , 24.391 , 0.191 , 2362.917
21- 0.25 , moe-scattermoe-granite-ep2-padding-free-foak , 4.0 , 24665.5 , ibm-granite/granite-3.0-3b-a800m-instruct , 4 , 8 , bfloat16 , 0.9507779973745346 , 274.126 , 46.694 , 0.365 , 2261.733
22- 0.25 , moe-scattermoe-granite-ep4-padding-free-foak , 4.0 , 18368.0 , ibm-granite/granite-3.0-3b-a800m-instruct , 4 , 8 , bfloat16 , 0.943427557349205 , 278.1245 , 46.023 , 0.36 , 2229.217
23- , none ,, 65607.25 , mistralai/Mixtral-8x7B-Instruct-v0.1 , 8 , 1 , bfloat16 , 0.8599078696966171 , 4180.9544 , 3.062 , 0.024 , 80.364
24- , moe-scattermoe-granite-ep8 ,, 52004.75 , mistralai/Mixtral-8x7B-Instruct-v0.1 , 8 , 1 , bfloat16 , 0.8588122856616974 , 1071.1967 , 11.949 , 0.093 , 313.668
25- , moe-scattermoe-granite-ep8-foak ,, 51961.25 , mistralai/Mixtral-8x7B-Instruct-v0.1 , 8 , 1 , bfloat16 , 0.8599798053503036 , 1043.6675 , 12.264 , 0.096 , 321.942
1+ epoch , framework_config , gradient_accumulation_steps , mem_nvidia_mem_reserved , model_name_or_path , num_gpus , per_device_train_batch_size , torch_dtype , train_loss , train_runtime , train_samples_per_second , train_steps_per_second , train_tokens_per_second
2+ 0.25 , none , 16 , 72072 , ibm-granite/granite-3.0-3b-a800m-instruct , 1 , 8 , bfloat16 , 0.938093501 , 1986.7714 , 6.443 , 0.05 , 1797.489
3+ 0.25 , none , 8 , 49689 , ibm-granite/granite-3.0-3b-a800m-instruct , 2 , 8 , bfloat16 , 0.937983845 , 1082.5484 , 11.824 , 0.092 , 1649.441
4+ 0.25 , none , 4 , 41754.5 , ibm-granite/granite-3.0-3b-a800m-instruct , 4 , 8 , bfloat16 , 0.93852025 , 569.5617 , 22.473 , 0.176 , 1567.521
5+ 0.25 , moe-scattermoe-granite-ep1 , 16 , 72068 , ibm-granite/granite-3.0-3b-a800m-instruct , 1 , 8 , bfloat16 , 0.938054211 , 660.687 , 19.374 , 0.151 , 5405.283
6+ 0.25 , moe-scattermoe-granite-ep1 , 8 , 53917 , ibm-granite/granite-3.0-3b-a800m-instruct , 2 , 8 , bfloat16 , 0.944801819 , 362.751 , 35.286 , 0.276 , 4922.385
7+ 0.25 , moe-scattermoe-granite-ep1 , 4 , 53070 , ibm-granite/granite-3.0-3b-a800m-instruct , 4 , 8 , bfloat16 , 0.95192752 , 202.3782 , 63.248 , 0.494 , 4411.543
8+ 0.25 , moe-scattermoe-granite-ep2 , 8 , 41880 , ibm-granite/granite-3.0-3b-a800m-instruct , 2 , 8 , bfloat16 , 0.938050581 , 441.5269 , 28.99 , 0.226 , 4044.147
9+ 0.25 , moe-scattermoe-granite-ep2 , 4 , 43092 , ibm-granite/granite-3.0-3b-a800m-instruct , 4 , 8 , bfloat16 , 0.945302382 , 235.4383 , 54.367 , 0.425 , 3792.076
10+ 0.25 , moe-scattermoe-granite-ep4 , 4 , 33673.5 , ibm-granite/granite-3.0-3b-a800m-instruct , 4 , 8 , bfloat16 , 0.938171822 , 259.2932 , 49.365 , 0.386 , 3443.207
11+ 0.25 , moe-scattermoe-granite-ep1-padding-free , 16 , 49580 , ibm-granite/granite-3.0-3b-a800m-instruct , 1 , 8 , bfloat16 , 0.937993399 , 505.6847 , 25.312 , 0.198 , 4904.241
12+ 0.25 , moe-scattermoe-granite-ep1-padding-free , 8 , 43821 , ibm-granite/granite-3.0-3b-a800m-instruct , 2 , 8 , bfloat16 , 0.944808855 , 311.785 , 41.054 , 0.321 , 3977.099
13+ 0.25 , moe-scattermoe-granite-ep1-padding-free , 4 , 40070.5 , ibm-granite/granite-3.0-3b-a800m-instruct , 4 , 8 , bfloat16 , 0.951866873 , 169.9554 , 75.314 , 0.588 , 3648.016
14+ 0.25 , moe-scattermoe-granite-ep1-padding-free-foak , 16 , 49114 , ibm-granite/granite-3.0-3b-a800m-instruct , 1 , 8 , bfloat16 , 0.938123143 , 476.8099 , 26.845 , 0.21 , 5201.235
15+ 0.25 , moe-scattermoe-granite-ep1-padding-free-foak , 8 , 43865 , ibm-granite/granite-3.0-3b-a800m-instruct , 2 , 8 , bfloat16 , 0.944894351 , 296.5204 , 43.167 , 0.337 , 4181.837
16+ 0.25 , moe-scattermoe-granite-ep1-padding-free-foak , 4 , 40070.5 , ibm-granite/granite-3.0-3b-a800m-instruct , 4 , 8 , bfloat16 , 0.951975068 , 163.756 , 78.165 , 0.611 , 3786.12
17+ 0.25 , moe-scattermoe-granite-ep2-padding-free , 8 , 32276 , ibm-granite/granite-3.0-3b-a800m-instruct , 2 , 8 , bfloat16 , 0.937930156 , 356.1296 , 35.942 , 0.281 , 3481.878
18+ 0.25 , moe-scattermoe-granite-ep2-padding-free , 4 , 29787 , ibm-granite/granite-3.0-3b-a800m-instruct , 4 , 8 , bfloat16 , 0.945339936 , 192.7168 , 66.419 , 0.519 , 3217.156
19+ 0.25 , moe-scattermoe-granite-ep2-padding-free-foak , 8 , 32376 , ibm-granite/granite-3.0-3b-a800m-instruct , 2 , 8 , bfloat16 , 0.938017525 , 342.9327 , 37.325 , 0.292 , 3615.87
20+ 0.25 , moe-scattermoe-granite-ep2-padding-free-foak , 4 , 29734.5 , ibm-granite/granite-3.0-3b-a800m-instruct , 4 , 8 , bfloat16 , 0.945357794 , 184.554 , 69.356 , 0.542 , 3359.451
21+ 0.25 , moe-scattermoe-granite-ep4-padding-free , 4 , 23386.5 , ibm-granite/granite-3.0-3b-a800m-instruct , 4 , 8 , bfloat16 , 0.938359724 , 191.205 , 66.944 , 0.523 , 3242.593
22+ 0.25 , moe-scattermoe-granite-ep4-padding-free-foak , 4 , 23359.5 , ibm-granite/granite-3.0-3b-a800m-instruct , 4 , 8 , bfloat16 , 0.938333818 , 183.9191 , 69.596 , 0.544 , 3371.048
23+ 0.25 , none , 16 , 81018 , ibm-research/moe-7b-1b-active-shared-experts , 1 , 8 , bfloat16 , 0.878051637 , 4223.9158 , 3.03 , 0.024 , 839.411
24+ 0.25 , none , 8 , 74462 , ibm-research/moe-7b-1b-active-shared-experts , 2 , 8 , bfloat16 , 0.877874975 , 2247.4716 , 5.695 , 0.044 , 788.798
25+ 0.25 , none , 4 , 63033 , ibm-research/moe-7b-1b-active-shared-experts , 4 , 8 , bfloat16 , 0.878253661 , 1155.5903 , 11.077 , 0.087 , 767.054
26+ 0.25 , moe-scattermoe-granite-ep1 , 16 , 81018 , ibm-research/moe-7b-1b-active-shared-experts , 1 , 8 , bfloat16 , 0.878006854 , 907.8407 , 14.099 , 0.11 , 3905.531
27+ 0.25 , moe-scattermoe-granite-ep1 , 8 , 73870 , ibm-research/moe-7b-1b-active-shared-experts , 2 , 8 , bfloat16 , 0.879557709 , 492.5063 , 25.99 , 0.203 , 3599.548
28+ 0.25 , moe-scattermoe-granite-ep1 , 4 , 74108.5 , ibm-research/moe-7b-1b-active-shared-experts , 4 , 8 , bfloat16 , 0.881521969 , 277.8191 , 46.073 , 0.36 , 3190.565
29+ 0.25 , moe-scattermoe-granite-ep2 , 8 , 54168 , ibm-research/moe-7b-1b-active-shared-experts , 2 , 8 , bfloat16 , 0.877982622 , 563.0434 , 22.734 , 0.178 , 3148.603
30+ 0.25 , moe-scattermoe-granite-ep2 , 4 , 54582 , ibm-research/moe-7b-1b-active-shared-experts , 4 , 8 , bfloat16 , 0.880103117 , 299.2522 , 42.773 , 0.334 , 2962.05
31+ 0.25 , moe-scattermoe-granite-ep1-padding-free , 16 , 77632 , ibm-research/moe-7b-1b-active-shared-experts , 1 , 8 , bfloat16 , 0.878018975 , 726.1255 , 17.628 , 0.138 , 3410.98
32+ 0.25 , moe-scattermoe-granite-ep1-padding-free , 8 , 68019 , ibm-research/moe-7b-1b-active-shared-experts , 2 , 8 , bfloat16 , 0.879643369 , 429.5618 , 29.798 , 0.233 , 2882.938
33+ 0.25 , moe-scattermoe-granite-ep1-padding-free , 4 , 63879 , ibm-research/moe-7b-1b-active-shared-experts , 4 , 8 , bfloat16 , 0.88148216 , 239.3677 , 53.474 , 0.418 , 2586.815
34+ 0.25 , moe-scattermoe-granite-ep1-padding-free-foak , 16 , 72666 , ibm-research/moe-7b-1b-active-shared-experts , 1 , 8 , bfloat16 , 0.878073001 , 688.38 , 18.594 , 0.145 , 3598.013
35+ 0.25 , moe-scattermoe-granite-ep1-padding-free-foak , 8 , 63074 , ibm-research/moe-7b-1b-active-shared-experts , 2 , 8 , bfloat16 , 0.879622684 , 419.7876 , 30.492 , 0.238 , 2950.063
36+ 0.25 , moe-scattermoe-granite-ep1-padding-free-foak , 4 , 60126.5 , ibm-research/moe-7b-1b-active-shared-experts , 4 , 8 , bfloat16 , 0.881447418 , 231.7976 , 55.221 , 0.431 , 2671.296
37+ 0.25 , moe-scattermoe-granite-ep2-padding-free , 8 , 45093 , ibm-research/moe-7b-1b-active-shared-experts , 2 , 8 , bfloat16 , 0.8779908 , 471.1344 , 27.168 , 0.212 , 2628.549
38+ 0.25 , moe-scattermoe-granite-ep2-padding-free , 4 , 42590 , ibm-research/moe-7b-1b-active-shared-experts , 4 , 8 , bfloat16 , 0.879999972 , 250.48 , 51.102 , 0.399 , 2472.054
39+ 0.25 , moe-scattermoe-granite-ep2-padding-free-foak , 8 , 40281 , ibm-research/moe-7b-1b-active-shared-experts , 2 , 8 , bfloat16 , 0.878110015 , 461.6668 , 27.726 , 0.217 , 2682.454
40+ 0.25 , moe-scattermoe-granite-ep2-padding-free-foak , 4 , 38934.5 , ibm-research/moe-7b-1b-active-shared-experts , 4 , 8 , bfloat16 , 0.880085612 , 250.2941 , 51.14 , 0.4 , 2473.889
41+ 0.25 , moe-scattermoe-granite-ep8 , 16 , 56845 , mistralai/Mixtral-8x7B-Instruct-v0.1 , 8 , 1 , bfloat16 , 0.86557077 , 779.9315 , 16.412 , 0.128 , 430.807
42+ 0.25 , moe-scattermoe-granite-ep8-foak , 16 , 56769.25 , mistralai/Mixtral-8x7B-Instruct-v0.1 , 8 , 1 , bfloat16 , 0.86551428 , 734.0756 , 17.437 , 0.136 , 457.719
0 commit comments