@@ -490,3 +490,47 @@ def test_benchmark_blockwise(dim1, dim2, gtype, optim_name):
490490 params = (k - k // 5 ) * dim1 * dim2
491491 print (optim_name , gtype , s / params )
492492 # assert s < 3.9
493+
494+ dim1 = [10 * 1024 ]
495+ gtype = [torch .float16 ]
496+ #mode = ['torch', 'bnb']
497+ mode = ['bnb' ]
498+ optimizer_names = ['paged_adamw' ]
499+ #optimizer_names = ['paged_adamw8bit_blockwise']
500+ values = list (product (dim1 ,gtype , optimizer_names , mode ))
501+ names = ['dim1_{0}_gtype_{1}_optim_{2}_mode_{3}' .format (* vals ) for vals in values ]
502+ @pytest .mark .parametrize ("dim1, gtype, optim_name, mode" , values , ids = names )
503+ def test_stream_optimizer_bench (dim1 , gtype , optim_name , mode ):
504+ layers1 = torch .nn .Sequential (* torch .nn .ModuleList ([torch .nn .Linear (dim1 , dim1 ) for i in range (10 )]))
505+ layers1 = layers1 .to (gtype )
506+ layers1 = layers1 .cuda ()
507+
508+ large_tensor = None
509+ if mode == 'torch' :
510+ optim = str2optimizers [optim_name ][0 ](layers1 .parameters ())
511+ else :
512+ optim = str2optimizers [optim_name ][1 ](layers1 .parameters ())
513+ # 12 GB
514+ large_tensor = torch .empty ((int (4.5e9 ),), device = 'cuda' )
515+
516+ torch .cuda .synchronize ()
517+ time .sleep (5 )
518+
519+ num_batches = 5
520+ batches = torch .randn (num_batches , 128 , dim1 , device = 'cuda' ).to (gtype )
521+ lbls = torch .randint (0 , 10 , size = (num_batches ,128 )).cuda ()
522+
523+ for i in range (num_batches ):
524+ print (i )
525+ b = batches [i ]
526+ if i == 2 :
527+ torch .cuda .synchronize ()
528+ t0 = time .time ()
529+
530+ out1 = layers1 (b )
531+
532+ loss1 = torch .nn .functional .cross_entropy (out1 , lbls [i ]).mean ()
533+ loss1 .backward ()
534+ optim .step ()
535+ torch .cuda .synchronize ()
536+ print (mode , time .time () - t0 )
0 commit comments