@@ -327,9 +327,9 @@ def common_all_to_all_embedding_trainable_v2(self, base_opt, test_opt, name):
327
327
shutil .rmtree (save_dir )
328
328
hvd .join () # Sync for avoiding files conflict
329
329
# base_model.save(save_dir, options=save_options)
330
- de .keras .models .de_hvd_save_model (base_model ,
331
- save_dir ,
332
- options = save_options )
330
+ de .keras .models .de_save_model (base_model ,
331
+ save_dir ,
332
+ options = save_options )
333
333
ckpt = de .train .DECheckpoint (
334
334
my_model = base_model ) # Test custom model key "my_model"
335
335
ckpt .save (save_dir + '/ckpt/test' )
@@ -407,31 +407,38 @@ def call(self, x):
407
407
return self .l2 (out )
408
408
409
409
def check_TFRADynamicEmbedding_directory (save_dir ,
410
- save_it ,
410
+ save_it = None ,
411
411
should_be_exist = True ):
412
412
hvd_size = hvd .size ()
413
413
if hvd_size <= 1 :
414
414
hvd_size = 1
415
+ base_dir = os .path .join (save_dir , 'variables' , 'TFRADynamicEmbedding' )
416
+ if save_it is not None :
417
+ base_dir = os .path .join (save_dir , f'TFRADynamicEmbedding-{ save_it } ' )
415
418
for tag in ['keys' , 'values' ]:
416
419
for rank in range (hvd_size ):
417
420
self .assertTrue (not (os .path .exists (
418
- save_dir +
419
- f'/TFRADynamicEmbedding- { save_it } / { name } -parameter_mht_1of1_rank{ rank } _size{ hvd_size } -{ tag } '
420
- ) ^ should_be_exist ))
421
+ base_dir +
422
+ f'/{ name } -parameter_mht_1of1_rank{ rank } _size{ hvd_size } -{ tag } ' ) ^
423
+ should_be_exist ))
421
424
self .assertTrue (not (os .path .exists (
422
- save_dir +
423
- f'/TFRADynamicEmbedding- { save_it } / { name } -parameter_DynamicEmbedding_keras_adam_lazy_build-shadow_m_mht_1of1_rank{ rank } _size{ hvd_size } -{ tag } '
425
+ base_dir +
426
+ f'/{ name } -parameter_DynamicEmbedding_keras_adam_lazy_build-shadow_m_mht_1of1_rank{ rank } _size{ hvd_size } -{ tag } '
424
427
) ^ should_be_exist ))
425
- # f'/TFRADynamicEmbedding-{save_it}/ {name}-parameter_no_compile_model_DynamicEmbedding_keras_adam_lazy_build-shadow_m_mht_1of1_rank{rank}_size{hvd_size}-{tag}'
428
+ # f'/{name}-parameter_no_compile_model_DynamicEmbedding_keras_adam_lazy_build-shadow_m_mht_1of1_rank{rank}_size{hvd_size}-{tag}'
426
429
self .assertTrue (not (os .path .exists (
427
- save_dir +
428
- f'/TFRADynamicEmbedding- { save_it } / { name } -parameter_DynamicEmbedding_keras_adam_lazy_build-shadow_v_mht_1of1_rank{ rank } _size{ hvd_size } -{ tag } '
430
+ base_dir +
431
+ f'/{ name } -parameter_DynamicEmbedding_keras_adam_lazy_build-shadow_v_mht_1of1_rank{ rank } _size{ hvd_size } -{ tag } '
429
432
) ^ should_be_exist ))
430
- # f'/TFRADynamicEmbedding-{save_it}/ {name}-parameter_no_compile_model_DynamicEmbedding_keras_adam_lazy_build-shadow_v_mht_1of1_rank{rank}_size{hvd_size}-{tag}'
433
+ # f'/{name}-parameter_no_compile_model_DynamicEmbedding_keras_adam_lazy_build-shadow_v_mht_1of1_rank{rank}_size{hvd_size}-{tag}'
431
434
432
435
with tf .device ("/{}:{}" .format (_device , _device_id )):
433
436
x = tf .reshape (tf .range (0 , 32 , dtype = tf .int64 ), [32 , 1 ])
434
437
y = tf .random .uniform (shape = [32 , 1 ])
438
+ base_de_emb_standard = {}
439
+ base_de_opt_standard = {}
440
+ new_de_emb_compared = {}
441
+ new_de_opt_compared = {}
435
442
436
443
save_dir = self .get_temp_dir ()
437
444
@@ -454,13 +461,16 @@ def check_TFRADynamicEmbedding_directory(save_dir,
454
461
l .params .upsert (x * 10 , tf .random .uniform (shape = [32 , 1 , dim ]))
455
462
emb_size = l .params .size ()
456
463
emb_keys , emb_values = l .params .export ()
464
+ base_de_emb_standard [l .name ] = (emb_size , emb_keys , emb_values )
457
465
break
458
466
for v in base_opt .variables ():
459
467
if name in v .name :
460
468
v .params .upsert (x * 10 , tf .random .uniform (shape = [32 , 1 , dim ]))
461
469
opt_size = v .params .size ()
462
- opt_keys , opt_values = l .params .export ()
463
- break
470
+ opt_keys , opt_values = v .params .export ()
471
+ base_de_opt_standard [v ._shared_name .split ('/' )[- 1 ]] = (opt_size ,
472
+ opt_keys ,
473
+ opt_values )
464
474
manager .save ()
465
475
if hvd .rank () == 0 :
466
476
check_TFRADynamicEmbedding_directory (save_dir ,
@@ -491,31 +501,102 @@ def check_TFRADynamicEmbedding_directory(save_dir,
491
501
new_model .compile (optimizer = new_opt , loss = 'mean_absolute_error' )
492
502
new_model (x ) # Build vairiables
493
503
try :
494
- new_opt ._create_all_weights (new_model .variables )
504
+ new_opt ._create_all_weights ([
505
+ new_model .variables [0 ]
506
+ ]) # Create DE slot variable from DE shadow variable
495
507
except :
496
508
#TODO(MoFHejia) raise ValueError: Cannot convert a partially known TensorShape <unknown> to a Tensor.
497
509
pass
498
510
for l in new_model .layers :
499
511
if name in l .name :
500
512
new_emb_size = l .params .size ()
501
513
new_emb_keys , new_emb_values = l .params .export ()
514
+ new_de_emb_compared [l .name ] = (new_emb_size , new_emb_keys ,
515
+ new_emb_values )
502
516
break
503
517
for v in new_opt .variables ():
504
518
if name in v .name :
505
519
new_opt_size = v .params .size ()
506
- new_opt_keys , new_opt_values = l .params .export ()
520
+ new_opt_keys , new_opt_values = v .params .export ()
521
+ new_de_opt_compared [v ._shared_name .split ('/' )[- 1 ]] = (new_opt_size ,
522
+ new_opt_keys ,
523
+ new_opt_values )
524
+
525
+ for de_l_name in base_de_emb_standard .keys ():
526
+ self .assertEqual (base_de_emb_standard [de_l_name ][0 ],
527
+ new_de_emb_compared [de_l_name ][0 ])
528
+ self .assertAllEqual (np .sort (base_de_emb_standard [de_l_name ][1 ], axis = 0 ),
529
+ np .sort (new_de_emb_compared [de_l_name ][1 ], axis = 0 ))
530
+ self .assertAllClose (np .sort (base_de_emb_standard [de_l_name ][2 ], axis = 0 ),
531
+ np .sort (new_de_emb_compared [de_l_name ][2 ], axis = 0 ))
532
+ for opt_v_name in base_de_opt_standard .keys ():
533
+ self .assertEqual (base_de_opt_standard [opt_v_name ][0 ],
534
+ new_de_opt_compared [opt_v_name ][0 ])
535
+ self .assertAllEqual (
536
+ np .sort (base_de_opt_standard [opt_v_name ][1 ], axis = 0 ),
537
+ np .sort (new_de_opt_compared [opt_v_name ][1 ], axis = 0 ))
538
+ self .assertAllClose (
539
+ np .sort (base_de_opt_standard [opt_v_name ][2 ], axis = 0 ),
540
+ np .sort (new_de_opt_compared [opt_v_name ][2 ], axis = 0 ))
541
+
542
+ extra_save_dir = self .get_temp_dir () + '/extra_save_dir'
543
+ de .keras .models .de_save_model (new_model , extra_save_dir )
544
+ if hvd .rank () == 0 :
545
+ check_TFRADynamicEmbedding_directory (extra_save_dir )
546
+ del new_opt
547
+ del new_model
548
+ del new_ckpt
549
+ tf .keras .backend .clear_session ()
550
+ tf .compat .v1 .reset_default_graph ()
551
+ new_saved_model = NoCompileModel ('zeros' )
552
+ new_saved_opt = Adam (1.2 )
553
+ new_saved_opt = de .DynamicEmbeddingOptimizer (new_saved_opt ,
554
+ synchronous = True )
555
+ new_saved_model .compile (optimizer = new_saved_opt ,
556
+ loss = 'mean_absolute_error' )
557
+ new_saved_model (x ) # Build vairiables
558
+ try :
559
+ new_opt ._create_all_weights ([
560
+ new_model .variables [0 ]
561
+ ]) # Create DE slot variable from DE shadow variable
562
+ except :
563
+ #TODO(MoFHejia) raise ValueError: Cannot convert a partially known TensorShape <unknown> to a Tensor.
564
+ pass
565
+ extra_save_dir = hvd .broadcast_object (
566
+ extra_save_dir , root_rank = 0 , name = 'de_utest_hvd_broadcast_filepath'
567
+ ) # All ranks should share same save directory
568
+ new_saved_model .load_weights (extra_save_dir + '/variables/variables' )
569
+ for l in new_saved_model .layers :
570
+ if name in l .name :
571
+ new_emb_size = l .params .size ()
572
+ new_emb_keys , new_emb_values = l .params .export ()
573
+ new_de_emb_compared [l .name ] = (new_emb_size , new_emb_keys ,
574
+ new_emb_values )
507
575
break
508
-
509
- self .assertEqual (emb_size , new_emb_size )
510
- self .assertEqual (opt_size , new_opt_size )
511
- self .assertAllEqual (np .sort (emb_keys , axis = 0 ),
512
- np .sort (new_emb_keys , axis = 0 ))
513
- self .assertAllClose (np .sort (emb_values , axis = 0 ),
514
- np .sort (new_emb_values , axis = 0 ))
515
- self .assertAllEqual (np .sort (opt_keys , axis = 0 ),
516
- np .sort (new_opt_keys , axis = 0 ))
517
- self .assertAllClose (np .sort (opt_values , axis = 0 ),
518
- np .sort (new_opt_values , axis = 0 ))
576
+ for v in new_saved_opt .variables ():
577
+ if name in v .name :
578
+ new_opt_size = v .params .size ()
579
+ new_opt_keys , new_opt_values = l .params .export ()
580
+ new_de_opt_compared [v ._shared_name .split ('/' )[- 1 ]] = (new_opt_size ,
581
+ new_opt_keys ,
582
+ new_opt_values )
583
+
584
+ for de_l_name in base_de_emb_standard .keys ():
585
+ self .assertEqual (base_de_emb_standard [de_l_name ][0 ],
586
+ new_de_emb_compared [de_l_name ][0 ])
587
+ self .assertAllEqual (np .sort (base_de_emb_standard [de_l_name ][1 ], axis = 0 ),
588
+ np .sort (new_de_emb_compared [de_l_name ][1 ], axis = 0 ))
589
+ self .assertAllClose (np .sort (base_de_emb_standard [de_l_name ][2 ], axis = 0 ),
590
+ np .sort (new_de_emb_compared [de_l_name ][2 ], axis = 0 ))
591
+ for opt_v_name in base_de_opt_standard .keys ():
592
+ self .assertEqual (base_de_opt_standard [opt_v_name ][0 ],
593
+ new_de_opt_compared [opt_v_name ][0 ])
594
+ self .assertAllEqual (
595
+ np .sort (base_de_opt_standard [opt_v_name ][1 ], axis = 0 ),
596
+ np .sort (new_de_opt_compared [opt_v_name ][1 ], axis = 0 ))
597
+ self .assertAllClose (
598
+ np .sort (base_de_opt_standard [opt_v_name ][2 ], axis = 0 ),
599
+ np .sort (new_de_opt_compared [opt_v_name ][2 ], axis = 0 ))
519
600
520
601
521
602
if __name__ == "__main__" :
0 commit comments