101101 SAFE_PEFT_WEIGHTS_INDEX_NAME ,
102102 SAFE_WEIGHTS_INDEX_NAME ,
103103)
104- from ..utils .import_utils import is_datasets_available , is_paddle_cuda_available
104+ from ..utils .import_utils import is_datasets_available
105105from ..utils .log import logger
106106from .argparser import strtobool
107107from .integrations import get_reporting_integration_callbacks
@@ -1259,19 +1259,27 @@ def _maybe_log_save_evaluate(self, tr_loss, model, epoch, ignore_keys_for_eval,
12591259 logs ["learning_rate" ] = float ("{0:.3e}" .format (self ._get_learning_rate ()))
12601260 logs ["global_step" ] = int (self .state .global_step )
12611261
1262- divisor = 2 ** 30
1263- # TODO(@gexiao): replace these codes with unified APIs in Paddle
1264- current_device = framework ._current_expected_place_ ()
1265- if str (current_device ) != "Place(cpu)" :
1266- device_id = current_device .get_device_id ()
1267- current_memory_allocated = core .device_memory_stat_current_value ("Allocated" , device_id )
1268- current_memory_reserved = core .device_memory_stat_current_value ("Reserved" , device_id )
1269- max_memory_allocated = core .device_memory_stat_peak_value ("Allocated" , device_id )
1270- max_memory_reserved = core .device_memory_stat_peak_value ("Reserved" , device_id )
1271- logs ["current_memory_allocated" ] = current_memory_allocated / divisor
1272- logs ["current_memory_reserved" ] = current_memory_reserved / divisor
1273- logs ["max_memory_allocated" ] = max_memory_allocated / divisor
1274- logs ["max_memory_reserved" ] = max_memory_reserved / divisor
1262+ # Add additional memory in log.
1263+ if not self .args .skip_memory_metrics :
1264+ shift_bits_for_MB = 20
1265+ logs .update (
1266+ {
1267+ "cpu_mem_used" : self ._memory_tracker .cpu_mem_used () >> shift_bits_for_MB ,
1268+ "cpu_mem_used_peak" : self ._memory_tracker .cpu_mem_used_peak >> shift_bits_for_MB ,
1269+ }
1270+ )
1271+ # TODO(@gexiao): replace these codes with unified APIs in Paddle
1272+ current_device = framework ._current_expected_place_ ()
1273+ if str (current_device ) != "Place(cpu)" :
1274+ device_id = current_device .get_device_id ()
1275+ current_memory_allocated = core .device_memory_stat_current_value ("Allocated" , device_id )
1276+ current_memory_reserved = core .device_memory_stat_current_value ("Reserved" , device_id )
1277+ max_memory_allocated = core .device_memory_stat_peak_value ("Allocated" , device_id )
1278+ max_memory_reserved = core .device_memory_stat_peak_value ("Reserved" , device_id )
1279+ logs ["current_memory_allocated" ] = current_memory_allocated >> shift_bits_for_MB
1280+ logs ["current_memory_reserved" ] = current_memory_reserved >> shift_bits_for_MB
1281+ logs ["max_memory_allocated" ] = max_memory_allocated >> shift_bits_for_MB
1282+ logs ["max_memory_reserved" ] = max_memory_reserved >> shift_bits_for_MB
12751283
12761284 total_train_batch_size = (
12771285 self .args .train_batch_size * self .args .gradient_accumulation_steps * self .args .dataset_world_size
@@ -1294,22 +1302,6 @@ def _maybe_log_save_evaluate(self, tr_loss, model, epoch, ignore_keys_for_eval,
12941302 self ._globalstep_last_logged = self .state .global_step
12951303 self ._globalstep_last_start_time = time .time ()
12961304
1297- # Add additional memory in log.
1298- if not self .args .skip_memory_metrics :
1299- logs .update (
1300- {
1301- "cpu_mem_used" : self ._memory_tracker .cpu_mem_used () >> 20 ,
1302- "cpu_mem_used_peak" : self ._memory_tracker .cpu_mem_used_peak >> 20 ,
1303- }
1304- )
1305- if is_paddle_cuda_available ():
1306- logs .update (
1307- {
1308- "gpu_max_memory_allocated" : paddle .device .cuda .max_memory_allocated () >> 20 ,
1309- "gpu_max_memory_reserved" : paddle .device .cuda .max_memory_reserved () >> 20 ,
1310- }
1311- )
1312-
13131305 self .log (logs , ** kwargs )
13141306
13151307 metrics = None
0 commit comments