Skip to content

Commit 4821ce2

Browse files
committed
[Trainer] remove redundant memory metrics and set enable as default
1 parent 09a0ce7 commit 4821ce2

File tree

2 files changed

+23
-31
lines changed

2 files changed

+23
-31
lines changed

paddlenlp/trainer/trainer.py

Lines changed: 22 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -101,7 +101,7 @@
101101
SAFE_PEFT_WEIGHTS_INDEX_NAME,
102102
SAFE_WEIGHTS_INDEX_NAME,
103103
)
104-
from ..utils.import_utils import is_datasets_available, is_paddle_cuda_available
104+
from ..utils.import_utils import is_datasets_available
105105
from ..utils.log import logger
106106
from .argparser import strtobool
107107
from .integrations import get_reporting_integration_callbacks
@@ -1259,19 +1259,27 @@ def _maybe_log_save_evaluate(self, tr_loss, model, epoch, ignore_keys_for_eval,
12591259
logs["learning_rate"] = float("{0:.3e}".format(self._get_learning_rate()))
12601260
logs["global_step"] = int(self.state.global_step)
12611261

1262-
divisor = 2**30
1263-
# TODO(@gexiao): replace these codes with unified APIs in Paddle
1264-
current_device = framework._current_expected_place_()
1265-
if str(current_device) != "Place(cpu)":
1266-
device_id = current_device.get_device_id()
1267-
current_memory_allocated = core.device_memory_stat_current_value("Allocated", device_id)
1268-
current_memory_reserved = core.device_memory_stat_current_value("Reserved", device_id)
1269-
max_memory_allocated = core.device_memory_stat_peak_value("Allocated", device_id)
1270-
max_memory_reserved = core.device_memory_stat_peak_value("Reserved", device_id)
1271-
logs["current_memory_allocated"] = current_memory_allocated / divisor
1272-
logs["current_memory_reserved"] = current_memory_reserved / divisor
1273-
logs["max_memory_allocated"] = max_memory_allocated / divisor
1274-
logs["max_memory_reserved"] = max_memory_reserved / divisor
1262+
# Add additional memory in log.
1263+
if not self.args.skip_memory_metrics:
1264+
shift_bits_for_MB = 20
1265+
logs.update(
1266+
{
1267+
"cpu_mem_used": self._memory_tracker.cpu_mem_used() >> shift_bits_for_MB,
1268+
"cpu_mem_used_peak": self._memory_tracker.cpu_mem_used_peak >> shift_bits_for_MB,
1269+
}
1270+
)
1271+
# TODO(@gexiao): replace these codes with unified APIs in Paddle
1272+
current_device = framework._current_expected_place_()
1273+
if str(current_device) != "Place(cpu)":
1274+
device_id = current_device.get_device_id()
1275+
current_memory_allocated = core.device_memory_stat_current_value("Allocated", device_id)
1276+
current_memory_reserved = core.device_memory_stat_current_value("Reserved", device_id)
1277+
max_memory_allocated = core.device_memory_stat_peak_value("Allocated", device_id)
1278+
max_memory_reserved = core.device_memory_stat_peak_value("Reserved", device_id)
1279+
logs["current_memory_allocated"] = current_memory_allocated >> shift_bits_for_MB
1280+
logs["current_memory_reserved"] = current_memory_reserved >> shift_bits_for_MB
1281+
logs["max_memory_allocated"] = max_memory_allocated >> shift_bits_for_MB
1282+
logs["max_memory_reserved"] = max_memory_reserved >> shift_bits_for_MB
12751283

12761284
total_train_batch_size = (
12771285
self.args.train_batch_size * self.args.gradient_accumulation_steps * self.args.dataset_world_size
@@ -1294,22 +1302,6 @@ def _maybe_log_save_evaluate(self, tr_loss, model, epoch, ignore_keys_for_eval,
12941302
self._globalstep_last_logged = self.state.global_step
12951303
self._globalstep_last_start_time = time.time()
12961304

1297-
# Add additional memory in log.
1298-
if not self.args.skip_memory_metrics:
1299-
logs.update(
1300-
{
1301-
"cpu_mem_used": self._memory_tracker.cpu_mem_used() >> 20,
1302-
"cpu_mem_used_peak": self._memory_tracker.cpu_mem_used_peak >> 20,
1303-
}
1304-
)
1305-
if is_paddle_cuda_available():
1306-
logs.update(
1307-
{
1308-
"gpu_max_memory_allocated": paddle.device.cuda.max_memory_allocated() >> 20,
1309-
"gpu_max_memory_reserved": paddle.device.cuda.max_memory_reserved() >> 20,
1310-
}
1311-
)
1312-
13131305
self.log(logs, **kwargs)
13141306

13151307
metrics = None

scripts/distribute/ci_case_dy.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -451,7 +451,7 @@ function llm_gpt_recompute_bs32_bf16_MP2-SD4-stage1() {
451451
>>${log_path}/$FUNCNAME 2>&1
452452
loss=`cat $log_dir/workerlog.0 | grep 'global_step: 30' | awk -F 'loss: ' '{print $2}' | awk -F ',' '{print $1}'`
453453
ips=`cat $log_dir/workerlog.0 | grep 'global_step: 30' | awk -F 'interval_samples_per_second: ' '{print $2}' | awk -F ',' '{print $1}'`
454-
mem=`cat $log_dir/workerlog.0 | grep 'global_step: 30' | awk -F 'gpu_max_memory_reserved: ' '{print $2}' | awk -F ',' '{print $1}'`
454+
mem=`cat $log_dir/workerlog.0 | grep 'global_step: 30' | awk -F 'max_memory_reserved: ' '{print $2}' | awk -F ',' '{print $1}'`
455455
echo "result: loss=$loss ips=$ips mem=$mem"
456456
loss_base=8.93362617
457457
ips_base=64.75564390065037

0 commit comments

Comments
 (0)