Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
52 changes: 22 additions & 30 deletions paddlenlp/trainer/trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@
SAFE_PEFT_WEIGHTS_INDEX_NAME,
SAFE_WEIGHTS_INDEX_NAME,
)
from ..utils.import_utils import is_datasets_available, is_paddle_cuda_available
from ..utils.import_utils import is_datasets_available
from ..utils.log import logger
from .argparser import strtobool
from .integrations import get_reporting_integration_callbacks
Expand Down Expand Up @@ -1259,19 +1259,27 @@
logs["learning_rate"] = float("{0:.3e}".format(self._get_learning_rate()))
logs["global_step"] = int(self.state.global_step)

divisor = 2**30
# TODO(@gexiao): replace these codes with unified APIs in Paddle
current_device = framework._current_expected_place_()
if str(current_device) != "Place(cpu)":
device_id = current_device.get_device_id()
current_memory_allocated = core.device_memory_stat_current_value("Allocated", device_id)
current_memory_reserved = core.device_memory_stat_current_value("Reserved", device_id)
max_memory_allocated = core.device_memory_stat_peak_value("Allocated", device_id)
max_memory_reserved = core.device_memory_stat_peak_value("Reserved", device_id)
logs["current_memory_allocated"] = current_memory_allocated / divisor
logs["current_memory_reserved"] = current_memory_reserved / divisor
logs["max_memory_allocated"] = max_memory_allocated / divisor
logs["max_memory_reserved"] = max_memory_reserved / divisor
# Add additional memory in log.
if not self.args.skip_memory_metrics:
shift_bits_for_MB = 20
logs.update(
{
"cpu_mem_used": self._memory_tracker.cpu_mem_used() >> shift_bits_for_MB,
"cpu_mem_used_peak": self._memory_tracker.cpu_mem_used_peak >> shift_bits_for_MB,

Check warning on line 1268 in paddlenlp/trainer/trainer.py

View check run for this annotation

Codecov / codecov/patch

paddlenlp/trainer/trainer.py#L1264-L1268

Added lines #L1264 - L1268 were not covered by tests
}
)
# TODO(@gexiao): replace these codes with unified APIs in Paddle
current_device = framework._current_expected_place_()
if str(current_device) != "Place(cpu)":
device_id = current_device.get_device_id()
current_memory_allocated = core.device_memory_stat_current_value("Allocated", device_id)
current_memory_reserved = core.device_memory_stat_current_value("Reserved", device_id)
max_memory_allocated = core.device_memory_stat_peak_value("Allocated", device_id)
max_memory_reserved = core.device_memory_stat_peak_value("Reserved", device_id)
logs["current_memory_allocated"] = current_memory_allocated >> shift_bits_for_MB
logs["current_memory_reserved"] = current_memory_reserved >> shift_bits_for_MB
logs["max_memory_allocated"] = max_memory_allocated >> shift_bits_for_MB
logs["max_memory_reserved"] = max_memory_reserved >> shift_bits_for_MB

Check warning on line 1282 in paddlenlp/trainer/trainer.py

View check run for this annotation

Codecov / codecov/patch

paddlenlp/trainer/trainer.py#L1272-L1282

Added lines #L1272 - L1282 were not covered by tests

total_train_batch_size = (
self.args.train_batch_size * self.args.gradient_accumulation_steps * self.args.dataset_world_size
Expand All @@ -1294,22 +1302,6 @@
self._globalstep_last_logged = self.state.global_step
self._globalstep_last_start_time = time.time()

# Add additional memory in log.
if not self.args.skip_memory_metrics:
logs.update(
{
"cpu_mem_used": self._memory_tracker.cpu_mem_used() >> 20,
"cpu_mem_used_peak": self._memory_tracker.cpu_mem_used_peak >> 20,
}
)
if is_paddle_cuda_available():
logs.update(
{
"gpu_max_memory_allocated": paddle.device.cuda.max_memory_allocated() >> 20,
"gpu_max_memory_reserved": paddle.device.cuda.max_memory_reserved() >> 20,
}
)

self.log(logs, **kwargs)

metrics = None
Expand Down
2 changes: 1 addition & 1 deletion scripts/distribute/ci_case_dy.sh
Original file line number Diff line number Diff line change
Expand Up @@ -451,7 +451,7 @@ function llm_gpt_recompute_bs32_bf16_MP2-SD4-stage1() {
>>${log_path}/$FUNCNAME 2>&1
loss=`cat $log_dir/workerlog.0 | grep 'global_step: 30' | awk -F 'loss: ' '{print $2}' | awk -F ',' '{print $1}'`
ips=`cat $log_dir/workerlog.0 | grep 'global_step: 30' | awk -F 'interval_samples_per_second: ' '{print $2}' | awk -F ',' '{print $1}'`
mem=`cat $log_dir/workerlog.0 | grep 'global_step: 30' | awk -F 'gpu_max_memory_reserved: ' '{print $2}' | awk -F ',' '{print $1}'`
mem=`cat $log_dir/workerlog.0 | grep 'global_step: 30' | awk -F 'max_memory_reserved: ' '{print $2}' | awk -F ',' '{print $1}'`
echo "result: loss=$loss ips=$ips mem=$mem"
loss_base=8.93362617
ips_base=64.75564390065037
Expand Down