From 0b0bebbd713052e01cd73062dff8c9f2d556e77b Mon Sep 17 00:00:00 2001 From: xuexixi Date: Thu, 31 Jul 2025 19:58:40 +0800 Subject: [PATCH 01/15] init ernie dynamic auto code(no moe op) --- ...list_ernie45turbo_tk_m100k_250321.txt.1000 | 1000 ++++++ examples/pre-training/conf/ratio_eb45t_0321 | 1176 +++++++ examples/pre-training/ernie/pretrain_auto.py | 459 +++ .../ernie/src/callbacks/__init__.py | 15 +- .../callbacks/adaptivegradclip_callback.py | 122 + .../src/callbacks/data_trace_callback.py | 251 ++ .../progressive_batching_callback.py | 70 + .../ernie/src/callbacks/stopper_callback.py | 29 + .../pre-training/ernie/src/clip/__init__.py | 6 +- examples/pre-training/ernie/src/clip/clip.py | 316 ++ .../ernie/src/datasets/__init__.py | 19 + .../ernie/src/datasets/dist_data_loader.py | 598 ++++ .../ernie/src/datasets/pretrain_task.py | 788 +++++ .../ernie/src/lr_schedulers/__init__.py | 4 +- .../ernie/src/lr_schedulers/cosine_lr.py | 62 + .../ernie/src/trainers/__init__.py | 9 +- .../src/trainers/pretraining_trainer_auto.py | 1774 ++++++++++ .../pre-training/ernie/src/utils/__init__.py | 9 +- .../ernie/src/utils/data_utils.py | 218 ++ .../ernie/src/utils/ipc_server.py | 265 ++ .../model_configs_auto/model_config.json | 66 + .../pre-training/models/aadiff_decorator.py | 63 + .../pre-training/models/ernie/__init__.py | 7 +- .../models/ernie/modeling_auto.py | 2939 +++++++++++++++++ .../models/ernie/modeling_auto_pp.py | 620 ++++ .../models/ernie_moe/configuration.py | 740 +++++ .../pre-training/models/fp8_linear_auto.py | 603 ++++ .../pre-training/models/moe/moe_layer_auto.py | 851 +++++ .../models/moe/moe_layer_auto_utils.py | 2087 ++++++++++++ examples/pre-training/models/moe/moe_utils.py | 229 ++ .../pre-training/models/moe/moe_utils_auto.py | 229 ++ .../pre-training/models/moe/top2_gate_auto.py | 77 + .../models/moe/top2_gate_auto_auto.py | 1135 +++++++ .../models/sequence_parallel_utils_auto.py | 910 +++++ .../pre-training/scripts/train_96_auto.sh | 118 + .../pre-training/yamls/pretrain_96_auto.yaml | 162 + 36 files changed, 18014 insertions(+), 12 deletions(-) create mode 100644 examples/pre-training/conf/filelist_ernie45turbo_tk_m100k_250321.txt.1000 create mode 100644 examples/pre-training/conf/ratio_eb45t_0321 create mode 100644 examples/pre-training/ernie/pretrain_auto.py create mode 100644 examples/pre-training/ernie/src/callbacks/adaptivegradclip_callback.py create mode 100644 examples/pre-training/ernie/src/callbacks/data_trace_callback.py create mode 100644 examples/pre-training/ernie/src/callbacks/progressive_batching_callback.py create mode 100644 examples/pre-training/ernie/src/callbacks/stopper_callback.py create mode 100644 examples/pre-training/ernie/src/clip/clip.py create mode 100644 examples/pre-training/ernie/src/datasets/__init__.py create mode 100644 examples/pre-training/ernie/src/datasets/dist_data_loader.py create mode 100644 examples/pre-training/ernie/src/datasets/pretrain_task.py create mode 100644 examples/pre-training/ernie/src/lr_schedulers/cosine_lr.py create mode 100644 examples/pre-training/ernie/src/trainers/pretraining_trainer_auto.py create mode 100644 examples/pre-training/ernie/src/utils/data_utils.py create mode 100644 examples/pre-training/ernie/src/utils/ipc_server.py create mode 100644 examples/pre-training/model_configs_auto/model_config.json create mode 100644 examples/pre-training/models/aadiff_decorator.py create mode 100644 examples/pre-training/models/ernie/modeling_auto.py create mode 100644 examples/pre-training/models/ernie/modeling_auto_pp.py create mode 100644 examples/pre-training/models/ernie_moe/configuration.py create mode 100644 examples/pre-training/models/fp8_linear_auto.py create mode 100644 examples/pre-training/models/moe/moe_layer_auto.py create mode 100644 examples/pre-training/models/moe/moe_layer_auto_utils.py create mode 100644 examples/pre-training/models/moe/moe_utils.py create mode 100644 examples/pre-training/models/moe/moe_utils_auto.py create mode 100644 examples/pre-training/models/moe/top2_gate_auto.py create mode 100644 examples/pre-training/models/moe/top2_gate_auto_auto.py create mode 100644 examples/pre-training/models/sequence_parallel_utils_auto.py create mode 100644 examples/pre-training/scripts/train_96_auto.sh create mode 100644 examples/pre-training/yamls/pretrain_96_auto.yaml diff --git a/examples/pre-training/conf/filelist_ernie45turbo_tk_m100k_250321.txt.1000 b/examples/pre-training/conf/filelist_ernie45turbo_tk_m100k_250321.txt.1000 new file mode 100644 index 00000000..d3868f63 --- /dev/null +++ b/examples/pre-training/conf/filelist_ernie45turbo_tk_m100k_250321.txt.1000 @@ -0,0 +1,1000 @@ +2 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200046/out/attempt_20250123021212_586550_r001_000000_0_0.h5 +2 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200046/out/attempt_20250123021212_587109_r001_000000_0_0.h5 +3 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200046/out/attempt_20250123021212_586550_r001_000001_0_0.h5 +3 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200046/out/attempt_20250123021212_587109_r001_000001_0_0.h5 +4 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200046/out/attempt_20250123021212_586550_r001_000002_0_0.h5 +4 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200046/out/attempt_20250123021212_587109_r001_000002_0_0.h5 +5 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200046/out/attempt_20250123021212_586550_r001_000003_0_0.h5 +5 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200046/out/attempt_20250123021212_587109_r001_000003_0_0.h5 +6 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200046/out/attempt_20250123021212_586550_r001_000004_0_0.h5 +6 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200046/out/attempt_20250123021212_587109_r001_000004_0_0.h5 +7 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200046/out/attempt_20250123021212_586550_r001_000005_0_0.h5 +7 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200046/out/attempt_20250123021212_587109_r001_000005_0_0.h5 +8 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200046/out/attempt_20250123021212_586550_r001_000006_0_0.h5 +8 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200046/out/attempt_20250123021212_587109_r001_000006_0_0.h5 +9 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200046/out/attempt_20250123021212_586550_r001_000007_0_0.h5 +9 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200046/out/attempt_20250123021212_587109_r001_000007_0_0.h5 +10 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200046/out/attempt_20250123021212_586550_r001_000008_0_0.h5 +10 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200046/out/attempt_20250123021212_587109_r001_000008_0_0.h5 +11 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200046/out/attempt_20250123021212_586550_r001_000009_0_0.h5 +11 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200046/out/attempt_20250123021212_587109_r001_000009_0_0.h5 +12 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200047/out/attempt_20250123021212_586551_r001_000000_0_0.h5 +12 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200047/out/attempt_20250123021212_587110_r001_000000_0_0.h5 +13 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200047/out/attempt_20250123021212_586551_r001_000001_0_0.h5 +13 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200047/out/attempt_20250123021212_587110_r001_000001_0_0.h5 +14 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200047/out/attempt_20250123021212_586551_r001_000002_0_0.h5 +14 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200047/out/attempt_20250123021212_587110_r001_000002_0_0.h5 +15 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200047/out/attempt_20250123021212_586551_r001_000003_0_0.h5 +15 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200047/out/attempt_20250123021212_587110_r001_000003_0_0.h5 +16 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200047/out/attempt_20250123021212_586551_r001_000004_0_0.h5 +16 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200047/out/attempt_20250123021212_587110_r001_000004_0_0.h5 +17 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200047/out/attempt_20250123021212_586551_r001_000005_0_0.h5 +17 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200047/out/attempt_20250123021212_587110_r001_000005_0_0.h5 +18 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200047/out/attempt_20250123021212_586551_r001_000006_0_0.h5 +18 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200047/out/attempt_20250123021212_587110_r001_000006_0_0.h5 +19 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200047/out/attempt_20250123021212_586551_r001_000007_0_0.h5 +19 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200047/out/attempt_20250123021212_587110_r001_000007_0_0.h5 +20 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200047/out/attempt_20250123021212_586551_r001_000008_0_0.h5 +20 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200047/out/attempt_20250123021212_587110_r001_000008_0_0.h5 +21 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200047/out/attempt_20250123021212_586551_r001_000009_0_0.h5 +21 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200047/out/attempt_20250123021212_587110_r001_000009_0_0.h5 +22 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200048/out/attempt_20250123021212_586552_r001_000000_0_0.h5 +22 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200048/out/attempt_20250123021212_587111_r001_000000_0_0.h5 +23 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200048/out/attempt_20250123021212_586552_r001_000001_0_0.h5 +23 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200048/out/attempt_20250123021212_587111_r001_000001_0_0.h5 +24 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200048/out/attempt_20250123021212_586552_r001_000002_0_0.h5 +24 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200048/out/attempt_20250123021212_587111_r001_000002_0_0.h5 +25 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200048/out/attempt_20250123021212_586552_r001_000003_0_0.h5 +25 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200048/out/attempt_20250123021212_587111_r001_000003_0_0.h5 +26 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200048/out/attempt_20250123021212_586552_r001_000004_0_0.h5 +26 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200048/out/attempt_20250123021212_587111_r001_000004_0_0.h5 +27 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200048/out/attempt_20250123021212_586552_r001_000005_0_0.h5 +27 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200048/out/attempt_20250123021212_587111_r001_000005_0_0.h5 +28 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200048/out/attempt_20250123021212_586552_r001_000006_0_0.h5 +28 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200048/out/attempt_20250123021212_587111_r001_000006_0_0.h5 +29 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200048/out/attempt_20250123021212_586552_r001_000007_0_0.h5 +29 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200048/out/attempt_20250123021212_587111_r001_000007_0_0.h5 +30 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200048/out/attempt_20250123021212_586552_r001_000008_0_0.h5 +30 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200048/out/attempt_20250123021212_587111_r001_000008_0_0.h5 +31 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200048/out/attempt_20250123021212_586552_r001_000009_0_0.h5 +31 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200048/out/attempt_20250123021212_587111_r001_000009_0_0.h5 +32 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200049/out/attempt_20250123021212_586553_r001_000000_0_0.h5 +32 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200049/out/attempt_20250123021212_587112_r001_000000_0_0.h5 +33 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200049/out/attempt_20250123021212_586553_r001_000001_0_0.h5 +33 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200049/out/attempt_20250123021212_587112_r001_000001_0_0.h5 +34 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200049/out/attempt_20250123021212_586553_r001_000002_0_0.h5 +34 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200049/out/attempt_20250123021212_587112_r001_000002_0_0.h5 +35 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200049/out/attempt_20250123021212_586553_r001_000003_0_0.h5 +35 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200049/out/attempt_20250123021212_587112_r001_000003_0_0.h5 +36 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200049/out/attempt_20250123021212_586553_r001_000004_0_0.h5 +36 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200049/out/attempt_20250123021212_587112_r001_000004_0_0.h5 +37 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200049/out/attempt_20250123021212_586553_r001_000005_0_0.h5 +37 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200049/out/attempt_20250123021212_587112_r001_000005_0_0.h5 +38 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200049/out/attempt_20250123021212_586553_r001_000006_0_0.h5 +38 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200049/out/attempt_20250123021212_587112_r001_000006_0_0.h5 +39 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200049/out/attempt_20250123021212_586553_r001_000007_0_0.h5 +39 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200049/out/attempt_20250123021212_587112_r001_000007_0_0.h5 +40 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200049/out/attempt_20250123021212_586553_r001_000008_0_0.h5 +40 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200049/out/attempt_20250123021212_587112_r001_000008_0_0.h5 +41 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200049/out/attempt_20250123021212_586553_r001_000009_0_0.h5 +41 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200049/out/attempt_20250123021212_587112_r001_000009_0_0.h5 +42 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200050/out/attempt_20250123021212_586554_r001_000000_0_0.h5 +42 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200050/out/attempt_20250123021212_587113_r001_000000_0_0.h5 +43 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200050/out/attempt_20250123021212_586554_r001_000001_0_0.h5 +43 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200050/out/attempt_20250123021212_587113_r001_000001_0_0.h5 +44 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200050/out/attempt_20250123021212_586554_r001_000002_0_0.h5 +44 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200050/out/attempt_20250123021212_587113_r001_000002_0_0.h5 +45 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200050/out/attempt_20250123021212_586554_r001_000003_0_0.h5 +45 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200050/out/attempt_20250123021212_587113_r001_000003_0_0.h5 +46 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200050/out/attempt_20250123021212_586554_r001_000004_0_0.h5 +46 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200050/out/attempt_20250123021212_587113_r001_000004_0_0.h5 +47 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200050/out/attempt_20250123021212_586554_r001_000005_0_0.h5 +47 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200050/out/attempt_20250123021212_587113_r001_000005_0_0.h5 +48 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200050/out/attempt_20250123021212_586554_r001_000006_0_0.h5 +48 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200050/out/attempt_20250123021212_587113_r001_000006_0_0.h5 +49 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200050/out/attempt_20250123021212_586554_r001_000007_0_0.h5 +49 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200050/out/attempt_20250123021212_587113_r001_000007_0_0.h5 +50 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200050/out/attempt_20250123021212_586554_r001_000008_0_0.h5 +50 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200050/out/attempt_20250123021212_587113_r001_000008_0_0.h5 +51 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200050/out/attempt_20250123021212_586554_r001_000009_0_0.h5 +51 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200050/out/attempt_20250123021212_587113_r001_000009_0_0.h5 +52 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200051/out/attempt_20250123021212_586555_r001_000000_0_0.h5 +52 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200051/out/attempt_20250123021212_587114_r001_000000_0_0.h5 +53 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200051/out/attempt_20250123021212_586555_r001_000001_0_0.h5 +53 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200051/out/attempt_20250123021212_587114_r001_000001_0_0.h5 +54 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200051/out/attempt_20250123021212_586555_r001_000002_0_0.h5 +54 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200051/out/attempt_20250123021212_587114_r001_000002_0_0.h5 +55 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200051/out/attempt_20250123021212_586555_r001_000003_0_0.h5 +55 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200051/out/attempt_20250123021212_587114_r001_000003_0_0.h5 +56 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200051/out/attempt_20250123021212_586555_r001_000004_0_0.h5 +56 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200051/out/attempt_20250123021212_587114_r001_000004_0_0.h5 +57 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200051/out/attempt_20250123021212_586555_r001_000005_0_0.h5 +57 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200051/out/attempt_20250123021212_587114_r001_000005_0_0.h5 +58 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200051/out/attempt_20250123021212_586555_r001_000006_0_0.h5 +58 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200051/out/attempt_20250123021212_587114_r001_000006_0_0.h5 +59 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200051/out/attempt_20250123021212_586555_r001_000007_0_0.h5 +59 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200051/out/attempt_20250123021212_587114_r001_000007_0_0.h5 +60 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200051/out/attempt_20250123021212_586555_r001_000008_0_0.h5 +60 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200051/out/attempt_20250123021212_587114_r001_000008_0_0.h5 +61 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200051/out/attempt_20250123021212_586555_r001_000009_0_0.h5 +61 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200051/out/attempt_20250123021212_587114_r001_000009_0_0.h5 +62 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200052/out/attempt_20250123021212_586556_r001_000000_0_0.h5 +62 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200052/out/attempt_20250123021212_587115_r001_000000_0_0.h5 +63 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200052/out/attempt_20250123021212_586556_r001_000001_0_0.h5 +63 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200052/out/attempt_20250123021212_587115_r001_000001_0_0.h5 +64 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200052/out/attempt_20250123021212_586556_r001_000002_0_0.h5 +64 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200052/out/attempt_20250123021212_587115_r001_000002_0_0.h5 +65 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200052/out/attempt_20250123021212_586556_r001_000003_0_0.h5 +65 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200052/out/attempt_20250123021212_587115_r001_000003_0_0.h5 +66 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200052/out/attempt_20250123021212_586556_r001_000004_0_0.h5 +66 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200052/out/attempt_20250123021212_587115_r001_000004_0_0.h5 +67 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200052/out/attempt_20250123021212_586556_r001_000005_0_0.h5 +67 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200052/out/attempt_20250123021212_587115_r001_000005_0_0.h5 +68 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200052/out/attempt_20250123021212_586556_r001_000006_0_0.h5 +68 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200052/out/attempt_20250123021212_587115_r001_000006_0_0.h5 +69 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200052/out/attempt_20250123021212_586556_r001_000007_0_0.h5 +69 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200052/out/attempt_20250123021212_587115_r001_000007_0_0.h5 +70 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200052/out/attempt_20250123021212_586556_r001_000008_0_0.h5 +70 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200052/out/attempt_20250123021212_587115_r001_000008_0_0.h5 +71 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200052/out/attempt_20250123021212_586556_r001_000009_0_0.h5 +71 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200052/out/attempt_20250123021212_587115_r001_000009_0_0.h5 +72 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200053/out/attempt_20250123021212_586557_r001_000000_0_0.h5 +72 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200053/out/attempt_20250123021212_587116_r001_000000_0_0.h5 +73 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200053/out/attempt_20250123021212_586557_r001_000001_0_0.h5 +73 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200053/out/attempt_20250123021212_587116_r001_000001_0_0.h5 +74 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200053/out/attempt_20250123021212_586557_r001_000002_0_0.h5 +74 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200053/out/attempt_20250123021212_587116_r001_000002_0_0.h5 +75 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200053/out/attempt_20250123021212_586557_r001_000003_0_0.h5 +75 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200053/out/attempt_20250123021212_587116_r001_000003_0_0.h5 +76 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200053/out/attempt_20250123021212_586557_r001_000004_0_0.h5 +76 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200053/out/attempt_20250123021212_587116_r001_000004_0_0.h5 +77 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200053/out/attempt_20250123021212_586557_r001_000005_0_0.h5 +77 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200053/out/attempt_20250123021212_587116_r001_000005_0_0.h5 +78 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200053/out/attempt_20250123021212_586557_r001_000006_0_0.h5 +78 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200053/out/attempt_20250123021212_587116_r001_000006_0_0.h5 +79 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200053/out/attempt_20250123021212_586557_r001_000007_0_0.h5 +79 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200053/out/attempt_20250123021212_587116_r001_000007_0_0.h5 +80 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200053/out/attempt_20250123021212_586557_r001_000008_0_0.h5 +80 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200053/out/attempt_20250123021212_587116_r001_000008_0_0.h5 +81 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200053/out/attempt_20250123021212_586557_r001_000009_0_0.h5 +81 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200053/out/attempt_20250123021212_587116_r001_000009_0_0.h5 +82 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200054/out/attempt_20250123021212_586558_r001_000000_0_0.h5 +82 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200054/out/attempt_20250123021212_587117_r001_000000_0_0.h5 +83 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200054/out/attempt_20250123021212_586558_r001_000001_0_0.h5 +83 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200054/out/attempt_20250123021212_587117_r001_000001_0_0.h5 +84 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200054/out/attempt_20250123021212_586558_r001_000002_0_0.h5 +84 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200054/out/attempt_20250123021212_587117_r001_000002_0_0.h5 +85 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200054/out/attempt_20250123021212_586558_r001_000003_0_0.h5 +85 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200054/out/attempt_20250123021212_587117_r001_000003_0_0.h5 +86 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200054/out/attempt_20250123021212_586558_r001_000004_0_0.h5 +86 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200054/out/attempt_20250123021212_587117_r001_000004_0_0.h5 +87 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200054/out/attempt_20250123021212_586558_r001_000005_0_0.h5 +87 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200054/out/attempt_20250123021212_587117_r001_000005_0_0.h5 +88 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200054/out/attempt_20250123021212_586558_r001_000006_0_0.h5 +88 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200054/out/attempt_20250123021212_587117_r001_000006_0_0.h5 +89 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200054/out/attempt_20250123021212_586558_r001_000007_0_0.h5 +89 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200054/out/attempt_20250123021212_587117_r001_000007_0_0.h5 +90 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200054/out/attempt_20250123021212_586558_r001_000008_0_0.h5 +90 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200054/out/attempt_20250123021212_587117_r001_000008_0_0.h5 +91 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200054/out/attempt_20250123021212_586558_r001_000009_0_0.h5 +91 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200054/out/attempt_20250123021212_587117_r001_000009_0_0.h5 +92 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200055/out/attempt_20250123021212_586559_r001_000000_0_0.h5 +92 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200055/out/attempt_20250123021212_587118_r001_000000_0_0.h5 +93 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200055/out/attempt_20250123021212_586559_r001_000001_0_0.h5 +93 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200055/out/attempt_20250123021212_587118_r001_000001_0_0.h5 +94 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200055/out/attempt_20250123021212_586559_r001_000002_0_0.h5 +94 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200055/out/attempt_20250123021212_587118_r001_000002_0_0.h5 +95 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200055/out/attempt_20250123021212_586559_r001_000003_0_0.h5 +95 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200055/out/attempt_20250123021212_587118_r001_000003_0_0.h5 +96 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200055/out/attempt_20250123021212_586559_r001_000004_0_0.h5 +96 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200055/out/attempt_20250123021212_587118_r001_000004_0_0.h5 +97 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200055/out/attempt_20250123021212_586559_r001_000005_0_0.h5 +97 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200055/out/attempt_20250123021212_587118_r001_000005_0_0.h5 +98 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200055/out/attempt_20250123021212_586559_r001_000006_0_0.h5 +98 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200055/out/attempt_20250123021212_587118_r001_000006_0_0.h5 +99 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200055/out/attempt_20250123021212_586559_r001_000007_0_0.h5 +99 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200055/out/attempt_20250123021212_587118_r001_000007_0_0.h5 +100 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200055/out/attempt_20250123021212_586559_r001_000008_0_0.h5 +100 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200055/out/attempt_20250123021212_587118_r001_000008_0_0.h5 +101 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200055/out/attempt_20250123021212_586559_r001_000009_0_0.h5 +101 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200055/out/attempt_20250123021212_587118_r001_000009_0_0.h5 +102 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200056/out/attempt_20250123021212_586562_r001_000000_0_0.h5 +102 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200056/out/attempt_20250123021212_587120_r001_000000_0_0.h5 +103 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200056/out/attempt_20250123021212_586562_r001_000001_0_0.h5 +103 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200056/out/attempt_20250123021212_587120_r001_000001_0_0.h5 +104 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200056/out/attempt_20250123021212_586562_r001_000002_0_0.h5 +104 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200056/out/attempt_20250123021212_587120_r001_000002_0_0.h5 +105 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200056/out/attempt_20250123021212_586562_r001_000003_0_0.h5 +105 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200056/out/attempt_20250123021212_587120_r001_000003_0_0.h5 +106 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200056/out/attempt_20250123021212_586562_r001_000004_0_0.h5 +106 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200056/out/attempt_20250123021212_587120_r001_000004_0_0.h5 +107 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200056/out/attempt_20250123021212_586562_r001_000005_0_0.h5 +107 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200056/out/attempt_20250123021212_587120_r001_000005_0_0.h5 +108 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200056/out/attempt_20250123021212_586562_r001_000006_0_0.h5 +108 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200056/out/attempt_20250123021212_587120_r001_000006_0_0.h5 +109 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200056/out/attempt_20250123021212_586562_r001_000007_0_0.h5 +109 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200056/out/attempt_20250123021212_587120_r001_000007_0_0.h5 +110 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200056/out/attempt_20250123021212_586562_r001_000008_0_0.h5 +110 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200056/out/attempt_20250123021212_587120_r001_000008_0_0.h5 +111 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200056/out/attempt_20250123021212_586562_r001_000009_0_0.h5 +111 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200056/out/attempt_20250123021212_587120_r001_000009_0_0.h5 +112 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200057/out/attempt_20250123021212_586563_r001_000000_0_0.h5 +112 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200057/out/attempt_20250123021212_587121_r001_000000_0_0.h5 +113 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200057/out/attempt_20250123021212_586563_r001_000001_0_0.h5 +113 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200057/out/attempt_20250123021212_587121_r001_000001_0_0.h5 +114 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200057/out/attempt_20250123021212_586563_r001_000002_0_0.h5 +114 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200057/out/attempt_20250123021212_587121_r001_000002_0_0.h5 +115 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200057/out/attempt_20250123021212_586563_r001_000003_0_0.h5 +115 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200057/out/attempt_20250123021212_587121_r001_000003_0_0.h5 +116 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200057/out/attempt_20250123021212_586563_r001_000004_0_0.h5 +116 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200057/out/attempt_20250123021212_587121_r001_000004_0_0.h5 +117 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200057/out/attempt_20250123021212_586563_r001_000005_0_0.h5 +117 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200057/out/attempt_20250123021212_587121_r001_000005_0_0.h5 +118 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200057/out/attempt_20250123021212_586563_r001_000006_0_0.h5 +118 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200057/out/attempt_20250123021212_587121_r001_000006_0_0.h5 +119 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200057/out/attempt_20250123021212_586563_r001_000007_0_0.h5 +119 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200057/out/attempt_20250123021212_587121_r001_000007_0_0.h5 +120 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200057/out/attempt_20250123021212_586563_r001_000008_0_0.h5 +120 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200057/out/attempt_20250123021212_587121_r001_000008_0_0.h5 +121 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200057/out/attempt_20250123021212_586563_r001_000009_0_0.h5 +121 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200057/out/attempt_20250123021212_587121_r001_000009_0_0.h5 +122 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200058/out/attempt_20250123021212_586565_r001_000000_0_0.h5 +122 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200058/out/attempt_20250123021212_587122_r001_000000_0_0.h5 +123 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200058/out/attempt_20250123021212_586565_r001_000001_0_0.h5 +123 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200058/out/attempt_20250123021212_587122_r001_000001_0_0.h5 +124 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200058/out/attempt_20250123021212_586565_r001_000002_0_0.h5 +124 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200058/out/attempt_20250123021212_587122_r001_000002_0_0.h5 +125 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200058/out/attempt_20250123021212_586565_r001_000003_0_0.h5 +125 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200058/out/attempt_20250123021212_587122_r001_000003_0_0.h5 +126 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200058/out/attempt_20250123021212_586565_r001_000004_0_0.h5 +126 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200058/out/attempt_20250123021212_587122_r001_000004_0_0.h5 +127 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200058/out/attempt_20250123021212_586565_r001_000005_0_0.h5 +127 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200058/out/attempt_20250123021212_587122_r001_000005_0_0.h5 +128 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200058/out/attempt_20250123021212_586565_r001_000006_0_0.h5 +128 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200058/out/attempt_20250123021212_587122_r001_000006_0_0.h5 +129 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200058/out/attempt_20250123021212_586565_r001_000007_0_0.h5 +129 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200058/out/attempt_20250123021212_587122_r001_000007_0_0.h5 +130 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200058/out/attempt_20250123021212_586565_r001_000008_0_0.h5 +130 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200058/out/attempt_20250123021212_587122_r001_000008_0_0.h5 +131 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200058/out/attempt_20250123021212_586565_r001_000009_0_0.h5 +131 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200058/out/attempt_20250123021212_587122_r001_000009_0_0.h5 +132 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200059/out/attempt_20250123021212_586566_r001_000000_0_0.h5 +132 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200059/out/attempt_20250123021212_587123_r001_000000_0_0.h5 +133 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200059/out/attempt_20250123021212_586566_r001_000001_0_0.h5 +133 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200059/out/attempt_20250123021212_587123_r001_000001_0_0.h5 +134 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200059/out/attempt_20250123021212_586566_r001_000002_0_0.h5 +134 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200059/out/attempt_20250123021212_587123_r001_000002_0_0.h5 +135 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200059/out/attempt_20250123021212_586566_r001_000003_0_0.h5 +135 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200059/out/attempt_20250123021212_587123_r001_000003_0_0.h5 +136 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200059/out/attempt_20250123021212_586566_r001_000004_0_0.h5 +136 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200059/out/attempt_20250123021212_587123_r001_000004_0_0.h5 +137 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200059/out/attempt_20250123021212_586566_r001_000005_0_0.h5 +137 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200059/out/attempt_20250123021212_587123_r001_000005_0_0.h5 +138 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200059/out/attempt_20250123021212_586566_r001_000006_0_0.h5 +138 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200059/out/attempt_20250123021212_587123_r001_000006_0_0.h5 +139 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200059/out/attempt_20250123021212_586566_r001_000007_0_0.h5 +139 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200059/out/attempt_20250123021212_587123_r001_000007_0_0.h5 +140 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200059/out/attempt_20250123021212_586566_r001_000008_0_0.h5 +140 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200059/out/attempt_20250123021212_587123_r001_000008_0_0.h5 +141 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200059/out/attempt_20250123021212_586566_r001_000009_0_0.h5 +141 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200059/out/attempt_20250123021212_587123_r001_000009_0_0.h5 +142 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200060/out/attempt_20250123021212_586567_r001_000000_0_0.h5 +142 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200060/out/attempt_20250123021212_587124_r001_000000_0_0.h5 +143 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200060/out/attempt_20250123021212_586567_r001_000001_0_0.h5 +143 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200060/out/attempt_20250123021212_587124_r001_000001_0_0.h5 +144 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200060/out/attempt_20250123021212_586567_r001_000002_0_0.h5 +144 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200060/out/attempt_20250123021212_587124_r001_000002_0_0.h5 +145 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200060/out/attempt_20250123021212_586567_r001_000003_0_0.h5 +145 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200060/out/attempt_20250123021212_587124_r001_000003_0_0.h5 +146 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200060/out/attempt_20250123021212_586567_r001_000004_0_0.h5 +146 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200060/out/attempt_20250123021212_587124_r001_000004_0_0.h5 +147 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200060/out/attempt_20250123021212_586567_r001_000005_0_0.h5 +147 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200060/out/attempt_20250123021212_587124_r001_000005_0_0.h5 +148 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200060/out/attempt_20250123021212_586567_r001_000006_0_0.h5 +148 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200060/out/attempt_20250123021212_587124_r001_000006_0_0.h5 +149 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200060/out/attempt_20250123021212_586567_r001_000007_0_0.h5 +149 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200060/out/attempt_20250123021212_587124_r001_000007_0_0.h5 +150 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200060/out/attempt_20250123021212_586567_r001_000008_0_0.h5 +150 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200060/out/attempt_20250123021212_587124_r001_000008_0_0.h5 +151 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200060/out/attempt_20250123021212_586567_r001_000009_0_0.h5 +151 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200060/out/attempt_20250123021212_587124_r001_000009_0_0.h5 +152 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200061/out/attempt_20250123021212_586568_r001_000000_0_0.h5 +152 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200061/out/attempt_20250123021212_587125_r001_000000_0_0.h5 +153 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200061/out/attempt_20250123021212_586568_r001_000001_0_0.h5 +153 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200061/out/attempt_20250123021212_587125_r001_000001_0_0.h5 +154 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200061/out/attempt_20250123021212_586568_r001_000002_0_0.h5 +154 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200061/out/attempt_20250123021212_587125_r001_000002_0_0.h5 +155 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200061/out/attempt_20250123021212_586568_r001_000003_0_0.h5 +155 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200061/out/attempt_20250123021212_587125_r001_000003_0_0.h5 +156 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200061/out/attempt_20250123021212_586568_r001_000004_0_0.h5 +156 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200061/out/attempt_20250123021212_587125_r001_000004_0_0.h5 +157 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200061/out/attempt_20250123021212_586568_r001_000005_0_0.h5 +157 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200061/out/attempt_20250123021212_587125_r001_000005_0_0.h5 +158 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200061/out/attempt_20250123021212_586568_r001_000006_0_0.h5 +158 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200061/out/attempt_20250123021212_587125_r001_000006_0_0.h5 +159 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200061/out/attempt_20250123021212_586568_r001_000007_0_0.h5 +159 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200061/out/attempt_20250123021212_587125_r001_000007_0_0.h5 +160 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200061/out/attempt_20250123021212_586568_r001_000008_0_0.h5 +160 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200061/out/attempt_20250123021212_587125_r001_000008_0_0.h5 +161 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200061/out/attempt_20250123021212_586568_r001_000009_0_0.h5 +161 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200061/out/attempt_20250123021212_587125_r001_000009_0_0.h5 +162 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200062/out/attempt_20250123021212_586570_r001_000000_0_0.h5 +162 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200062/out/attempt_20250123021212_587126_r001_000000_0_0.h5 +163 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200062/out/attempt_20250123021212_586570_r001_000001_0_0.h5 +163 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200062/out/attempt_20250123021212_587126_r001_000001_0_0.h5 +164 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200062/out/attempt_20250123021212_586570_r001_000002_0_0.h5 +164 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200062/out/attempt_20250123021212_587126_r001_000002_0_0.h5 +165 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200062/out/attempt_20250123021212_586570_r001_000003_0_0.h5 +165 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200062/out/attempt_20250123021212_587126_r001_000003_0_0.h5 +166 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200062/out/attempt_20250123021212_586570_r001_000004_0_0.h5 +166 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200062/out/attempt_20250123021212_587126_r001_000004_0_0.h5 +167 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200062/out/attempt_20250123021212_586570_r001_000005_0_0.h5 +167 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200062/out/attempt_20250123021212_587126_r001_000005_0_0.h5 +168 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200062/out/attempt_20250123021212_586570_r001_000006_0_0.h5 +168 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200062/out/attempt_20250123021212_587126_r001_000006_0_0.h5 +169 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200062/out/attempt_20250123021212_586570_r001_000007_0_0.h5 +169 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200062/out/attempt_20250123021212_587126_r001_000007_0_0.h5 +170 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200062/out/attempt_20250123021212_586570_r001_000008_0_0.h5 +170 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200062/out/attempt_20250123021212_587126_r001_000008_0_0.h5 +171 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200062/out/attempt_20250123021212_586570_r001_000009_0_0.h5 +171 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200062/out/attempt_20250123021212_587126_r001_000009_0_0.h5 +172 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200063/out/attempt_20250123021212_586571_r001_000000_0_0.h5 +172 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200063/out/attempt_20250123021212_587127_r001_000000_0_0.h5 +173 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200063/out/attempt_20250123021212_586571_r001_000001_0_0.h5 +173 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200063/out/attempt_20250123021212_587127_r001_000001_0_0.h5 +174 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200063/out/attempt_20250123021212_586571_r001_000002_0_0.h5 +174 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200063/out/attempt_20250123021212_587127_r001_000002_0_0.h5 +175 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200063/out/attempt_20250123021212_586571_r001_000003_0_0.h5 +175 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200063/out/attempt_20250123021212_587127_r001_000003_0_0.h5 +176 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200063/out/attempt_20250123021212_586571_r001_000004_0_0.h5 +176 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200063/out/attempt_20250123021212_587127_r001_000004_0_0.h5 +177 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200063/out/attempt_20250123021212_586571_r001_000005_0_0.h5 +177 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200063/out/attempt_20250123021212_587127_r001_000005_0_0.h5 +178 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200063/out/attempt_20250123021212_586571_r001_000006_0_0.h5 +178 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200063/out/attempt_20250123021212_587127_r001_000006_0_0.h5 +179 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200063/out/attempt_20250123021212_586571_r001_000007_0_0.h5 +179 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200063/out/attempt_20250123021212_587127_r001_000007_0_0.h5 +180 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200063/out/attempt_20250123021212_586571_r001_000008_0_0.h5 +180 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200063/out/attempt_20250123021212_587127_r001_000008_0_0.h5 +181 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200063/out/attempt_20250123021212_586571_r001_000009_0_0.h5 +181 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200063/out/attempt_20250123021212_587127_r001_000009_0_0.h5 +182 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200064/out/attempt_20250123021212_586572_r001_000000_0_0.h5 +182 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200064/out/attempt_20250123021212_587132_r001_000000_0_0.h5 +183 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200064/out/attempt_20250123021212_586572_r001_000001_0_0.h5 +183 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200064/out/attempt_20250123021212_587132_r001_000001_0_0.h5 +184 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200064/out/attempt_20250123021212_586572_r001_000002_0_0.h5 +184 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200064/out/attempt_20250123021212_587132_r001_000002_0_0.h5 +185 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200064/out/attempt_20250123021212_586572_r001_000003_0_0.h5 +185 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200064/out/attempt_20250123021212_587132_r001_000003_0_0.h5 +186 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200064/out/attempt_20250123021212_586572_r001_000004_0_0.h5 +186 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200064/out/attempt_20250123021212_587132_r001_000004_0_0.h5 +187 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200064/out/attempt_20250123021212_586572_r001_000005_0_0.h5 +187 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200064/out/attempt_20250123021212_587132_r001_000005_0_0.h5 +188 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200064/out/attempt_20250123021212_586572_r001_000006_0_0.h5 +188 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200064/out/attempt_20250123021212_587132_r001_000006_0_0.h5 +189 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200064/out/attempt_20250123021212_586572_r001_000007_0_0.h5 +189 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200064/out/attempt_20250123021212_587132_r001_000007_0_0.h5 +190 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200064/out/attempt_20250123021212_586572_r001_000008_0_0.h5 +190 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200064/out/attempt_20250123021212_587132_r001_000008_0_0.h5 +191 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200064/out/attempt_20250123021212_586572_r001_000009_0_0.h5 +191 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200064/out/attempt_20250123021212_587132_r001_000009_0_0.h5 +192 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200065/out/attempt_20250123021212_586575_r001_000000_0_0.h5 +192 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200065/out/attempt_20250123021212_587134_r001_000000_0_0.h5 +193 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200065/out/attempt_20250123021212_586575_r001_000001_0_0.h5 +193 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200065/out/attempt_20250123021212_587134_r001_000001_0_0.h5 +194 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200065/out/attempt_20250123021212_586575_r001_000002_0_0.h5 +194 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200065/out/attempt_20250123021212_587134_r001_000002_0_0.h5 +195 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200065/out/attempt_20250123021212_586575_r001_000003_0_0.h5 +195 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200065/out/attempt_20250123021212_587134_r001_000003_0_0.h5 +196 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200065/out/attempt_20250123021212_586575_r001_000004_0_0.h5 +196 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200065/out/attempt_20250123021212_587134_r001_000004_0_0.h5 +197 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200065/out/attempt_20250123021212_586575_r001_000005_0_0.h5 +197 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200065/out/attempt_20250123021212_587134_r001_000005_0_0.h5 +198 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200065/out/attempt_20250123021212_586575_r001_000006_0_0.h5 +198 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200065/out/attempt_20250123021212_587134_r001_000006_0_0.h5 +199 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200065/out/attempt_20250123021212_586575_r001_000007_0_0.h5 +199 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200065/out/attempt_20250123021212_587134_r001_000007_0_0.h5 +200 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200065/out/attempt_20250123021212_586575_r001_000008_0_0.h5 +200 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200065/out/attempt_20250123021212_587134_r001_000008_0_0.h5 +201 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200065/out/attempt_20250123021212_586575_r001_000009_0_0.h5 +201 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200065/out/attempt_20250123021212_587134_r001_000009_0_0.h5 +202 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200066/out/attempt_20250123021212_586590_r001_000000_0_0.h5 +202 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200066/out/attempt_20250123021212_587141_r001_000000_0_0.h5 +203 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200066/out/attempt_20250123021212_586590_r001_000001_0_0.h5 +203 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200066/out/attempt_20250123021212_587141_r001_000001_0_0.h5 +204 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200066/out/attempt_20250123021212_586590_r001_000002_0_0.h5 +204 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200066/out/attempt_20250123021212_587141_r001_000002_0_0.h5 +205 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200066/out/attempt_20250123021212_586590_r001_000003_0_0.h5 +205 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200066/out/attempt_20250123021212_587141_r001_000003_0_0.h5 +206 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200066/out/attempt_20250123021212_586590_r001_000004_0_0.h5 +206 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200066/out/attempt_20250123021212_587141_r001_000004_0_0.h5 +207 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200066/out/attempt_20250123021212_586590_r001_000005_0_0.h5 +207 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200066/out/attempt_20250123021212_587141_r001_000005_0_0.h5 +208 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200066/out/attempt_20250123021212_586590_r001_000006_0_0.h5 +208 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200066/out/attempt_20250123021212_587141_r001_000006_0_0.h5 +209 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200066/out/attempt_20250123021212_586590_r001_000007_0_0.h5 +209 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200066/out/attempt_20250123021212_587141_r001_000007_0_0.h5 +210 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200066/out/attempt_20250123021212_586590_r001_000008_0_0.h5 +210 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200066/out/attempt_20250123021212_587141_r001_000008_0_0.h5 +211 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200066/out/attempt_20250123021212_586590_r001_000009_0_0.h5 +211 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200066/out/attempt_20250123021212_587141_r001_000009_0_0.h5 +212 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200067/out/attempt_20250123021212_586577_r001_000000_0_0.h5 +212 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200067/out/attempt_20250123021212_587135_r001_000000_0_0.h5 +213 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200067/out/attempt_20250123021212_586577_r001_000001_0_0.h5 +213 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200067/out/attempt_20250123021212_587135_r001_000001_0_0.h5 +214 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200067/out/attempt_20250123021212_586577_r001_000002_0_0.h5 +214 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200067/out/attempt_20250123021212_587135_r001_000002_0_0.h5 +215 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200067/out/attempt_20250123021212_586577_r001_000003_0_0.h5 +215 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200067/out/attempt_20250123021212_587135_r001_000003_0_0.h5 +216 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200067/out/attempt_20250123021212_586577_r001_000004_0_0.h5 +216 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200067/out/attempt_20250123021212_587135_r001_000004_0_0.h5 +217 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200067/out/attempt_20250123021212_586577_r001_000005_0_0.h5 +217 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200067/out/attempt_20250123021212_587135_r001_000005_0_0.h5 +218 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200067/out/attempt_20250123021212_586577_r001_000006_0_0.h5 +218 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200067/out/attempt_20250123021212_587135_r001_000006_0_0.h5 +219 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200067/out/attempt_20250123021212_586577_r001_000007_0_0.h5 +219 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200067/out/attempt_20250123021212_587135_r001_000007_0_0.h5 +220 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200067/out/attempt_20250123021212_586577_r001_000008_0_0.h5 +220 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200067/out/attempt_20250123021212_587135_r001_000008_0_0.h5 +221 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200067/out/attempt_20250123021212_586577_r001_000009_0_0.h5 +221 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200067/out/attempt_20250123021212_587135_r001_000009_0_0.h5 +222 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200068/out/attempt_20250123021212_586581_r001_000000_0_0.h5 +222 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200068/out/attempt_20250123021212_587137_r001_000000_0_0.h5 +223 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200068/out/attempt_20250123021212_586581_r001_000001_0_0.h5 +223 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200068/out/attempt_20250123021212_587137_r001_000001_0_0.h5 +224 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200068/out/attempt_20250123021212_586581_r001_000002_0_0.h5 +224 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200068/out/attempt_20250123021212_587137_r001_000002_0_0.h5 +225 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200068/out/attempt_20250123021212_586581_r001_000003_0_0.h5 +225 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200068/out/attempt_20250123021212_587137_r001_000003_0_0.h5 +226 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200068/out/attempt_20250123021212_586581_r001_000004_0_0.h5 +226 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200068/out/attempt_20250123021212_587137_r001_000004_0_0.h5 +227 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200068/out/attempt_20250123021212_586581_r001_000005_0_0.h5 +227 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200068/out/attempt_20250123021212_587137_r001_000005_0_0.h5 +228 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200068/out/attempt_20250123021212_586581_r001_000006_0_0.h5 +228 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200068/out/attempt_20250123021212_587137_r001_000006_0_0.h5 +229 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200068/out/attempt_20250123021212_586581_r001_000007_0_0.h5 +229 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200068/out/attempt_20250123021212_587137_r001_000007_0_0.h5 +230 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200068/out/attempt_20250123021212_586581_r001_000008_0_0.h5 +230 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200068/out/attempt_20250123021212_587137_r001_000008_0_0.h5 +231 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200068/out/attempt_20250123021212_586581_r001_000009_0_0.h5 +231 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200068/out/attempt_20250123021212_587137_r001_000009_0_0.h5 +232 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200069/out/attempt_20250123021212_586582_r001_000000_0_0.h5 +232 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200069/out/attempt_20250123021212_587139_r001_000000_0_0.h5 +233 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200069/out/attempt_20250123021212_586582_r001_000001_0_0.h5 +233 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200069/out/attempt_20250123021212_587139_r001_000001_0_0.h5 +234 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200069/out/attempt_20250123021212_586582_r001_000002_0_0.h5 +234 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200069/out/attempt_20250123021212_587139_r001_000002_0_0.h5 +235 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200069/out/attempt_20250123021212_586582_r001_000003_0_0.h5 +235 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200069/out/attempt_20250123021212_587139_r001_000003_0_0.h5 +236 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200069/out/attempt_20250123021212_586582_r001_000004_0_0.h5 +236 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200069/out/attempt_20250123021212_587139_r001_000004_0_0.h5 +237 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200069/out/attempt_20250123021212_586582_r001_000005_0_0.h5 +237 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200069/out/attempt_20250123021212_587139_r001_000005_0_0.h5 +238 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200069/out/attempt_20250123021212_586582_r001_000006_0_0.h5 +238 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200069/out/attempt_20250123021212_587139_r001_000006_0_0.h5 +239 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200069/out/attempt_20250123021212_586582_r001_000007_0_0.h5 +239 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200069/out/attempt_20250123021212_587139_r001_000007_0_0.h5 +240 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200069/out/attempt_20250123021212_586582_r001_000008_0_0.h5 +240 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200069/out/attempt_20250123021212_587139_r001_000008_0_0.h5 +241 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200069/out/attempt_20250123021212_586582_r001_000009_0_0.h5 +241 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200069/out/attempt_20250123021212_587139_r001_000009_0_0.h5 +242 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200070/out/attempt_20250123021212_586589_r001_000000_0_0.h5 +242 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200070/out/attempt_20250123021212_587140_r001_000000_0_0.h5 +243 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200070/out/attempt_20250123021212_586589_r001_000001_0_0.h5 +243 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200070/out/attempt_20250123021212_587140_r001_000001_0_0.h5 +244 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200070/out/attempt_20250123021212_586589_r001_000002_0_0.h5 +244 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200070/out/attempt_20250123021212_587140_r001_000002_0_0.h5 +245 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200070/out/attempt_20250123021212_586589_r001_000003_0_0.h5 +245 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200070/out/attempt_20250123021212_587140_r001_000003_0_0.h5 +246 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200070/out/attempt_20250123021212_586589_r001_000004_0_0.h5 +246 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200070/out/attempt_20250123021212_587140_r001_000004_0_0.h5 +247 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200070/out/attempt_20250123021212_586589_r001_000005_0_0.h5 +247 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200070/out/attempt_20250123021212_587140_r001_000005_0_0.h5 +248 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200070/out/attempt_20250123021212_586589_r001_000006_0_0.h5 +248 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200070/out/attempt_20250123021212_587140_r001_000006_0_0.h5 +249 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200070/out/attempt_20250123021212_586589_r001_000007_0_0.h5 +249 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200070/out/attempt_20250123021212_587140_r001_000007_0_0.h5 +250 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200070/out/attempt_20250123021212_586589_r001_000008_0_0.h5 +250 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200070/out/attempt_20250123021212_587140_r001_000008_0_0.h5 +251 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200070/out/attempt_20250123021212_586589_r001_000009_0_0.h5 +251 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200070/out/attempt_20250123021212_587140_r001_000009_0_0.h5 +252 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200624/out/attempt_20250123021212_586633_r001_000000_0_0.h5 +252 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200624/out/attempt_20250123021212_587171_r001_000000_0_0.h5 +253 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200624/out/attempt_20250123021212_586633_r001_000001_0_0.h5 +253 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200624/out/attempt_20250123021212_587171_r001_000001_0_0.h5 +254 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200624/out/attempt_20250123021212_586633_r001_000002_0_0.h5 +254 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200624/out/attempt_20250123021212_587171_r001_000002_0_0.h5 +255 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200624/out/attempt_20250123021212_586633_r001_000003_0_0.h5 +255 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200624/out/attempt_20250123021212_587171_r001_000003_0_0.h5 +256 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200624/out/attempt_20250123021212_586633_r001_000004_0_0.h5 +256 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200624/out/attempt_20250123021212_587171_r001_000004_0_0.h5 +257 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200624/out/attempt_20250123021212_586633_r001_000005_0_0.h5 +257 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200624/out/attempt_20250123021212_587171_r001_000005_0_0.h5 +258 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200624/out/attempt_20250123021212_586633_r001_000006_0_0.h5 +258 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200624/out/attempt_20250123021212_587171_r001_000006_0_0.h5 +259 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200624/out/attempt_20250123021212_586633_r001_000007_0_0.h5 +259 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200624/out/attempt_20250123021212_587171_r001_000007_0_0.h5 +260 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200624/out/attempt_20250123021212_586633_r001_000008_0_0.h5 +260 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200624/out/attempt_20250123021212_587171_r001_000008_0_0.h5 +261 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200624/out/attempt_20250123021212_586633_r001_000009_0_0.h5 +261 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200624/out/attempt_20250123021212_587171_r001_000009_0_0.h5 +262 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300330/out/attempt_20250123021212_586542_r001_000000_0_0.h5 +262 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/300330/out/attempt_20250123021212_587102_r001_000000_0_0.h5 +263 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300330/out/attempt_20250123021212_586542_r001_000001_0_0.h5 +263 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/300330/out/attempt_20250123021212_587102_r001_000001_0_0.h5 +264 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300330/out/attempt_20250123021212_586542_r001_000002_0_0.h5 +264 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/300330/out/attempt_20250123021212_587102_r001_000002_0_0.h5 +265 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300330/out/attempt_20250123021212_586542_r001_000003_0_0.h5 +265 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/300330/out/attempt_20250123021212_587102_r001_000003_0_0.h5 +266 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300330/out/attempt_20250123021212_586542_r001_000004_0_0.h5 +266 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/300330/out/attempt_20250123021212_587102_r001_000004_0_0.h5 +267 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300330/out/attempt_20250123021212_586542_r001_000005_0_0.h5 +267 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/300330/out/attempt_20250123021212_587102_r001_000005_0_0.h5 +268 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300330/out/attempt_20250123021212_586542_r001_000006_0_0.h5 +268 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/300330/out/attempt_20250123021212_587102_r001_000006_0_0.h5 +269 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300330/out/attempt_20250123021212_586542_r001_000007_0_0.h5 +269 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/300330/out/attempt_20250123021212_587102_r001_000007_0_0.h5 +270 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300330/out/attempt_20250123021212_586542_r001_000008_0_0.h5 +270 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/300330/out/attempt_20250123021212_587102_r001_000008_0_0.h5 +271 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300330/out/attempt_20250123021212_586542_r001_000009_0_0.h5 +271 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/300330/out/attempt_20250123021212_587102_r001_000009_0_0.h5 +272 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300330/out/attempt_20250123021212_586542_r001_000010_0_0.h5 +272 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/300330/out/attempt_20250123021212_587102_r001_000010_0_0.h5 +273 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300330/out/attempt_20250123021212_586542_r001_000011_0_0.h5 +273 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/300330/out/attempt_20250123021212_587102_r001_000011_0_0.h5 +274 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300330/out/attempt_20250123021212_586542_r001_000012_0_0.h5 +274 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/300330/out/attempt_20250123021212_587102_r001_000012_0_0.h5 +275 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300330/out/attempt_20250123021212_586542_r001_000013_0_0.h5 +275 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/300330/out/attempt_20250123021212_587102_r001_000013_0_0.h5 +276 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300330/out/attempt_20250123021212_586542_r001_000014_0_0.h5 +276 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/300330/out/attempt_20250123021212_587102_r001_000014_0_0.h5 +277 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300330/out/attempt_20250123021212_586542_r001_000015_0_0.h5 +277 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/300330/out/attempt_20250123021212_587102_r001_000015_0_0.h5 +278 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300330/out/attempt_20250123021212_586542_r001_000016_0_0.h5 +278 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/300330/out/attempt_20250123021212_587102_r001_000016_0_0.h5 +279 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300330/out/attempt_20250123021212_586542_r001_000017_0_0.h5 +279 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/300330/out/attempt_20250123021212_587102_r001_000017_0_0.h5 +280 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300330/out/attempt_20250123021212_586542_r001_000018_0_0.h5 +280 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/300330/out/attempt_20250123021212_587102_r001_000018_0_0.h5 +281 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300330/out/attempt_20250123021212_586542_r001_000019_0_0.h5 +281 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/300330/out/attempt_20250123021212_587102_r001_000019_0_0.h5 +282 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300330/out/attempt_20250123021212_586542_r001_000020_0_0.h5 +282 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/300330/out/attempt_20250123021212_587102_r001_000020_0_0.h5 +283 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300330/out/attempt_20250123021212_586542_r001_000021_0_0.h5 +283 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/300330/out/attempt_20250123021212_587102_r001_000021_0_0.h5 +284 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300330/out/attempt_20250123021212_586542_r001_000022_0_0.h5 +284 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/300330/out/attempt_20250123021212_587102_r001_000022_0_0.h5 +285 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300330/out/attempt_20250123021212_586542_r001_000023_0_0.h5 +285 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/300330/out/attempt_20250123021212_587102_r001_000023_0_0.h5 +286 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300330/out/attempt_20250123021212_586542_r001_000024_0_0.h5 +286 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/300330/out/attempt_20250123021212_587102_r001_000024_0_0.h5 +287 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300330/out/attempt_20250123021212_586542_r001_000025_0_0.h5 +287 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/300330/out/attempt_20250123021212_587102_r001_000025_0_0.h5 +288 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300330/out/attempt_20250123021212_586542_r001_000026_0_0.h5 +288 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/300330/out/attempt_20250123021212_587102_r001_000026_0_0.h5 +289 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300330/out/attempt_20250123021212_586542_r001_000027_0_0.h5 +289 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/300330/out/attempt_20250123021212_587102_r001_000027_0_0.h5 +290 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300330/out/attempt_20250123021212_586542_r001_000028_0_0.h5 +290 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/300330/out/attempt_20250123021212_587102_r001_000028_0_0.h5 +291 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300330/out/attempt_20250123021212_586542_r001_000029_0_0.h5 +291 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/300330/out/attempt_20250123021212_587102_r001_000029_0_0.h5 +292 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300330/out/attempt_20250123021212_586542_r001_000030_0_0.h5 +292 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/300330/out/attempt_20250123021212_587102_r001_000030_0_0.h5 +293 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300330/out/attempt_20250123021212_586542_r001_000031_0_0.h5 +293 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/300330/out/attempt_20250123021212_587102_r001_000031_0_0.h5 +294 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300330/out/attempt_20250123021212_586542_r001_000032_0_0.h5 +294 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/300330/out/attempt_20250123021212_587102_r001_000032_0_0.h5 +295 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300330/out/attempt_20250123021212_586542_r001_000033_0_0.h5 +295 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/300330/out/attempt_20250123021212_587102_r001_000033_0_0.h5 +296 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300330/out/attempt_20250123021212_586542_r001_000034_0_0.h5 +296 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/300330/out/attempt_20250123021212_587102_r001_000034_0_0.h5 +297 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300330/out/attempt_20250123021212_586542_r001_000035_0_0.h5 +297 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/300330/out/attempt_20250123021212_587102_r001_000035_0_0.h5 +298 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300330/out/attempt_20250123021212_586542_r001_000036_0_0.h5 +298 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/300330/out/attempt_20250123021212_587102_r001_000036_0_0.h5 +299 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300330/out/attempt_20250123021212_586542_r001_000037_0_0.h5 +299 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/300330/out/attempt_20250123021212_587102_r001_000037_0_0.h5 +300 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300330/out/attempt_20250123021212_586542_r001_000038_0_0.h5 +300 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/300330/out/attempt_20250123021212_587102_r001_000038_0_0.h5 +301 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300330/out/attempt_20250123021212_586542_r001_000039_0_0.h5 +301 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/300330/out/attempt_20250123021212_587102_r001_000039_0_0.h5 +302 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300330/out/attempt_20250123021212_586542_r001_000040_0_0.h5 +302 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/300330/out/attempt_20250123021212_587102_r001_000040_0_0.h5 +303 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300330/out/attempt_20250123021212_586542_r001_000041_0_0.h5 +303 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/300330/out/attempt_20250123021212_587102_r001_000041_0_0.h5 +304 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300330/out/attempt_20250123021212_586542_r001_000042_0_0.h5 +304 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/300330/out/attempt_20250123021212_587102_r001_000042_0_0.h5 +305 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300330/out/attempt_20250123021212_586542_r001_000043_0_0.h5 +305 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/300330/out/attempt_20250123021212_587102_r001_000043_0_0.h5 +306 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300330/out/attempt_20250123021212_586542_r001_000044_0_0.h5 +306 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/300330/out/attempt_20250123021212_587102_r001_000044_0_0.h5 +307 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300330/out/attempt_20250123021212_586542_r001_000045_0_0.h5 +307 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/300330/out/attempt_20250123021212_587102_r001_000045_0_0.h5 +308 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300330/out/attempt_20250123021212_586542_r001_000046_0_0.h5 +308 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/300330/out/attempt_20250123021212_587102_r001_000046_0_0.h5 +309 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300330/out/attempt_20250123021212_586542_r001_000047_0_0.h5 +309 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/300330/out/attempt_20250123021212_587102_r001_000047_0_0.h5 +310 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300330/out/attempt_20250123021212_586542_r001_000048_0_0.h5 +310 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/300330/out/attempt_20250123021212_587102_r001_000048_0_0.h5 +311 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300330/out/attempt_20250123021212_586542_r001_000049_0_0.h5 +311 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/300330/out/attempt_20250123021212_587102_r001_000049_0_0.h5 +312 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300330/out/attempt_20250123021212_586542_r001_000050_0_0.h5 +312 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/300330/out/attempt_20250123021212_587102_r001_000050_0_0.h5 +313 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300330/out/attempt_20250123021212_586542_r001_000051_0_0.h5 +313 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/300330/out/attempt_20250123021212_587102_r001_000051_0_0.h5 +314 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300330/out/attempt_20250123021212_586542_r001_000052_0_0.h5 +314 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/300330/out/attempt_20250123021212_587102_r001_000052_0_0.h5 +315 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300330/out/attempt_20250123021212_586542_r001_000053_0_0.h5 +315 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/300330/out/attempt_20250123021212_587102_r001_000053_0_0.h5 +316 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300330/out/attempt_20250123021212_586542_r001_000054_0_0.h5 +316 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/300330/out/attempt_20250123021212_587102_r001_000054_0_0.h5 +317 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300330/out/attempt_20250123021212_586542_r001_000055_0_0.h5 +317 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/300330/out/attempt_20250123021212_587102_r001_000055_0_0.h5 +318 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300330/out/attempt_20250123021212_586542_r001_000056_0_0.h5 +318 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/300330/out/attempt_20250123021212_587102_r001_000056_0_0.h5 +319 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300330/out/attempt_20250123021212_586542_r001_000057_0_0.h5 +319 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/300330/out/attempt_20250123021212_587102_r001_000057_0_0.h5 +320 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300330/out/attempt_20250123021212_586542_r001_000058_0_0.h5 +320 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/300330/out/attempt_20250123021212_587102_r001_000058_0_0.h5 +321 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300330/out/attempt_20250123021212_586542_r001_000059_0_0.h5 +321 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/300330/out/attempt_20250123021212_587102_r001_000059_0_0.h5 +322 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300330/out/attempt_20250123021212_586542_r001_000060_0_0.h5 +322 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/300330/out/attempt_20250123021212_587102_r001_000060_0_0.h5 +323 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300333/out/attempt_20250123021212_586543_r001_000000_0_0.h5 +323 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/300333/out/attempt_20250123021212_587104_r001_000000_0_0.h5 +324 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300333/out/attempt_20250123021212_586543_r001_000001_0_0.h5 +324 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/300333/out/attempt_20250123021212_587104_r001_000001_0_0.h5 +325 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300333/out/attempt_20250123021212_586543_r001_000002_0_0.h5 +325 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/300333/out/attempt_20250123021212_587104_r001_000002_0_0.h5 +326 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300333/out/attempt_20250123021212_586543_r001_000003_0_0.h5 +326 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/300333/out/attempt_20250123021212_587104_r001_000003_0_0.h5 +327 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300333/out/attempt_20250123021212_586543_r001_000004_0_0.h5 +327 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/300333/out/attempt_20250123021212_587104_r001_000004_0_0.h5 +328 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300333/out/attempt_20250123021212_586543_r001_000005_0_0.h5 +328 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/300333/out/attempt_20250123021212_587104_r001_000005_0_0.h5 +329 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300333/out/attempt_20250123021212_586543_r001_000006_0_0.h5 +329 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/300333/out/attempt_20250123021212_587104_r001_000006_0_0.h5 +330 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300333/out/attempt_20250123021212_586543_r001_000007_0_0.h5 +330 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/300333/out/attempt_20250123021212_587104_r001_000007_0_0.h5 +331 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300333/out/attempt_20250123021212_586543_r001_000008_0_0.h5 +331 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/300333/out/attempt_20250123021212_587104_r001_000008_0_0.h5 +332 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300333/out/attempt_20250123021212_586543_r001_000009_0_0.h5 +332 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/300333/out/attempt_20250123021212_587104_r001_000009_0_0.h5 +333 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000000_0_0.h5 +334 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000001_0_0.h5 +335 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000002_0_0.h5 +336 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000003_0_0.h5 +337 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000004_0_0.h5 +338 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000005_0_0.h5 +339 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000006_0_0.h5 +340 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000007_0_0.h5 +341 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000008_0_0.h5 +342 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000009_0_0.h5 +343 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000010_0_0.h5 +344 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000011_0_0.h5 +345 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000012_0_0.h5 +346 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000013_0_0.h5 +347 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000014_0_0.h5 +348 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000015_0_0.h5 +349 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000016_0_0.h5 +350 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000017_0_0.h5 +351 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000018_0_0.h5 +352 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000019_0_0.h5 +353 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000020_0_0.h5 +354 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000021_0_0.h5 +355 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000022_0_0.h5 +356 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000023_0_0.h5 +357 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000024_0_0.h5 +358 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000025_0_0.h5 +359 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000026_0_0.h5 +360 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000027_0_0.h5 +361 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000028_0_0.h5 +362 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000029_0_0.h5 +363 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000030_0_0.h5 +364 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000031_0_0.h5 +365 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000032_0_0.h5 +366 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000033_0_0.h5 +367 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000034_0_0.h5 +368 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000035_0_0.h5 +369 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000036_0_0.h5 +370 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000037_0_0.h5 +371 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000038_0_0.h5 +372 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000039_0_0.h5 +373 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000040_0_0.h5 +374 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000041_0_0.h5 +375 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000042_0_0.h5 +376 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000043_0_0.h5 +377 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000044_0_0.h5 +378 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000045_0_0.h5 +379 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000046_0_0.h5 +380 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000047_0_0.h5 +381 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000048_0_0.h5 +382 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000049_0_0.h5 +383 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000050_0_0.h5 +384 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000051_0_0.h5 +385 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000052_0_0.h5 +386 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000053_0_0.h5 +387 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000054_0_0.h5 +388 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000055_0_0.h5 +389 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000056_0_0.h5 +390 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000057_0_0.h5 +391 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000058_0_0.h5 +392 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000059_0_0.h5 +393 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000060_0_0.h5 +394 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000061_0_0.h5 +395 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000062_0_0.h5 +396 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000063_0_0.h5 +397 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000064_0_0.h5 +398 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000065_0_0.h5 +399 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000066_0_0.h5 +400 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000067_0_0.h5 +401 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000068_0_0.h5 +402 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000069_0_0.h5 +403 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000070_0_0.h5 +404 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000071_0_0.h5 +405 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000072_0_0.h5 +406 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000073_0_0.h5 +407 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000074_0_0.h5 +408 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000075_0_0.h5 +409 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000076_0_0.h5 +410 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000077_0_0.h5 +411 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000078_0_0.h5 +412 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000079_0_0.h5 +413 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000080_0_0.h5 +414 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000081_0_0.h5 +415 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000082_0_0.h5 +416 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000083_0_0.h5 +417 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000084_0_0.h5 +418 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000085_0_0.h5 +419 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000086_0_0.h5 +420 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000087_0_0.h5 +421 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000088_0_0.h5 +422 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000089_0_0.h5 +423 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000090_0_0.h5 +424 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000091_0_0.h5 +425 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000092_0_0.h5 +426 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000093_0_0.h5 +427 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000094_0_0.h5 +428 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000095_0_0.h5 +429 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000096_0_0.h5 +430 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000097_0_0.h5 +431 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000098_0_0.h5 +432 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000099_0_0.h5 +433 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000100_0_0.h5 +434 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000101_0_0.h5 +435 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000102_0_0.h5 +436 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000103_0_0.h5 +437 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000104_0_0.h5 +438 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000105_0_0.h5 +439 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000106_0_0.h5 +440 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000107_0_0.h5 +441 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000108_0_0.h5 +442 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000109_0_0.h5 +443 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000110_0_0.h5 +444 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000111_0_0.h5 +445 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000112_0_0.h5 +446 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000113_0_0.h5 +447 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000114_0_0.h5 +448 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000115_0_0.h5 +449 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000116_0_0.h5 +450 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000117_0_0.h5 +451 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000118_0_0.h5 +452 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000119_0_0.h5 +453 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000120_0_0.h5 +454 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000121_0_0.h5 +455 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000122_0_0.h5 +456 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000123_0_0.h5 +457 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000124_0_0.h5 +458 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000125_0_0.h5 +459 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000126_0_0.h5 +460 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000127_0_0.h5 +461 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000128_0_0.h5 +462 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000129_0_0.h5 +463 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000130_0_0.h5 +464 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000131_0_0.h5 +465 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000132_0_0.h5 +466 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000133_0_0.h5 +467 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000134_0_0.h5 +468 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000135_0_0.h5 +469 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000136_0_0.h5 +470 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000137_0_0.h5 +471 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000138_0_0.h5 +472 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000139_0_0.h5 +473 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000140_0_0.h5 +474 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000141_0_0.h5 +475 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000142_0_0.h5 +476 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000143_0_0.h5 +477 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000144_0_0.h5 +478 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000145_0_0.h5 +479 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000146_0_0.h5 +480 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000147_0_0.h5 +481 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000148_0_0.h5 +482 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000149_0_0.h5 +483 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000150_0_0.h5 +484 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000151_0_0.h5 +485 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000152_0_0.h5 +486 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000153_0_0.h5 +487 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000154_0_0.h5 +488 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000155_0_0.h5 +489 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000156_0_0.h5 +490 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000157_0_0.h5 +491 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000158_0_0.h5 +492 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000159_0_0.h5 +493 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000160_0_0.h5 +494 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000161_0_0.h5 +495 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000162_0_0.h5 +496 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000163_0_0.h5 +497 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000164_0_0.h5 +498 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000165_0_0.h5 +499 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000166_0_0.h5 +500 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000167_0_0.h5 +501 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000168_0_0.h5 +502 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000169_0_0.h5 +503 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000170_0_0.h5 +504 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000171_0_0.h5 +505 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000172_0_0.h5 +506 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000173_0_0.h5 +507 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000174_0_0.h5 +508 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000175_0_0.h5 +509 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000176_0_0.h5 +510 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000177_0_0.h5 +511 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000178_0_0.h5 +512 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000179_0_0.h5 +513 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000180_0_0.h5 +514 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000181_0_0.h5 +515 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000182_0_0.h5 +516 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000183_0_0.h5 +517 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000184_0_0.h5 +518 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000185_0_0.h5 +519 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000186_0_0.h5 +520 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000187_0_0.h5 +521 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000188_0_0.h5 +522 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000189_0_0.h5 +523 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000190_0_0.h5 +524 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000191_0_0.h5 +525 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000192_0_0.h5 +526 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000193_0_0.h5 +527 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000194_0_0.h5 +528 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000195_0_0.h5 +529 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000196_0_0.h5 +530 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000197_0_0.h5 +531 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000198_0_0.h5 +532 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000199_0_0.h5 +533 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000200_0_0.h5 +534 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000201_0_0.h5 +535 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000202_0_0.h5 +536 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000203_0_0.h5 +537 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000204_0_0.h5 +538 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000205_0_0.h5 +539 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000206_0_0.h5 +540 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000207_0_0.h5 +541 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000208_0_0.h5 +542 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000209_0_0.h5 +543 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000210_0_0.h5 +544 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000211_0_0.h5 +545 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000212_0_0.h5 +546 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000213_0_0.h5 +547 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000214_0_0.h5 +548 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000215_0_0.h5 +549 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000216_0_0.h5 +550 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000217_0_0.h5 +551 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000218_0_0.h5 +552 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000219_0_0.h5 +553 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000220_0_0.h5 +554 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000221_0_0.h5 +555 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000222_0_0.h5 +556 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000223_0_0.h5 +557 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000224_0_0.h5 +558 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000225_0_0.h5 +559 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000226_0_0.h5 +560 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000227_0_0.h5 +561 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000228_0_0.h5 +562 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000229_0_0.h5 +563 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000230_0_0.h5 +564 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000231_0_0.h5 +565 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000232_0_0.h5 +566 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000233_0_0.h5 +567 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000234_0_0.h5 +568 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000235_0_0.h5 +569 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000236_0_0.h5 +570 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000237_0_0.h5 +571 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000238_0_0.h5 +572 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000239_0_0.h5 +573 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000240_0_0.h5 +574 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000241_0_0.h5 +575 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000242_0_0.h5 +576 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000243_0_0.h5 +577 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000244_0_0.h5 +578 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000245_0_0.h5 +579 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000246_0_0.h5 +580 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000247_0_0.h5 +581 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000248_0_0.h5 +582 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000249_0_0.h5 +583 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000250_0_0.h5 +584 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000251_0_0.h5 +585 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000252_0_0.h5 +586 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000253_0_0.h5 +587 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000254_0_0.h5 +588 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000255_0_0.h5 +589 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000256_0_0.h5 +590 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000257_0_0.h5 +591 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000258_0_0.h5 +592 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000259_0_0.h5 +593 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000260_0_0.h5 +594 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000261_0_0.h5 +595 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000262_0_0.h5 +596 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000263_0_0.h5 +597 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000264_0_0.h5 +598 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000265_0_0.h5 +599 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000266_0_0.h5 +600 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000267_0_0.h5 +601 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000268_0_0.h5 +602 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000269_0_0.h5 +603 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000270_0_0.h5 +604 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000271_0_0.h5 +605 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000272_0_0.h5 +606 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000273_0_0.h5 +607 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000274_0_0.h5 +608 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000275_0_0.h5 +609 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000276_0_0.h5 +610 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000277_0_0.h5 +611 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000278_0_0.h5 +612 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000279_0_0.h5 +613 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000280_0_0.h5 +614 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000281_0_0.h5 +615 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000282_0_0.h5 +616 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000283_0_0.h5 +617 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000284_0_0.h5 +618 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000285_0_0.h5 +619 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000286_0_0.h5 +620 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000287_0_0.h5 +621 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000288_0_0.h5 +622 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000289_0_0.h5 +623 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000290_0_0.h5 +624 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000291_0_0.h5 +625 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000292_0_0.h5 +626 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000293_0_0.h5 +627 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000294_0_0.h5 +628 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000295_0_0.h5 +629 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000296_0_0.h5 +630 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000297_0_0.h5 +631 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000298_0_0.h5 +632 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000299_0_0.h5 +633 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000300_0_0.h5 +634 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000301_0_0.h5 +635 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000302_0_0.h5 +636 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000303_0_0.h5 +637 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000304_0_0.h5 +638 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000305_0_0.h5 +639 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000306_0_0.h5 +640 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000307_0_0.h5 +641 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000308_0_0.h5 +642 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000309_0_0.h5 +643 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000310_0_0.h5 +644 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000311_0_0.h5 +645 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000312_0_0.h5 +646 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000313_0_0.h5 +647 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000314_0_0.h5 +648 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000315_0_0.h5 +649 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000316_0_0.h5 +650 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000317_0_0.h5 +651 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000318_0_0.h5 +652 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000319_0_0.h5 +653 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000320_0_0.h5 +654 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000321_0_0.h5 +655 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000322_0_0.h5 +656 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000323_0_0.h5 +657 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000324_0_0.h5 +658 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000325_0_0.h5 +659 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000326_0_0.h5 +660 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000327_0_0.h5 +661 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000328_0_0.h5 +662 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000329_0_0.h5 +663 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000330_0_0.h5 +664 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000331_0_0.h5 +665 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000332_0_0.h5 +666 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000333_0_0.h5 +667 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000334_0_0.h5 +668 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000335_0_0.h5 +669 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000336_0_0.h5 +670 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000337_0_0.h5 diff --git a/examples/pre-training/conf/ratio_eb45t_0321 b/examples/pre-training/conf/ratio_eb45t_0321 new file mode 100644 index 00000000..e1f9c4bb --- /dev/null +++ b/examples/pre-training/conf/ratio_eb45t_0321 @@ -0,0 +1,1176 @@ +/300332/ 0.004202218 10 英文-网页-Chegg 数据集(一期、二期) -- 纯文部分 +/300018/ 0.000426423 10 英文-问答-Chain-of-thought数据集 +/300031/ 0.000672956 10 英文(推理)-其他-专项预训练数据(推理cot数据) +/300079/ 0.004154126 10 英文(推理)-试题-AMPS数学题 +/300268/ 0.011664832 10 英文-问答-mathpile问答数据 +/301010/ 0.000621846 10 英文(推理)-试题-AMPS数学题-COT QA数据-英文 +/400048/ 0.000495729 10 中文-书籍-中小学k12教材_纯文本_P0 +/400073/ 0.000051698 10 中文-书籍-15本逻辑学cot数据 +/400079/ 0.000741630 10 中文(推理)-问答-专项预训练数据(推理cot数据) +/400384/ 0.017233671 10 中文-书籍-十二五高等教育教科书(纯文) +/400500/ 0.001329830 10 中文-书籍-35本大学数学教材与习题-理科专项(文本) +/400501/ 0.001060006 10 中文-书籍-92本各学科大学教材-理科专项提供书单(文本) +/400636/ 0.001132464 10 中文-合成-推理基础能力-合成COT数据(问答) +/400637/ 0.010130296 10 中文-合成-推理相关试题-合成COT数据 +/400640/ 0.038171000 10 中文-合成-合成数据_V34(百度教育) +/400641/ 0.030401655 10 中文-合成-K12百度教育题库&职业考试(法律、医疗)new +/400643/ 0.001701378 10 中文-合成-裁判猜谜合成数据 +/401003/ 0.000070062 10 中文-试题-公务员(网页) +/401004/ 0.000177366 10 中文(推理)-试题-公务员(sft) +/401008/ 0.024912935 10 中文(推理)-试题-数学专项思维链 +/401018/ 0.004168075 10 中文-试题-全品题舟初中试题 +/401019/ 0.002761711 10 中文-试题-组卷网高中试题 +/401020/ 0.002670994 10 中文(推理)-试题-全品题舟小学试题+组卷网初中试题 +/401025/ 0.030762161 10 中文-试题-百度教育253W处理数据 +/401027/ 0.005162020 10 中文-试题-全品题舟高中试题batch1 +/401036/ 0.001720236 10 中文-试题-考试酷-课后资料抓取 +/401041/ 0.010854240 10 中文-试题-学科竞赛资料试题 +/401042/ 0.005200987 10 中文-试题-火星搜题APP试题数据-纯文试题(全量) +/401047/ 0.000193406 10 中文-试题-33iq智力题第一批1万条-文本对 +/401049/ 0.022248625 10 中文-试题-华图公务员题库-推理专项-文本对 +/401137/ 0 10 中文(推理)-垂类-数学专项QA数据-中文 +/401138/ 0.004837967 10 中文(推理)-试题-小学数学-猿辅导train-COT QA数据-中文 +/300317/ 0.008617457 10 英文-书籍-【纯文】open textbook-各类书籍教材 +/300114/ 0.001816212 10 英文-网页-逻辑推理COT数据(英文-70w) +/400131/ 0.009835131 10 中文-网页-逻辑推理COT数据(中文-123w) +/301016/ 1.0998e-7 10 英文-试题-【纯文】mathsisfun网站-试题数据 +/300321/ 0.007123413 10 英文-网页-【数据集筛选】benchmark反查站点-英文数学数据 +/300037/ 1.084535795 50 英文(推理)-学术-open-web-math +/401037/ 0.042431255 10 中文-试题-数学线上 query 定向合成试题 +/401048/ 0.055261370 10 中文-试题-百度教育精品题库推理相关试题-推理专项-文本对 +/500009/ 0.000805158 10 任务数据-任务-MammoTH数据 +/300136/ 0.005368267 10 英文-百科-GoodWiki数据集 +/301027/ 0.000001196 10 英文-试题-electrical-engineering电气工程问答数据-能源开源Benchmark +/300328/ 0.000014418 10 英文-网页-【生物数据】koreabiomed +/301025/ 0.000045433 10 英文-试题-fingpt-fiqa-qa问答数据-金融开源Benchmark +/301032/ 0.002056826 10 英文-网页-开源数学数据-SciInstruct-文本对 +/401053/ 0 10 中文-问答-信息处理能力退火数据-信息处理专项(问答)-文本对 +/400389/ 0.000668421 10 中文-书籍-10类小说剧本-PDD-文本 +/400388/ 0.000878760 10 中文-垂类-能源行业1.8万标准文档-ACG-文本 +/300326/ 0.000745006 10 英文-书籍-【ACG】电力行业第一批1330本专业书籍-文本 +/300327/ 0.043608102 10 英文-网页-CK12数学-纯文(网页) +/301026/ 0.000326669 10 英文-试题-Sujet-Finance-Instruct-177k英文评估-金融开源Benchmark +/301034/ 0.038548893 10 英文-网页-开源数学数据-MMIQC-文本对 +/301035/ 0.041540716 10 英文-网页-代码知识点合成QA数据(codeInstruct)-第二批 +/400392/ 0.000055332 10 中文-垂类-【生物数据】东方财富-文档转纯文 +/401052/ 0.015406246 10 中文-试题-百度教育第二批K12试题-人工定产高质-文本对 +/401054/ 0.000475462 10 中文-网页-牛客网 +/301030/ 0.086946662 10 英文-网页-代码知识点合成QA数据(python教科书)-第三批 +/400723/ 0.000651345 10 中文-书籍-【网络小说】ppd采购-2024Q3-第二批(文本) +/301033/ 0.000003762 10 英文-试题-puzzleprime谜题类试题问答数据(纯文) +/401050/ 9.3184e-7 10 中文-试题-FinEval试题数据-金融开源Benchmark +/100161/ 0.006921342 10 代码-代码-洛谷题库数据 +/100159/ 0.000024031 10 代码-代码-hackerearth代码数据 +/100160/ 7.8628125e-7 10 代码-代码-DevProjects By codementor +/100162/ 0.000511434 10 代码-代码-Lintcode题库数据 +/301017/ 0.000085270 10 英文-问答-investopedia-embedding-dataset问答数据-金融开源Benchmark +/301024/ 0.006414237 10 英文-问答-bartleby教育-书籍问答对-第一批 +/301022/ 0.000063948 10 英文-试题-finance-tasks AdaptLLM金融开源Benchmark +/301018/ 0.000004426 10 英文-试题-financial-qa-10K-金融开源Benchmark +/301020/ 0.000018547 10 英文-问答-BrainBashers谜题问答-纯文 +/301019/ 5.9093333e-8 10 英文-试题-NuclearQA核能数据-能源开源Benchmark +/301012/ 0.000056494 10 英文-试题-【纯文】khanacademy教学数据-试题问答 +/300023/ 0.031022331 23 英文(推理)-学术-DM Mathematics 数学问题数据(包括代数、算数、微积分、数论与概率论等数据) +/500007/ 0.000317181 10 任务数据-任务-flan_v2_math +/301006/ 0.556026240 52 英文-试题-AlgebraicStack数学数据 +/300085/ 0.034850577 13 英文(推理)-试题-WolframAlpha生成语料0629 +/500004/ 0.000085429 10 任务数据-任务-modelscope_v1.1 +/300019/ 0.156326843 110 英文(推理)-学术-PubMed Central 生物医学论文数据 +/301007/ 0.004619885 10 英文-试题-Quizlet-flashcard +/300316/ 0.008069836 10 英文-网页-bartleby教育数据-文档文献 +/300324/ 0.002509667 10 英文-网页-弱势学科相关网页抓取数据-第一批 +/300299/ 0.003595824 10 英文-网页-IXL-AANki web数据-纯文 +/500011/ 0.010269707 10 任务数据-任务-huggingface_v2.3&v3任务数据 +/301005/ 0.001998200 10 英文-试题-Quizlet试题 +/401145/ 0.000015125 10 中文-试题-FinanceIQ-度小满金融开源Benchmark +/401146/ 1.75973333e-7 10 中文-试题-mmlu-electrical电气领域数据-能源开源Benchmark +/301031/ 0.289928856 27 英文-网页-开源数学数据-OpenMathInstruct-2-文本对 +/400010/ 0.141365042 10 中文(推理)-其他-专项预训练数据(数学) +/401022/ 0.013196836 10 中文-试题-DM Mathematics 数学问题数据(中文版) +/401028/ 0.1614 20 中文-试题-刷刷题定向试题增强-中文 +/500003/ 0.040748335 15 任务数据-任务-flan_v2_no_math +/401147/ 0.003231284 10 中文-网页-【说服力】强说服力数据挖掘-文本对 +/400383/ 0.001394032 10 中文-书籍-历史数据-3600个知识点检索得到的书籍清单(mask策略捞回数据) +/300156/ 0.014032425 10 英文-学术-pubscholar-9w +/700010/ 0.036641681 13 英文-试题-os_sft_15m_wo_g4 +/800002/ 0.034635151 12 英文-试题-opc +/700008/ 0 11 英文-试题-long_text_sft_data +/100146/ 0.154701220 24 代码-代码-opc-annealing-corpus +/300073/ 0.046649456 10 英文-书籍-redpjama_Books +/300320/ 0.557234266 52 英文-网页-【HuggingFace开源数据集】FineMath-4plus(数学推理) +/300101/ 0.002820591 10 英文-学术-地理数据-GAKG数据集 +/301009/ 0 10 英文(推理)-垂类-数学专项QA数据-英文 +/301004/ 0.005779125 10 英文-试题-生物分子数据Mol-Instructions +/300303/ 0.014933236 10 英文-网页-PPT文本数据(slideplayer +slideserve) +/400681/ 0.000239836 10 中文-垂类-交通行业书籍-ACG(纯文) +/500006/ 0.000024415 10 任务数据-任务-math + gsm8k +/401033/ 0.00514 10 中文-试题-百度教育-不挂科电力试题-知识点增强-中文【纯文】【内容】【中文】【合作】【自然】 +/200007/ 0.011311881 10 平行语料-平行语料-翻译-平行网页数据 +/300001/ 0.160349719 20 英文-百科-Wikipedia (en)英文维基百科 +/301011/ 0.000211221 10 英文-网页-researchgate纯文问答数据 +/100130/ 0.003521280 10 代码-代码-commitpackft 数据 +/100140/ 0.135185069 22 代码-代码-codeforces-v1.1 +/100157/ 0.006554617 10 代码-代码-代码知识点合成QA数据-第一批 +/400680/ 0.000256299 10 中文-垂类-电力行业标准文档(纯文) +/400667/ 0.032566909 11 中文-百科-百度百科2024年5月26日更新的全量数据(覆盖原先百科数据) +/401030/ 0.073292 10 中文-试题-刷刷题双向知识增强-单多选-中文 +/401029/ 0.055556 10 中文-试题-刷刷题双向知识增强-知识点-中文 +/300153/ 0.003525420 10 英文-对话-quora问答评论组成的多轮对话数据 +/301002/ 0.000804454 10 英文(推理)-试题-英文数学定理证明(NaturalProofs_proofwiki) +/401013/ 0.034229525 10 中文(推理)-试题-百度文库试题 +/300081/ 0.073932628 10 英文(推理)-网页-redpajama_StackExchange过滤数学站点 +/401031/ 0.02362 10 中文-试题-百度教育-不挂科电力试题-中文 +/401035/ 0.000006635 10 中文-问答-劳动纠纷领域精编问答对【纯文字】【问答】【中文】【抓取】【自然】 +/401040/ 0.096686604 12 中文-试题-K12领域理科试题-理科专项(文本对) +/300139/ 0.018957618 10 英文-垂类-digitalcorpora文档 +/400671/ 0.007065981 10 中文-学术-spider-论文-1700w-第二批-中文 +/400391/ 0.000170387 10 中文-书籍-【说服力】心理学&情商648本书籍-文本 +/401032/ 0.08354 10 中文-试题-百度教育-百度题库-电力相关试题-中文 +/400390/ 0.015422963 11 中文-书籍-【说服力】说服力中文书籍第一批-文档 +/300075/ 0.49133022 92 英文-问答-redpjama_StackExchange +/300278/ 0.408453110 288 英文-学术-ArXiv 论文数据(LaTeX格式,主要由数学、计算机和物理学组成) +/300095/ 0.003341633 10 英文-百科-维基百科数学网页 +/300008/ 0.428304576 101 英文-书籍-Books3 长篇书籍(包含科幻与非科幻小说) +/300297/ 0.006656111 10 英文-书籍-用户上传打分分析数据-14000-pdf +/300276/ 0.002535074 10 英文-书籍-复旦书籍-2077-pdf +/300273/ 0.000041790 10 英文-书籍-行业数据-82-pdf-简单版式 +/300275/ 0.000080798 10 英文-书籍-剧本文案-86-pdf +/300021/ 0.055823765 20 英文-学术-USPTO Backgrounds 美国专利数据(仅包含被美国专利商标局批准的专利的背景章节) +/300289/ 0.012064707 10 英文-书籍-英文 Springer出版图书 +/300151/ 0.216530092 40 英文-对话-reddit英文对话(第一批) +/300022/ 0.029885555 21 英文-学术-PubMed Abstracts 生物医学论文摘要数据 +/200005/ 0.000032112 10 平行语料-平行语料-篇章级多语言平行数据-chinadaily中英频道 +/300017/ 0.252459481 47 英文-问答-Stack Exchange问答数据 +/300024/ 0.009099030 10 英文-学术-EuroParl 欧洲议会期刊数据 +/300025/ 0.005437775 10 英文-学术-PhilPapers 哲学论文数据 +/401005/ 0.001047805 10 中文(推理)-试题-小学数学-猿辅导train +/100138/ 0.012531748 10 代码-代码-code-instruction-v1.1(清洗后重新入仓) +/100142/ 0.324497172 51 代码-代码-general-code-v1.1(清洗后重新入仓) +/400150/ 0.042563868 10 中文-合成-代数与分析能力 +/401021/ 0.001276874 10 中文-试题-LaWGPT法律竞赛试题 +/200621/ 0.004428583 10 平行语料-网页-中英向量合成数据(源400116 +/300026/ 0.008649082 10 英文-学术-NIH ExPorter 生物医学专利摘要数据 +/400145/ 0.028199290 21 中文-对话-B站评论第一批 +/400685/ 0.000019176 10 中文-对话-DuConv主动聊天任务数据集(纯文) +/300084/ 0.001032644 10 英文-试题-toolbench工具调用 +/300010/ 0.007351746 10 英文-书籍-BookCorpus2 书籍数据(由未正式发表书籍组成) +/100154/ 0.500997626 200 代码-代码-commitpack-v1 +/100155/ 0.063144033 25 代码-代码-github-issue代码数据 +/300072/ 0.608984059 131 英文-学术-redpjama_ArXiv +/400655/ 0.027240954 10 中文-对话-spider网页过滤多轮(高质量)更新 +/400373/ 0.010967084 10 中文-学术-spider-论文1700w-第一批-47w-pdf(中文) +/400105/ 0.174754822 16 中文(推理)-网页-数学网页vip库0608 +/300087/ 0.009795067 10 英文-网页-openreview论文摘要与评论 +/401034/ 0.13008 16 中文-试题-百度教育-百度题库-电力相关试题-知识点增强-中文【纯文】【内容】【中文】【合作】【自然】 +/300009/ 0.029606909 11 英文-书籍-Gutenberg (PG-19) 经典西方文学书籍数据 +/300143/ 0.471065105 110 英文-书籍-zlibary (epub解析) +/200012/ 0.239147163 29 平行语料-平行语料-翻译-中英文语料wmt +/300068/ 0.008538754 10 英文-书籍-亚马逊12w英文书单-第二批-约4000 亚马逊12w英文书单-理工科书籍-2.7w +/400120/ 0.165578080 59 中文-百科-百度百科2023年8月4日更新的全量数据(覆盖原先百科数据) +/300082/ 0.000029408 10 英文-试题-类BBH训练样本 +/400178/ 0.002457650 10 中文-垂类-中国证券报 +/300069/ 0.235950495 57 英文-数据库-PubChem,Reactome, NASA +/300113/ 0.010712575 10 英文-网页-readthedocs网页数据 +/300292/ 0.019369272 10 英文-合成-Algorithm Solver 1期&2期-英文 +/300126/ 0.007587147 10 英文-合成-低级数学运算 +/300291/ 0.035686748 13 英文-合成-Algorithm Solver 3.1期&3.2期 +/100139/ 0.003852292 10 代码-代码-code-problem-v1.1 +/100143/ 0.025920724 10 代码-代码-jupyter-v1.1(清洗后重新入仓) +/100144/ 0.023971742 10 代码-代码-jupyter-v2.1(清洗后重新入仓) +/100147/ 0.005426379 10 代码-代码-jupyter-v3.1(清洗后重新入仓) +/100148/ 0.001321260 10 代码-代码-leetcode-v1 +/100149/ 0.036570082 10 代码-代码-leetcode-v2 +/100150/ 0.045218142 10 代码-代码-codebench-v1代码数据 +/300173/ 0.155629214 23 英文-问答-里屋社区数据集:数学领域有关的问答数据 +/401139/ 0.011541056 10 中文-试题-刷刷题数据改造-多选题改单选题 +/401140/ 0.016042149 10 中文-试题-刷刷题数据改造-选择题选项顺序打乱&选择题改判断题&选择题改简答题 +/300329/ 0.305080876 51 英文-网页-【数学推理文本】dolmino-mix-1124-math-HuggingFace-退火数据 +/400724/ 0.193480222 72 中文-百科-百度百科2025年1月12日全量更新-百科团队-文本 +/400101/ 0.037382915 28 中文-文库-文库-法律(替代400055) +/200004/ 0.000053225 10 平行语料-平行语料-篇章级多语言平行数据-ft中英频道 +/400370/ 0.006267959 10 中文-专利-维普专利-9w-pdf +/300140/ 0.003650648 10 英文-网页-K12相关网页(字幕解析) +/300308/ 0.000334133 10 英文-学术-历史数据-pubscholar-9w(mask策略捞回数据) +/300314/ 0.008328762 10 英文-学术-历史数据-aminer-128w-简单版式(mask策略捞回数据) +/300318/ 0.274455412 34 英文-百科-【审核专项】wiki百科数据-条目(英文全量) +/300305/ 0.010346531 10 英文-网页-Goodreads评论 +/400623/ 0.014226065 10 中文-数据库-中文优质 caption +/300158/ 0.004283739 10 英文-问答-里屋社区数据集:wikihow英文问答 +/400665/ 0.018959938 10 中文-合成-Algorithm Solver 1期&2期-中文 +/400161/ 0.001164170 10 中文-对话-京东-淘宝电商对话 +/300323/ 0.000526813 10 英文-网页-【说服力】开源数据集DailyPersuasion +/300313/ 0.062847912 47 英文-网页-知识点试题改造合成数据集 +/300319/ 2.050215290 192 英文-网页-【HuggingFace开源数据集】FineMath-3plus(数学推理) +/300322/ 0.012923087 10 英文-网页-【说服力】说服力英文书籍第一批-文档 +/400181/ 0.003513481 10 中文-对话-电商客服数据 +/400656/ 0.007287521 10 中文-对话-spider网页过滤多轮(中质量) +/400660/ 0.056956065 15 中文-对话-spider-电商直播视频ASR转文本数据 +/400129/ 0.002704659 10 中文-书籍-中图简单版式4000本 +/300300/ 0.11802262 81 英文-学术-spider-论文1700w-第一批-178w-pdf(英文) +/300167/ 0.016092177 12 英文-数据库-英文优质 caption +/400024/ 0.430057396 80 中文-试题-百度教育作文/诗词 +/400387/ 0.001297218 10 中文-垂类-智慧职教(课程)(纯文) +/400381/ 0.011926724 10 中文-学术-历史数据-维普学位论文-0619(mask策略捞回数据) +/400386/ 0.000045197 10 中文-垂类-【acg-行业数据】金融行业第二批41本专业书籍-ACG(文档/文本(content)/理解&生成/通用) +/400718/ 0.000087185 10 中文-对话-小说转对话-多轮专项-文本对 +/400719/ 0.000078533 10 中文-对话-selfplay对话数据-多轮专项-文本对 +/400378/ 0.000029977 10 中文-垂类-电商直播文稿(纯文) +/400385/ 0.000011633 10 中文-垂类-【acg-行业数据】交通行业公路业务知识(纯文) +/100141/ 0.135998526 21 代码-代码-developer-community-v1.1(清洗后重新入仓) +/400646/ 0.005912905 10 中文-书籍-15w中文书籍-其他理工科书籍 +/200014/ 0.022128531 10 平行语料-平行语料-来自翻译团队的中英文平行语料 +/300096/ 0.000274601 10 英文-网页-从CC中筛选包含高等数学与科学相关公式的网页 +/300277/ 0.055427506 10 英文-网页-从commoncrawl中筛选包含高等数学与科学相关公式的网页 +/300290/ 0.049165793 10 英文-网页-数学网页专项九合一汇总 +/200620/ 0.009161019 13 平行语料-学术-中英向量合成数据(源400619 +/400239/ 0.014897824 10 中文-对话-微博-多轮数据 +/400712/ 0.000158719 10 中文-网页-【音频转纯文】得到-音视频流媒体(纯文) +/400707/ 0.013256509 10 中文-网页-问一问&播客对话数据-多轮专项 +/400716/ 0.022618291 16 中文-网页-【数据集筛选】【说服力】强说服力数据挖掘-文本 +/400710/ 0.000255895 10 中文-网页-【视频转纯文】【说服力】pdd-辩论赛比赛视频-文本(content) +/400717/ 0.001778207 10 中文-网页-【数据集筛选】【说服力】强说服力数据精加工-说服要素增强 +/400715/ 0.000123443 10 中文-网页-【音视频转纯文】【说服力】pdd-心理学音视频-文本(content) +/400713/ 0.000076532 10 中文-网页-【音频转纯文】电商直播文稿-视频数据(文本) +/400711/ 0.000028231 10 中文-网页-【视频转纯文】【说服力】pdd奇葩说1-7季-文本(content) +/400709/ 0.000004578 10 中文-网页-【视频转纯文】【说服力】pdd-辩论赛培训视频-文本(content) +/400714/ 0.000002955 10 中文-网页-【音频转纯文】央6电影纪录片音频(文本) +/300003/ 0.002559880 10 英文-对话-Ubuntu IRCUbuntu 系统相关对话日志 +/300162/ 0.080042447 32 英文-合成-icl-ref +/300127/ 0.666819573 250 英文-对话-reddit对话数据 +/400085/ 0.026819219 11 中文-学术-10w理工科论文 +/400207/ 0.028018129 21 中文-合成-需求标签tag增强 +/400205/ 0.008679003 10 中文-合成-小红书主题tag增强 +/400626/ 0.087133313 65 中文-合成-条件约束生成数据(collie工具) +/201008/ 0.009847289 10 平行语料-试题-源401008 +/200111/ 0.010215286 10 平行语料-网页-源400111 +/400208/ 0.003940606 10 中文-合成-写作风格增强 +/400206/ 0.003422340 10 中文-合成-知乎主题tag增强 +/200106/ 0.01629529 10 平行语料-网页-源400106 +/200091/ 0.029744786 10 平行语料-网页-源400091 +/200113/ 0.061492316 13 平行语料-网页-源400113 +/200033/ 0.053508506 12 平行语料-网页-源400033 +/200010/ 0.035356092 10 平行语料-其他-源400010 +/200023/ 0.030241943 10 平行语料-网页-源400023 +/300088/ 0.011886271 10 英文-网页-维基百科nature&science词条引文外链站点数据 +/200093/ 0.083837447 18 平行语料-网页-源400093 +/200092/ 0.101052883 22 平行语料-网页-源400092 +/300302/ 0.188836192 155 英文-学术-spider-论文-1700w-第二批-英文 +/400084/ 0.003962362 10 中文-学术-文科论文解析(包含400083) +/200038/ 0 10 平行语料-问答-源400038 +/200045/ 0.018567822 10 平行语料-问答-源400045 +/200122/ 0.006794635 10 平行语料-专利-源400122 +/400382/ 0.004016988 10 中文-书籍-历史数据-中图简单版式4000本(mask策略捞回数据) +/200008/ 0.009533830 10 平行语料-试题-融合多个中文试题数据源 +/200116/ 0.002967857 10 平行语料-网页-源400116 +/200041/ 0.001845739 10 平行语料-问答-源400041 +/400222/ 0.058112990 23 中文-书籍-万话网络小说 +/400627/ 0.000476358 10 中文-合成-角色扮演数据 +/400191/ 0.000701442 10 中文-合成-角色对话 +/401026/ 0.196423801 24 中文-试题-百度教育文科题目 +/400036/ 0.213550949 81 中文-问答-百度知道 +/400151/ 0.181640498 70 中文-对话-小说&网文对话数据 +/400674/ 2.769272989 1106 中文-网页-微信公众号纯文2024.10.11存量数据 +/400675/ 0.000464246 10 中文-垂类-金融相关数据-研报&财报 +/300310/ 0.140917415 52 英文-网页-【new】benchmark网站反查-英文数据 +/300271/ 0.126941465 39 英文-网页-mathpile文本数据 +/300298/ 2.456243178 921 英文-网页-FineWeb-Edu 开源数据 +/400683/ 1.809877020 528 中文-网页-benchmark反查站点-综合主站(6个) +/300067/ 0.052336458 39 英文-数据库-UniProt, OEIS, LIPID +/300304/ 0.364691495 136 英文-网页-Huggingface 弱势学科数据集 +/400103/ 0.161841677 70 中文-文库-文库-学前教育(覆盖400057) +/400204/ 0.022169518 16 中文-书籍-豆瓣 +/400368/ 0.056001816 42 中文-书籍-spider-书籍-52w-pdf +/400356/ 0.026642813 13 中文-书籍-复旦书籍-34504-pdf +/400192/ 0.025978778 10 中文-书籍-3.5w学科专项-简单版式 +/400364/ 0.003851156 10 中文-书籍-用户上传打分分析数据-24000-pdf +/400352/ 0.000595459 10 中文-书籍-剧本文案-3180-pdf +/400351/ 0.001016385 10 中文-书籍-通用书籍-675-pdf & 地震震例-90-pdf +/400247/ 0.000527025 10 中文-书籍-通用书籍-675-pdf +/200077/ 0.000089357 10 平行语料-网页-源100077(leetcode) +/401143/ 0.000013073 10 中文-垂类-【acg-行业数据】交通行业FAQ数据(纯文) +/401141/ 0.000002629 10 中文-垂类-【acg-行业数据】交通行业数据(专业考试数据)-ACG(纯文) +/401142/ 0.000044407 10 中文-垂类-【acg-行业数据】交通行业数据(开源数据集)(纯文) +/400697/ 0.000496174 10 中文-垂类-交通行业数据-ACG(纯文) +/401144/ 0.005225356 10 中文-垂类-【acg-行业数据】电力能源相关数据-电力问答 +/400379/ 0.007165436 10 中文-垂类-【acg-行业数据】金融行业数据-ACG(纯文) +/400698/ 0.000506602 10 中文-垂类-交通行业数据(国行标)-ACG(纯文) +/400705/ 0.000010460 10 中文-垂类-【acg-行业数据】交通法律法规(纯文) +/400375/ 0.000055653 10 中文-垂类-【acg-行业数据】交通行业数据(轨交书籍和国行标)-ACG(纯文) +/400377/ 0.000033962 10 中文-垂类-【acg-行业数据】交通行业文档(纯文) +/400376/ 0.000047017 10 中文-垂类-【acg-行业数据】交通行业数据(书籍)11.26新增(纯文) +/400699/ 0.000031349 10 中文-垂类-能源行业法规数据-国家能源局 +/400704/ 0.000730821 10 中文-对话-多领域中文多轮对话-多轮专项(文本对) +/400380/ 0.001839532 10 中文-学术-历史数据-spider-论文1700w-第一批-47w-pdf(mask策略捞回数据) +/400016/ 0.058636600 43 中文-书籍-法律 +/400678/ 1.490375738 408 中文-网页-benchmark网站反查-中文数据 +/400102/ 0.202163233 97 中文-文库-文库-行业资料(覆盖400056) +/300150/ 0.219513474 164 英文-网页-低速源核心数据第二期 +/401007/ 0.010259254 10 中文-试题-普通作文(非议论文)(web+百度文库) +/400673/ 0.012700600 10 中文-书籍-番茄小说数据 +/300098/ 0.000710771 10 英文-网页-paperswithcode站点所有methods及下面所有的方法和内容 +/100068/ 0.217081934 87 代码-starcoder-git-commits +/100069/ 0.222823771 89 代码-starcoder-github-issues +/100151/ 0.007411130 10 代码-代码-starcoder-jupyter-scripts +/100152/ 0.008740367 10 代码-代码-ee-code-v3 +/100153/ 0.000945145 10 代码-代码-ee-code-v2 +/300002/ 0.006581946 10 英文-对话-OpenSubtitles 电影与电视节目的字幕数据 +/400657/ 0.031458149 30 中文-对话-电商直播数据 +/400371/ 0.174836572 101 中文-书籍-第二批离线数据集-书籍-txt +/6003350001/ 0.000949284 10 多语言-维基百科-Minnan +/400248/ 0.000601401 10 中文-问答-北京帕依提提财税问答数据 +/400014/ 0.019422007 14 中文-书籍-图书出版物 +/6000500096/ 0.005268196 10 多语言-CommonCrawl-布列塔尼语 +/6000690096/ 0.016805797 10 多语言-CommonCrawl-宗喀语 +/6000490096/ 0.007834570 10 多语言-CommonCrawl-波斯尼亚语 +/6000590096/ 0.004257278 10 多语言-CommonCrawl-楚瓦什语 +/300135/ 0.182460476 68 英文-网页-Clueweb22 Category B(推理高浓度) +/6001180096/ 0.015998604 10 多语言-CommonCrawl-马耳他语 +/6001050096/ 0.016079103 10 多语言-CommonCrawl-库尔德语 +/6000770095/ 0.000122776 10 多语言-维基百科-Galician +/6000820096/ 0.003101589 10 多语言-CommonCrawl-海地克里奥尔语 +/6000760096/ 0.004383107 10 多语言-CommonCrawl-苏格兰盖尔语 +/6001690096/ 0.005609511 10 多语言-CommonCrawl-土库曼语 +/6001460096/ 0.012187497 10 多语言-CommonCrawl-梵语 +/6000880096/ 0.000938697 10 多语言-CommonCrawl-伊多语 +/6001640096/ 0.017000598 10 多语言-CommonCrawl-藏语 +/6000940096/ 0.019475851 10 多语言-CommonCrawl-爱尔兰语 +/300281/ 1.112133548 417 英文-网页-RefinedWeb英文网站(推理高浓度) +/6000560096/ 0.002267842 10 多语言-CommonCrawl-车臣语 +/6001130096/ 0.009125619 10 多语言-CommonCrawl-卢森堡语 +/6001360096/ 0.001350078 10 多语言-CommonCrawl-奥塞梯语 +/6001780096/ 0.007382466 10 多语言-CommonCrawl-弗里斯兰语 +/6000450096/ 0.012277542 10 多语言-CommonCrawl-巴什基尔语 +/6001150096/ 0.006805956 10 多语言-CommonCrawl-马达加斯加语 +/6000170096/ 0.008275904 10 多语言-CommonCrawl-爪哇语 +/6001550096/ 0.011083386 10 多语言-CommonCrawl-索马里语 +/6001810096/ 0.007309711 10 多语言-CommonCrawl-意第绪语 +/6001710096/ 0.014157654 10 多语言-CommonCrawl-维吾尔语 +/6001510096/ 0.005953573 10 多语言-CommonCrawl-信德语 +/6001420096/ 0.001181368 10 多语言-CommonCrawl-罗曼什语 +/6001580096/ 0.003222900 10 多语言-CommonCrawl-巽他语 +/6001380096/ 0.017529166 10 多语言-CommonCrawl-普什图语 +/6000390096/ 0.010052251 10 多语言-CommonCrawl-阿萨姆语 +/6001340096/ 0.020935854 10 多语言-CommonCrawl-奥里亚语 +/6001750096/ 0.000748991 10 多语言-CommonCrawl-沃拉普克语 +/6001760096/ 0.000654264 10 多语言-CommonCrawl-瓦隆语 +/6001160095/ 0.000433849 10 多语言-维基百科-Malay +/6000370096/ 0.000548524 10 多语言-CommonCrawl-阿拉贡语 +/6000660096/ 0.010642623 10 多语言-CommonCrawl-迪维希语 +/300005/ 0.004963501 10 英文-对话-YoutubeSubtitlesYoutube 字幕数据(多语言平行预料,由教育内容、流行文化与对话等数据组成) +/6000700095/ 0.000107328 10 多语言-维基百科-Estonian +/6001530095/ 0.000229382 10 多语言-维基百科-Slovak +/6000800095/ 0.000267213 10 多语言-维基百科-Greek +/6000900096/ 0.000528172 10 多语言-CommonCrawl-国际语 +/6000830095/ 0.000405922 10 多语言-维基百科-Hebrew +/6001540095/ 0.000122431 10 多语言-维基百科-Slovenian +/6000790095/ 0.000101249 10 多语言-维基百科-Georgian +/300170/ 0.031127557 23 英文-合成-条件约束生成数据(collie工具) +/300159/ 0.123790280 46 英文-问答-里屋社区数据集:StackExchange问答数据 +/6000080095/ 0.000194590 10 多语言-维基百科-Thai +/6000200095/ 0.000097229 10 多语言-维基百科-Urdu +/6000510095/ 0.000192587 10 多语言-维基百科-Bulgarian +/6001770095/ 0.000095230 10 多语言-维基百科-Welsh +/6000470095/ 0.000149749 10 多语言-维基百科-Belarusian +/6000400096/ 0.000413514 10 多语言-CommonCrawl-阿瓦尔语 +/6000630095/ 0.000127107 10 多语言-维基百科-Croatian +/6001020096/ 0.000285866 10 多语言-CommonCrawl-科米语 +/6001400096/ 0.000203648 10 多语言-CommonCrawl-克丘亚语 +/6001940001/ 0.000123159 10 多语言-维基百科-Asturian +/400632/ 0.000673756 10 中文-问答-里屋社区数据集:wikihow中文问答 +/6001090096/ 0.000299480 10 多语言-CommonCrawl-林堡语 +/6000980095/ 0.000088780 10 多语言-维基百科-Kazakh +/6000380095/ 0.000244474 10 多语言-维基百科-Armenian +/6000490095/ 0.000088780 10 多语言-维基百科-Bosnian +/6001730095/ 0.000121679 10 多语言-维基百科-Uzbek +/6001010096/ 0.021613542 10 多语言-CommonCrawl-柯尔克孜语 +/6001820096/ 0.000274791 10 多语言-CommonCrawl-约鲁巴语 +/6000810096/ 0.000289954 10 多语言-CommonCrawl-瓜拉尼语 +/6000250095/ 0.000119719 10 多语言-维基百科-Telugu +/6000230095/ 0.000084849 10 多语言-维基百科-Hindi +/400700/ 0.000071534 10 中文-书籍-小说剧本-pdd采购2024Q3 +/400374/ 0.000218060 10 中文-书籍-小说名著 +/6000690095/ 0.000153115 10 多语言-维基百科-Esperanto +/6000600096/ 0.000112903 10 多语言-CommonCrawl-康瓦尔语 +/400648/ 0.716732068 67 中文-网页-数学网页spider +/6002070001/ 0.000143369 10 多语言-维基百科-Bangla +/6001110095/ 0.000095701 10 多语言-维基百科-Lithuanian +/6001080095/ 0.000078525 10 多语言-维基百科-Latvian +/6000650095/ 0.000141432 10 多语言-维基百科-Danish +/6000430095/ 0.000107086 10 多语言-维基百科-Azerbaijani +/6000260095/ 0.000105641 10 多语言-维基百科-Tamil +/6001140095/ 0.000115618 10 多语言-维基百科-Macedonian +/6001170095/ 0.000068962 10 多语言-维基百科-Malayalam +/400041/ 0.201815565 151 中文-问答-新医疗问答数据 +/6003070001/ 0.000064261 10 多语言-维基百科-Simple English +/301001/ 0.000005174 10 英文-试题-英语四六级雅思(电子书) +/401012/ 0.000200636 10 中文-试题-议论文作文(web+百度教育+百度文库) +/400045/ 0.053851749 32 中文-问答-知乎 +/400702/ 0.073980175 10 中文-网页-【审核专项】wiki百科数据-条目(中文繁体全量) +/400703/ 0.062628071 10 中文-网页-【审核专项】wiki百科数据-条目(中文简体全量) +/6002030001/ 0.000054121 10 多语言-维基百科-Belarusian (Taraškievica orthography) +/6000330095/ 0.000053668 10 多语言-维基百科-Afrikaans +/6001310095/ 0.000049165 10 多语言-维基百科-Norwegian Nynorsk +/6001980001/ 0.000046495 10 多语言-维基百科-South Azerbaijani +/200003/ 0.01990948 10 平行语料-翻译-wmt / UN +/6000450095/ 0.000046053 10 多语言-维基百科-Bashkir +/6000290095/ 0.000044290 10 多语言-维基百科-Kannada +/300099/ 0.000020255 10 英文-网页-IUPAC Goldbook所有化学概念 +/400209/ 0.105112368 66 中文-合成-写作质量提升-precot +/400670/ 0.011606157 13 中文-垂类-智源纯文数据集-汽车 +/300070/ 0.009732606 10 英文-网页-reddit用户评论交流数据 +/100132/ 12.277663946 2000 代码-代码-github-v2-0415 +/400049/ 0.001149245 10 中文-书籍-计算机书籍(中文) +/300004/ 0.011594312 10 英文-对话-HackerNews 热点评论数据(由针对热点话题的用户评论组成,话题大多与计算机与企业家精神相关) +/6000520095/ 0.000037359 10 多语言-维基百科-Burmese +/6000350095/ 0.000037359 10 多语言-维基百科-Albanian +/6003360001/ 0.000032713 10 多语言-维基百科-Cantonese +/400210/ 0.053394477 35 中文-垂类-电商内部数据 +/6000300096/ 0.000037359 10 多语言-CommonCrawl-比哈尔语 +/400104/ 0.283824680 190 中文-文库-文库-others(替代400058) +/6000270095/ 0.000030058 10 多语言-维基百科-Punjabi +/6001070095/ 0.000029894 10 多语言-维基百科-Latin +/400067/ 0.003617295 10 中文-书籍-科学文库-中文理工科书籍 +/6001770096/ 0.022690344 10 多语言-CommonCrawl-威尔士语 +/6002770001/ 0.000029114 10 多语言-维基百科-Low German +/6000240095/ 0.000026230 10 多语言-维基百科-Marathi +/6002670001/ 0.000026230 10 多语言-维基百科-Minangkabau +/6000910096/ 0.000029155 10 多语言-CommonCrawl-西方国际语 +/400037/ 0.017456813 12 中文-问答-问一问 +/6003000001/ 0.000025428 10 多语言-维基百科-Santali +/6001310096/ 0.022779158 10 多语言-CommonCrawl-新挪威语 +/6001780095/ 0.000025348 10 多语言-维基百科-Western Frisian +/6001320095/ 0.000024310 10 多语言-维基百科-Occitan +/6001150095/ 0.000024271 10 多语言-维基百科-Malagasy +/6000180095/ 0.000023993 10 多语言-维基百科-Tagalog +/6001190096/ 0.000025668 10 多语言-CommonCrawl-马恩岛语 +/6000360096/ 0.023071663 10 多语言-CommonCrawl-阿姆哈拉语 +/6002950001/ 0.000021050 10 多语言-维基百科-Western Punjabi +/6002590001/ 0.000021012 10 多语言-维基百科-Ladin +/6001620095/ 0.000020624 10 多语言-维基百科-Tajik +/6003050001/ 0.000020547 10 多语言-维基百科-Shan +/300100/ 0.000008498 10 英文-网页-NASA Exoplanet 新闻数据 +/6000390095/ 0.000020431 10 多语言-维基百科-Assamese +/400075/ 0.000779161 10 中文-书籍-3600个知识点检索得到的书籍清单 +/6000210095/ 0.000020161 10 多语言-维基百科-Hausa +/400047/ 0.015566506 11 中文-书籍-小说-行业top网文 +/6000370095/ 0.000018250 10 多语言-维基百科-Aragonese +/6000160096/ 0.022840342 10 多语言-CommonCrawl-斯瓦希里语 +/6001010095/ 0.000018213 10 多语言-维基百科-Kyrgyz +/6002150001/ 0.000017834 10 多语言-维基百科-Central Kurdish +/6001130095/ 0.000017082 10 多语言-维基百科-Luxembourgish +/6003030001/ 0.001420308 10 多语言-维基百科-Serbo-Croatian +/6001960001/ 0.000016633 10 多语言-维基百科-Kotava +/6000500095/ 0.000016372 10 多语言-维基百科-Breton +/6001630096/ 0.022942882 11 多语言-CommonCrawl-鞑靼语 +/6000870095/ 0.000016335 10 多语言-维基百科-Icelandic +/6000890095/ 0.000016261 10 多语言-维基百科-Igbo +/6001860001/ 0.000016224 10 多语言-维基百科-Alemannic +/300142/ 0.900666666 337 英文-网页-MMLU-难样本-英文 +/6001380095/ 0.000015631 10 多语言-维基百科-Pashto +/6001050095/ 0.000015153 10 多语言-维基百科-Kurdish +/400013/ 0.086267911 122 中文-书籍-龙源期刊 +/6001640095/ 0.000014713 10 多语言-维基百科-Tibetan +/6002810001/ 0.000002898 10 多语言-维基百科-N’Ko +/6001260095/ 0.000014494 10 多语言-维基百科-Nepali +/6000170095/ 0.000013803 10 多语言-维基百科-Javanese +/6003060001/ 0.000013659 10 多语言-维基百科-Sinhala +/100070/ 0.051859991 10 代码-starcoder-jupyter-structured +/400176/ 0.017917055 17 中文-合成-抽象符号推理-ascii_art +/6000940095/ 0.000013191 10 多语言-维基百科-Irish +/6000280095/ 0.000013083 10 多语言-维基百科-Gujarati +/500008/ 0.000285964 10 任务数据-任务-汉语拆字&专有名词 +/6003250001/ 0.000012051 10 多语言-维基百科-Venetian +/6002850001/ 0.000011417 10 多语言-维基百科-Odia +/6000460095/ 0.001446829 10 多语言-维基百科-Basque +/6000590095/ 0.000011207 10 多语言-维基百科-Chuvash +/6002780001/ 0.000010720 10 多语言-维基百科-Newari +/6001070096/ 0.023282341 12 多语言-CommonCrawl-拉丁语 +/6000160095/ 0.000010065 10 多语言-维基百科-Swahili +/300145/ 0.000031086 10 英文-垂类-k12相关网页-pdf-简单版式 +/6001220095/ 0.000009723 10 多语言-维基百科-Mongolian +/300295/ 0.000133656 10 英文-对话-历史英文被过滤语料回捞数据-对话(reddit英文对话第一批) +/6002500001/ 0.000008977 10 多语言-维基百科-Khmer +/6001580095/ 0.000008943 10 多语言-维基百科-Sundanese +/6002410001/ 0.000008574 10 多语言-维基百科-Western Armenian +/6002000001/ 0.000008341 10 多语言-维基百科-Bavarian +/400155/ 0.036901387 13 中文-网页-百科优质词条外链站点数据(推理高浓度) +/300013/ 0.240575585 65 英文-网页-OpenWebText2 网页库数据(多语言) +/6003120001/ 0.000007713 10 多语言-维基百科-Silesian +/6002690001/ 0.000007680 10 多语言-维基百科-Mon +/6001060096/ 0.023799202 13 多语言-CommonCrawl-老挝语 +/6002060001/ 0.000002863 10 多语言-维基百科-Pa'O +/6003020001/ 0.000006835 10 多语言-维基百科-Scots +/6000820095/ 0.000006707 10 多语言-维基百科-Haitian Creole +/400188/ 0.054522614 40 中文-对话-贴吧v3-多轮对话-优质数据 +/6000880095/ 0.000006515 10 多语言-维基百科-Ido +/6001180095/ 0.00000642 10 多语言-维基百科-Maltese +/6001420095/ 0.000002548 10 多语言-维基百科-Romansh +/400040/ 0.001365487 10 中文-问答-新浪 +/6002200001/ 0.000006103 10 多语言-维基百科-Zazaki +/6002600001/ 0.000005914 10 多语言-维基百科-Lombard +/400012/ 0.047220190 35 中文-书籍-小说百度阅读出版物 +/6001990001/ 0.000005694 10 多语言-维基百科-Balinese +/6000280096/ 0.023643323 13 多语言-CommonCrawl-古吉拉特语 +/6000060095/ 0.002383376 10 多语言-维基百科-Indonesian +/400100/ 0.029084601 21 中文-网页-VIP库-微信公众号文本 +/6000270096/ 0.023662608 14 多语言-CommonCrawl-旁遮普语 +/6002940001/ 0.000004500 10 多语言-维基百科-Piedmontese +/6001400095/ 0.000004500 10 多语言-维基百科-Quechua +/300349/ 0.708291032 265 英文-网页-RefinedWeb英文网站(弱势学科)_minhash0.7网页全局&局部去重 +/400676/ 0.043443129 32 中文-垂类-智源纯文数据集-体育 +/6001510095/ 0.000004231 10 多语言-维基百科-Sindhi +/6002020001/ 0.000004172 10 多语言-维基百科-Central Bikol +/6001460095/ 0.000004202 10 多语言-维基百科-Sanskrit +/6002990001/ 0.000004202 10 多语言-维基百科-Yakut +/6001920001/ 0.000004083 10 多语言-维基百科-Moroccan Arabic +/401014/ 0.000321003 10 中文-试题-BAAI Exam文科题 +/6002180001/ 0.000003790 10 多语言-维基百科-Dagbani +/6001820095/ 0.000003558 10 多语言-维基百科-Yoruba +/300311/ 0.474249004 355 英文-网页-英文知识分级退火数据 +/6001700095/ 0.000003443 10 多语言-维基百科-Twi +/6001090095/ 0.000003414 10 多语言-维基百科-Limburgish +/6003220001/ 0.000001334 10 多语言-维基百科-Tuvinian +/6003310001/ 0.000003328 10 多语言-维基百科-Mingrelian +/6001750095/ 0.000003214 10 多语言-维基百科-Volapük +/6002420001/ 0.000003271 10 多语言-维基百科-Iloko +/6000360095/ 0.000003158 10 多语言-维基百科-Amharic +/6000560095/ 0.001507632 10 多语言-维基百科-Chechen +/6001810095/ 0.000003073 10 多语言-维基百科-Yiddish +/6002400001/ 0.000002905 10 多语言-维基百科-Upper Sorbian +/6000900095/ 0.000002933 10 多语言-维基百科-Interlingua +/6002080001/ 0.000002905 10 多语言-维基百科-Bishnupriya +/6003280001/ 0.001523826 10 多语言-维基百科-Waray +/6000150095/ 0.002037323 10 多语言-维基百科-Turkish +/6001470096/ 0.00000285 10 多语言-CommonCrawl-萨丁尼亚语 +/400644/ 0.000856489 10 中文-合成-QA-style合成数据 +/6003330001/ 0.000001007 10 多语言-维基百科-Standard Moroccan Tamazight +/6003240001/ 0.000002492 10 多语言-维基百科-Uyghur +/6003150001/ 9.75e-7 10 多语言-维基百科-Tulu +/6001870001/ 9.72e-7 10 多语言-维基百科-Southern Altai +/400369/ 0.000127151 10 中文-期刊-维普期刊-1w-pdf +/400357/ 0.000013011 10 中文-对话-三联生活周刊、中国新闻周刊对话数据 +/400355/ 0.000098824 10 中文-对话-凤凰卫视-媒体逐字稿数据 +/400366/ 0.000008572 10 中文-对话-凤凰卫视媒体逐字稿-第二批 +/400367/ 0.000004555 10 中文-对话-南方人物周刊 +/400359/ 0.000002782 10 中文-对话-圆桌派视频字幕 +/6000610096/ 0.000002574 10 多语言-CommonCrawl-科西嘉语 +/6000330096/ 0.024599836 16 多语言-CommonCrawl-南非语 +/6001240095/ 0.000002249 10 多语言-维基百科-Navajo +/6002680001/ 0.000002329 10 多语言-维基百科-Manipuri +/6000660095/ 8.466e-7 10 多语言-维基百科-Divehi +/6002320001/ 0.000002061 10 多语言-维基百科-Scottish Gaelic +/6002630001/ 0.000002045 10 多语言-维基百科-Maithili +/6001730096/ 0.024639228 16 多语言-CommonCrawl-乌孜别克语 +/6002580001/ 0.000002005 10 多语言-维基百科-Ligurian +/6002100001/ 7.938e-7 10 多语言-维基百科-Russia Buriat +/6002860001/ 0.000001940 10 多语言-维基百科-Ossetic +/6001630095/ 0.001533951 10 多语言-维基百科-Tatar +/6003130001/ 7.728e-7 10 多语言-维基百科-Sakizaya +/300006/ 0.030760461 10 英文-对话-reddit +/6000720095/ 0.000001888 10 多语言-维基百科-Faroese +/6002720001/ 0.000001881 10 多语言-维基百科-Erzya +/6002710001/ 0.000001873 10 多语言-维基百科-Mirandese +/6002880001/ 0.000001798 10 多语言-维基百科-Pampanga +/6003010001/ 0.000001798 10 多语言-维基百科-Sicilian +/6002730001/ 0.000001756 10 多语言-维基百科-Mazanderani +/6002970001/ 0.000001716 10 多语言-维基百科-Tarantino +/6002160001/ 0.000001708 10 多语言-维基百科-Crimean Tatar +/6003320001/ 0.000001693 10 多语言-维基百科-Zeelandic +/6000310095/ 6.798e-7 10 多语言-维基百科-Abkhazian +/6003290001/ 0.000001685 10 多语言-维基百科-Wu +/6002660001/ 0.000001588 10 多语言-维基百科-Eastern Mari +/100134/ 5.251074464 2000 代码-代码-The Stack v2-train-full-ids(去除github-v2-0415已有repo) +/6002340001/ 0.000001571 10 多语言-维基百科-Goan Konkani +/6003080001/ 0.000001551 10 多语言-维基百科-Saraiki +/6001550095/ 0.000001538 10 多语言-维基百科-Somali +/6003260001/ 0.000001533 10 多语言-维基百科-Veps +/6002930001/ 6.192e-7 10 多语言-维基百科-Palatine German +/6003340001/ 0.000001513 10 多语言-维基百科-Literary Chinese +/6000970095/ 5.943e-7 10 多语言-维基百科-Kashmiri +/6003210001/ 0.000001473 10 多语言-维基百科-Tumbuka +/6003110001/ 5.598e-7 10 多语言-维基百科-Saterland Frisian +/6001760095/ 0.000001394 10 多语言-维基百科-Walloon +/6001320096/ 0.025684532 18 多语言-CommonCrawl-奥克语 +/6002270001/ 0.000001369 10 多语言-维基百科-Northern Frisian +/6000610095/ 0.000001332 10 多语言-维基百科-Corsican +/6003200001/ 5.352e-7 10 多语言-维基百科-Taroko +/6002040001/ 0.000001311 10 多语言-维基百科-Bhojpuri +/200623/ 0.065344722 48 平行语料-网页-中英向量合成数据(源400093 +/6001060095/ 0.000001296 10 多语言-维基百科-Lao +/6003270001/ 0.000001284 10 多语言-维基百科-West Flemish +/6001690095/ 0.000001264 10 多语言-维基百科-Turkmen +/6002760001/ 0.000001257 10 多语言-维基百科-Low Saxon +/6001900001/ 2.50537438e-7 10 多语言-维基百科-Angika +/6001000095/ 0.000001247 10 多语言-维基百科-Kinyarwanda +/6000810095/ 0.000001216 10 多语言-维基百科-Guarani +/6000780095/ 4.815e-7 10 多语言-维基百科-Ganda +/6001850001/ 0.000001197 10 多语言-维基百科-Achinese +/300325/ 0.356531958 263 英文-网页-fineweb推理数据(第二批)(模糊去重) +/6002010001/ 0.000001163 10 多语言-维基百科-Samogitian +/6002210001/ 4.719e-7 10 多语言-维基百科-Lower Sorbian +/6002110001/ 4.623e-7 10 多语言-维基百科-Chavacano +/200002/ 0.044962196 33 平行语料-翻译-OPUS +/6002280001/ 4.551e-7 10 多语言-维基百科-Friulian +/6001470095/ 0.000001129 10 多语言-维基百科-Sardinian +/6000180096/ 0.025123174 19 多语言-CommonCrawl-他加禄语 +/6001410095/ 0.001561550 10 多语言-维基百科-Romanian +/6002510001/ 4.245e-7 10 多语言-维基百科-Komi-Permyak +/300086/ 0.066175197 25 英文-网页-RoBERTa-stories数据集 +/6000750095/ 2.075551707e-7 10 多语言-维基百科-Fula +/6002650001/ 0.000001018 10 多语言-维基百科-Moksha +/6002840001/ 4.032e-7 10 多语言-维基百科-Livvi-Karelian +/6002980001/ 9.954e-7 10 多语言-维基百科-Rusyn +/6001840001/ 9.858e-7 10 多语言-维基百科-Zulu +/6000140095/ 0.002091422 10 多语言-维基百科-Persian +/6002560001/ 9.672e-7 10 多语言-维基百科-Lezghian +/6003040001/ 3.801e-7 10 多语言-维基百科-Tachelhit +/6001190095/ 9.444e-7 10 多语言-维基百科-Manx +/6002220001/ 3.801e-7 10 多语言-维基百科-Doteli +/6002450001/ 3.525e-7 10 多语言-维基百科-Lojban +/400706/ 0.240076382 173 中文-网页-中文知识分级退火数据 +/6001490095/ 8.274e-7 10 多语言-维基百科-Shona +/6003090001/ 8.184e-7 10 多语言-维基百科-Inari Sami +/300312/ 0.180215637 135 英文-垂类-【审核专项】legal-mc4-法律开源数据集 +/6000540096/ 0.024792144 19 多语言-CommonCrawl-高棉语 +/6002750001/ 7.734e-7 10 多语言-维基百科-Neapolitan +/6002350001/ 7.602e-7 10 多语言-维基百科-Gorontalo +/6002830001/ 7.512e-7 10 多语言-维基百科-Northern Sotho +/6002790001/ 3.072e-7 10 多语言-维基百科-Nias +/6002460001/ 7.512e-7 10 多语言-维基百科-Kara-Kalpak +/300280/ 0.138689640 208 英文-专利-英文专利 +/6001020095/ 7.398e-7 10 多语言-维基百科-Komi +/6000290096/ 0.025091585 20 多语言-CommonCrawl-卡纳达语 +/6001680095/ 2.76269503e-7 10 多语言-维基百科-Tswana +/6002170001/ 2.784664743e-7 10 多语言-维基百科-Kashubian +/6001290095/ 0.00159324 10 多语言-维基百科-Norwegian +/6002050001/ 6.786e-7 10 多语言-维基百科-Banjar +/6001440095/ 1.34848023e-7 10 多语言-维基百科-Samoan +/6002120001/ 6.564e-7 10 多语言-维基百科-Mindong +/6002470001/ 6.432e-7 10 多语言-维基百科-Kabyle +/6001790095/ 2.544623304e-7 10 多语言-维基百科-Wolof +/6002900001/ 6.324e-7 10 多语言-维基百科-Picard +/6002620001/ 1.242053655e-7 10 多语言-维基百科-Madurese +/300012/ 0.407028972 171 英文-网页-Pile-CC 网页库数据 +/400093/ 0.050580450 37 中文-网页-中文高质量网页库第二批(0616过滤) +/6000580095/ 1.231276377e-7 10 多语言-维基百科-Church Slavic +/6002570001/ 6.048e-7 10 多语言-维基百科-Lingua Franca Nova +/6000400095/ 6.066e-7 10 多语言-维基百科-Avaric +/6000910095/ 5.961e-7 10 多语言-维基百科-Interlingue +/6001280095/ 5.769e-7 10 多语言-维基百科-Northern Sami +/6002480001/ 2.28701057e-7 10 多语言-维基百科-Kabardian +/6002530001/ 2.35098237e-7 10 多语言-维基百科-Colognian +/6002190001/ 2.28701129e-7 10 多语言-维基百科-Dagaare +/6002640001/ 5.577e-7 10 多语言-维基百科-Basa Banyumasan +/6000150096/ 0.034764535 29 多语言-CommonCrawl-土耳其语 +/6002870001/ 4.338e-7 10 多语言-维基百科-Pangasinan +/6002490001/ 2.09690186e-7 10 多语言-维基百科-Kabiye +/6000600095/ 5.178e-7 10 多语言-维基百科-Cornish +/6003230001/ 5.073e-7 10 多语言-维基百科-Udmurt +/6001200095/ 5.052e-7 10 多语言-维基百科-Māori +/6000240096/ 0.025417773 22 多语言-CommonCrawl-马拉地语 +/6002370001/ 4.845e-7 10 多语言-维基百科-Hakka Chinese +/6001350095/ 1.888999455e-7 10 多语言-维基百科-Oromo +/300014/ 0.000758372 10 英文-网页-Enron Emails 邮件数据 +/6002440001/ 1.868410965e-7 10 多语言-维基百科-Jamaican Creole English +/400244/ 0.010880140 10 中文-网页-大搜—小红书数据 +/6000740095/ 0.001614875 10 多语言-维基百科-Finnish +/6002240001/ 4.392e-7 10 多语言-维基百科-Extremaduran +/6003300001/ 1.705085886e-7 10 多语言-维基百科-Kalmyk +/6002250001/ 4.191e-7 10 多语言-维基百科-Võro +/6002890001/ 4.212e-7 10 多语言-维基百科-Papiamento +/6001880001/ 1.66868598e-7 10 多语言-维基百科-Amis +/6000990095/ 8.291558e-8 10 多语言-维基百科-Kikuyu +/6000420095/ 4.068e-7 10 多语言-维基百科-Aymara +/6002960001/ 8.0616336e-8 10 多语言-维基百科-Aromanian +/6002380001/ 1.59828762e-7 10 多语言-维基百科-Hawaiian +/6002700001/ 3.969e-7 10 多语言-维基百科-Western Mari +/6002390001/ 3.828e-7 10 多语言-维基百科-Fiji Hindi +/6002260001/ 3.828e-7 10 多语言-维基百科-Arpitan +/6001620096/ 0.025905823 23 多语言-CommonCrawl-塔吉克语 +/6002330001/ 3.552e-7 10 多语言-维基百科-Gilaki +/400029/ 0.092360254 69 中文-网页-文案(作文、演讲稿等) +/6001520096/ 0.025990615 23 多语言-CommonCrawl-僧伽罗语 +/6002360001/ 1.33364553e-7 10 多语言-维基百科-Gun +/300342/ 0.129748617 41 英文-网页-Clueweb22CategoryB(弱势学科)_minhash0.7网页全局&局部去重 +/6002540001/ 3.3e-7 10 多语言-维基百科-Ladino +/6002610001/ 1.318104054e-7 10 多语言-维基百科-Latgalian +/6003180001/ 1.275523281e-7 10 多语言-维基百科-Tongan +/6002910001/ 1.265878506e-7 10 多语言-维基百科-Nigerian Pidgin +/6000110095/ 0.002167557 10 多语言-维基百科-Vietnamese +/6001890001/ 3.048e-7 10 多语言-维基百科-Old English +/6002800001/ 1.81772176e-7 10 多语言-维基百科-Novial +/6002520001/ 2.992002771e-7 10 多语言-维基百科-Karachay-Balkar +/6001220096/ 0.025902340 24 多语言-CommonCrawl-蒙古语 +/6000120095/ 0.002226276 10 多语言-维基百科-Korean +/6003140001/ 2.80205471e-7 10 多语言-维基百科-Tayal +/6002310001/ 1.105726713e-7 10 多语言-维基百科-Guianan Creole +/6001100095/ 2.59556788e-7 10 多语言-维基百科-Lingala +/400701/ 0.121971089 91 中文-垂类-智源纯文数据集-文学 +/6003160001/ 9.2918323e-8 10 多语言-维基百科-Tetum +/6001800095/ 2.355035583e-7 10 多语言-维基百科-Xhosa +/6002430001/ 2.24536354e-7 10 多语言-维基百科-Ingush +/6002230001/ 2.245363452e-7 10 多语言-维基百科-Emiliano-Romagnolo +/400035/ 0.007915126 10 中文-网页-作文 +/6001970001/ 8.87233128e-8 10 多语言-维基百科-Awadhi +/6002300001/ 2.17273911e-7 10 多语言-维基百科-Gan +/100019/ 0.000063483 10 代码-网页-VBA编程相关网站+400054 emojiall +/100021/ 0.008660972 10 代码-markdown-各类带表格的markdown语料 +/301008/ 2.033326666 1524 英文(推理)-网页-fineweb英文推理数据 +/6002820001/ 2.06456002e-7 10 多语言-维基百科-Norman +/6002140001/ 8.29415673e-8 5 多语言-维基百科-Cherokee +/6002090001/ 2.028706863e-7 10 多语言-维基百科-Buginese +/6001480095/ 0.001647660 10 多语言-维基百科-Serbian +/400677/ 0.024186679 34 中文-垂类-金融相关数据-金融资讯 +/6003170001/ 1.92178418e-7 10 多语言-维基百科-Talysh +/6002290001/ 1.555048722e-7 10 多语言-维基百科-Gagauz +/200011/ 0.306725598 314 平行语料-平行语料-翻译-基于opus集翻译小语种数据集 +/6001910001/ 6.96567603e-8 10 多语言-维基百科-Aramaic +/400091/ 0.015889569 11 中文-网页-安全权威网页第三批并与第一批、第二批合并 +/6002920001/ 5.8982595e-8 10 多语言-维基百科-Pennsylvania German +/6001830001/ 5.8304299e-8 10 多语言-维基百科-Zhuang +/6001030095/ 2.90674728e-8 10 多语言-维基百科-Kongo +/6003100001/ 5.5772518e-8 10 多语言-维基百科-Sranan Tongo +/400654/ 0.334551889 946 中文-学术-维普论文 +/6002550001/ 5.19275805e-8 10 多语言-维基百科-Lak +/400722/ 0.323560728 121 中文-网页-头条号截止2024年12月全量数据-Spider-文本 +/6003190001/ 4.78015863e-8 10 多语言-维基百科-Tok Pisin +/6002740001/ 1.16070562e-7 10 多语言-维基百科-Nāhuatl +/6001950001/ 4.60044882e-8 10 多语言-维基百科-Atikamekw +/6001160096/ 0.026818105 29 多语言-CommonCrawl-马来语 +/6000480095/ 2.259573e-8 10 多语言-维基百科-Bislama +/400152/ 0.023280952 39 中文-网页-百科优质词条外链站点数据(弱势学科) +/400650/ 0.143103541 107 中文-网页-vip网页库(推理高浓度) +/6000770096/ 0.027044638 30 多语言-CommonCrawl-加利西亚语 +/6000520096/ 0.026830243 30 多语言-CommonCrawl-缅甸语 +/6000730095/ 3.7514555e-8 10 多语言-维基百科-Fijian +/400144/ 0.000162475 10 中文-网页-医学-药品名和商品名 && 科学能力-xmol期刊资讯 +/400030/ 0.021575597 16 中文-网页-科技百家号 +/400026/ 0.005048637 10 中文-网页-综合新闻 +/6000030095/ 0.002705957 10 多语言-维基百科-Arabic +/6000640095/ 0.001697821 10 多语言-维基百科-Czech +/400198/ 0.036545755 27 中文-网页-低速源核心数据第二期 +/6000860095/ 0.001705734 10 多语言-维基百科-Hungarian +/400113/ 0.002695897 10 中文-网页-大学计算机数据46w +/400034/ 0.007476953 10 中文-网页-新闻 +/6000070095/ 0.002732014 10 多语言-维基百科-Portuguese +/6000460096/ 0.027628220 33 多语言-CommonCrawl-巴斯克语 +/6001930001/ 0.001712297 10 多语言-维基百科-Egyptian Arabic +/6001610095/ 2.2241634e-8 10 多语言-维基百科-Tahitian +/6001140096/ 0.027383413 34 多语言-CommonCrawl-马其顿语 +/400664/ 0.155170136 116 中文-书籍-中图豆瓣淘宝京东epub +/400645/ 0.057696071 43 中文-对话-健康医疗-多轮问诊对话 +/400112/ 0.004705202 10 中文-网页-LaWGPT法律开源数据 +/6000530095/ 0.001745842 10 多语言-维基百科-Catalan +/6000470096/ 0.027883990 36 多语言-CommonCrawl-白俄罗斯语 +/400022/ 0.000200493 10 中文-网页-党政 +/400189/ 0.048884509 36 中文-对话-贴吧v3-多轮对话-一般优质数据 +/300269/ 0.000001023 5 英文-对话-onepocket对话访谈 +/400666/ 0.003927789 10 中文-网页-剧本创作 +/300154/ 0.741774062 278 英文-学术-dolma_peS2o +/400183/ 0.000846748 10 中文-网页-健康医疗-医院、医生 +/400068/ 0.000668563 10 中文-书籍-中文古籍-3.59w +/400027/ 0.007461071 10 中文-网页-医疗 +/300015/ 0.085395505 59 英文-网页-CC-NEWS 网页库新闻数据 +/400060/ 0.027824244 20 中文-对话-贴吧v2 +/400730/ 0.012030753 90 中文-网页-vip网页库(弱势学科)_minhash0.7网页全局&局部去重 +/400149/ 0.010336823 15 中文-合成-符号替换symbol_substitution +/6000250096/ 0.027730615 39 多语言-CommonCrawl-泰卢固语 +/400184/ 0.004399811 10 中文-网页-健康医疗-其他数据 +/400114/ 0.000069139 10 中文-网页-地理数据 +/400230/ 0.000112892 10 中文-网页-93歌词数据 +/400684/ 1.038841914 779 中文(推理)-网页-vip&大搜推理数据2023.11-2024-04 +/6000200096/ 0.028003724 40 多语言-CommonCrawl-乌尔都语 +/300274/ 0.078405061 58 英文-书籍-FreeLaw 法律数据 +/6000670095/ 0.001773496 10 多语言-维基百科-Dutch +/6001260096/ 0.027893926 41 多语言-CommonCrawl-尼泊尔语 +/400023/ 0.002560072 10 中文-网页-金融 +/6000870096/ 0.028706252 43 多语言-CommonCrawl-冰岛语 +/6001370095/ 1.63769256e-8 10 多语言-维基百科-Pali +/6001170096/ 0.028387901 44 多语言-CommonCrawl-马拉雅拉姆语 +/400033/ 0.003183366 10 中文-网页-科技 +/400233/ 0.000010543 10 中文-专业创作-保险产品条款-301-pdf +/400096/ 0.041686120 31 中文-网页-新浪及简书博客文章 +/400032/ 0.002755136 10 中文-网页-人民网 +/401002/ 1.28e-7 10 中文-试题-政治考研(web) +/300270/ 0.246624685 30 英文-网页-wiki-多语言-英语 +/6000030096/ 0.045033413 79 多语言-CommonCrawl-阿拉伯语 +/300301/ 3.306503875 2000 英文-网页-commoncrawl纯文推理数据2013-2023 +/6000380096/ 0.029266659 54 多语言-CommonCrawl-亚美尼亚语 +/401001/ 2.14666667e-7 10 中文-试题-中医考研(web) +/6000350096/ 0.029836300 56 多语言-CommonCrawl-阿尔巴尼亚语 +/400028/ 0.003871893 10 中文-网页-3c、旅游 +/200009/ 0.618969248 1196 平行语料-平行语料-翻译-opus数据集 +/400245/ 0.018641081 13 中文-网页-大搜—旅游数据 +/300347/ 0.542201090 255 英文-网页-RefinedWeb英文网站(推理中浓度)_minhash0.7网页全局&局部去重 +/6000980096/ 0.030103785 61 多语言-CommonCrawl-哈萨克语 +/400154/ 0.009107741 28 中文-网页-百科优质词条外链站点数据(推理中浓度) +/400220/ 0.000009362 10 中文-问答-金融财报人工精标问答数据 +/6001390095/ 0.001896546 10 多语言-维基百科-Polish +/400121/ 0.143819858 107 中文-网页-中文创作spider网页库 +/6001600095/ 0.001926623 10 多语言-维基百科-Swedish +/6000790096/ 0.030429894 65 多语言-CommonCrawl-格鲁吉亚语 +/400629/ 0.000176079 10 中文-网页-93歌词(第二批) +/400173/ 0.006682723 10 中文-合成-抽象符号推理-字符串处理-第一批 +/400020/ 0.020552609 15 中文-网页-小红书 +/400737/ 0.025183065 71 中文-网页-SE网页库9月例行(弱势学科)_minhash0.7网页全局&局部去重 +/6000060096/ 0.048800635 108 多语言-CommonCrawl-印度尼西亚语 +/6000430096/ 0.030468871 68 多语言-CommonCrawl-阿塞拜疆语 +/400159/ 0.040556361 30 中文-网页-SE网页库9月例行(推理高浓度) +/6000040095/ 0.003087481 10 多语言-维基百科-Spanish +/400177/ 0.000012448 10 中文-网页-上海城市法规全书 +/6001720095/ 0.001927410 10 多语言-维基百科-Ukrainian +/100024/ 0.000308844 10 代码-代码-各语言编程网站 +/100027/ 0.000865627 10 代码-代码-github证明题代码 +/6000140096/ 0.042191683 111 多语言-CommonCrawl-波斯语 +/400133/ 0.056244973 228 中文-合成-中文棋类内容(围棋、中国象棋、国际象棋) +/400682/ 0.048176924 36 中文-垂类-党政政策解读、分析相关数据 +/6009990096/ 0.032153579 87 多语言-CommonCrawl-其他 +/6000050095/ 0.003165565 10 多语言-维基百科-Russian +/6000070096/ 0.051388692 142 多语言-CommonCrawl-葡萄牙语 +/6000190095/ 0.002014799 10 多语言-维基百科-Italian +/6000260096/ 0.031766319 91 多语言-CommonCrawl-泰米尔语 +/6000130095/ 0.002756493 10 多语言-维基百科-Japanese +/400215/ 0.001864484 10 中文-问答-里屋社区数据集:知乎问答 +/6000220096/ 0.032028172 96 多语言-CommonCrawl-孟加拉语 +/6000090095/ 0.002745445 10 多语言-维基百科-French +/300344/ 0.029037864 34 英文-网页-Clueweb22CategoryB(推理中浓度)_minhash0.7网页全局&局部去重 +/6001080096/ 0.032613506 110 多语言-CommonCrawl-拉脱维亚语 +/6000670096/ 0.033514756 115 多语言-CommonCrawl-荷兰语 +/6000700096/ 0.033573350 116 多语言-CommonCrawl-世界语 +/400164/ 0.000101043 10 中文-网页-电商-京东商品数据 +/300348/ 0.141539919 530 英文-网页-RefinedWeb英文网站(推理低浓度)_minhash0.7网页全局&局部去重 +/6000100095/ 0.00284676 10 多语言-维基百科-German +/300294/ 0.116943048 87 英文-网页-历史英文被过滤语料回捞数据-网页(Dolma CC、BAAI-MTP) +/300296/ 0.000004768 10 英文-网页-OpenNewsArchive 新闻数据集 +/6002130001/ 0.002161981 10 多语言-维基百科-Cebuano +/400741/ 0.021644925 107 中文-网页-悟道_minhash0.7网页全局&局部去重 +/400635/ 3.6982e-7 5 中文-对话-似是故人来第一季唱词10篇 +/400025/ 0.000283245 10 中文-网页-财经 +/6000090096/ 0.047730460 210 多语言-CommonCrawl-法语 +/6001540096/ 0.035161323 155 多语言-CommonCrawl-斯洛文尼亚语 +/400017/ 0.003022108 10 中文-网页-百度经验 +/6000230096/ 0.035196511 173 多语言-CommonCrawl-印地语 +/100023/ 0.002430262 10 代码-pytorch-使用pytorch框架的python代码 +/6000740096/ 0.036302397 190 多语言-CommonCrawl-芬兰语 +/400630/ 0.000366207 10 中文-网页-10万条药品说明书 +/400238/ 0.000706079 10 中文-网页-3个作文网 +/6001290096/ 0.036735728 202 多语言-CommonCrawl-挪威语 +/6000650096/ 0.036725799 203 多语言-CommonCrawl-丹麦语 +/400019/ 0.003125926 10 中文-网页-Job Description +/6001110096/ 0.036752247 208 多语言-CommonCrawl-立陶宛语 +/400728/ 0.309798380 232 中文-网页-vip网页库(推理中浓度)_minhash0.7网页全局&局部去重 +/400099/ 0.002636629 10 中文-网页-歌词、笑话、菜谱数据集 +/6000080096/ 0.048579735 296 多语言-CommonCrawl-泰语 +/6001480096/ 0.037829073 246 多语言-CommonCrawl-塞尔维亚语 +/400237/ 0.000061661 10 中文-网页-健康-药品说明书 +/300336/ 1.307952901 2000 英文-网页-CC-MAIN-英语-合并-202405_minhash0.7网页全局&局部去重 +/400690/ 0.014820299 11 中文-垂类-智源纯文数据集-农业 +/400694/ 0.121218750 90 中文-垂类-智源纯文数据集-教育 +/400687/ 0.032787332 46 中文-垂类-智源纯文数据集-医疗 +/6000120096/ 0.051928047 359 多语言-CommonCrawl-朝鲜语 +/400696/ 0.000932791 10 中文-垂类-国家法律法规数据库 +/400691/ 0.001886995 10 中文-垂类-国家、行业、企业标准等相关数据 +/400688/ 0.148094313 111 中文-垂类-金融相关数据-ACG +/6000530096/ 0.038919130 278 多语言-CommonCrawl-加泰罗尼亚语 +/400689/ 0.000013803 10 中文-垂类-44本金融行业重点书籍-ACG(纯文) +/6000630096/ 0.038588745 279 多语言-CommonCrawl-克罗地亚语 +/300306/ 0.001757989 10 英文-网页-CNN-DailyMail-newspaper-新闻摘要 +/300337/ 1 2000 英文-网页-commoncrawl_minhash0.7网页全局&局部去重 +/6001530096/ 0.038953692 298 多语言-CommonCrawl-斯洛伐克语 +/6001600096/ 0.039239266 306 多语言-CommonCrawl-瑞典语 +/300343/ 0.010931487 35 英文-网页-Clueweb22CategoryB(推理低浓度)_minhash0.7网页全局&局部去重 +/400729/ 0.143491965 538 中文-网页-vip网页库(推理低浓度)_minhash0.7网页全局&局部去重 +/6000830096/ 0.039081741 315 多语言-CommonCrawl-希伯来语 +/6000510096/ 0.038850066 314 多语言-CommonCrawl-保加利亚语 +/400692/ 0.000476538 10 中文-书籍-【网络小说】ppd采购-2024Q3 +/400731/ 0.490618665 1051 中文-网页-VIP库例行生产_minhash0.7网页全局&局部去重 +/400634/ 0.000326967 10 中文-网页-1954年到2023年全国各省直辖市地级市政府报告 +/400735/ 0.464511214 1741 中文-网页-ext数据_minhash0.7网页全局&局部去重 +/400734/ 0.321270648 481 中文-网页-se数据_minhash0.7网页全局&局部去重 +/400732/ 0.243574094 182 中文-网页-VIP例行生产2024.03-04_minhash0.7网页全局&局部去重 +/400742/ 0.010588006 130 中文-网页-CC94份中文合并数据(简体)_minhash0.7网页全局&局部去重 +/400658/ 0.093877701 70 中文-网页-低速源核心数据第一期&第二期 +/400647/ 0.006078744 79 中文-网页-中文高点击网页库/高质量网页库(0616过滤) +/400668/ 0.005336899 69 中文-网页-CC94份中文合并数据(繁体) +/400362/ 0.013446240 19 中文-网页-OpenNewsArchive 新闻数据集 +/400663/ 0.017843859 13 中文-网页-中文网页-党政官媒类高质量站点抓取 +/400661/ 0.000288970 10 中文-网页-93歌词数据 +/400365/ 0.000031484 10 中文-网页-教案库数据 +/400190/ 0.080211077 60 中文-对话-贴吧v3-多轮对话-中等质量数据 +/6000190096/ 0.039903901 350 多语言-CommonCrawl-意大利语 +/400721/ 0.121492969 911 中文-书籍-百度小说(全量) +/400649/ 0.953990031 715 中文-专利-中国专利 +/400153/ 0.002920912 44 中文-网页-百科优质词条外链站点数据(推理低浓度) +/400672/ 0.078886848 59 中文-网页-小红书纯文2024.10.10存量数据 +/400642/ 0.002630645 41 中文-合成-写作要求指令增强-中文 +/400240/ 4.3802708e-8 10 中文-网页-化妆品、三品一械相关法规条例 +/6001390096/ 0.041045574 436 多语言-CommonCrawl-波兰语 +/6001720096/ 0.041580866 484 多语言-CommonCrawl-乌克兰语 +/6000100096/ 0.056805958 672 多语言-CommonCrawl-德语 +/400740/ 0.541951715 2000 中文-网页-dadu库数据_minhash0.7网页全局&局部去重 +/6000860096/ 0.042586047 543 多语言-CommonCrawl-匈牙利语 +/300131/ 1.376364399 1028 英文-网页-Dolma CC( 2020–05~2023–06)(推理高浓度) +/400171/ 0.028050290 42 中文-合成-抽象符号推理-编解码(非COT) +/400199/ 0.016500530 352 中文-网页-裁判文书网全量数据(截止2021年) +/400065/ 0.000660962 10 中文-网页-cot-裁判文书(上海高院) +/300338/ 0.168918041 920 英文-网页-DolmaCC(2020–05~2023–06)(弱势学科)_minhash0.7网页全局&局部去重 +/400163/ 0.007696080 187 中文-合成-百度搜索行为数据 +/6001410096/ 0.044887275 716 多语言-CommonCrawl-罗马尼亚语 +/6000800096/ 0.044028267 726 多语言-CommonCrawl-希腊语(现代,1453–) +/400693/ 0.000461641 10 中文-网页-播客音频洗出对话数据-汉语 +/400695/ 0.001879579 49 中文-网页-里屋社区数据集MNBVC-CommonCrawl中清洗出来的通用文本数据 +/6000110096/ 0.06285045 1411 多语言-CommonCrawl-越南语 +/6000640096/ 0.047711892 1073 多语言-CommonCrawl-捷克语 +/400659/ 0.018200173 13 中文-书籍-zlibary (pdf解析)-简单版式 +/400739/ 0.001365763 45 中文-网页-SE网页库9月例行(推理中浓度)_minhash0.7网页全局&局部去重 +/400241/ 0.103746891 77 中文-网页-大搜-微信数据 +/6000050096/ 0.078562287 2000 多语言-CommonCrawl-俄语 +/6000040096/ 0.081746977 2000 多语言-CommonCrawl-西班牙语 +/400119/ 0.015833751 846 中文-网页-爱企查判决文书 +/6000130096/ 0.072749946 2000 多语言-CommonCrawl-日语 +/400242/ 0.140827638 105 中文-网页-大搜-知乎专栏数据 +/400132/ 0.011133470 727 中文-合成-中文牌类内容(斗地主、麻将、UNO) +/300288/ 0.212246929 79 英文-论文-mag-简单版式 +/400174/ 0.078907165 118 中文-合成-抽象符号推理-古典密码(非COT) +/400172/ 0.065238524 97 中文-合成-抽象符号推理-编解码(COT) +/400243/ 0.180635069 135 中文-网页-大搜—法律文书数据 +/400175/ 0.105579055 158 中文-合成-抽象符号推理-古典密码(COT) +/400738/ 0.000714383 71 中文-网页-SE网页库9月例行(推理低浓度)_minhash0.7网页全局&局部去重 +/400160/ 0.000155801 49 中文-网页-电商-ugc数据 +/300340/ 0.016854611 741 英文-网页-DolmaCC(2020–05~2023–06)(推理中浓度)_minhash0.7网页全局&局部去重 +/300286/ 0.105916101 149 英文-书籍-zlibary (pdf解析)-简单版式 +/300287/ 0.200661735 150 英文-书籍-archive-pdf-简单版式 +/300163/ 0.212951405 79 英文-论文-aminer-128w-简单版式 +/300339/ 0.005421938 990 英文-网页-DolmaCC(2020–05~2023–06)(推理低浓度)_minhash0.7网页全局&局部去重 +/400733/ 0.720390371 1080 中文-网页-大搜—vip2.0数据_minhash0.7网页全局&局部去重 +/300124/ 0.000005793 571 英文-网页-BAAI-MTP英文语义向量模型 BGE-1.7亿条 +/400148/ 5.68345553e-7 216 中文-网页-BAAI-MTP中文语义向量模型 BGE-1.1亿条 +/301044/ 0.000322066 10 代码-代码-code-problem-v2-codecademy +/301045/ 0.000659759 10 代码-代码-code-problem-v2-programiz +/301040/ 0.001326477 10 代码-代码-benchmark-instruction-evo-241212-合成 +/301046/ 0.002573786 10 代码-代码-代码知识点合成QA数据(CodeBench)-第四批 +/code-1/ 0 42 代码-代码-code-log-synthetic-250106 +/code-2/ 0 12 代码-代码-code-log-synthetic-250207 +/301028/ 0.036818530 10 代码-代码-code-problem-v2-exercism +/301043/ 0.000911915 10 代码-代码-code-problem-v2-coderbyte +/100166/ 0.001392763 10 代码-代码-SVG代码理解数据集 +/100167/ 0.000290147 10 代码-代码-SVG代码HF合集-tu-berlin-svgs-纯文 +/300331/ 0.000180343 10 中文-网页-CK12数学-纯文(文档解析) +/400396/ 0.002094046 10 中文-网页-大学教材习题册171本-数学专项-文本 +/401056/ 0.004770410 10 中文-网页-华律网问答数据【文本对】【问答】【中文】【合作】【自然】 +/300334/ 0.000002737 10 中文-网页-CK12数学-纯文(视频数据纯文本改造) +/300335/ 0.009313413 10 中文-网页-CommonCrawl数学站点筛选【纯文】 +/401059/ 0.000778927 10 中文-网页-E考试网试题每日一练(资格考试为主)-文本对 +/301048/ 0.053228242 10 中文-试题-百度教育第一批精品试题格式优化-english-文本对 +/401060/ 0.003291972 10 中文-试题-百度教育第一批精品试题格式优化-politics-文本对 +/401061/ 0.005437544 10 中文-试题-百度教育第一批精品试题格式优化-geography-文本对 +/401062/ 0.082348446 10 中文-试题-百度教育第一批精品试题格式优化-chinese-文本对 +/401063/ 0.036789083 10 中文-试题-百度教育第一批精品试题格式优化-chemistry-文本对 +/401065/ 0.201537322 10 中文-试题-百度教育第一批精品试题格式优化-math-文本对 +/401066/ 0.017092634 10 中文-试题-百度教育第一批精品试题格式优化-history-文本对 +/401067/ 0.117661457 14 中文-试题-百度教育第一批精品试题格式优化- other-文本对 +/401068/ 0.015326292 10 中文-试题-百度教育第一批精品试题格式优化-politics2-文本对 +/401151/ 0.000659135 10 中文-试题-百度教育第一批精品试题格式优化-大学-文本对 +/401064/ 0.028558718 10 中文-试题-百度教育第一批精品试题格式优化-physics-文本对 +/301029/ 0.011923442 10 中文-试题-math.stackexchange试题-文本对 +/301037/ 0.091007017 11 中文-试题-brainly教育问答&试题数据第一批 +/301038/ 0.180398182 22 中文-试题-brainly教育问答&试题数据第二批 +/401051/ 0.142497603 17 中文-试题-K12理科试题202412(20250107试题更新)-数学专项-文本对 +/2025021800000001/ 0.291133333 250 中文-网页-创作类CPT-part-1 +/2025021800000002/ 2.217066666 2000 中文-网页-创作类CPT-part-2 +/2025021800000003/ 0.825066666 960 中文-网页-创作类CPT-part-3 +/200046/ 0.000204372 10 多语言-CommonCrawl-中翻混合语种句对-汉语+印尼-1453544(zhongwen_yini) +/200047/ 0.000943932 10 多语言-CommonCrawl-中翻混合语种句对-汉语+ja-348w(zh_ja) +/200048/ 0.000022979 10 多语言-CommonCrawl-中翻混合语种句对-汉语+han-10w(zh_han) +/200049/ 0.000051249 10 多语言-CommonCrawl-中翻混合语种句对-jazh+jiongrong-9w +/200050/ 0.000445868 10 多语言-CommonCrawl-中翻混合语种句对-en2th-114905 +/200051/ 0.000254730 10 多语言-CommonCrawl-中翻混合语种句对-波斯语+汉语(bosi_zh) +/200052/ 0.000223686 10 多语言-CommonCrawl-中翻混合语种句对-波斯语+汉语-1500000(bosi_zh) +/200053/ 0.001152494 10 多语言-CommonCrawl-中翻混合语种句对-越南语+汉语-400w(ViZh_400w) +/200054/ 0.000158281 10 多语言-CommonCrawl-中翻混合语种句对-汉语+hi-61w(zh_hi) +/200055/ 0.001010838 10 多语言-CommonCrawl-中翻混合语种句对-汉语+韩语-507.4w(zhko) +/200056/ 0.005276874 10 多语言-CommonCrawl-中翻混合语种句对-汉语+meng-250w(han_meng) +/200057/ 0.000377929 10 多语言-CommonCrawl-中翻混合语种句对-缅甸语+汉语-120w(miandian_zhongwen) +/200058/ 0.000180456 10 多语言-CommonCrawl-中翻混合语种句对-汉语+高棉-50w(zhongwen_gaomian) +/200059/ 0.000575149 10 多语言-CommonCrawl-中翻混合语种句对-汉语+蒙-20w(zhong_meng) +/200060/ 0.000218231 10 多语言-CommonCrawl-中翻混合语种句对-汉语+老挝-61w(zhongwen_laowo) +/200061/ 0.001685358 10 多语言-CommonCrawl-中翻混合语种句对-汉语+日语(zh_jp) +/200062/ 0.000348243 10 多语言-CommonCrawl-中翻混合语种句对-汉语+蒙语(zh_waimeng) +/200063/ 0.000766032 10 多语言-CommonCrawl-中翻混合语种句对-汉语+韩语(zh_ko) +/200064/ 0.000110161 10 多语言-CommonCrawl-中翻混合语种句对-汉语+尼泊尔语(zh_nepal) +/200065/ 0.000260237 10 多语言-CommonCrawl-中翻混合语种句对-汉语+缅甸语(zh_mya) +/200067/ 0.000028358 10 多语言-CommonCrawl-中翻混合语种句对-汉语+印度尼西亚语(zh_ind) +/200068/ 0.000763076 10 多语言-CommonCrawl-中翻混合语种句对-汉语+德语(zh_deu) +/200069/ 0.000656297 10 多语言-CommonCrawl-中翻混合语种句对-汉语+越南语(zh_vie) +/200070/ 0.000081524 10 多语言-CommonCrawl-中翻混合语种句对-汉语+朝鲜语(zh_kor) +/200066/ 0.000304197 10 多语言-CommonCrawl-中翻混合语种句对-汉语+英语(zh_eng) +/6000040097/ 0.004690451 35 英文-垂类-【审核专项】legal-mc4-法律开源数据集 +/6000100097/ 0.004524982 33 英文-垂类-【审核专项】legal-mc4-法律开源数据集 +/300307/ 9.617887232 1999 英文-网页-FineWeb全量数据集 +/6000070097/ 0.000816024 10 英文-垂类-【审核专项】legal-mc4-法律开源数据集 +/6000090097/ 0.001257192 10 英文-垂类-【审核专项】legal-mc4-法律开源数据集 +/6000190097/ 0.001938434 14 英文-垂类-【审核专项】legal-mc4-法律开源数据集 +/6000510097/ 0.000010060 10 英文-垂类-【审核专项】legal-mc4-法律开源数据集 +/6000640097/ 0.007286504 10 英文-垂类-【审核专项】legal-mc4-法律开源数据集 +/6000670097/ 0.000075349 10 英文-垂类-【审核专项】legal-mc4-法律开源数据集 +/6000700097/ 0.000463792 10 英文-垂类-【审核专项】legal-mc4-法律开源数据集 +/6000740097/ 0.000264480 10 英文-垂类-【审核专项】legal-mc4-法律开源数据集 +/6000800097/ 0.000007447 10 英文-垂类-【审核专项】legal-mc4-法律开源数据集 +/6000860097/ 0.001060364 10 英文-垂类-【审核专项】legal-mc4-法律开源数据集 +/6001110097/ 0.000036506 10 英文-垂类-【审核专项】legal-mc4-法律开源数据集 +/6001180097/ 0.000015436 10 英文-垂类-【审核专项】legal-mc4-法律开源数据集 +/6001390097/ 0.008755500 13 英文-垂类-【审核专项】legal-mc4-法律开源数据集 +/6001540097/ 0.000402673 10 英文-垂类-【审核专项】legal-mc4-法律开源数据集 +/6001600097/ 0.001056753 10 英文-垂类-【审核专项】legal-mc4-法律开源数据集 +/6001410097/ 0.001735976 10 英文-垂类-【审核专项】legal-mc4-法律开源数据集 +/6000650098/ 0.000006720 10 英文-垂类-【审核专项】legal-mc4-法律开源数据集 +/6001530097/ 0.000255724 10 英文-垂类-【审核专项】legal-mc4-法律开源数据集 +/200624/ 0.000002979 10 中文-网页-时政党史专业翻译语料-天津外国语大学-文本对 +/400725/ 0.014728966 11 中文-对话-【音频转纯文】喜马拉雅ASR数据(纯文) +/400393/ 0.025928793 19 中文-垂类-电力行业采购的4万+专业书籍-ACG-文本 +/401055/ 0.000050154 10 中文-试题-AnHuiDianxinZhidao(纯文-5.6万条安徽电信问答数据) +/400394/ 0.008479758 12 中文-专利-维普期刊12月增量50万+期刊论文-维普(文本) +/400395/ 0.000202317 10 中文-网页-剧本段子文案数据-pdd-纯文 +/400726/ 0.011834500 10 中文-对话-【音频转纯文】爱奇艺ASR数据(文本) +/300330/ 0.081723645 61 英文-网页-【音频转纯文】Youtube ASR数据(文本) +/300333/ 0.000570876 10 英文-网页-STEAM游戏信息数据-纯文 +/401149/ 0.003821260 10 中文-试题-百度教育第二批K12试题-教辅拆解、合作采买-文本对-有答案解析 +/401150/ 0.001438031 10 中文-试题-百度教育第二批K12试题-教辅拆解、合作采买-文本对-无答案解析 +/301041/ 0.000030120 10 英文-垂类-MedQuad-MedicalQnADataset医疗问答-医疗开源Benchmark +/301042/ 0.002053802 10 英文-问答-Math ematics数学问答数据 +/301047/ 0.078382249 10 英文-试题-K12理科专项合成试题20250210-英文-文本对 +/401057/ 0.532893024 66 中文-试题-K12理科专项合成试题20250210-中文-文本对 +/400744/ 0.020501561 10 中文-问答-数学书籍知识点合成数据-中文 +/10000100010011/ 0.005288875 10 英文-试题-chegg数学试题20250216第1.3批220万题-文本对 +/10000100010012/ 0.008434328 10 英文-试题-chegg数学试题20250216第1.2批350万题-文本对 +/10000100010013/ 0.004758764 10 英文-试题-chegg数学试题20250216第1.1批200万题-文本对 +/10000100010015/ 0.021962204 10 英文-试题-chegg数学试题20250221第二批1000万题-文本对 +/10000100010016/ 0.020773026 10 英文-试题-【多模转纯文】 chegg图文试题转纯文 +/10000100010021/ 0.049392972 10 英文-试题-chegg数学试题20250224第三批1000万题-文本对 +/10000100010025/ 0 10 英文-试题-chegg图文数学试题转纯文20250216、20250221、20250224共三批2800万题-文本对 +/10000100010020/ 0.031240055 10 英文-问答-Quora 1亿问答20250219第二批-文本对 +/10000100010017/ 0.015085636 10 英文-问答-Quora 1亿问答20250219-文本对 +/301050/ 0.001224881 10 英文-问答-medical-qa-datasets医疗问答-医疗开源Benchmark +/400746/ 0.000098480 10 中文-网页-【生物数据】prnasia 美通社 +/400750/ 0.000004216 10 中文-网页-【说服力】【视频转纯文】主持人大赛 +/401152/ 0.000002475 10 中文-问答-弱智吧-HuggingFace-文本对 +/10000100010032/ 0.211483136 79 英文-问答-Quora 1亿问答20250219第三批-文本对 +/301051/ 0.002882570 10 英文-问答-CUAD法律合同数据-法律开源Benchmark +/10000100000009/ 0.000216301 10 英文-网页-【生物数据】biospace +/300350/ 0.000080930 10 英文-网页-【生物数据】ascopub +/10000100000004/ 0.084811013 63 英文-问答-红石REDSTONE-Open Question Aswering -文本 +/10000100000002/ 0.000038184 10 英文-网页-【音频转纯文】TED纯文数据 +/10000200000003/ 5.76293333333333e-7 10 英文-问答-红石REDSTONE-Open Question Aswering -文本 +/301056/ 0.001646013 10 中文-问答-baike_qa2019(问答150 万个)数据 +/400743/ 0.000017539 10 中文-网页-【说服力】沟通课程与文案-文档 +/400748/ 0.000066221 10 中文-网页-金融政策数据20250214-ACG-文本 +/10000200000004/ 0.005043936 10 中文-对话-MEG电话客服数据转纯文角色对话数据 +/10000200000006/ 0.007739220 10 中文-对话 -MEG-CRM电话销售数据转纯文角色对话数据 +/10000200000008/ 0.019111735 12 中文-网页-大搜金融站点数据20250214-ACG-wenda +/10000200000010/ 0.497842698 1066 中文-网页-百家号2025.02存量数据-文本 +/400234/ 0.000450864 10 中文-问答-NL2SQL_开源数据 +/400639/ 0.000307161 10 中文-合成-NL2SQL_合成数据 +/400745/ 0.079470070 119 中文-书籍-113万图书-中文在线(文本)-epub +/400747/ 0.010073886 15 中文-书籍-【音频转纯文】懒人听书数据第一批(ASR数据)(文本) +/400749/ 0.017416038 26 中文-书籍-113万图书-中文在线(文本)-txt +/401148/ 0.000324319 10 中文-问答-token字数训练语料-信息处理专项-文本 +/10000200000005/ 0.019837646 29 中文(推理)-网页-vip&大搜数学数据2023.11-2024-04【Loop0】 +/300351/ 0.002377099 10 英文-书籍-【说服力】说服力书籍第二批-pdf文档 +/400399/ 0.006603937 10 中文-书籍-【说服力】说服力书籍第二批-pdf文档 +/10000100010006/ 0.003533202 10 英文-问答-BBH专项提升数据-dyck_languages-文本对 +/10000100010028/ 0.001203027 10 英文(推理)-试题-BBH专项提升数据-word_sorting-文本对 +/10000200000007/ 0.002033263 10 英文-书籍-【说服力】说服力书籍第二批-epub文档 +/10000100000001/ 0.045815276 17 英文-网页-红石REDSTONE-MATH-文本 +/10000100010014/ 0.000665171 10 英文-试题-REDSTONE-MultiChoiceQuestion-文本对 +/10000200000001/ 0.000003050 10 英文(推理)-网页-红石REDSTONE-MATH-文本 +/10000100010024/ 0.034897821 13 英文(推理)-试题-英文数学问答题答案解析生成第一批 +/10000200020016/ 0.020399428 10 中文(推理)-试题-中文数学问答题答案解析生成第一批 +/10000100000005/ 0.007139503 10 英文(推理)-网页-fineweb数学网页数据【Loop0】 +/10000100010034/ 0.062262077 23 英文-试题-【benchmark反查】huggingface开源学科数据集 +/10000100010008/ 0 10 英文-问答-BBH专项提升数据-salient_translation_error_detection-文本对 +/10000200020010/ 0 10 英文-合成-BBH专项提升数据-logic_deduction-文本对 +/10000100010027/ 0 10 英文(推理)-试题-BBH专项提升数据-ruin_names-文本对 +/10000100010029/ 0.000125882 10 英文(推理)-试题-BBH专项提升数据-geometric_shapes-文本对 +/10000100020007/ 0 10 英文(推理)-试题-BBH专项提升数据-snarks-文本对 +/400398/ 0.002114105 10 英文-问答-K12&特殊教育文档-深圳教育云-纯文 +/401153/ 0.003500047 10 中文-试题-鑫创职业资格试题-文本对 +/401155/ 0.040426706 10 中文-试题-百度教育精品试题2月份例行更新-文本对 +/10000200020015/ 0.053691484 10 中文(推理)-试题-鑫创K12中文试题-文本对 +/10000100010022/ 0.00032586 10 英文(推理)-试题-aopsonline数学竞赛-文本对 +/10000100010023/ 0.000095751 10 英文(推理)-网页-benchmark反查网站覆盖:web2.0calc.com/questions/ +/10000100000013/ 0.000883960 10 英文-问答-gauthmath所有学科knowledge-文本 +/10000100010033/ 0.045247112 10 英文-试题-凤凰智媒quizlet英文试题-文本对 +/10000100000014/ 0.005132933 10 英文-网页-原始Common Crawl数据数学站点筛选网页【2023第一批】 +/301049/ 0.000032657 10 英文-试题-AMC/AIME/BMO/IMO 试题-文本对 +/401058/ 0.000010356 10 中文-问答-脑筋急转弯1万题0212-推理专项-文本对 +/10000100010018/ 0 36 英文(推理)-试题-K12理科专项合成试题20250227-文本对(英文) +/10000100010019/ 0.017926272 10 英文(推理)-试题-brainly试题20250222-20250226期间增量-文本对 +/10000200020012/ 0.000032824 10 中文(推理)-试题-33iq智力题第二批-本文对 +/10000200020013/ 0.000030966 10 中文-试题-2025年考研数学真题-试题 +/numiamath_query/ 0.003930693 10 英文-试题-Numiamath query 改写试题数据第一批 +/10000100000006/ 0.003247139 10 英文-试题-弱势学科textbook增强合成数据-anatomy +/10000100000007/ 0.013114467 10 英文-试题-弱势学科textbook增强合成数据-professional_accounting +/10000100000008/ 0.009572875 10 英文-试题-弱势学科textbook增强合成数据-formal_logic +/10000100010030/ 0.042406978 10 英文-试题-brainly试题20250227-20250302期间增量-文本对 +/10000100010031/ 0.029642469 10 英文-试题-澳鹏homework.study英文试题-文本对 +/10000100000015/ 0.000394343 10 英文-书籍-【生物数据】business wire +/10000100000016/ 0.011167630 10 英文-书籍-【说服力】说服力书籍第三批-epub文档 +/10000100000017/ 0.036646001 10 英文-书籍-【说服力】说服力书籍第三批-pdf文档 +/10000100000018/ 0.85233422 213 英文-网页-【benchmark反查】huggingface.co math-ai/AutoMathText数据集 +/10000100010035/ 0.001839838 10 英文-问答-MedicalQA-医疗开源Benchmark +/10000100010036/ 0.011245594 10 英文-问答-【benchmark反查】huggingface.co math-ai/StackMathQA数据集 +/10000100010037/ 0.000048483 10 英文-问答-MedQA-USMLE-4-options医疗考试选择题-医疗开源Benchmark +/10000100010038/ 0.000573528 10 英文-问答-MedMCQA选择题-医疗开源Benchmark +/10000100010039/ 0.005300162 10 英文-问答-medical-question-answering-datasets医疗问答-医疗开源Benchmark +/10000100010040/ 0.000385748 10 英文-问答-PubMedQA-医疗开源Benchmark +/10000100010041/ 0.277890436 34 英文-试题-K12理科专项合成英文试题20250227(数据迭代更新)-文本对 +/10000200000011/ 0.000055442 10 中文-网页-【说服力】【音视频转纯文】销售培训课程 +/10000200000012/ 0.000007219 10 中文-网页-中国科普博览文物数据-纯文 +/10000200000013/ 0.000300687 10 中文-网页-【说服力】【音视频转纯文】沟通课程 +/10000200000014/ 0.000119291 10 中文-网页-咨询课程文档 +/10000200000015/ 0.004665145 10 中文-网页-【acg-行业数据】金融研报数据20250214-ACG-文档 +/10000200000016/ 0.007079626 10 中文-网页-【说服力】说服力书籍第三批-epub文档 +/10000200000017/ 0.009367401 10 中文-网页-【说服力】说服力书籍第三批-pdf文档 +/10000100010043/ 0.015168132 10 英文-试题-brainly试题20250303-20250310期间美国站增量-文本对 +/10000100010044/ 0.028362376 10 英文-试题-chegg20250303第4.1批STEM试题706w题-文本对 +/10000100010045/ 0.047063267 10 英文-试题-chegg图文数学试题转纯文第二批图转文144w-文本对 +/10000100010046/ 0.016908574 10 英文-试题-chegg图文数学试题转纯文第一批50w数据-fix +/10000100010048/ 0.020887173 10 英文-试题-开源数学QA数据_英文_不带模板 +/10000100020008/ 0.000001064 10 英文-试题-【补】AMC/AIME/BMO/IMO 试题-文本对 +/10000200000020/ 0.001733574 10 中文-书籍-计算机书籍(中文)(源400049) +/10000200020019/ 0.018817498 10 中文-试题-开源数学QA数据_中文_不带模板 +/10000100000020/ 0.089288913 22 英文-网页-CommonCrawl数学站点筛选网页【纯文】 +/10000100000021/ 0.002020114 10 英文-网页-【说服力】kialo英文辩论网站 +/10000100000023/ 2.873354966 431 英文-网页-【benchmark反查】simpleQA反查长尾站点召回CC数据-文本 +/10000100000022/ 0.001690478 10 英文-网页-【benchmark反查】chem.libretexts.org-文档文本 +/10000100000024/ 0.311246095 77 英文-网页-InfiMM-WebMath-40B-文本 +/10000100000025/ 0.149771971 449 英文-网页-【benchmark反查】pmc.ncbi.nlm.nih.gov网站覆盖—纯文文本 +/10000100010049/ 0.125617411 10 英文-试题-chegg20250303第4.2批STEM试题-文本对 +/10000100010050/ 0.090095504 12 代码-代码-code-instruction-v2.1 +/10000100010051/ 0.018196497 10 英文-试题-brainly试题20250305-20250311期间印度站增量-文本对 +/10000100010052/ 0.069538147 10 英文-试题-20250312-g4o图转文第三批(包含多图)-文本对 +/10000200020020/ 0.013792681 10 中文-试题-【百度教育】多模转纯文(答案图转文)-数学(第一批fix2) +/10000200020021/ 0.000655053 10 中文-试题-【百度教育】多模转纯文(答案图转文)-生物(第一批fix) +/10000200020023/ 0.003159375 10 中文-试题-【百度教育】多模转纯文(答案图转文)-化学(第一批fix) +/10000200020024/ 0.011188260 10 中文-试题-【百度教育】多模转纯文(答案图转文)-物理(第一批fix) +/10000200020025/ 0 10 代码-代码-code-log-synthetic-250207-update-250313 +/10000200020026/ 0 21 代码-代码-code-log-synthetic-250106-update-250313 +/10000100000026/ 0.178253971 26 英文-网页-Finemath-e5模型筛选CommonCrawl数学数据【2023-50】 +/10000100000027/ 0.004469794 10 英文-网页-【benchmark反查】simpleQA评测集anwser来源url网页覆盖 +/10000100010053/ 0.153438946 10 英文-试题-chegg20250303第4.3批STEM试题-文本对 +/10000100010054/ 0.006074558 10 英文-网页-【benchmark反查】socratic.org网站覆盖 +/10000100010055/ 0.036150889 10 英文-试题-20250312-g4o图转文第四批(包含多图)-文本对 +/10000200000023/ 0.000021881 10 中文-网页-【说服力】【音视频转纯文】pdd演讲与口才课程音视频 +/10000200000024/ 0.001989468 10 中文-网页-B站优质UP主视频字幕转纯文数据【第一批】 +/10000200000025/ 0.102165515 30 中文-论文-维普50w论文-2024Q3 +/10000200020027/ 0.102444928 10 代码-代码-code-log-synthetic-250207-update-250315 +/10000200020028/ 0.363919886 38 代码-代码-code-log-synthetic-250106-update-250315 +/10000200000021/ 0.043793891 10 中文-百科-抖音百科第一批存量词条-spider-文本 +/10000200000022/ 0.001558578 10 中文-网页-【说服力】【音视频转纯文】【电商】抖音强说服力口播文案-说服要素增强 +/2025031210001/ 0.002210112 10 英文-试题-AMPS query 改写试题数据第一批 +/10000100010056/ 0.000319968 10 英文-试题-crackap网页抓取数据-第一批 +/2025031190001/ 0.000335608 10 英文-试题-crackap网页抓取数据-第二批 +/bbh-fewshot-79/ 0.000003251 10 英文-试题-bbh-fewshot-79条 +/math-related-7/ 0.666666666 25 英文-网页-MATH相关7个开源数据集 +/chegg-g4o-part5/ 0.1 15 英文-试题-20250312-g4o图转文第五批(包含多图)-文本对 +/10000100000028/ 0.001087514 10 英文-网页-【benchmark反查】simpleQA评测集g4o+RAG来源url网页覆盖 diff --git a/examples/pre-training/ernie/pretrain_auto.py b/examples/pre-training/ernie/pretrain_auto.py new file mode 100644 index 00000000..1772797b --- /dev/null +++ b/examples/pre-training/ernie/pretrain_auto.py @@ -0,0 +1,459 @@ +# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import time +import json +import numpy as np +from functools import partial +import random +import paddle +import paddle.distributed.fleet as fleet +from src.utils import logger +from paddleformers.datasets import MapDataset +from paddleformers.trainer import ( + PdArgumentParser, + get_last_checkpoint, +) +from src.tokenizers.tokenization_eb_v2 import ErnieBotTokenizer +from omegaconf.listconfig import ListConfig +from omegaconf.dictconfig import DictConfig +from src.callbacks import ( + ProgreesiveBatchingCallback, + DataTraceCallbackAuto, + GlobalRNGCallback, +) +from models.ernie import ( + ErnieForCausalLMAuto, + ErnieForCausalLMAutoPP, +) +from models.ernie_moe.configuration import ( + ErnieConfig, + ErnieMoEConfig, +) +from src.datasets import PretrainTask +from src.datasets.pretrain_task import parse_data_weight +from src.trainers import AutoPretrainingTrainer, AutoPreTrainingArguments +from src.utils import ( + setup_logger_output_file, +) +from src.utils.data_utils import merge_fn_group_batch +from src.utils.misc import global_training_logs + + +# from pretrain import create_pretrained_dataset + +from config import get_config + +try: + from paddleformers.trainer.trainer_utils import log_trainer_start +except ImportError: + + def log_trainer_start(): + """print main process messgae""" + if "MAIN_PROCESS_STARTED" not in os.environ: + start_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) + logger.info( + f"The Training Main Process Started Successfully. time: {start_time}, pid: {os.getpid()}" + ) + os.environ["MAIN_PROCESS_STARTED"] = "1" + + +log_trainer_start() + + +try: + from paddle.distributed.fleet import monitor_perf as collective_perf +except ImportError: + from paddle.distributed.fleet import collective_perf + + +assert paddle.version.mkl() == "OFF", ( + "MKL is not supported" + " in this version. Please set -DWITH_MKL=OFF when compiling PaddlePaddle." +) + + +def update_model_config_from_args(config: ErnieConfig, model_args: dict): + """update model config from args + + Args: + config (ErnieConfig): _description_ + model_args (dict): _description_ + + Returns: + _type_: _description_ + """ + for k, v in model_args.items(): + if hasattr(config, k): + logger.info(f"update model config: {k} = {v}") + setattr(config, k, v) + return config + + +def init_parameter(model): + + for param in model.parameters(): + param.initialize() + + +def main(): + """main function""" + config = get_config(verbose=True) + os.makedirs(config.model_args.output_dir, exist_ok=True) + parser = PdArgumentParser(AutoPreTrainingArguments) + if not hasattr(config.trainer_args, "pipeline_parallel_config"): + config.trainer_args.pipeline_parallel_config = "" + + if "enable_dp_comm_overlap" in config.trainer_args.pipeline_parallel_config: + logger.warning( + "Pipeline dp_comm_overlap and FusedLinearWithGradAdd can not be used at " + "the same time." + ) + + if "enable_timer" in config.trainer_args.pipeline_parallel_config: + from paddle.distributed.fleet.meta_parallel.pipeline_parallel import ( + PipelineParallel, + ) + + PipelineParallel.timer_printer = lambda _: None + + def formatv(v): + if isinstance(v, ListConfig): + return list(v) + elif isinstance(v, DictConfig): + return dict(v) + return v + + model_args = {k: formatv(v) for k, v in dict(config.model_args).items()} + trainer_args = {k: formatv(v) for k, v in dict(config.trainer_args).items()} + (args,) = parser.parse_dict(dict(**model_args, **trainer_args)) + + if args.strategy.pipeline.enable and args.virtual_pp_degree > 1: + pipeline = args.strategy.pipeline + pipeline.vpp_degree = args.virtual_pp_degree + pipeline.vpp_seg_method = args.virtual_pipeline_seg_method + + if args.modality_ratio is not None: + args.modality_interleave = ( + sum(args.modality_ratio) + if args.modality_interleave == "acc" + else sum(args.modality_ratio) * args.gradient_accumulation_steps + ) + args.modality_ratio = [ + i / sum(args.modality_ratio) for i in args.modality_ratio + ] + + # combine_batch = args.combine_batch // config.trainer_args.data_parallel_degree + # data_processor_args = {k: formatv(v) for k, v in dict(getattr(config, "data_processor_args", {})).items()} + # (args,) = parser.parse_dict(dict(**model_args, **trainer_args, **data_processor_args)) + args.use_moe = dict(**dict(config.model_args), **dict(config.trainer_args)).get( + "use_moe", False + ) + model_config = dict(getattr(config.model_args, "model_config", {})) + model_config = {k: formatv(v) for k, v in model_config.items()} + logger.info(f"model_config_from_yaml: {json.dumps(model_config, indent=4)}") + setup_logger_output_file(config.model_args.output_dir, args.local_rank) + paddle.set_device(args.device) + + np.random.seed(args.seed) + random.seed(args.seed) + paddle.seed(args.seed) + # set_seed(args.seed) + + prop = paddle.device.cuda.get_device_properties() + if prop.total_memory < args.pre_alloc_memory * 1024 * 1024 * 1024: + logger.warning( + "Invalid value for `pre_alloc_memory`, so pre-allocating just failed." + ) + elif args.pre_alloc_memory > 0: + logger.warning( + f"pre-allocating a tensor whose memory capacity is {args.pre_alloc_memory} GB " + "and then release it." + ) + memory_size = int(args.pre_alloc_memory * 1024 * 1024 * 1024) + x = paddle.empty([memory_size], dtype=paddle.uint8) + del x + + # add fleet test + try: + collective_perf( + "allgather", + round=50, + size_and_time={67108864: 0.00625, 234881024: 0.02, 637534208: 0.057}, + ) + logger.info("======monitor allgather done!=======\n") + collective_perf( + "allreduce", + round=50, + size_and_time={67108864: 0.02, 134217728: 0.038, 268435456: 0.075}, + ) + logger.info("======monitor allreduce done!=======\n") + except Exception as e: + logger.warning("fleet test unexcepted error! skip exception[{}]...".format(e)) + + # Detecting last checkpoint. + last_checkpoint = None + if ( + os.path.isdir(args.output_dir) + and args.do_train + and not args.overwrite_output_dir + ): + last_checkpoint = get_last_checkpoint(args.output_dir) + if last_checkpoint is None and len(os.listdir(args.output_dir)) > 0: + raise ValueError( + f"Output directory ({args.output_dir}) already exists and is not empty. " + "Use --overwrite_output_dir to overcome." + ) + elif last_checkpoint is not None and args.resume_from_checkpoint is None: + logger.info( + f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " + "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." + ) + + # Define the metrics of tasks. + def compute_metrics(p): + preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions + + output = paddle.to_tensor(preds) + labels = paddle.to_tensor(p.label_ids) + output = [t.astype("float32").cuda() for t in output] + labels = [t[t != tokenizer.ignored_index] for t in labels] + labels = [t.cuda() for t in labels] + all_numel = ( + (paddle.concat(labels, 0) != tokenizer.ignored_index).astype("int64").sum() + ) + ignored = (paddle.concat(labels, 0) == -100).astype("int64").sum() + labels = all_numel - ignored + output = sum(output) + logger.info(f"output : {output.item()}, labels : {labels.item()}") + nll_loss = output / (labels + 1.0e-6) # nll_loss is global loss + ppl = paddle.exp(nll_loss) + + return { + "nll_loss": nll_loss.item(), + "ppl": ppl.item(), + "num_token": labels.item(), + } + + # model + dtype = "float32" + if args.fp16 and args.fp16_opt_level == "O2": + paddle.set_default_dtype("float16") + dtype = "float16" + elif args.bf16: + paddle.set_default_dtype("bfloat16") + dtype = "bfloat16" + + if args.use_moe: + global ErnieConfig, ErnieForCausalLMAuto + ErnieConfig = ErnieMoEConfig + + if args.moe_group.lower() in {"mp", "tp", "model", "dummy"}: + logger.info(f"disable moe flag when using moe-group={args.moe_group}") + args.use_moe = False + + cfg = ErnieConfig.from_pretrained(args.model_name_or_path) + cfg = update_model_config_from_args(cfg, model_config) + cfg.seqlen = args.max_seq_length + cfg.fp16_opt_level = args.fp16_opt_level + cfg.moe_group = args.moe_group + cfg.dtype = dtype + cfg.pipeline_parallel_degree = args.pipeline_parallel_degree + cfg.virtual_pp_degree = args.virtual_pp_degree + if args.tensor_parallel_degree > 1: + cfg.sequence_parallel = args.sequence_parallel + cfg.tensor_parallel_degree = max( + fleet.get_hybrid_communicate_group().get_model_parallel_world_size(), 1 + ) + cfg.tensor_parallel_rank = max( + fleet.get_hybrid_communicate_group().get_model_parallel_rank(), 0 + ) + else: + cfg.sequence_parallel = False + cfg.tensor_parallel_degree = 1 + cfg.tensor_parallel_rank = 0 + + tokenizer = ErnieBotTokenizer.from_pretrained(args.tokenizer_name) + tokenizer.ignored_index = cfg.ignored_index + logger.info( + f"using tokenizer={type(tokenizer)}, bos:{tokenizer.bos_token_id} " + f"eos:{tokenizer.eos_token_id} pad:{tokenizer.pad_token_id} " + ) + image_preprocess = None # set if `vision_model_name_or_path is not None` + + if args.model_type == "ernie": + model_class = ErnieForCausalLMAuto + elif args.model_type == "ernie_pp": + model_class = ErnieForCausalLMAutoPP + else: + raise ValueError(f"not support model_type: {args.model_type}") + + if args.from_scratch: + with paddle.LazyGuard(): + model = model_class(cfg) + else: + with paddle.LazyGuard(): + model = model_class.from_pretrained( + args.model_name_or_path, + config=cfg, + ) + + if image_preprocess is not None: + model.add_image_preprocess(image_preprocess) + + cfg = model.config + logger.info(f"using model type:{type(model)}") + paddle.set_default_dtype("float32") + + logger.info(f"using model={type(model)}, cfg={cfg}") + + freeze_config = set(args.freeze_config.split(" ")) + if "freeze_vision" in freeze_config and hasattr(model, "freeze_vision"): + logger.info("Freeze model vision module") + model.freeze_vision() + + # data + logger.info("loading data...") + train_file_list, data_weights = parse_data_weight( + args.data_weights, args.data_filelist + ) + # train_dataset, eval_dataset, test_dataset, data_collator = create_pretrained_dataset(args) + + max_seq_length = args.max_seq_length + + if args.do_train: + assert ( + args.max_seq_length // args.base_seq_length >= 1 + and args.max_seq_length % args.base_seq_length == 0 + ) + if args.combine_batch > 1: + logger.info( + f"max seq length is larger than base_seq_length, use combine batch: {args.combine_batch}" + ) + assert ( + args.use_train_part_sharding + ), "not `use_train_part_sharding` is not supported when using `combine_batch`" + assert ( + args.num_consecutive // args.combine_batch >= 1 + and args.num_consecutive % args.combine_batch == 0 + ), "num_consecutive must be a multiple of max_seq_length / base_seq_length" + assert ( + args.data_weights + ), "no `data_weights` is not supported when using `combine_batch`" + max_seq_length = args.base_seq_length + if args.need_data: + if args.multimodal: + assert False, "Do not support multimodal!" + else: + pretrain_task = PretrainTask(train_file_list, tokenizer) + train_dataset = pretrain_task.train_data( + max_seq_length + 1, + stride=max_seq_length, + rng=random.Random(args.seed), + weights=data_weights, + evaluate=False, + seed=args.seed, + num_consecutive=args.num_consecutive, + shuffle=not args.no_part_shuffle, + combine_batch=args.combine_batch, + load_process_num=args.data_load_process_num, + ) + train_dataset.load( + use_shard=args.use_train_part_sharding, + dp_rank=args.reeao_dataset_rank, + dp_size=args.reeao_dataset_world_size, + ) + train_dataset = MapDataset(train_dataset) + else: + logger.info( + f"mp_{args.pipeline_parallel_rank}_pp{args.tensor_parallel_rank} no data needed, \ + skip init train_dataset" + ) + train_dataset = None + + if args.do_eval: + eval_dataset = PretrainTask( + [[args.dev_data]], + tokenizer, + max_seq_len=max_seq_length, + ).train_data( + max_seq_length + 1, + stride=max_seq_length, + overlap_len=32, + rng=random.Random(0), + evaluate=True, + shuffle=False, + ) + eval_dataset.load(False, dp_rank=0, dp_size=1) + eval_dataset = MapDataset(eval_dataset) + else: + eval_dataset = None + + data_collator = partial( + merge_fn_group_batch, + tokenizer, + pad_to_max_seqlen=args.max_seq_length, + combine_batch=args.combine_batch, + image_dtype="uint8", + ) + callbacks = [] + callbacks = [DataTraceCallbackAuto()] if not args.use_dummy_dataset else [] + callbacks += [GlobalRNGCallback()] + + if args.batch_size_warmup_steps: + progreesive_batcing_callback = ProgreesiveBatchingCallback( + args.gradient_accumulation_steps, + args.max_gradient_accumulation_steps, + args.batch_size_warmup_steps, + args.batch_size_warmup_increment, + ) + callbacks.append(progreesive_batcing_callback) + + init_parameter(model) + model.apply(model.init_weights) + trainer = AutoPretrainingTrainer( + model=model, + args=args, + data_collator=data_collator, + train_dataset=train_dataset, + eval_dataset=eval_dataset, + tokenizer=tokenizer, + compute_metrics=compute_metrics, + callbacks=callbacks, + ) + global_training_logs.accumulate = args.gradient_accumulation_steps + checkpoint = None + if args.resume_from_checkpoint is not None: + checkpoint = args.resume_from_checkpoint + elif last_checkpoint is not None: + checkpoint = last_checkpoint + + # Training + if args.do_train: + train_result = trainer.train(resume_from_checkpoint=checkpoint) + metrics = train_result.metrics + trainer.save_model(args.output_dir) + trainer.log_metrics("train", metrics) + trainer.save_metrics("train", metrics) + trainer.save_state() + + # Evaluate and tests model + if args.do_eval: + eval_metrics = trainer.evaluate() + trainer.log_metrics("eval", eval_metrics) + + +if __name__ == "__main__": + main() diff --git a/examples/pre-training/ernie/src/callbacks/__init__.py b/examples/pre-training/ernie/src/callbacks/__init__.py index 15b0cb9f..51a31a22 100644 --- a/examples/pre-training/ernie/src/callbacks/__init__.py +++ b/examples/pre-training/ernie/src/callbacks/__init__.py @@ -12,14 +12,22 @@ # See the License for the specific language governing permissions and # limitations under the License. +import logging + +from .tensorboard_callback import TensorBoardCallback + from .gc_callback import GCCallback +from .progressive_batching_callback import ProgreesiveBatchingCallback from .logging_callback import LoggingCallback +from .stopper_callback import StopperCallback +from .adaptivegradclip_callback import ClipGradByAdaptiveNormCallback + from .moe_correction_bias_adjust_callback import MoECorrectionBiasAdjustCallback from .moe_logging_callback import GlobalRNGCallback, MoeLoggingCallback from .sp_grad_sync_callback import SPGradSyncCallback -from .tensorboard_callback import TensorBoardCallback from .fp8_quant_weight_callback import FP8QuantWeightCallback from .ortho_loss_callback import OrthogonalCallback +from .data_trace_callback import DataTraceCallback, DataTraceCallbackAuto __all__ = [ "TensorBoardCallback", @@ -31,4 +39,9 @@ "MoECorrectionBiasAdjustCallback", "FP8QuantWeightCallback", "OrthogonalCallback", + "ClipGradByAdaptiveNormCallback", + "StopperCallback", + "ProgreesiveBatchingCallback", + "DataTraceCallbackAuto", + "DataTraceCallback", ] diff --git a/examples/pre-training/ernie/src/callbacks/adaptivegradclip_callback.py b/examples/pre-training/ernie/src/callbacks/adaptivegradclip_callback.py new file mode 100644 index 00000000..f05e4500 --- /dev/null +++ b/examples/pre-training/ernie/src/callbacks/adaptivegradclip_callback.py @@ -0,0 +1,122 @@ +# !/usr/bin/env python3 + +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" ClipGradByAdaptiveNormCallback """ + +import os +import paddle +from paddleformers.trainer.trainer_callback import TrainerCallback +from paddleformers.trainer.trainer_utils import ( + PREFIX_CHECKPOINT_DIR, + get_last_checkpoint, +) +from src.utils import logger + + +class ClipGradByAdaptiveNormCallback(TrainerCallback): + """ + Load and save adaptive norm state hook, hack version + """ + + def on_train_begin(self, args, state, control, **kwargs): + """ + load adaptive norm state at the beginning of training. + """ + optimizer = kwargs.get("optimizer", None) + assert optimizer is not None + if optimizer._grad_clip is None: + logger.info("grad_clip is None.") + return + elif not hasattr(optimizer._grad_clip, "state_dict"): + logger.info("grad_clip {optimizer._grad_clip} has not state_dict method.") + return + + if args.adaptive_norm_force_clear_state: + logger.info("force clear ClipGradByAdaptiveNorm state dict.") + return + + resume_from_checkpoint = ( + None if not args.resume_from_checkpoint else args.resume_from_checkpoint + ) + # Load potential model checkpoint + if isinstance(resume_from_checkpoint, bool) and resume_from_checkpoint: + resume_from_checkpoint = get_last_checkpoint(args.output_dir) + if resume_from_checkpoint is None: + raise ValueError( + f"No valid checkpoint found in output directory ({args.output_dir})" + ) + + if resume_from_checkpoint is None: + return + + # if use distributed training + if args.world_size > 1: + process_index = args.process_index + path = os.path.join( + resume_from_checkpoint, f"adaptivenorm_clip_state_{process_index}.pth" + ) + if not os.path.isfile(path): + logger.info( + f"Didn't find an adaptivenorm clip state file for process {process_index}, if you are resuming " + "a training that wasn't launched in a distributed fashion, reproducibility is not guaranteed." + ) + return + else: + path = os.path.join(resume_from_checkpoint, "adaptivenorm_clip_state.pth") + if not os.path.isfile(path): + logger.info( + "Didn't find an adaptivenorm clip state file, if you are resuming a training that was " + "launched in a distributed fashion, reproducibility is not guaranteed." + ) + return + + logger.info(f"Loading adaptivenorm clip state state to {path}") + state_dict = paddle.load(path) + + optimizer._grad_clip.set_state_dict(state_dict) + logger.info("load ClipGradByAdaptiveNorm state dict success.") + + def on_save(self, args, state, control, **kwargs): + """ + Event called after a checkpoint save. + """ + optimizer = kwargs.get("optimizer", None) + assert optimizer is not None + + if optimizer._grad_clip is None or not hasattr( + optimizer._grad_clip, "state_dict" + ): + return + + # Save model checkpoint + checkpoint_folder = f"{PREFIX_CHECKPOINT_DIR}-{state.global_step}" + + run_dir = args.output_dir + + output_dir = os.path.join(run_dir, checkpoint_folder) + + os.makedirs(output_dir, exist_ok=True) + + if args.world_size > 1: + # use global process_index to save + process_index = args.process_index + path = os.path.join( + output_dir, f"adaptivenorm_clip_state_{process_index}.pth" + ) + else: + path = os.path.join(output_dir, "adaptivenorm_clip_state.pth") + logger.info(f"Saving randompos rng state to {path}") + paddle.save(optimizer._grad_clip.state_dict(), path) diff --git a/examples/pre-training/ernie/src/callbacks/data_trace_callback.py b/examples/pre-training/ernie/src/callbacks/data_trace_callback.py new file mode 100644 index 00000000..b0c99391 --- /dev/null +++ b/examples/pre-training/ernie/src/callbacks/data_trace_callback.py @@ -0,0 +1,251 @@ +# -*- coding: utf-8 -*- +# !/usr/bin/env python3 + +# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging + +import numpy as np +import paddle +import paddle.distributed as dist +from paddle.distributed import fleet +from paddleformers.trainer.trainer_callback import ( + TrainerCallback, + TrainerControl, + TrainerState, +) +from paddleformers.trainer.training_args import TrainingArguments + +logger = logging.getLogger(__name__) + + +class DataTraceCallback(TrainerCallback): + """Callback 用于DataStatus记录 + + Args: + TrainerCallback (_type_): _description_ + """ + + def on_train_begin( + self, + args: TrainingArguments, + state: TrainerState, + control: TrainerControl, + **kwargs, + ): + + if args.custom_data_status: + custom_trainer_state = TrainerState.load_from_json(args.custom_data_status) + logger.info(f"load custom data status from {args.custom_data_status}") + state.trial_params = custom_trainer_state.trial_params + + if not args.need_data: + self.data_status_shape = paddle.zeros([1], dtype="int32") + if dist.is_initialized(): + logger.info("broadcast data trace callback hook") + dist.broadcast(self.data_status_shape, 0) # 呼应 Line:117 + return + batch_sampler = kwargs["train_dataloader"].batch_sampler + + if state.trial_params is None: + state.trial_params = {} + + if "saved_data_status" not in state.trial_params: + state.trial_params["saved_data_status"] = [ + 0 for _ in range(batch_sampler.max_part_id + 1) + ] + + if "last_start_data_status" not in state.trial_params: + state.trial_params["last_start_data_status"] = [ + 0 for _ in state.trial_params["saved_data_status"] + ] + + if "consumed_samples" not in state.trial_params: + state.trial_params["consumed_samples"] = sum( + state.trial_params["saved_data_status"] + ) + if "global_shuffle_seed" not in state.trial_params: + state.trial_params["global_shuffle_seed"] = 0 + + if not args.same_data: + state.trial_params["last_start_data_status"] = state.trial_params[ + "saved_data_status" + ] + state.trial_params["consumed_samples"] = 0 + state.trial_params["global_shuffle_seed"] = ( + state.trial_params["global_shuffle_seed"] + 1 + ) + + logger.debug( + f"Update global_shuffle_seed to {state.trial_params['global_shuffle_seed']}" + ) + logger.debug( + "Due to changes in the underlying data (ratio, number of files, number of dp), \ + the index needs to be rebuilt by resetting the consumed_samplers to 0." + ) + + if not args.ignore_data_skip: + # 进行数据skip - sampler load data_status状态与consumed_samples状态 + batch_sampler.load_data_status( + state.trial_params["last_start_data_status"], + state.trial_params["global_shuffle_seed"], + ) + batch_sampler.set_epoch(0, state.trial_params["consumed_samples"]) + else: + state.trial_params["consumed_samples"] = 0 + state.trial_params["saved_data_status"] = [ + 0 for _ in range(batch_sampler.max_part_id + 1) + ] + state.trial_params["last_start_data_status"] = [ + 0 for _ in range(batch_sampler.max_part_id + 1) + ] + batch_sampler.load_data_status( + state.trial_params["last_start_data_status"], + state.trial_params["global_shuffle_seed"], + ) + batch_sampler.set_epoch(0, state.trial_params["consumed_samples"]) + logger.info("Ignore data skipping and data status") + + state.trial_params["data_status"] = [ + 0 + for _ in range( + max( + batch_sampler.max_part_id + 1, + len(state.trial_params["saved_data_status"]), + ) + ) + ] + self.data_status_shape = paddle.to_tensor( + len(state.trial_params["data_status"]), dtype="int32" + ) + if dist.is_initialized(): + logger.info("broadcast data trace callback hook") + dist.broadcast(self.data_status_shape, 0) + + def on_load_data_end( + self, + args: TrainingArguments, + state: TrainerState, + control: TrainerControl, + inputs, + **kwargs, + ): + + if not args.need_data: + return + for part_id in inputs["src_id"]: + state.trial_params["data_status"][part_id] += 1 + + def on_step_end( + self, + args: TrainingArguments, + state: TrainerState, + control: TrainerControl, + **kwargs, + ): + + if not args.need_data: + if ( + args.use_hybrid_parallel + and control.should_save + and dist.is_initialized() + and args.pp_need_data_degree + and args.pipeline_parallel_degree > 1 + ): + _hcg = fleet.get_hybrid_communicate_group() + data_status = paddle.zeros( + [self.data_status_shape.item()], dtype="int64" + ) + dist.all_reduce(data_status, group=_hcg.get_pipe_parallel_group()) + return # 呼应 Line:178 + return + + if control.should_save: + data_status = paddle.to_tensor( + state.trial_params["data_status"], dtype="int64" + ) + if dist.is_initialized(): + if args.use_hybrid_parallel: + _hcg = fleet.get_hybrid_communicate_group() + # dp间进行all_reduce + if args.data_parallel_degree > 1: + dist.all_reduce( + data_status, group=_hcg.get_data_parallel_group() + ) + if args.sharding_parallel_degree > 1: + dist.all_reduce( + data_status, group=_hcg.get_sharding_parallel_group() + ) + if args.pp_need_data_degree and args.pipeline_parallel_degree > 1: + dist.all_reduce( + data_status, group=_hcg.get_pipe_parallel_group() + ) + else: + dist.all_reduce(data_status) # + group + logger.debug("All reduced `data_status`") + + _saved_data_status = np.array(state.trial_params["saved_data_status"]) + if len(data_status) > len(_saved_data_status): + # 数据max_part_id变大。 + _saved_data_status = np.append( + _saved_data_status, + np.zeros( + [ + len(data_status) - len(_saved_data_status), + ], + dtype="int64", + ), + ) + + state.trial_params["saved_data_status"] = ( + data_status.numpy() + _saved_data_status + ).tolist() + state.trial_params["consumed_samples"] += sum(data_status.tolist()) + + def on_save( + self, + args: TrainingArguments, + state: TrainerState, + control: TrainerControl, + **kwargs, + ): + + if not args.need_data: + return + state.trial_params["data_status"] = [ + 0 for _ in range(len(state.trial_params["data_status"])) + ] + + +class DataTraceCallbackAuto(DataTraceCallback): + """Callback 用于DataStatus记录 + + Args: + TrainerCallback (_type_): _description_ + """ + + def on_load_data_end( + self, + args: TrainingArguments, + state: TrainerState, + control: TrainerControl, + inputs, + **kwargs, + ): + + if not args.need_data: + return + for part_id in inputs["input_ids"][3]: # src_id + state.trial_params["data_status"][part_id] += 1 diff --git a/examples/pre-training/ernie/src/callbacks/progressive_batching_callback.py b/examples/pre-training/ernie/src/callbacks/progressive_batching_callback.py new file mode 100644 index 00000000..79de8beb --- /dev/null +++ b/examples/pre-training/ernie/src/callbacks/progressive_batching_callback.py @@ -0,0 +1,70 @@ +# !/usr/bin/env python3 + +# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +import numpy as np +from paddleformers.trainer.trainer_callback import TrainerCallback + +logger = logging.getLogger(__name__) + + +def progressive_accumulate_steps( + acc_step_begin, acc_step_end, warmup_global_steps, increment, step +): + + assert step >= 0, step + if step >= warmup_global_steps: + return acc_step_end + slope = (acc_step_end - acc_step_begin) / warmup_global_steps + acc_steps = int(slope * step + acc_step_begin) + acc_steps = int(np.ceil(acc_steps / increment) * increment) + return acc_steps + + +class ProgreesiveBatchingCallback(TrainerCallback): + def __init__(self, acc_step_bigin, acc_step_end, warmup_global_steps, increment): + self.acc_step_bigin = acc_step_bigin + self.acc_step_end = acc_step_end + self.warmup_global_steps = warmup_global_steps + self.increment = increment + + def on_train_begin(self, args, state, control, **kwargs): + new_acc_step = progressive_accumulate_steps( + self.acc_step_bigin, + self.acc_step_end, + self.warmup_global_steps, + self.increment, + state.global_step, + ) + if new_acc_step != args.gradient_accumulation_steps: + logger.info( + f"updating acc_step{args.gradient_accumulation_steps}->{new_acc_step}, global_step={state.global_step}" + ) + args.gradient_accumulation_steps = new_acc_step + + def on_step_end(self, args, state, control, **kwargs): + new_acc_step = progressive_accumulate_steps( + self.acc_step_bigin, + self.acc_step_end, + self.warmup_global_steps, + self.increment, + state.global_step, + ) + if new_acc_step != args.gradient_accumulation_steps: + logger.info( + f"updating acc_step{args.gradient_accumulation_steps}->{new_acc_step}, global_step={state.global_step}" + ) + args.gradient_accumulation_steps = new_acc_step diff --git a/examples/pre-training/ernie/src/callbacks/stopper_callback.py b/examples/pre-training/ernie/src/callbacks/stopper_callback.py new file mode 100644 index 00000000..2b776309 --- /dev/null +++ b/examples/pre-training/ernie/src/callbacks/stopper_callback.py @@ -0,0 +1,29 @@ +# !/usr/bin/env python3 + +# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import os +import logging +from paddleformers.trainer.trainer_callback import TrainerCallback + +logger = logging.getLogger(__name__) + + +class StopperCallback(TrainerCallback): + + def on_substep_end(self, args, state, control, **kwargs): + if os.path.exists("/root/stop"): + control.should_training_stop = True diff --git a/examples/pre-training/ernie/src/clip/__init__.py b/examples/pre-training/ernie/src/clip/__init__.py index 6484ef44..215b5156 100644 --- a/examples/pre-training/ernie/src/clip/__init__.py +++ b/examples/pre-training/ernie/src/clip/__init__.py @@ -12,6 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. +from .clip import ClipGradByAdaptiveNorm from .moe_clip import ClipGradForMOEByGlobalNorm -__all__ = ['ClipGradForMOEByGlobalNorm'] +__all__ = [ + "ClipGradForMOEByGlobalNorm", + "ClipGradByAdaptiveNorm", +] diff --git a/examples/pre-training/ernie/src/clip/clip.py b/examples/pre-training/ernie/src/clip/clip.py new file mode 100644 index 00000000..d795061f --- /dev/null +++ b/examples/pre-training/ernie/src/clip/clip.py @@ -0,0 +1,316 @@ +# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import copy +from collections import defaultdict +import paddle +import paddle.distributed as dist +from paddle.distributed import fleet + +try: + from paddle.base import framework +except ImportError: + from paddle.fluid import framework +from paddle.nn.clip import ClipGradBase, _squared_l2_norm +from src.utils import logger + + +class ClipGradByAdaptiveNorm(ClipGradBase): + + def __init__( + self, + clip_ratio=1.03, + start_clip_steps=100, + beta=0.98, + epsilon=1e-8, + shard_clip=False, + enable_record=False, + enable_record_clip_history=False, + verbose=False, + ): + super().__init__() + self.clip_ratio = clip_ratio + self.beta = beta + self.epsilon = epsilon + self.state = defaultdict(dict) + self.start_clip_steps = start_clip_steps + self.shard_clip = shard_clip + self.enable_record = enable_record + self.steps = 0 + self.enable_record_clip_history = enable_record_clip_history + self.verbose = verbose + self.keys = [ + "clip_ratio", + "beta", + "epsilon", + "start_clip_steps", + "shard_clip", + "enable_record", + "steps", + "enable_record_clip_history", + ] + + if start_clip_steps < 0: + raise ValueError( + "start_clip_steps {}, please start_clip_steps >= 0.".format( + start_clip_steps + ) + ) + + def __str__(self): + return "ClipGradByAdaptiveNorm, clip_ratio={}, beta={}, start_clip_steps={}, \ + shard_clip={}, enable_record={}".format( + self.clip_ratio, + self.beta, + self.start_clip_steps, + self.shard_clip, + self.enable_record, + ) + + def clip_by_norm(self, param, grad, norm_value, global_norm): + + state = self.state[param.name] + + if "norm_value" not in state: + state["norm_value"] = norm_value + + if "clip_times" not in state: + state["clip_times"] = 0 + + if self.enable_record_clip_history: + if "clip_history" not in state: + state["clip_history"] = {} + + avg_norm_value = state["norm_value"] + + if self.enable_record: + if "norm_history" not in state: + state["norm_history"] = {} + state["norm_history"][self.steps] = [ + float(norm_value), + float(avg_norm_value), + ] + + if self.steps <= self.start_clip_steps: + clip_coeff = 1.0 / (global_norm + self.epsilon) + if clip_coeff < 1.0: + grad.multiply_(clip_coeff) + param._reset_grad_inplace_version(True) + + if norm_value < state["norm_value"]: + state["norm_value"] = norm_value + else: + if norm_value > self.clip_ratio * avg_norm_value: + # clip grad + coef = (self.clip_ratio * avg_norm_value) / (norm_value + self.epsilon) + grad.multiply_(coef) + param._reset_grad_inplace_version(True) + norm_value_old = norm_value + norm_value = self.clip_ratio * avg_norm_value + state["clip_times"] = state["clip_times"] + 1 + if self.enable_record_clip_history: + state["clip_history"][self.steps] = [ + float(norm_value_old), + float(norm_value), + ] + if self.verbose: + logger.info( + "{} gradclip {} times, clip from {} to {}".format( + param.name, + state["clip_times"], + float(norm_value_old), + float(norm_value), + ) + ) + + logger.info( + "{} steps {}, gradclip {} times, clip_ratio {}, clip from {} to {}".format( + param.name, + self.steps, + state["clip_times"], + self.clip_ratio, + float(norm_value_old), + float(norm_value), + ) + ) + state["norm_value"] = avg_norm_value * self.beta + norm_value * ( + 1.0 - self.beta + ) + + return grad + + @paddle.no_grad() + def _dygraph_clip(self, params_grads): + global_norm_tensor = None + if self.steps <= self.start_clip_steps: + hcg = fleet.get_hybrid_communicate_group() + mp_size = hcg.get_model_parallel_world_size() + mp_group = hcg.get_model_parallel_group() + pp_size = hcg.get_pipe_parallel_world_size() + pp_group = hcg.get_pipe_parallel_group() + sharding_size = hcg.get_sharding_parallel_world_size() + sharding_group = hcg.get_sharding_parallel_group() + + norm_squared_values = [] + for p, g in params_grads: + if g is None: + continue + if getattr(p, "need_clip", True) is False: + continue + norm_squared_value = _squared_l2_norm(g) + if not p.is_distributed and mp_size > 1: + norm_squared_value = norm_squared_value / mp_size + norm_squared_values.append(norm_squared_value) + + global_norm_squared_tensor = paddle.stack(norm_squared_values).sum() + + if mp_size > 1: + dist.all_reduce(global_norm_squared_tensor, group=mp_group) + if pp_size > 1: + dist.all_reduce(global_norm_squared_tensor, group=pp_group) + if sharding_size > 1: + dist.all_reduce(global_norm_squared_tensor, group=sharding_group) + global_norm_tensor = paddle.sqrt(global_norm_squared_tensor) + + if self.verbose and global_norm_tensor is not None: + logger.info( + "step: {}, global norm: {}".format( + self.steps, float(global_norm_tensor) + ) + ) + + if hasattr(self, "sharding_stage1_v2") and self.sharding_stage1_v2: + need_sync = False + if not self.shard_clip: + hcg = fleet.get_hybrid_communicate_group() + mp_size = hcg.get_model_parallel_world_size() + mp_group = hcg.get_model_parallel_group() + sharding_size = hcg.get_sharding_parallel_world_size() + sharding_group = hcg.get_sharding_parallel_group() + if mp_size > 1 or sharding_size > 1: + need_sync = True + + norm_squared_values = [ + paddle.zeros([1], dtype=params_grads[0][1].dtype) + for _ in range(self.num_params) + ] + + for p, g in params_grads: + if g is None: + continue + if getattr(p, "need_clip", True) is False: + continue + norm_squared_value = _squared_l2_norm(g) + if need_sync and not p.is_distributed: + norm_squared_values[self.pname_to_paramindex[p.name]] = ( + 1 / mp_size + ) * norm_squared_value + else: + norm_squared_values[self.pname_to_paramindex[p.name]] = ( + norm_squared_value + ) + + num_has_grad = len(norm_squared_values) + norm_squared_tensor = paddle.concat(norm_squared_values, axis=0) + if need_sync: + if mp_size > 1: + dist.all_reduce(norm_squared_tensor, group=mp_group) + if sharding_size > 1: + dist.all_reduce(norm_squared_tensor, group=sharding_group) + + norm_tensor = paddle.sqrt(norm_squared_tensor) + norm_values = paddle.split(norm_tensor, num_has_grad, axis=0) + + params_and_grads = [] + for p, g in params_grads: + if g is None: + continue + if getattr(p, "need_clip", True) is False: + params_and_grads.append((p, g)) + continue + new_grad = self.clip_by_norm( + p, + g, + norm_values[self.pname_to_paramindex[p.name]], + global_norm_tensor, + ) + params_and_grads.append((p, new_grad)) + else: + need_sync = False + if not self.shard_clip: + hcg = fleet.get_hybrid_communicate_group() + mp_size = hcg.get_model_parallel_world_size() + mp_group = hcg.get_model_parallel_group() + if mp_size > 1: + need_sync = True + + norm_squared_values = [] + for p, g in params_grads: + if g is None: + continue + if getattr(p, "need_clip", True) is False: + continue + norm_squared_value = _squared_l2_norm(g) + if need_sync and not p.is_distributed: + norm_squared_values.append((1 / mp_size) * norm_squared_value) + else: + norm_squared_values.append(norm_squared_value) + + num_has_grad = len(norm_squared_values) + norm_squared_tensor = paddle.concat(norm_squared_values, axis=0) + if need_sync: + dist.all_reduce(norm_squared_tensor, group=mp_group) + + norm_tensor = paddle.sqrt(norm_squared_tensor) + norm_values = paddle.split(norm_tensor, num_has_grad, axis=0) + + params_and_grads = [] + idx = 0 + for p, g in params_grads: + if g is None: + continue + if getattr(p, "need_clip", True) is False: + params_and_grads.append((p, g)) + continue + new_grad = self.clip_by_norm(p, g, norm_values[idx], global_norm_tensor) + params_and_grads.append((p, new_grad)) + idx += 1 + + self.steps += 1 + return params_and_grads + + @framework.dygraph_only + def state_dict(self): + + state_dict = {k: v for k, v in self.state.items()} + for key in self.keys: + state_dict[key] = self.__dict__[key] + return state_dict + + @framework.dygraph_only + def set_state_dict(self, state_dict): + + if len(state_dict) == 0 or state_dict is None: + logger.info("state_dict is empty, please check if it is right.") + + for key in self.keys: + if key in state_dict: + self.__dict__[key] = state_dict[key] + else: + logger.info("Can't find [ {} ] in state_dict".format(key)) + + for k in state_dict: + if k in self.keys: + continue + self.state[k] = copy.deepcopy(state_dict[k]) diff --git a/examples/pre-training/ernie/src/datasets/__init__.py b/examples/pre-training/ernie/src/datasets/__init__.py new file mode 100644 index 00000000..0b558806 --- /dev/null +++ b/examples/pre-training/ernie/src/datasets/__init__.py @@ -0,0 +1,19 @@ +# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""pretraining task +""" + +from .dist_data_loader import DistDataLoader, DistDataLoaderAuto +from .pretrain_task import ExampleSet, ExampleSetSingleDataSource, PretrainTask diff --git a/examples/pre-training/ernie/src/datasets/dist_data_loader.py b/examples/pre-training/ernie/src/datasets/dist_data_loader.py new file mode 100644 index 00000000..846ad593 --- /dev/null +++ b/examples/pre-training/ernie/src/datasets/dist_data_loader.py @@ -0,0 +1,598 @@ +# -*- coding: utf-8 -*- +# !/usr/bin/env python3 + +# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +DistDataLoader is a wrapper of paddle.io.DataLoader. +It is used to support hybrid parallelism. +It can replace paddle.io.DataLoader in most cases. +""" +import logging +from collections import OrderedDict +from itertools import groupby +from functools import reduce +from dataclasses import dataclass + +import numpy as np +import os +import paddle +from paddle.distributed import fleet +import paddle.distributed as dist +from paddle.utils.layers_utils import flatten, map_structure, pack_sequence_as + +from paddleformers.utils.batch_sampler import DistributedBatchSampler +from paddleformers.trainer.plugins.timer import get_timers +from paddleformers.utils.tools import get_env_device + +from src.utils.misc import global_training_logs + +logger = logging.getLogger(__name__) + + +log = logging.getLogger(__name__) + +_MAX_DATA_DIM = 64 + +VOCAB_SIZE = os.getenv("VOCAB_SIZE") +G_DEBUG_DATA_MD5 = os.getenv("G_DEBUG_DATA_MD5") + + +class DummyDataset(paddle.io.Dataset): + def __len__(self): + return 0 + + +class DistDataLoader(paddle.io.DataLoader): + """ + DistDataLoader is a wrapper of paddle.io.DataLoader. + """ + + def __init__( + self, + dataset, + feed_list=None, + places=None, + return_list=True, + batch_sampler=None, + batch_size=1, + shuffle=False, + drop_last=False, + collate_fn=None, + num_workers=0, + use_buffer_reader=True, + prefetch_factor=2, + use_shared_memory=True, + timeout=0, + worker_init_fn=None, + persistent_workers=False, + need_data=True, + pp_broadcast=True, + need_magic_trans=False, + ): + if dataset is None: + dataset = DummyDataset() + batch_sampler = DistributedBatchSampler(dataset, 1) + log.info("rank has no data, use Dummpy dataset") + super().__init__( + dataset=dataset, + batch_sampler=batch_sampler, + collate_fn=collate_fn, + num_workers=num_workers, + ) + self.need_magic_trans = need_magic_trans + # log.info(f'DistDataloader using image-dtype: {self.image_dtype}') + self._hcg = fleet.get_hybrid_communicate_group() + + # init pp data comm group + if self._hcg.get_pipe_parallel_world_size() > 1 and pp_broadcast: + self._pp_data_group = self._init_dataloader_comm_group() + else: + log.info("skip pp broadcast") + self._pp_data_group = None + + # tensor parallel message + self.mp_rank = self._hcg.get_model_parallel_rank() + self.mp_group = self._hcg.get_model_parallel_group() + self.mp_src_rank = self._hcg.get_model_parallel_group_src_rank() + + self.pp_rank = self._hcg.get_stage_id() + self.dp_rank = self._hcg.get_data_parallel_rank() + sharding_rank = self._hcg.get_sharding_parallel_rank() + self._need_data = need_data + if self._need_data: + self._dataloder = paddle.io.DataLoader( + dataset, + feed_list, + places, + return_list, + batch_sampler, + batch_size, + shuffle, + drop_last, + collate_fn, + num_workers, + use_buffer_reader, + prefetch_factor, + use_shared_memory, + timeout, + worker_init_fn, + persistent_workers, + ) + + # self._dataloder_iter = iter(self._dataloder) + self._lazy_dataloader_iter = None + else: + log.info( + "mp{}_pp{}_sharding{}_dp{} no data needed, " + "skip init dataloader.".format( + self.mp_rank, self.pp_rank, sharding_rank, self.dp_rank + ) + ) + + @property + def _dataloder_iter(self): + if self._lazy_dataloader_iter is None: + self._lazy_dataloader_iter = iter(self._dataloder) + return self._lazy_dataloader_iter + + def __len__(self): + if self._need_data: + return super().__len__() + else: + raise ValueError( + "raise error for `paddlenlp.trainer.trainer_utils.has_length`" + ) + + def _init_dataloader_comm_group(self): + topo = self._hcg._topo + parallel_comm_group = None + parallel_groups = topo.get_comm_list("pipe") + + for group in parallel_groups: + # only first rank and last rank + if self.need_magic_trans: + assert ( + len(group) > 2 + ), f"magic_trans need ranks in group greater than 2, but get {len(group)}" + ranks = [group[0], group[-2], group[-1]] + else: + ranks = [group[0], group[-1]] + comm_group = paddle.distributed.new_group(ranks=ranks) + if paddle.distributed.get_rank() in ranks: + parallel_comm_group = comm_group + return parallel_comm_group + + def __iter__(self): + return self + + def __next__(self): + get_timers() and get_timers()("read-raw-data").start() + if self._need_data: + # {'input_ids': int64, 'labels': int64, 'data_id': int64} + data = next(self._dataloder_iter) + if "data_not_valid" in data: + global_training_logs.update( + data_not_valid=data["data_not_valid"].astype("float32").mean() + ) + ( + input_ids, + labels, + data_id, + src_id, + data_type, + images, + token_type_ids, + image_type_ids, + audio_input_ids, + audio_labels, + grid_thw, + inbatch_pack_offset, + position_ids, + log_prob, + ) = ( + data["input_ids"], + data["labels"], + data["data_id"], + data["src_id"], + data.get("data_type", None), + data.get("images", None), + data.get("token_type_ids", None), + data.get("image_type_ids", None), + data.get("audio_input_ids", None), + data.get("audio_labels", None), + data.get("grid_thw", None), + data.get("inbatch_pack_offset", None), + data.get("position_ids", None), + data.get("log_prob", None), + ) + assert {input_ids.dtype, labels.dtype, data_id.dtype, src_id.dtype} == { + paddle.int64 + }, ( + f"Distloader requires dtype == `int64`, " + f"got:{[input_ids.dtype, labels.dtype, data_id.dtype, src_id.dtype]}" + ) + else: + ( + input_ids, + labels, + data_id, + src_id, + data_type, + images, + token_type_ids, + image_type_ids, + audio_input_ids, + audio_labels, + grid_thw, + inbatch_pack_offset, + position_ids, + log_prob, + ) = ( + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + ) + get_timers() and get_timers()("read-raw-data").stop() + + # broadcast data + pp_broadcast = (self._pp_data_group is None) or self.pp_rank == 0 + if self.mp_group is not None and self.mp_group.nranks > 1 and pp_broadcast: + ( + input_ids, + labels, + data_id, + src_id, + data_type, + images, + token_type_ids, + image_type_ids, + audio_input_ids, + audio_labels, + grid_thw, + inbatch_pack_offset, + position_ids, + log_prob, + ) = broadcast_data_obj( + [ + input_ids, + labels, + data_id, + src_id, + data_type, + images, + token_type_ids, + image_type_ids, + audio_input_ids, + audio_labels, + grid_thw, + inbatch_pack_offset, + position_ids, + log_prob, + ], + self.mp_src_rank, + self.mp_group, + ) + + if self._pp_data_group is not None and self._pp_data_group.nranks > 1: + # NOTE(shenliang03): in last stage in pp, we don't need input_ids and data_id. + # But it's only for paddle-new_model_7 compatible upgrade. It will remove in future. + ( + input_ids, + labels, + data_id, + src_id, + data_type, + images, + token_type_ids, + image_type_ids, + audio_input_ids, + audio_labels, + grid_thw, + inbatch_pack_offset, + position_ids, + log_prob, + ) = broadcast_data_obj( + [ + input_ids, + labels, + data_id, + src_id, + data_type, + images, + token_type_ids, + image_type_ids, + audio_input_ids, + audio_labels, + grid_thw, + inbatch_pack_offset, + position_ids, + log_prob, + ], + self._pp_data_group.ranks[0], + self._pp_data_group, + ) + + if VOCAB_SIZE is not None: + if input_ids is not None: + input_ids %= int(VOCAB_SIZE) + if labels is not None: + labels %= int(VOCAB_SIZE) + + to_return = OrderedDict( + [ + ("input_ids", input_ids), + ("labels", labels), + ("data_id", data_id), + ("src_id", src_id), + ("data_type", data_type), + ("images", images), + ("token_type_ids", token_type_ids), + ("image_type_ids", image_type_ids), + ("audio_input_ids", audio_input_ids), + ("audio_labels", audio_labels), + ("grid_thw", grid_thw), + ("inbatch_pack_offset", inbatch_pack_offset), + ("position_ids", position_ids), + ] + ) + optional_keys = [ + "data_type", + "images", + "token_type_ids", + "image_type_ids", + "audio_input_ids", + "audio_labels", + "grid_thw", + "inbatch_pack_offset", + "position_ids", + "log_prob", + ] + none_keys = [ + k for k, v in to_return.items() if v is None and k in optional_keys + ] + for k in none_keys: + to_return.pop(k) + return to_return + + +def broadcast_data_list(data_list, datatype, comm_rank=0, comm_group=None, src_rank=0): + """ + Broadcast data from src_rank to all ranks in comm_group. + """ + # Move to GPU and broadcast. + size_cpu = [] + if comm_rank == 0: + for data in data_list: + size_cpu.append(len(data.shape)) + size_cpu += data.shape + size_cpu = size_cpu + [0] * (_MAX_DATA_DIM - len(size_cpu)) + size_cuda = paddle.to_tensor(size_cpu) + paddle.distributed.broadcast(size_cuda, src_rank, group=comm_group).wait() + + size_cpu = size_cuda.tolist() + i = 0 + numel = 0 + sizes = [] + while size_cpu[i] > 0: + rank = size_cpu[i] + this_size = size_cpu[i + 1 : i + 1 + rank] + numel += int(np.prod(this_size)) + sizes.append(this_size) + i += 1 + rank + + if comm_rank == 0: + assert ( + data.dtype == datatype + ), "input has data type {} which " "is different than {}".format( + data.dtype, datatype + ) + data_b = paddle.concat( + [d.to(get_env_device()).reshape([-1]) for d in data_list], 0 + ) + assert numel == sum([d.numel().item() for d in data_list]), ( + numel, + [d.numel().item() for d in data_list], + ) + else: + data_b = paddle.empty([numel], dtype=datatype).to(get_env_device()) + + # Broadcast + paddle.distributed.broadcast(data_b, src_rank, group=comm_group).wait() + + ret = [] + offset = 0 + for size in sizes: + numel = int(np.prod(size)) + ret.append(data_b[offset : offset + numel].reshape(size)) + offset += numel + + return ret + + +@dataclass +class _DtypeSndShape: + """_summary_ + + Returns: + _type_: _description_ + """ + + dtype: paddle.dtype + shape: list + + def size(self): + """_summary_ + + Returns: + _type_: _description_ + """ + return reduce(lambda x, y: x * y, self.shape) + + +def split_group(grouped, split_size): + """_summary_ + + Args: + grouped (_type_): _description_ + split_size (_type_): _description_ + + Yields: + _type_: _description_ + """ + ret = [] + while grouped: + if sum([r[1].size() for r in ret]) > split_size: + yield ret + ret = [] + ret.append(grouped.pop()) + if ret: + yield ret + + +# Tea.chen congmin(葱明) brodcast +def broadcast_data_obj(data, src_rank, group): + + this_rank = dist.get_rank() + if this_rank == src_rank: + template = [ + map_structure( + lambda x: ( + _DtypeSndShape(dtype=x.dtype, shape=x.shape) + if x is not None + else _DtypeSndShape(dtype="", shape=[0]) + ), + data, + ) + ] + else: + template = [None] + dist.broadcast_object_list(template, src_rank, group) + template = template[0] + # log.info(f'[rank={dist.get_rank()}]: {template}') + + temp_flat = flatten(template) + data_flat = flatten(data) + + def keyfn(i): + return str(i[1].dtype) + + ret_flat = [-1 for _ in range(len(temp_flat))] + for dtype, grouped in groupby(sorted(enumerate(temp_flat), key=keyfn), keyfn): + grouped = list(grouped) + for grouped_chunk in split_group(grouped, 2**18): + idxs = [g[0] for g in grouped_chunk] + if not dtype: + for id in idxs: + ret_flat[id] = None + continue + + data_buf_shapes = [ + reduce(lambda x, y: x * y, g[1].shape) for g in grouped_chunk + ] + if this_rank == src_rank: + data_buf = paddle.concat([data_flat[i].reshape([-1]) for i in idxs], 0) + else: + data_buf = paddle.empty( + [sum(data_buf_shapes)], dtype=grouped_chunk[0][1].dtype + ) + dist.broadcast(data_buf, src_rank, group) + # log.info(f'[rank={dist.get_rank()}]: done broadcast data:{data_buf.shape}') + + if this_rank != src_rank: + # log.info(f'[rank={dist.get_rank()}] split:{data_buf_shapes}') + if len(data_buf_shapes) == 1: + data_buf = [data_buf] + else: + data_buf = data_buf.split(data_buf_shapes, axis=0) + for g, data_chunk in zip(grouped_chunk, data_buf): + ret_flat[g[0]] = data_chunk.reshape(g[1].shape) + + if this_rank != src_rank: + assert not [r for r in ret_flat if r is -1], ret_flat + data = pack_sequence_as(template, ret_flat) + return data + + +class DistDataLoaderAuto(DistDataLoader): + + def _init_dataloader_comm_group(self): + return self._hcg.get_pipe_parallel_group() + + def __next__(self): + data_dict = super().__next__() + + input_list = [] + if "token_type_ids" in data_dict.keys(): + ( + input_ids, + labels, + data_id, + src_id, + data_type, + images, + token_type_ids, + image_type_ids, + grid_thw, + ) = ( + data_dict["input_ids"], + data_dict["labels"], + data_dict["data_id"], + data_dict["src_id"], + data_dict["data_type"], + data_dict.get("images", None), + data_dict["token_type_ids"], + data_dict.get("image_type_ids", None), + data_dict.get("grid_thw", None), + ) + + data_world_size = max(self._hcg.get_data_parallel_rank(), 1) * max( + self._hcg.get_sharding_parallel_rank(), 1 + ) + if images is None: + images = paddle.zeros([1, 64, 64], dtype="uint8") + has_images = paddle.full([data_world_size, 1], False, dtype="bool") + else: + raise NotImplementedError + has_images = paddle.full([data_world_size, 1], True, dtype="bool") + if image_type_ids is None: + image_type_ids = paddle.zeros_like(token_type_ids) # padding for dy2st + input_list = [ + input_ids, + labels, + data_id, + src_id, + data_type, + images, + token_type_ids, + image_type_ids, + has_images, + grid_thw, + ] + else: + for key, data in data_dict.items(): + input_list.append(data) + return OrderedDict([("input_ids", input_list), ("labels", [])]) diff --git a/examples/pre-training/ernie/src/datasets/pretrain_task.py b/examples/pre-training/ernie/src/datasets/pretrain_task.py new file mode 100644 index 00000000..31572361 --- /dev/null +++ b/examples/pre-training/ernie/src/datasets/pretrain_task.py @@ -0,0 +1,788 @@ +# !/usr/bin/env python3 +# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import division +from __future__ import absolute_import +from __future__ import print_function +from __future__ import unicode_literals + +import atexit +import os +import math +import re +import random +import logging +from functools import partial +import numpy as np +from collections import OrderedDict, namedtuple +from typing import List + +import paddle +import h5py +from time import time +from src.utils.ipc_server import IPCServer + + +log = logging.getLogger(__name__) + + +class IPCH5Resource: + + def __init__(self, path, name, server): + + self.path = path + self.name = name + self.server = server + self._length = None + self._to_bool = None + + def __getitem__(self, key): + + return self.server.call(self.path, "get", (self.path, self.name, key)) + + def __len__(self): + + if self._length is None: + self._length = self.server.call(self.path, "len", (self.path, self.name)) + return self._length + + def __bool__(self): + + if self._to_bool is None: + self._to_bool = self.server.call( + self.path, "to_bool", (self.path, self.name) + ) + return self._to_bool + + +class IPCH5MetaResource: + + def __init__(self, path, server): + """ + __init__ + """ + self.path = path + self.server = server + self._meta = None + + def _get_meta(self): + """ + get_meta once + """ + if self._meta is None: + self._meta = self.server.call(self.path, "get_meta", (self.path,)) + + def __getitem__(self, key): + """ + __getitem__ + """ + self._get_meta() + return self._meta[key] + + def __len__(self): + """ + __len__ + """ + self._get_meta() + return len(self._meta) + + +class DatasetHolder: + + def __init__(self, paths, server_idx, server_num): + + self.fps = {} + path_num = len(paths) + start_t = time() + for idx, path in enumerate(paths): + assert path not in self.fps, path + + ds = h5py.File(path, mode="r") + fp = ds["ds16"] + assert ( + "ds16_tokenwise_type_id" not in ds + ), f"this file maybe a multimodal H5, path={path}" + if "ds16_lossmask" in ds: + fp_lossmask = ds["ds16_lossmask"] + assert len(ds["ds16_lossmask"]) == len(ds["ds16"]), ( + len(ds["ds16_lossmask"]), + len(ds["ds16"]), + ) + else: + fp_lossmask = None + + if "ds16_off" in ds: + off = ds["ds16_off"] + else: + off = None + + if "log_prob" in ds: + log_prob = ds["log_prob"] + else: + log_prob = None + + shape = fp.shape + meta = {"shape": shape} + if shape[0] <= 0 or shape[0] >= 1000000000000: + raise OSError + self.fps[path] = { + "fp": fp, + "lossmask": fp_lossmask, + "meta": meta, + "off": off, + "log_prob": log_prob, + } + end_t = time() + log.info( + f"Done loading {path}, shape: {shape}, in server-{server_idx}/{server_num}, " + f"accumulated time = {end_t - start_t}, progress: {idx}/{path_num}" + ) + end_t = time() + log.info( + f"Server-{server_idx}/{server_num} load ends with path number {path_num}, " + f"accumulated time = {end_t - start_t}" + ) + + def get(self, path, name, key): + """ + get + """ + return self.fps[path][name][key] + + def len(self, path, name): + """ + len + """ + return len(self.fps[path][name]) + + def to_bool(self, path, name): + """ + to_bool + """ + return True if self.fps[path][name] else False + + def get_meta(self, path): + """ + get_meta + """ + return self.fps[path]["meta"] + + +class DatasetHolderIniter: + + def __init__(self, paths): + """ + __init__ + """ + self.paths = paths + + def __call__(self, server_idx, server_num): + + return DatasetHolder(self.paths, server_idx, server_num) + + +def create_ipc_h5_resources(paths, num_server): + + n = len(paths) + if n <= 0: + return [] + + num_server = min(n, num_server) + + router_keys = [[] for _ in range(num_server)] + for i, p in enumerate(paths): + router_keys[i % num_server].append(p) + + init_funcs = [DatasetHolderIniter(rk) for rk in router_keys] + server = IPCServer(router_keys, init_funcs) + atexit.register(lambda: server.close()) + fps = [] + for p in paths: + tmp = { + "fp": IPCH5Resource(p, "fp", server), + "lossmask": IPCH5Resource(p, "lossmask", server), + "meta": IPCH5MetaResource(p, server), + "off": IPCH5Resource(p, "off", server), + "log_prob": IPCH5Resource(p, "log_prob", server), + } + fps.append(tmp) + return fps + + +def parse_filelist(filelist): + """parse filelist + + Args: + filelist (_type_): _description_ + + Raises: + ValueError: _description_ + + Returns: + _type_: _description_ + """ + if isinstance(filelist, str): + filelist = [filelist] + part_id_offset = 0 + h5, partids = [], [] + for f in filelist: + lines = [i.strip().split("\t") for i in open(f).readlines()] + if len(lines[0]) == 1: + h5.extend([i[0] for i in lines]) + partids.extend([i + part_id_offset for i in range(len(lines))]) + elif len(lines[0]) == 2: + _ids, _flst = zip(*lines) + h5.extend(_flst) + partids.extend([int(i) + part_id_offset for i in _ids]) + else: + raise ValueError("part format error") + part_id_offset = max(partids) + 1 + assert len(h5) == len(set(h5)), "duplicated filelist" + return partids, h5 + + +def parse_weights(weights): + """parse weights + + Args: + weights (_type_): _description_ + + Returns: + _type_: _description_ + """ + patterns = [] + if isinstance(weights, str): + weights = [weights] + for w in weights: + for i in open(w): + cols = i.strip().split() + assert ( + len(cols) >= 3 + ), f"配比文件至少要4列,格式为:pattern weight num_parts - {cols}" + pattern, w, num_parts = cols[:3] + if len(cols) >= 4 and cols[3] in ["lm", "mm", "audio"]: + data_type = cols[3] + else: + data_type = "mm" if "multimodal" in i else "lm" + + num_parts = int(num_parts) + pattern = re.compile(pattern) + patterns.append((pattern, float(w) / num_parts, data_type)) + return patterns + + +def parse_data_weight(weights, filelist): + + partids, filelist = parse_filelist(filelist) + patterns = parse_weights(weights) + partid2files, weight_filelist = {}, {} + for part_id, f in zip(partids, filelist): + if part_id not in partid2files: + partid2files[part_id] = [f] + else: + partid2files[part_id].append(f) + + for ipattern, (pattern, w, data_type) in enumerate(patterns): + if pattern.search(f): + # weight_filelist[f] = (float(w), ipattern, part_id) + weight_filelist[part_id] = (float(w), ipattern, data_type) + break + else: + log.warning(f"{f} does not match any pattern") + + train_filelist, weights = [], [] + for part_id, (v, source_id, data_type) in weight_filelist.items(): + train_filelist.append((partid2files[part_id], data_type)) + weights.append((v, source_id, part_id)) + return train_filelist, weights + + +def equal_shard(datasets, rank, world_size): + + assert ( + len(datasets) >= world_size + ), f"#filelist={len(datasets)} < world_size{world_size}" + if world_size == 1: + return datasets + if datasets[0].weights is None: + ran = np.array_split(np.arange(len(datasets)), world_size)[rank] + s, e = ran[0], ran[-1] + shard = datasets[s : e + 1] + return shard + buckets = [[] for _ in range(world_size)] + + bucketsize = np.zeros(len(buckets), dtype="float64") + total_w = sum([d.weights for d in datasets]) + for d in datasets: + d.weights = d.weights / total_w + datasets = sorted(datasets, key=lambda d: d.weights, reverse=True) + for d in datasets: + this_bucket = np.argmin(bucketsize) + buckets[this_bucket].append(d) + bucketsize[this_bucket] += d.weights + + log.info( + f"sharding dataset according to prob, group vs probs={[sum([rr.weights for rr in r])for r in buckets]}" + ) + bucketsize = bucketsize[rank] + diff = bucketsize - (1 / world_size) + log.info( + f"unable to perfect shard. prob sum of this bucket:{bucketsize}, diff to perfect portion:{diff}" + ) + assert ( + len(buckets) == world_size + ), f"#ret={len(buckets)} prob not normalized:{[d.weights for d in datasets]}" + return buckets[rank] + + +Example = namedtuple("Example", ["ids", "sids", "task", "lossmask", "src", "log_prob"]) + + +class ExampleSetSingleDataSource: + """Use to pick data from h5""" + + def __init__( + self, + path: List[str], + seqlen, + stride=None, + weights=None, + shuffle: bool = False, + num_consecutive: int = 1, + seed: int = 42, + combine_batch: int = 1, + ): + + self.seqlen = seqlen + if weights is not None: + assert isinstance(weights, tuple) and len(weights) == 3, weights + self.weights, self.src, self.part = weights + else: + self.weights, self.src, self.part = None, 0, 0 + if not stride: + self.stride = seqlen + else: + self.stride = stride + self.path = [os.path.expanduser(p) for p in path] + self._load = False + self.fps = [] + self._data_status = 0 + self.num_consecutive = num_consecutive + self.seed = seed + self.shuffle = shuffle + self.combine_batch = combine_batch + self.epoch = 0 + + @property + def data_status(self): + return self._data_status + + @data_status.setter + def data_status(self, value): + log.info(f"part-{self.part}-load_data_status: {value}") + self._data_status = value + + def set_loaded(self, fps): + """ + Set loaded fps + """ + self._load = True + self.int16_ds = True + self.fps = fps + + def load(self): + self._load = True + self.int16_ds = True + log.info("using int16 ds") + + for path in self.path: + log.info(f"loading {path}, weights={self.weights}") + ds = h5py.File(path, mode="r") + assert ( + "ds16_tokenwise_type_id" not in ds + ), f"this file maybe a multimodal H5, src={self.src}" + + fp = ds["ds16"] + if "ds16_lossmask" in ds: + fp_lossmask = ds["ds16_lossmask"] + assert len(ds["ds16_lossmask"]) == len(ds["ds16"]), ( + len(ds["ds16_lossmask"]), + len(ds["ds16"]), + ) + else: + fp_lossmask = None + # self.fp = self.fps[0] + + if "ds16_off" in ds: + log.info("using ds w/ offset") + off = ds["ds16_off"] + else: + off = None + + if "log_prob" in ds: + log.info("using ds with log_prob") + log_prob = ds["log_prob"] + else: + log_prob = None + shape = fp.shape + meta = {"shape": shape} + if ( + shape[0] <= 0 or shape[0] >= 1000000000000 + ): # 1000000000000 for max tokens of h5 + raise OSError + self.fps.append( + { + "fp": fp, + "lossmask": fp_lossmask, + "meta": meta, + "off": off, + "log_prob": log_prob, + } + ) + log.info( + f"done loading {path}, shape:{shape}: int16:{self.int16_ds} " + f"seqlen:{self.seqlen} stride:{self.stride}" + ) + log.info(f"done loading part-{self.part}, file count: {len(self.fps)}") + + def __getitem__(self, idx): + assert ( + len(idx) == 2 + ), f"idx format must be (`epoch, data_idx`), but got {idx} instead" + epoch, idx = idx + if idx == -1: + return Example( + ids=[], + sids=None, + task="lm", + src=self.part, + lossmask=None, + log_prob=None, + ) + assert self._load + fp = self.fps[epoch % len(self.fps)] + off = fp["off"] + if off: + s = off[idx] + e = off[idx + 1] + else: + s = max(idx * self.stride, 0) + e = idx * self.stride + self.seqlen + + ids = fp["fp"][s:e].astype(np.int32) + if fp["lossmask"]: + lossmask = fp["lossmask"][s:e].astype(np.int32) + else: + lossmask = None + if fp["log_prob"]: + log_prob = fp["log_prob"][s:e].astype(np.float32) + else: + log_prob = None + ret = Example( + ids=ids, + sids=None, + task="lm", + src=self.part, + lossmask=lossmask, + log_prob=log_prob, + ) + return ret + + def __len__(self): + assert self._load + fp = self.fps[self.epoch % len(self.fps)] + if fp["off"]: + return len(fp["off"]) + return int(np.ceil((fp["meta"]["shape"][0]) / self.stride)) + + def __iter__(self): + for i in range(len(self)): + yield self[(0, i)] + + @property + def example_id(self): + example_id = range(0, len(self), self.num_consecutive) + example_id = [ + (ii, min(ii + self.num_consecutive, len(self))) for ii in example_id + ] + if self.shuffle: + rng = random.Random(self.epoch + self.seed + self.part) + rng.shuffle(example_id) + return np.array(example_id) + + @property + def num_examples(self): + assert self.epoch == 0 + # return len(list(range(0, len(self), self.num_consecutive))) + return (len(self) + self.num_consecutive - 1) // self.num_consecutive + + def sampler(self): + assert paddle.io.get_worker_info() is None + + self.epoch = 0 + while 1: + if self._data_status >= len(self): + self._data_status -= len(self) + else: + log.debug( + f"...gen_index_from-[{self.part}]-[{self.epoch}]-offset-[{self.data_status}/{len(self)}]" + ) + for s, e in self.example_id: + _length = ( + math.ceil((e - s) / self.combine_batch) * self.combine_batch + ) + if self._data_status > 0: + if self._data_status >= _length: + self._data_status -= _length + continue + else: + s += self._data_status + self._data_status = 0 + yield self.epoch, list(range(s, e)) + self.epoch += 1 + + +class ExampleSet: + """use to manage all h5 data""" + + def __init__(self, exs, fn, load_process_num=0): + """ + __init__ + """ + self.exs = exs + self.fn = fn + self._load = False + self.global_max_part_id = max([ex.part for ex in exs]) + self.partid2ex = {ex.part: ex for ex in exs} + self.load_process_num = load_process_num + + def append(self, new_exs): + log.info(f"updating exs, #new example: {len(new_exs)}") + self.exs.append(new_exs) + lens = [len(e) for e in self.exs] + len_sum = sum(lens) + log.info("multi task data portion") + log.info( + "\n".join([f"{e.path}={left/len_sum}" for left, e in zip(lens, self.exs)]) + ) + + def load(self, use_shard, dp_rank, dp_size): + self._load = True + log.info(f"loading h5... use_shard={use_shard}, {self._load} {id(self)}") + + log.info(f"loading h5 in dp_env:{dp_rank}/{dp_size}") + if use_shard: + log.info("#shard train file, before load") + + def keyfn(e): + left = e.path.strip("/").split("/") + return left[0] + + path_per_dp = equal_shard(self.exs, dp_rank, dp_size) + log.debug( + f"using source shard, # files before shard={len(self.exs)}, after shard={len(path_per_dp)}" + ) + self.exs = path_per_dp + + if self.load_process_num > 0: + paths = [] + ranges = [] + start_idx = 0 + for i, ex in enumerate(self.exs): + assert isinstance(ex, ExampleSetSingleDataSource), type(ex) + cur_len = len(ex.path) + paths.extend(ex.path) + ranges.append((ex, start_idx, start_idx + cur_len)) + start_idx += cur_len + + fps = create_ipc_h5_resources(paths, self.load_process_num) + for ex, start, end in ranges: + ex.set_loaded(fps[start:end]) + else: + loaded_exs, err_cnt = [], 0 + for ex in self.exs: + try: + if isinstance(ex, ExampleSetSingleDataSource): + ex.load() + except OSError as e: + log.warning(f"loading {ex.path} error:{e}, skip...") + err_cnt += 1 + continue + loaded_exs.append(ex) + assert ( + loaded_exs + ), f"data_dir {[e.path for e in self.exs]} empty, #err:{err_cnt}" + self.exs = loaded_exs + if err_cnt > 0: + raise ValueError( + f"some data load failed, #parts={len(self.exs)}, #err={err_cnt}" + ) + log.info(f"done loading h5 #parts={len(self.exs)}, #err={err_cnt}") + + def __getitem__(self, idx): + # index 为三维坐标 (partid, part_epoch, part_data_idx) + if isinstance(idx, int): + # dev data + s = 0 + for ex in self.exs: + if s + len(ex) < idx: + s += len(ex) + else: + ret = ex[(0, idx - s)] + break + else: + assert ( + len(idx) == 3 + ), f"idx format must be (`part_id`, `part_epoch`, `part_data_idx`), but got {idx} instead" + part_id, epoch, idx = idx + ret = self.partid2ex[part_id][(epoch, idx)] + ret = self.fn(ret, idx) + ret.update(data_id=idx) + # log.info(f"index:{idx}, input_ids: {ret['input_ids'][0:10]}") + return ret + + def __len__(self): + assert self._load + return sum(map(len, self.exs)) + + def __iter__(self): + # print(f"real len: {len(self)}") + for i in range(len(self)): + yield self[i] + + +class PretrainTask: + def __init__(self, data_dir, tokenizer, **kwargs): + self.tokenizer = tokenizer + self.data_dir = data_dir + self.mask_gen = None + + def train_data( + self, + max_seq_len=512, + stride=None, + overlap_len=0, + rng=None, + weights=None, + evaluate=False, + seed=0, + num_consecutive=1, + shuffle=True, + combine_batch=1, + load_process_num=0, + ): + if isinstance(self.data_dir[0][0], list): + path = [i[0] for i in self.data_dir if not i[0][0].endswith("meta")] + else: + path = [i for i in self.data_dir if not i[0].endswith("meta")] + if not weights: + weights = [(None, None, i) for i in range(len(path))] + # assert max_seq_len > 0, f'max_mask_num too big! seqlen={max_seq_len}, max_mask_num={mask_generator.special_token_num}' + examples = ExampleSet( + [ + ExampleSetSingleDataSource( + p, + max_seq_len, + stride=stride, + weights=w, + seed=seed, + num_consecutive=num_consecutive, + shuffle=shuffle, + combine_batch=combine_batch, + ) + for p, w in zip(path, weights) + ], + partial( + self.example_to_feature, + rng=rng, + overlap_len=overlap_len, + evaluate=evaluate, + ), + load_process_num=load_process_num, + ) + return examples + + def example_to_feature( + self, + example, + idx, + rng, + overlap_len, + evaluate, + ): + if not rng: + rng = random + if evaluate: + # print(f"eval index: {idx}") + rng = random.Random(idx) + + if example.lossmask is not None: + labels = [ + self.tokenizer.ignored_index if j == 0 else i + for i, j in zip(example.ids, example.lossmask) + ] + tokens = example.ids[:-1] + lm_labels = labels[1:] + else: + _tokens = example.ids + tokens, lm_labels = _tokens[:-1], _tokens[1:] + if example.log_prob is not None: + log_prob = example.log_prob[1:] + else: + log_prob = None + + if overlap_len and idx != 0: # do overlap + # log.info(f"apply overlaping: overlap_len: {overlap_len}") + if isinstance(lm_labels, np.ndarray): + lm_labels = lm_labels.tolist() + lm_labels = [self.tokenizer.ignored_index] * len( + lm_labels[:overlap_len] + ) + lm_labels[overlap_len:] + assert len(lm_labels) == len( + tokens + ), f"lm_labels:{len(lm_labels)} vs tokens:{len(tokens)}" + + assert len(tokens) == len( + lm_labels + ), f"tokens:{len(tokens)} != labels:{len(lm_labels)}" + token_ids = np.array(tokens, dtype="int64") + lm_labels = np.array(lm_labels, dtype="int64") + + features = OrderedDict( + input_ids=token_ids, labels=lm_labels, src_id=example.src, log_prob=log_prob + ) + return features + + +class PretrainDummyDataset: + """pretrain dummy dataset""" + + def __init__(self, max_seq_len): + self.max_seq_len = max_seq_len + + def __getitem__(self, idx): + return { + "input_ids": np.array([1] * self.max_seq_len), + "labels": np.array([1] * self.max_seq_len), + "src_id": 0, + "data_id": 0, + } + + def __len__(self): + return 10000 + + def __iter__(self): + for i in range(len(self)): + yield self[i] diff --git a/examples/pre-training/ernie/src/lr_schedulers/__init__.py b/examples/pre-training/ernie/src/lr_schedulers/__init__.py index 77159c8e..71081f4b 100644 --- a/examples/pre-training/ernie/src/lr_schedulers/__init__.py +++ b/examples/pre-training/ernie/src/lr_schedulers/__init__.py @@ -11,7 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - +from .cosine_lr import get_cosine_schedule_with_warmup from .wsd_lr import get_wsd_schedule_with_warmup -__all__ = ['get_wsd_schedule_with_warmup'] +__all__ = ["get_wsd_schedule_with_warmup", "get_cosine_schedule_with_warmup"] diff --git a/examples/pre-training/ernie/src/lr_schedulers/cosine_lr.py b/examples/pre-training/ernie/src/lr_schedulers/cosine_lr.py new file mode 100644 index 00000000..6059c60a --- /dev/null +++ b/examples/pre-training/ernie/src/lr_schedulers/cosine_lr.py @@ -0,0 +1,62 @@ +# !/usr/bin/env python3 +# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" Custom lr schedule +""" + +import math +from paddle.optimizer.lr import LambdaDecay + + +def get_cosine_schedule_with_warmup( + learning_rate: float, + num_warmup_steps: int, + num_training_steps: int, + num_cycles: float = 0.5, + last_epoch: int = -1, + min_lr: float = 0.0, +): + """ + Create a schedule with a learning rate that decreases following the values of the cosine function between the + initial lr set in the optimizer to 0, after a warmup period during which it increases linearly between 0 and the + initial lr set in the optimizer. + Args: + learning_rate (float) + The initial learning rate. It is a python float number. + num_warmup_steps (`int`): + The number of steps for the warmup phase. + num_training_steps (`int`): + The total number of training steps. + num_cycles (`float`, *optional*, defaults to 0.5): + The number of waves in the cosine schedule (the defaults is to just decrease from the max value to 0 + following a half-cosine). + last_epoch (`int`, *optional*, defaults to -1): + The index of the last epoch when resuming training. + Return: + `paddle.optimizer.lr.LambdaDecay` with the appropriate schedule. + """ + + def lr_lambda(current_step): + if current_step < num_warmup_steps: + return float(current_step) / float(max(1, num_warmup_steps)) + progress = float(current_step - num_warmup_steps) / float( + max(1, num_training_steps - num_warmup_steps) + ) + ratio = max( + 0.0, 0.5 * (1.0 + math.cos(math.pi * float(num_cycles) * 2.0 * progress)) + ) + return ratio * (1 - min_lr / learning_rate) + min_lr / learning_rate + + return LambdaDecay(learning_rate, lr_lambda, last_epoch) diff --git a/examples/pre-training/ernie/src/trainers/__init__.py b/examples/pre-training/ernie/src/trainers/__init__.py index 254a42c3..477eeef4 100644 --- a/examples/pre-training/ernie/src/trainers/__init__.py +++ b/examples/pre-training/ernie/src/trainers/__init__.py @@ -17,9 +17,12 @@ PretrainingTrainer, WeightedDistributedSampler, ) +from .pretraining_trainer_auto import AutoPretrainingTrainer, AutoPreTrainingArguments __all__ = [ - 'PretrainingTrainer', - 'PreTrainingArguments', - 'WeightedDistributedSampler', + "PretrainingTrainer", + "PreTrainingArguments", + "WeightedDistributedSampler", + "AutoPretrainingTrainer", + "AutoPreTrainingArguments", ] diff --git a/examples/pre-training/ernie/src/trainers/pretraining_trainer_auto.py b/examples/pre-training/ernie/src/trainers/pretraining_trainer_auto.py new file mode 100644 index 00000000..18767fc6 --- /dev/null +++ b/examples/pre-training/ernie/src/trainers/pretraining_trainer_auto.py @@ -0,0 +1,1774 @@ +# !/usr/bin/env python3 + +# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""AutoPretrainingTrainer""" + +__all__ = [ + "AutoPretrainingTrainer", +] + + +import sys +import re +import os +import json +import pickle +import contextlib +from typing import Optional, List +from collections import OrderedDict, defaultdict +from dataclasses import dataclass, field +import random +import time +import math +import logging +from functools import partial + +import numpy as np + +import paddle +import paddle.nn as nn +from paddle.io import DataLoader +import paddle.amp.auto_cast as autocast +from paddle.distributed.communication.group import _get_global_group + +from paddleformers.trainer import ( + speed_metrics, +) + +from paddleformers.trainer.auto_trainer import AutoTrainer + +try: + from paddleformers.utils.env import ( + PADDLE_OPTIMIZER_NAME, + ) +except ImportError: + from paddleformers.trainer.trainer import ( + OPTIMIZER_NAME, + ) + + PADDLE_OPTIMIZER_NAME = OPTIMIZER_NAME +from paddleformers.utils.batch_sampler import ( + DistributedBatchSampler as PaddleNLPDistributedBatchSampler, +) + +try: + from paddleformers.trainer.trainer import ( + PADDLE_WEIGHT_FILE_NAME as PADDLE_WEIGHTS_NAME, + ) +except ImportError: + from paddleformers.utils.env import PADDLE_WEIGHTS_NAME +from paddleformers.trainer.utils import add_start_docstrings +from paddleformers.trainer.trainer_callback import PrinterCallback +from paddle.distributed import fleet +import paddle.distributed as dist +from paddleformers.datasets import MapDataset + +from paddleformers.transformers.model_utils import _add_variant + +from src.lr_schedulers import get_cosine_schedule_with_warmup +from src.utils.training_utils import ( + reset_per_device_batch_size, +) +from src.callbacks import ( + TensorBoardCallback, + LoggingCallback, + StopperCallback, + ClipGradByAdaptiveNormCallback, +) +from src.datasets import ( + DistDataLoaderAuto, + ExampleSet, + ExampleSetSingleDataSource, +) +from paddle.distributed import in_auto_parallel_align_mode +from src.clip import ClipGradByAdaptiveNorm, ClipGradForMOEByGlobalNorm +from src.trainers.pretraining_trainer import DummySampler + +try: + from paddleformers.trainer.trainer import ( + is_dp_group_support_in_group_sharded_parallel, + ) +except Exception: + + def is_dp_group_support_in_group_sharded_parallel(): + """ + hack for paddlenlp develop branch. + """ + return True + + +logger = logging.getLogger(__name__) + +try: + from paddleformers.trainer import AutoTrainingArguments +except ImportError: + from paddleformers.trainer import TrainingArguments as AutoTrainingArguments + + logger.warning("paddlenlp.trainer.AutoTrainingArguments CANNOT import!") + logger.warning("Use TrainingArguments as an alternative but will lose some args!") + + +def distributed_optimizer_maybe_hack( + optimizer, + use_moe, +): + if use_moe: + from src.trainers.dygraph_optimizer.hybrid_parallel_optimizer import ( + HybridParallelOptimizer as MoEHybridParallelOptimizer, + ) + + fleet_env = fleet.fleet + fleet_env.user_defined_optimizer = optimizer + hp_optim = MoEHybridParallelOptimizer( + optimizer, fleet_env._hcg, fleet_env._user_defined_strategy + ) + + if fleet_env._user_defined_strategy.hybrid_configs[ + "pp_configs" + ].dp_comm_overlap: + hp_optim._dp_enable = False + + if fleet_env._user_defined_strategy.hybrid_configs[ + "pp_configs" + ].sharding_comm_overlap: + hp_optim._sharding_enable = False + return hp_optim + else: + return fleet.distributed_optimizer(optimizer) + + +DATATYPE_2_ID = {"mm": 0, "lm": 1, "audio": 2} + + +@dataclass +@add_start_docstrings(AutoTrainingArguments.__doc__) +class AutoPreTrainingArguments(AutoTrainingArguments): + + vocab_path: str = field( + default=None, metadata={"help": "eb35 streaming data vocab"} + ) + task_need_convert: str = field(default=None, metadata={"help": "glm task id"}) + multimodal: bool = field( + default=False, metadata={"help": "whether training with multimodal"} + ) + model_name_or_path: str = field( + default=None, + metadata={ + "help": "Path to pretrained model or model identifier from " + "https://paddlenlp.readthedocs.io/zh/latest/model_zoo/transformers.html" + }, + ) + vision_model_name_or_path: str = field( + default=None, + metadata={ + "help": "Path to pretrained model or model identifier from " + "https://paddlenlp.readthedocs.io/zh/latest/model_zoo/transformers.html" + }, + ) + inception_model_name_or_path: str = field( + default=None, + metadata={ + "help": "Path to pretrained model or model identifier from " + "https://paddlenlp.readthedocs.io/zh/latest/model_zoo/transformers.html" + }, + ) + prefetch_factor: int = field( + default=2, + metadata={"help": "global random seed factor."}, + ) + eval_iters: int = field( + default=-1, + metadata={"help": "eval iteration for every evaluation."}, + ) + num_consecutive: int = field( + default=1, + metadata={ + "help": "H5文件连续采样。为了保证AFS性能,在读取AFS H5文件的时候需要尽量读取一片ID" + ",这个参数指定了一次连续读取的`样本`大小" + }, + ) + train_emb_only: int = field( + default=0, + metadata={"help": "是否只训练embedding,通常用于热启换词表"}, + ) + use_train_part_sharding: Optional[int] = field( + default=1, + metadata={"help": "根据file进行数据切片,只在预训练时候使用。否则会很慢"}, + ) + min_lr: float = field( + default=0.0, + metadata={"help": "minus learning rate"}, + ) + use_map_style_data: int = field( + default=0, + metadata={ + "help": "以为HF dataset为中心的 MapStyle SFT数据流(支持ShareGPT/DistillGPT)等数据", + }, + ) + use_streaming_data: int = field( + default=0, + metadata={ + "help": "标准线上明文数据流", + }, + ) + dataset: str = field( + default=None, + metadata={"help": "The name of the dataset to use (via the datasets library)."}, + ) + data_load_process_num: int = field( + default=10, + metadata={ + "help": "是否使用多进程加速原始数据读取,与DataLoader的num_workers意义不同" + }, + ) + + data_dir: str = field(default=None, metadata={"help": "数据路径(指向一个目录)"}) + + data_filelist: str = field( + default=None, metadata={"help": "数据文件列表,与`args.data_dir`互斥"} + ) + data_weights: str = field(default=None, metadata={"help": "数据配比权重"}) + + dev_data: str = field( + default=None, + metadata={"help": "The name of the dataset to use (via the datasets library)."}, + ) + + max_seq_length: int = field( + default=512, + metadata={ + "help": "The maximum total input sequence length after tokenization. Sequences longer " + "than this will be truncated, sequences shorter will be padded." + }, + ) + global_batch_size: int = field( + default=-1, + metadata={ + "help": "if `global_batch_size` and `per_device_train_batch_size` is provied, " + "`gradient_accumulation_steps` will be ignored" + }, + ) + init_global_batch_size: int = field( + default=-1, + metadata={ + "help": "开启动态Batching。必须提供`global_batch_size`, " + "global_batch_size 会在 `batch_size_warumup_steps` 步内从 " + "`init_global_batch_size` 提升到 `global_batch_size`, " + "每次 `batchsize` 的提升量为`batch_size_warmup_increment`" + }, + ) + batch_size_warmup_steps: int = field( + default=-1, + metadata={ + "help": "开启动态Batching。必须提供`global_batch_size`, " + "global_batch_size 会在 `batch_size_warumup_steps` 步内从 " + "`init_global_batch_size` 提升到 `global_batch_size`, " + "每次 `batchsize` 的提升量为`batch_size_warmup_increment`" + }, + ) + batch_size_warmup_increment: int = field( + default=1, + metadata={ + "help": "开启动态Batching。必须提供`global_batch_size`, " + "global_batch_size 会在 `batch_size_warumup_steps` 步内从 " + "`init_global_batch_size` 提升到 `global_batch_size`, " + "每次 `batchsize` 的提升量为`batch_size_warmup_increment`" + }, + ) + preprocessing_num_workers: Optional[int] = field( + default=None, + metadata={"help": "The number of processes to use for the preprocessing."}, + ) + config_name: Optional[str] = field( + default=None, + metadata={ + "help": "Pretrained config name or path if not the same as model_name" + }, + ) + tokenizer_name: Optional[str] = field( + default=None, + metadata={ + "help": "Pretrained tokenizer name or path if not the same as model_name" + }, + ) + init_ckpt: Optional[str] = field( + default=None, + metadata={}, + ) + sequence_parallel: Optional[int] = field( + default=0, + metadata={}, + ) + + config_file: Optional[str] = field( + default=None, + metadata={"help": "config file (YAML) to update hyper-parameters"}, + ) + virtual_pp_degree: Optional[int] = field( + default=1, + metadata={ + "help": "vpp", + }, + ) + from_scratch: Optional[int] = field(default=1, metadata={"help": "是否重头训练"}) + no_shuffle: Optional[int] = field(default=0, metadata={"help": "不要shuffle数据"}) + no_part_shuffle: Optional[int] = field( + default=0, metadata={"help": "不进行part内数据shuffle"} + ) + record_optimizer_stat: Optional[bool] = field( + default=False, metadata={"help": "是否记录优化器momentum信息"} + ) + skip_optimizer_badcases: Optional[bool] = field( + default=False, metadata={"help": "是否跳过optimizer badcase很多的step"} + ) + same_data: Optional[bool] = field( + default=False, + metadata={"help": "热启时,数据、配比、DP数是否完全一致, 支持续线"}, + ) + base_seq_length: Optional[int] = field( + default=4096, metadata={"help": "reeao最小seq_length"} + ) + shuffle_consecutive: Optional[bool] = field( + default=False, + metadata={ + "help": "是否对num_consecutive片段进行shuffle, same_data=True热启时,该值需与上一次保持一致" + }, + ) + global_shuffle_num_examples: Optional[int] = field( + default=0, + metadata={ + "help": "part间shuffle的num_example总数限制,默认不做限制, " + "这个值与最小配比的积 必须大于1, 改变该值时,需要设置same_data=False" + }, + ) + adaptive_norm_clip: Optional[bool] = field( + default=False, metadata={"help": "是否启用 AdaptiveNormClip 梯度裁剪策略"} + ) + adaptive_norm_clip_ratio: Optional[float] = field( + default=1.03, + metadata={"help": "AdaptiveNormClip 裁剪阈值, 大于设定的阈值才会启动裁剪"}, + ) + adaptive_norm_force_clear_state: Optional[bool] = field( + default=False, metadata={"help": "AdaptiveNormClip 强制清空 state dict"} + ) + adaptive_norm_shard_clip: Optional[bool] = field( + default=False, metadata={"help": "AdaptiveNormClip 在切分参数上是否在局部clip"} + ) + adaptive_norm_enable_record: Optional[bool] = field( + default=False, metadata={"help": "AdaptiveNormClip 是否启用统计历史norm值"} + ) + adaptive_norm_start_clip_steps: Optional[int] = field( + default=100, metadata={"help": "AdaptiveNormClip 开始裁剪的step"} + ) + adaptive_norm_enable_record_clip_history: Optional[bool] = field( + default=False, metadata={"help": "AdaptiveNormClip 是否启用统计历史裁剪的记录"} + ) + adaptive_norm_verbose: Optional[bool] = field( + default=False, metadata={"help": "AdaptiveNormClip 是否开启裁剪日志打印"} + ) + use_async_save: Optional[bool] = field( + default=False, metadata={"help": "是否开启异步保存功能"} + ) + pre_alloc_memory: float = field( + default=0.0, + metadata={ + "help": "Pre-allocate one specific-capacity empty tensor " + "and release it for avoiding memory fragmentation" + }, + ) + enable_global_training_logs: bool = field( + default=False, metadata={"help": "是否启用global_training_logs"} + ) + use_dummy_dataset: Optional[bool] = field( + default=False, metadata={"help": "是否使用DummyDataSet, 仅用于Debug"} + ) + reshard_save_then_exit: Optional[bool] = field( + default=False, metadata={"help": "是否在reshard后直接退出程序"} + ) + moe_group: Optional[str] = field( + default="dp", metadata={"help": "moe 的通信组,目前支持“dp|sharding|mp|dummy”"} + ) + use_moe: Optional[bool] = field( + default=False, metadata={"help": "expert parallel 临时替代"} + ) + moe_use_all2all: Optional[bool] = field( + default=False, metadata={"help": "是否使用all2all通信方式"} + ) + log_global_grad_norm: Optional[bool] = field( + default=False, + metadata={ + "help": "打印全局grad-norm, 只有在开启`enable_global_training_logs`时生效" + }, + ) + + lr_scheduler: str = field( + default="cosine", + metadata={ + "help": "The scheduler type to use. suppor linear, cosine, constant, constant_with_warmup" + }, + ) + image_token_len: int = field( + default=64, + metadata={"help": "number of images tokens from resampler per image"}, + ) + freeze_config: str = field( + default="", + metadata={ + "help": ( + "Some additional config for freeze params, we provide some option to config it." + "following config is support: freeze_vision,freeze_lm" + ) + }, + ) + moe_gate_lr_ratio: float = field( + default=None, + metadata={"help": ("启用 moe 时,对 gate/router 的 LR 做特殊处理")}, + ) + vit_lr_ratio: float = field( + default=None, + metadata={"help": ("启用vit训练时,对 vit 的 LR 做特殊处理")}, + ) + modality_interleave: str = field(default="acc", metadata={"help": "acc"}) + modality_ratio: tuple = field( + default=None, + metadata={"help": "ratio of modality tokens to be masked out"}, + ) + bos_retry_max_time: int = field( + default=0, metadata={"help": "when bos download failed, #retry times"} + ) + bos_retry_interval: float = field( + default=1, metadata={"help": "when bos download failed, interval between retry"} + ) + + pipeline_schedule_mode: str = field( + default="1F1B", + metadata={"help": "The pipeline schedule mode, support 1F1B and VPP"}, + ) + virtual_pipeline_seg_method: str = field( + default="ErnieDecoderLayerAuto", + metadata={"help": "The seg method of spliting pp layer for virtual pipeline."}, + ) + pp_need_data_degree: int = field( + default=0, + metadata={ + "help": "pipline 并行中的机器也需要 fetch 数据,提升吞吐,搭配 `ErniemmMoEForCausalPipe` 使用" + }, + ) + pp_need_data: bool = field(default=False, metadata={"help": "向前兼容"}) + custom_data_status: str = field( + default=None, + metadata={"help": "load data status from custom trainer_state.json"}, + ) + model_type: Optional[str] = field( + default="ernie", + metadata={"help": "Only support for ernie pre-training for now."}, + ) + n_microbatches: int = field( + default=1, + metadata={"help": "Control the num of microbatches in one pp step."}, + ) + + @property + def use_moe(self): + """_summary_ + + Returns: + _type_: _description_ + """ + return getattr(self, "use_expert_parallel", self._use_moe) + + @use_moe.setter + def use_moe(self, value): + """_summary_ + + Args: + value (_type_): _description_ + """ + self.use_expert_parallel = value + self._use_moe = value + + @property + def need_data(self): + + # mp0、pp0状态 卡才需要load数据 + if self.pp_need_data_degree: + assert self.pipeline_parallel_degree > 1 + assert ( + self.pp_need_data_degree >= 2 + and self.pp_need_data_degree <= self.pipeline_parallel_degree + ), ( + self.pp_need_data_degree, + self.pipeline_parallel_degree, + ) + # shift by 1 to avoid last pp no nee data + no_need_data_range = list( + range(self.pp_need_data_degree - 1, self.pipeline_parallel_degree - 1) + ) + return self.tensor_parallel_rank == 0 and ( + self.pipeline_parallel_rank not in no_need_data_range + ) + return self.pipeline_parallel_rank == 0 and self.tensor_parallel_rank == 0 + + @property + def combine_batch(self): + + return self.max_seq_length // self.base_seq_length + + @property + def reeao_dataset_rank(self): + + if not self.pp_need_data_degree: + return super().dataset_rank + no_need_data_range = list( + range(self.pp_need_data_degree - 1, self.pipeline_parallel_degree - 1) + ) + ranks = [ + i + for i in range(self.pipeline_parallel_degree) + if i not in no_need_data_range + ] + if self.pipeline_parallel_rank not in ranks: + return None + reeao_pp_rank = ranks.index(self.pipeline_parallel_rank) + + assert not (self.sharding_parallel_degree > 1 and self.data_parallel_rank > 1) + return ( + max(self.pp_need_data_degree, 1) * self.sharding_parallel_rank + + reeao_pp_rank + ) + + @property + def reeao_dataset_world_size(self): + """ + 考虑 pp /sharding/ dp 总和的数据流 worldsize + """ + if not self.pp_need_data: + return super().dataset_world_size + return ( + max(self.sharding_parallel_degree, 1) + * max(self.data_parallel_degree, 1) + * max(self.pipeline_parallel_degree, 1) + ) + + def __post_init__(self): + super().__post_init__() + # if self.sharding_parallel_degree > 1 and self.data_parallel_degree > 1: + # # MP/PP下, 当前框架不支持同时开启 sharding 和 DP + # assert ( + # self.pipeline_parallel_degree <= 1 and self.tensor_parallel_degree <= 1 + # ), f"when using mp/pp, `data_parallel_degree` should be 1 but receive {self.data_parallel_degree}" + if in_auto_parallel_align_mode(): + self.adaptive_norm_clip = False + self.adaptive_norm_clip_ratio = 0.0 + self.no_shuffle = 1 + self.no_part_shuffle = 1 + + assert ( + self.global_batch_size + == self.per_device_train_batch_size + * self.gradient_accumulation_steps + * max(self.sharding_parallel_degree, 1) + * max(self.data_parallel_degree, 1) + ), ( + f"`gbs` should be equal to `lbs * acc * (dp_degree or sd_degree)`, " + f"but got gbs={self.global_batch_size}, " + f"lbs={self.per_device_train_batch_size}, " + f"acc={self.gradient_accumulation_steps}, " + f"dp_degree={max(self.data_parallel_degree, 1)}, " + f"sd_degree={max(self.sharding_parallel_degree, 1)}" + ) + + if self.global_batch_size > 0: + micro_bsz, acc_steps = reset_per_device_batch_size( + self.global_batch_size, + self.per_device_train_batch_size, + self.dataset_world_size, + ) + logger.info( + f"global_batch={self.global_batch_size} micro-bsz:{micro_bsz}, accumulate_steps:{acc_steps}" + ) + if ( + acc_steps != 1 + and self.gradient_accumulation_steps != 1 + and acc_steps != self.gradient_accumulation_steps + ): + raise ValueError( + f"global_accumulation_steps={self.gradient_accumulation_steps}" + f"& global_batch={self.global_batch_size} are both set" + ) + self.per_device_train_batch_size, self.gradient_accumulation_steps = ( + micro_bsz, + acc_steps, + ) + + if self.batch_size_warmup_steps > 0: + assert self.global_batch_size > 0, self.global_batch_size + assert self.init_global_batch_size > 0, self.init_global_batch_size + self.max_gradient_accumulation_steps = ( + self.gradient_accumulation_steps + ) # hack add new + ( + self.per_device_train_batch_size, + self.gradient_accumulation_steps, + ) = reset_per_device_batch_size( + self.init_global_batch_size, + self.per_device_train_batch_size, + self.dataset_world_size, + ) + logger.info( + f"using progressive batching, accumulate step will increese from {self.gradient_accumulation_steps}" + f"to {self.max_gradient_accumulation_steps} in {self.batch_size_warmup_steps} steps" + ) + else: + self.max_gradient_accumulation_steps = ( + self.gradient_accumulation_steps + ) # hack add new + + if self.pipeline_parallel_degree > 1: + self.per_device_eval_batch_size = ( + self.per_device_train_batch_size * self.gradient_accumulation_steps + ) # hack Eval for PP! + logger.warn( + f"eval_batch_size set to {self.per_device_eval_batch_size} in Pipeline Parallel!" + ) + user_defined_strategy = fleet.fleet._user_defined_strategy + user_defined_strategy.strategy.pipeline_configs.accumulate_steps = ( + self.gradient_accumulation_steps + ) + if self.pp_need_data and not self.pp_need_data_degree: + self.pp_need_data_degree = self.pipeline_parallel_degree + if self.pp_need_data_degree: + assert ( + self.gradient_accumulation_steps % self.pp_need_data_degree == 0 + ), ( + f"gradient_accumulation_steps[{self.gradient_accumulation_steps}] should be divisible by " + f"pp_need_data_degree[{self.pp_need_data_degree}]" + ) + # pp_need_data_degree下,args的acc 需要//pp数量,欺骗 在prepare_inputs + self.gradient_accumulation_steps = ( + self.gradient_accumulation_steps // self.pp_need_data_degree + ) + logger.info( + f"pp-need-data hack args.gradient_accumulation_steps to - {self.gradient_accumulation_steps}" + ) + self.max_gradient_accumulation_steps = ( + self.gradient_accumulation_steps + ) # hack add new + logger.info(f"fixing pp configs: {user_defined_strategy.pipeline_configs}") + else: + self.per_device_eval_batch_size = self.per_device_train_batch_size + logger.warn(f"eval_batch_size set to {self.per_device_eval_batch_size}") + + if self.sharding_parallel_degree > 1: + sharding_parallel_config = ( + set(self.sharding_parallel_config.split(" ")) + if self.sharding_parallel_config + else set() + ) + sharding_comm_overlap_non_pp = ( + True + if "shardingv1_comm_overlap" in sharding_parallel_config + or "sharding_comm_overlap" in sharding_parallel_config + else False + ) + if sharding_comm_overlap_non_pp: + # update grad acc steps + assert hasattr(fleet.fleet, "_user_defined_strategy") + user_defined_strategy = fleet.fleet._user_defined_strategy + user_defined_strategy.hybrid_configs[ + "sharding_configs" + ].accumulate_steps = self.gradient_accumulation_steps + + # NOTE(shenliang03): Check sanity of `accumulate_steps` when using sharding comm overlap. + if hasattr(fleet.fleet, "_user_defined_strategy"): + user_defined_strategy = fleet.fleet._user_defined_strategy + if ( + hasattr(user_defined_strategy, "hybrid_configs") + and "sharding_configs" in user_defined_strategy.hybrid_configs + ): + sd_configs = user_defined_strategy.hybrid_configs["sharding_configs"] + if sd_configs.comm_overlap: + assert self.global_batch_size % self.dataset_world_size == 0, ( + f"global_batch_size[{self.global_batch_size}] should be divisible by " + f"dataset_world_size[{self.dataset_world_size}]" + ) + lbs = self.global_batch_size // self.dataset_world_size + assert lbs % self.per_device_train_batch_size == 0, ( + f"local_batch_size[{lbs}] should be divisible by " + f"per_device_train_batch_size[{self.per_device_train_batch_size}]" + ) + assert ( + lbs // self.per_device_train_batch_size + == sd_configs.accumulate_steps + ), ( + f"local_batch_size[{lbs}] should be equal to " + f"accumulate_steps[{sd_configs.accumulate_steps}] * " + f"per_device_train_batch_size[{self.per_device_train_batch_size}]" + ) + if self.vision_model_name_or_path is not None: + self.multimodal = True + + +class WeightedDistributedSamplerAuto(PaddleNLPDistributedBatchSampler): + + def __init__( + self, + dataset, + batch_size, + output_dir, + dp_rank, + dp_size, + num_consecutive=1, + seed=0, + batch_size_warmup_steps=-1, + gradient_accumulation_steps=None, + max_gradient_accumulation_steps=None, + per_device_train_batch_size=None, + batch_size_warmup_increment=None, + combine_batch: int = 1, + shuffle_consecutive: bool = False, + global_shuffle_num_examples: int = 0, + same_data: bool = False, + modality_ratio: tuple = None, + modality_interleave: int = 1, + **kwargs, + ): + self.num_consecutive = num_consecutive + self.seed = seed + super().__init__(dataset, batch_size, **kwargs) + self.weights = None + self.batch_size = batch_size # per-device-micro-batchsize + self.output_dir = output_dir + self.rng = random.Random(self.seed + self.epoch) + self.dp_rank = dp_rank + self.dp_size = dp_size + self.batch_size_warmup_steps = batch_size_warmup_steps + self.gradient_accumulation_steps = gradient_accumulation_steps + self.max_gradient_accumulation_steps = max_gradient_accumulation_steps + self.per_device_train_batch_size = per_device_train_batch_size + self.batch_size_warmup_increment = batch_size_warmup_increment + self.combine_batch = combine_batch + self.shuffle_consecutive = shuffle_consecutive + self.global_shuffle_seed = 0 + self.global_shuffle_num_examples = global_shuffle_num_examples + self.same_data = same_data + self.load_data_seq = False + self.modality_ratio = modality_ratio + self.modality_interleave = modality_interleave + if self.modality_ratio is not None: + print("[my debug] modality_ratio:", modality_ratio) + logger.info(f"modality ratio set to {self.modality_ratio}") + assert sum(modality_ratio) == 1.0, "modality ratio should sum to 1" + assert ( + self.modality_interleave * self.modality_ratio[0] % 1 == 0 + if len(self.modality_ratio) >= 1 + else True + ), "modality_interleave * modality_ratio[0] should be integer" + assert ( + self.modality_interleave * self.modality_ratio[1] % 1 == 0 + if len(self.modality_ratio) >= 2 + else True + ), "modality_interleave * modality_ratio[1] should be integer" + assert ( + self.modality_interleave * self.modality_ratio[2] % 1 == 0 + if len(self.modality_ratio) >= 3 + else True + ), "modality_interleave * modality_ratio[1] should be integer" + if isinstance(self.dataset, MapDataset): + self.inner_dataset = self.dataset.data + else: + self.inner_dataset = self.dataset + assert self.inner_dataset._load + + self.max_part_id = self.inner_dataset.global_max_part_id + + self.set_epoch(0) + + def load_data_status(self, data_status: List[int], global_shuffle_seed: int = 0): + self.global_shuffle_seed = global_shuffle_seed + if not hasattr(self.inner_dataset.exs[0], "data_status"): + logger.warn( + "Inner Datasource has no attribute data_status, ignore load_data_status" + ) + return + data_status = [ + math.ceil(i / self.combine_batch) * self.combine_batch for i in data_status + ] + for ex in self.inner_dataset.exs: + if ex.part < len(data_status): + ex.data_status = data_status[ex.part] + logger.debug( + f"dp-[{self.dp_rank}/{self.dp_size}]-loaded_data_status--[{data_status[:10]}]" + ) + + def gen_data_seq(self): + """ + 生成随机采样序列。在给定seed + epoch 的情况下,序列结果稳定可复现 + """ + total = [] + for ex in self.inner_dataset.exs: + total.extend([(ex.part, 0, i) for i in range(ex.data_status, len(ex))]) + assert ( + len(total) > self.num_consecutive + ), f"total={total} < num_consecutive={self.num_consecutive}" + indices = np.array_split(np.array(total), len(total) // self.num_consecutive) + if self.shuffle: + self.rng.shuffle(indices) + indices = np.concatenate(indices) + indices = self.roundup_and_shard(indices) + logger.debug(indices[:10]) + return indices + + def load_data_seq_from_cache(self): + """_summary_ + + Returns: + _type_: _description_ + """ + indices_file = os.path.join( + self.output_dir, + f"data_seq.epoch{self.epoch}.dp_{self.dp_rank}_of_{self.dp_size}" + f"_shard_{self.local_rank}_of_{self.nranks}.pth", + ) + if self.same_data and os.path.exists(indices_file): + logger.info(f"load data seq from file - {indices_file}") + self.load_data_seq = True + with open(indices_file, "rb") as of: + return pickle.load(of) + return None + + def gen_data_seq_weighted_multimodal( + self, lm_num_examples, mm_num_examples, audio_num_examples + ): + """multimodal data seq""" + assert self.modality_ratio is not None + logger.info(f"LM-num_examples -- {lm_num_examples}") + lm_indices = ( + self.gen_data_seq_weighted(lm_num_examples, DATATYPE_2_ID["lm"]) + if lm_num_examples > 0 + else None + ) + mm_indices = ( + self.gen_data_seq_weighted(mm_num_examples, DATATYPE_2_ID["mm"]) + if mm_num_examples > 0 + else None + ) + audio_indices = ( + self.gen_data_seq_weighted(audio_num_examples, DATATYPE_2_ID["audio"]) + if audio_num_examples > 0 + else None + ) + + lm_base = ( + int( + int(self.modality_ratio[0] * self.modality_interleave) + * self.combine_batch + * self.per_device_train_batch_size + ) + if len(self.modality_ratio) >= 1 + else 0 + ) + mm_base = ( + int( + int(self.modality_ratio[1] * self.modality_interleave) + * self.combine_batch + * self.per_device_train_batch_size + ) + if len(self.modality_ratio) >= 2 + else 0 + ) + audio_base = ( + int( + int(self.modality_ratio[2] * self.modality_interleave) + * self.combine_batch + * self.per_device_train_batch_size + ) + if len(self.modality_ratio) >= 3 + else 0 + ) + + num_batches = math.inf + if lm_indices is not None and lm_base > 0: + num_batches = min(lm_indices.shape[0] // lm_base, num_batches) + if mm_indices is not None and mm_base > 0: + num_batches = min(mm_indices.shape[0] // mm_base, num_batches) + if audio_indices is not None and audio_base > 0: + num_batches = min(audio_indices.shape[0] // audio_base, num_batches) + + all_indices = [] + if lm_indices is not None and lm_base > 0: + lm_indices = lm_indices[: num_batches * lm_base, :].reshape( + num_batches, lm_base, -1 + ) + all_indices.append(lm_indices) + if mm_indices is not None and mm_base > 0: + mm_indices = mm_indices[: num_batches * mm_base, :].reshape( + num_batches, mm_base, -1 + ) + all_indices.append(mm_indices) + if audio_indices is not None and audio_base > 0: + audio_indices = audio_indices[: num_batches * audio_base, :].reshape( + num_batches, audio_base, -1 + ) + all_indices.append(audio_indices) + + assert len(all_indices) > 0 + indices = np.concatenate(all_indices, axis=1).reshape( + -1, all_indices[0].shape[-1] + ) + logger.debug( + f"multimodal_data_seq={len(indices)}, example={indices[:10]}, " + f"modality_interleave={self.modality_interleave}, lm-{lm_base}, mm-{mm_base}, audio-{audio_base}" + ) + return indices + + def gen_data_seq_weighted(self, num_examples, data_type=None): + + assert ( + self.load_data_seq is False + ), "需要保证所有epoch的data_seq都从文件加载,否则下次删data_seq无法控住随机性" + logger.debug( + f"generating data sequence... #non_consecutive_data_chunks={num_examples}," + f" num_consecutive={self.num_consecutive}" + ) + + if num_examples > 1e5: + logger.debug( + "generating data sequence for very large data, consider use large `num_consecutive`" + ) + + if data_type is not None: + weights = [ + ex.weights for ex in self.inner_dataset.exs if ex.data_type == data_type + ] + exs = [ex for ex in self.inner_dataset.exs if ex.data_type == data_type] + else: + weights = [ex.weights for ex in self.inner_dataset.exs] + exs = self.inner_dataset.exs + assert len(exs) > 0, f"data_type={data_type}, no data found" + total_w = sum(weights) + weights = [w / total_w for w in weights] + + logger.info( + f"using weighted sampler, num_consecutive={self.num_consecutive}:\n" + + "\n".join(["%-100s...%.3e" % (e.path, w) for w, e in zip(weights, exs)]) + ) + + part_indices_gen = {} + indices = [] + for i, ex in enumerate(exs): + sample_size = int(weights[i] * num_examples) + logger.debug( + f"part_data_pre_sampling--[part-{ex.part}]-[sampler-size-{sample_size}]" + ) + assert ex.combine_batch == self.combine_batch + part_indices_gen[ex.part] = ex.sampler() + indices.extend([ex.part] * sample_size) + + logger.debug( + f"shuffle part placeholder index, size={len(indices)}, exmaple={indices[0]}" + ) + if self.shuffle: + self.rng.shuffle(indices) + logger.debug("shuffle done") + indices_ret = [] + logger.debug("build_index from shuffled placeholder") + + for part_id in indices: + epoch, _index = next(part_indices_gen[part_id]) + # combine_batch = max_seqlen (8k) / base_seqlen (1k) + if len(_index) % self.combine_batch != 0: + _index += [-1] * (self.combine_batch - len(_index) % self.combine_batch) + indices_ret += [(part_id, epoch, i) for i in _index] + + if self.shuffle_consecutive and self.combine_batch >= 1: + part_data_gen = defaultdict(lambda: []) + logger.debug("consecutive placeholder 2 shuffle") + for item in indices_ret: + part_data_gen[item[0]].append(item) + logger.debug("consecutive placeholder 2 shuffle...") + part_data_gen_iter = {} + for key in part_data_gen.keys(): + part_data_gen_iter[key] = iter(part_data_gen[key]) + logger.debug("consecutive placeholder 2 shuffle......") + placeholder_indices = [i[0] for i in indices_ret] + placeholder_indices = [ + placeholder_indices[i : i + self.combine_batch] + for i in range(0, len(placeholder_indices), self.combine_batch) + ] + logger.debug("consecutive placeholder 2 shuffle..........") + self.rng.shuffle(placeholder_indices) + logger.debug("consecutive placeholder 2 shuffle.............") + placeholder_indices = [ + item for sublist in placeholder_indices for item in sublist + ] + logger.debug("consecutive placeholder 2 shuffle................") + indices_ret = [next(part_data_gen_iter[i]) for i in placeholder_indices] + logger.debug("consecutive placeholder 2 shuffle done") + + logger.debug("build index done") + indices = np.array(indices_ret) + del indices_ret + logger.debug(f"num_data_seq={len(indices)}, example={indices[:10]}") + indices = self.roundup_and_shard(indices) + return indices + + def roundup_and_shard(self, indices): + if self.nranks == 1: + logger.info("use use_train_part_sharding, skip padding") + return indices + + padding_size = self.total_size - len(indices) + logger.info( + f"padding-size={padding_size}, total_size={self.total_size} shard={self.local_rank}/{self.nranks}" + ) + if padding_size < 0: + indices = indices[:padding_size] + else: + indices = np.concatenate( + [ + indices, + np.tile(indices, math.ceil(padding_size / len(indices)))[ + :padding_size + ], + ] + ) + + assert len(indices) == self.total_size, (len(indices), self.total_size) + + # subsample + indices = indices[self.local_rank : self.total_size : self.nranks] + assert len(indices) == self.num_samples + return indices + + def __len__(self): + # PaddleNLP expect TypeError for infinite datasets: + # https://github.com/PaddlePaddle/PaddleNLP/blob/develop/paddlenlp/trainer/trainer_utils.py#L515 + raise TypeError + + def __iter__(self): + # deterministically shuffle based on epoch and seed + self.rng = random.Random(self.seed + self.epoch + self.global_shuffle_seed) + logger.info(f"seed={self.seed + self.epoch + self.global_shuffle_seed}") + weights = [e.weights for e in self.inner_dataset.exs] + if any([w is None for w in weights]) or sum(weights) == 0.0: + logger.info(f"using normal sampler, num_consecutive={self.num_consecutive}") + indices = self.gen_data_seq() + self.weights = None + else: + self.weights = weights + num_examples = sum([ex.num_examples for ex in self.inner_dataset.exs]) + if self.modality_ratio is not None: + lm_num_examples = sum( + [ + ex.num_examples + for ex in self.inner_dataset.exs + if ex.data_type == DATATYPE_2_ID["lm"] + ] + ) + mm_num_examples = sum( + [ + ex.num_examples + for ex in self.inner_dataset.exs + if ex.data_type == DATATYPE_2_ID["mm"] + ] + ) + audio_num_examples = sum( + [ + ex.num_examples + for ex in self.inner_dataset.exs + if ex.data_type == DATATYPE_2_ID["audio"] + ] + ) + if self.global_shuffle_num_examples > 0: + num_examples = min([self.global_shuffle_num_examples, num_examples]) + if self.modality_ratio is not None: + lm_num_examples = min( + [self.global_shuffle_num_examples, lm_num_examples] + ) + mm_num_examples = min( + [self.global_shuffle_num_examples, mm_num_examples] + ) + audio_num_examples = min( + [self.global_shuffle_num_examples, audio_num_examples] + ) + logger.debug( + f"using global shuffle num examples: {self.global_shuffle_num_examples}" + ) + indices = self.load_data_seq_from_cache() + if indices is None: + indices = ( + self.gen_data_seq_weighted_multimodal( + lm_num_examples, mm_num_examples, audio_num_examples + ) + if self.modality_ratio is not None + else self.gen_data_seq_weighted(num_examples) + ) + + if self.output_dir: + with open( + os.path.join( + self.output_dir, + f"data_seq.epoch{self.epoch}.dp_{self.dp_rank}_of_{self.dp_size}" + f"_shard_{self.local_rank}_of_{self.nranks}.pth", + ), + "wb", + ) as of: + pickle.dump(indices, of, protocol=4) + + def ret(): # 无穷长reader。 + # info = paddle.io.get_worker_info() + nonlocal indices + buf = [] + logger.info(f"start training sequence, data-sequence: {indices[:10]}") + while 1: + if self.consumed_samples >= len(indices): + self.consumed_samples -= len(indices) + else: + for i in range(self.consumed_samples, len(indices)): + if len(buf) == self.batch_size: + yield buf + buf = [] + buf.append(indices[i].tolist()) + self.consumed_samples = 0 + self.epoch += 1 + logger.info( + f"epoch done, #data={self.total_size}, reshuffle-sequence: epoch={self.epoch}" + ) + + self.rng = random.Random(self.seed + self.epoch) + if self.weights: + indices = self.load_data_seq_from_cache() + if indices is None: + indices = ( + self.gen_data_seq_weighted_multimodal( + lm_num_examples, mm_num_examples, audio_num_examples + ) + if self.modality_ratio is not None + else self.gen_data_seq_weighted(num_examples) + ) + else: + indices = self.gen_data_seq() + if self.output_dir: + with open( + os.path.join( + self.output_dir, + f"data_seq.epoch{self.epoch}.dp_{self.dp_rank}_of_{self.dp_size}" + f"_shard_{self.local_rank}_of_{self.nranks}.pth", + ), + "wb", + ) as of: + pickle.dump(indices, of, protocol=4) + + return ret() + + def set_epoch(self, epoch=0, consumed_samples=0): + + consumed_samples = consumed_samples // self.dp_size + logger.debug(f"set consumed samples={consumed_samples}, epoch={epoch}") + super().set_epoch(epoch, consumed_samples) + + if isinstance(self.inner_dataset, ExampleSet): + for ex in self.inner_dataset.exs: + if isinstance(ex, ExampleSetSingleDataSource): + ex.epoch = epoch + + +class AutoPretrainingTrainer(AutoTrainer): + + def __init__(self, _shit=None, args=None, model=None, callbacks=[], **kwargs): + assert _shit is None, "use key-ward argument" + callbacks = [ + LoggingCallback(), + StopperCallback(), + TensorBoardCallback( + args, model=model, log_tokens_per_step=True, log_flops_per_step=False + ), + ] + callbacks + + if args.adaptive_norm_clip: + callbacks.append( + ClipGradByAdaptiveNormCallback(), + ) + args.use_async_save = ( + args.use_async_save and args.save_sharded_model and args.load_sharded_model + ) + super().__init__(args=args, model=model, callbacks=callbacks, **kwargs) + + def get_numel_item(p): + item = p.numel().item() + return item if item else 0 + + model_numel = sum( + get_numel_item(p) + for n, p in model.named_parameters() + if not p.stop_gradient and "embeddings" not in n and "embed_tokens" not in n + ) + numel_tensor = paddle.to_tensor(model_numel) + dist.all_reduce(numel_tensor) + self.model_numel = numel_tensor.item() // self.args.dataset_world_size + + self.pop_callback(PrinterCallback) + self.pp_data_buffer = [] # pp + self._tokens_per_sec_per_card_buffer = [] + self._start_save_time = time.time() + self._end_save_time = time.time() + self._first_end_save_time = time.time() + self.resume_global_step = -1 + self.first_skip_step = ( + 5 if self.args.save_steps > 5 else self.args.save_steps / 2 + ) + if args.same_data: + logger.warning( + "You have set same_data=True. \ + Carefully check whether the data, population proportion, " + "and DP count are completely consistent with those before." + ) + else: + logger.warning( + "You have set same_data=False. \ + which will regenerate the global shuffle domain." + ) + # self.return_value = paddle.zeros([]) #fake return value + + def autocast_smart_context_manager(self): + + if self.enable_autocast_context_manager: + black = [ + "reduce_sum", + "c_softmax_with_cross_entropy", + "elementwise_div", + "sin", + "cos", + ] + white = [ + "lookup_table", + "lookup_table_v2", + "flash_attn", + "flash_attn_v1", + "matmul", + "matmul_v2", + "fused_gemm_epilogue", + ] + if self.args.bf16 and self.args.fp16_opt_level == "O2": + black.append("c_embedding") + + ctx_manager = autocast( + True, + custom_black_list=black, + custom_white_list=white, + level=self.args.fp16_opt_level, + dtype=self.amp_dtype, + ) + else: + ctx_manager = ( + contextlib.nullcontext() + if sys.version_info >= (3, 7) + else contextlib.suppress() + ) + + return ctx_manager + + def _load_optimizer_state(self, checkpoint): + + def _broadcast_moe_optimizer_state(state_dict): + base_state_dict = {"master_weights": {}} + buf = [ + { + i: j.shape + for i, j in state_dict.items() + if i not in ["master_weights", "LR_Scheduler"] + }, + {i: j.shape for i, j in state_dict["master_weights"].items()}, + {"LR_Scheduler": state_dict.get("LR_Scheduler", {})}, + ] + + if self.args.use_hybrid_parallel: + hcg = fleet.get_hybrid_communicate_group() + src_rank = hcg.get_data_parallel_group_src_rank() + group = hcg.get_data_parallel_group() + else: + src_rank = 0 + group = None + + dist.broadcast_object_list(buf, src=src_rank, group=group) + for k, s in buf[0].items(): + v = state_dict.get(k, paddle.zeros(s, "float32")).cuda() + v.name = k + # k = k.replace("_fp32_master_0", "") # TODO 这一手replace待品 + dist.broadcast(v, src=src_rank, group=group) + logger.info(f"broadcast moe optimizer {k} from {src_rank}") + base_state_dict[k] = v.cpu() + for k, s in buf[1].items(): + v = ( + state_dict["master_weights"] + .get(k, paddle.zeros(s, "float32")) + .cuda() + ) + v.name = k + dist.broadcast(v, src=src_rank, group=group) + logger.info( + f"broadcast moe optimizer-master_weights {k} from {src_rank}" + ) + base_state_dict["master_weights"][k] = v.cpu() + base_state_dict.update(buf[2]) + return base_state_dict + + state_dict = super()._load_optimizer_state(checkpoint) + + if self.args.use_moe: + base_state_dict = _broadcast_moe_optimizer_state(state_dict) + if self.args.data_parallel_rank > 0: + master_weight = state_dict.pop("master_weights", {}) + base_state_dict.update(state_dict) + if master_weight: + if "master_weights" in base_state_dict: + base_state_dict["master_weights"].update(master_weight) + else: + base_state_dict["master_weights"] = master_weight + state_dict = base_state_dict + del base_state_dict + return state_dict + + def _save_moe_weights(self, output_dir): + + optimizer_name = _add_variant( + PADDLE_OPTIMIZER_NAME, self.args.optimizer_name_suffix + ) + saved_signal_path = os.path.join(output_dir, f"saved_signal_{dist.get_rank()}") + + os.makedirs(output_dir, exist_ok=True) + state_dict = self.model.state_dict() + optimzier_state_dict = self.optimizer.state_dict() + + filtered_state_dict = OrderedDict() + filter_optimzier_state_dict = OrderedDict() + + param_names_in_master_weights = ( + list(optimzier_state_dict["master_weights"].keys()) + if self.args.bf16 + else [] + ) + filter_optimzier_state_dict["master_weights"] = OrderedDict() + + for k, v in state_dict.items(): + if getattr(v, "no_sync", False): + + if v.name in param_names_in_master_weights: + filter_optimzier_state_dict["master_weights"][v.name] = ( + optimzier_state_dict["master_weights"][v.name] + ) + if not ( + getattr(self.args, "should_save_sharding_stage1_model", False) + or getattr(self.args, "save_sharding_stage1_model", False) + ): + filtered_state_dict[k] = v + for op_k, op_v in optimzier_state_dict.items(): + if op_k.startswith(v.name): + filter_optimzier_state_dict[op_k] = op_v + + if getattr(self.args, "should_save_sharding_stage1_model", False) or getattr( + self.args, "save_sharding_stage1_model", False + ): + self._save(output_dir=output_dir) + else: + if self.args.sharding_parallel_rank == 0: + paddle.save( + filtered_state_dict, + os.path.join( + output_dir, + _add_variant(PADDLE_WEIGHTS_NAME, self.args.weight_name_suffix), + ), + ) + paddle.save( + filter_optimzier_state_dict, os.path.join(output_dir, optimizer_name) + ) + with open(saved_signal_path, mode="w+") as f: + f.write("1") + + def evaluate( + self, eval_dataset=None, ignore_keys=None, metric_key_prefix: str = "eval" + ): + """doc""" + self.model_wrapped.accumulate_steps = self.args.gradient_accumulation_steps + eval_dataloader = self.get_eval_dataloader(eval_dataset) + + start_time = time.time() + # Temporarily disable metric computation, we will do it in the loop here. + compute_metrics = self.compute_metrics + eval_loop = self.evaluation_loop + + output = eval_loop( + eval_dataloader, + description="Evaluation", + # No point gathering the predictions if there are no metrics, otherwise we defer to + # self.args.prediction_loss_only + prediction_loss_only=True if compute_metrics is None else None, + ignore_keys=ignore_keys, + # Only evaluate max_eval_iters + max_eval_iters=self.args.eval_iters, + ) + + total_batch_size = self.args.eval_batch_size * self.args.world_size + output.metrics.update( + speed_metrics( + metric_key_prefix, + start_time, + num_samples=output.num_samples, + num_steps=math.ceil(output.num_samples / total_batch_size), + ) + ) + + self.log(output.metrics) + + self.control = self.callback_handler.on_evaluate( + self.args, self.state, self.control, output.metrics + ) + return output.metrics + + def prediction_pipeline_step( + self, model, inputs, prediction_loss_only, ignore_keys + ): + """doc""" + loss, _, labels = super().prediction_pipeline_step( + model, inputs, prediction_loss_only, ignore_keys + ) + num_tokens = (labels != self.tokenizer.ignored_index).sum().item() + loss_avg = loss * self.model_wrapped.accumulate_steps / num_tokens + return loss_avg, loss, labels + + def _get_train_sampler(self) -> Optional[paddle.io.Sampler]: + if self.args.use_dummy_dataset: + return DummySampler( + self.train_dataset, + self.args.per_device_train_batch_size * self.args.combine_batch, + ) + if self.args.use_train_part_sharding: + num_replicas = 1 + rank = 0 + else: + num_replicas = self.args.reeao_dataset_world_size + rank = self.args.reeao_dataset_rank + batch_size = self.args.per_device_train_batch_size * self.args.combine_batch + batch_size *= self.args.gradient_accumulation_steps + batch_sampler = WeightedDistributedSamplerAuto( + self.train_dataset, + batch_size, + self.args.output_dir, + dp_rank=self.args.reeao_dataset_rank, + dp_size=self.args.reeao_dataset_world_size, + num_replicas=num_replicas, + rank=rank, + seed=self.args.seed, + batch_size_warmup_steps=self.args.batch_size_warmup_steps, # used to reesume from ckpt + gradient_accumulation_steps=self.args.gradient_accumulation_steps, + max_gradient_accumulation_steps=self.args.max_gradient_accumulation_steps, + per_device_train_batch_size=self.args.per_device_train_batch_size, + batch_size_warmup_increment=self.args.batch_size_warmup_increment, + shuffle=not self.args.no_shuffle, + drop_last=False, + num_consecutive=self.args.num_consecutive, + combine_batch=self.args.combine_batch, + shuffle_consecutive=self.args.shuffle_consecutive, + global_shuffle_num_examples=self.args.global_shuffle_num_examples, + same_data=self.args.same_data, + modality_ratio=self.args.modality_ratio, + modality_interleave=( + self.args.modality_interleave * self.args.combine_batch + if self.args.modality_interleave + else None + ), + ) + return batch_sampler + + def get_train_dataloader(self): + + if self.args.need_data and self.train_dataset is None: + raise ValueError("Trainer: training requires a train_dataset.") + _DataLoader = partial( + DistDataLoaderAuto, + need_data=self.args.need_data, + pp_broadcast=not self.args.pp_need_data, + ) + + train_dataset = self.train_dataset + if self._is_iterable_dataset(train_dataset): + return DataLoader( + train_dataset, + batch_size=None, + collate_fn=self.data_collator, + num_workers=self.args.dataloader_num_workers, + use_shared_memory=True, + prefetch_factor=self.args.prefetch_factor, + ) + if self.args.need_data: + train_sampler = self._get_train_sampler() + else: + train_sampler = None + return _DataLoader( + train_dataset, + batch_sampler=train_sampler, + collate_fn=self.data_collator, + num_workers=self.args.dataloader_num_workers, + prefetch_factor=self.args.prefetch_factor, + ) + + def _broadcast_final_loss(self, tr_loss): + tr_loss = tr_loss._local_value() if tr_loss.is_dist() else tr_loss + + if self.args.pipeline_parallel_degree > 1: + hcg = fleet.get_hybrid_communicate_group() + num_stages = hcg.get_pipe_parallel_world_size() + + paddle.distributed.broadcast( + tr_loss, + src=hcg.get_rank_from_stage(num_stages - 1), + sync_op=True, + group=hcg.get_pipe_parallel_group(), + ) + return tr_loss + + def _maybe_log_save_evaluate( + self, tr_loss, model, epoch, ignore_keys_for_eval, **kwargs + ): + super()._maybe_log_save_evaluate( + tr_loss, model, epoch, ignore_keys_for_eval, **kwargs + ) + return + + def create_scheduler(self, num_training_steps): + + if self.args.warmup_steps > 0: + warmup = self.args.warmup_steps + else: + warmup = int(self.args.warmup_ratio * num_training_steps) + self.lr_scheduler = get_cosine_schedule_with_warmup( + self.args.learning_rate, + warmup, + self.args.max_steps, + min_lr=self.args.min_lr if self.args.min_lr else 0.0, + ) + print(f"lr_scheduler : {self.lr_scheduler}") + + return self.lr_scheduler + + def create_optimizer(self, lr_scheduler=None): + """ + Setup the optimizer. + + We provide a reasonable default that works well. If you want to use something else, you can pass a tuple in the + Trainer's init through `optimizers`, or subclass and override this method in a subclass. + """ + optimizer_params = ( + [p for n, p in self.model.named_parameters() if "embeddings" in n] + if self.args.train_emb_only + else self.model.parameters() + ) + if self.args.train_emb_only: + logger.info( + f"using `train-emb-only`, #embedding params={len(optimizer_params)}" + ) + if self.optimizer is None: + + def need_decay(name): + if ( + name == "ernie.norm.weight" + and self.args.pipeline_parallel_degree > 1 + ): + return True + return not any(nd in name for nd in ["bias", "norm"]) + + decay_parameters = [ + p.name for n, p in self.model.named_parameters() if need_decay(n) + ] + + def apply_decay_param_fun(x): + return x in decay_parameters + + optimizer_cls, optimizer_kwargs = AutoTrainer.get_optimizer_cls_and_kwargs( + self.args + ) + + if self.args.adaptive_norm_clip: + if "split_param" in self.args.sharding_parallel_config: + from paddle.distributed.fleet.meta_optimizers.dygraph_optimizer.dygraph_sharding_optimizer import ( + DygraphShardingOptimizerV2, + ) + + v2_assign_slice_grad = DygraphShardingOptimizerV2._assign_slice_grad + + def _assign_slice_grad(self): + v2_assign_slice_grad(self) + assert isinstance( + self._grad_clip, ClipGradByAdaptiveNorm + ), "self._grad_clip must be ClipGradByAdaptiveNorm" + if not hasattr(self._grad_clip, "pname_to_paramindex"): + pname_to_paramindex = {} + assert not isinstance(self._parameter_list[0], dict) + for idx, param in enumerate(self._parameter_list): + param = self._slice_params[param.name] + if param._is_initialized(): + pname_to_paramindex[param.name] = idx + self._grad_clip.pname_to_paramindex = pname_to_paramindex + self._grad_clip.num_params = len(self._parameter_list) + self._grad_clip.sharding_stage1_v2 = True + + DygraphShardingOptimizerV2._assign_slice_grad = _assign_slice_grad + logger.info( + "Hack DygraphShardingOptimizerV2._assign_slice_grad for ClipGradByAdaptiveNorm" + ) + + grad_clip = ClipGradByAdaptiveNorm( + clip_ratio=self.args.adaptive_norm_clip_ratio, + start_clip_steps=self.args.adaptive_norm_start_clip_steps, + shard_clip=self.args.adaptive_norm_shard_clip, + enable_record=self.args.adaptive_norm_enable_record, + enable_record_clip_history=self.args.adaptive_norm_enable_record_clip_history, + verbose=self.args.adaptive_norm_verbose, + ) + logger.info("using ClipGradByAdaptiveNorm") + elif ( + self.args.use_moe + and not self.args.use_hybrid_parallel + and not self.args.enable_auto_parallel + ): + logger.info("using moe Global clip") + + def expert_fn(p): + return getattr(p, "no_sync", False) + + grad_clip = ClipGradForMOEByGlobalNorm( + self.args.max_grad_norm, + is_expert_param_func=expert_fn, + moe_group=_get_global_group(), # None 为全局通信组, + local_clip=False, + ) + else: + grad_clip = ( + nn.ClipGradByGlobalNorm(self.args.max_grad_norm) + if self.args.max_grad_norm > 0 + else None + ) + + self.static_name_to_dyg_name = { + p.name: n for n, p in self.model.state_dict().items() + } + gate_pattern = re.compile(r"ernie\.layers\.0\.mlp\.gate\.weight") + vit_pattern = re.compile( + r"vision_model\.(cls_token|pos_embed|patch_embed|blocks)" + ) + vit_blocks_pattern = re.compile(r"vision_model\.blocks\.(\d+)\.") + + def lr_ratio_fn(param): + if param.name in self.static_name_to_dyg_name.keys(): + name = self.static_name_to_dyg_name[param.name] + # logger.info(f'search {param.name} -> {name}') + if self.args.moe_gate_lr_ratio is not None and gate_pattern.match( + name + ): + logger.info( + f"apply moe_gate_lr_ratio to {name}, ratio={self.args.moe_gate_lr_ratio}" + ) + return float(self.args.moe_gate_lr_ratio) + elif self.args.vit_lr_ratio is not None and vit_pattern.match(name): + n_layers = self.model.config.vision_config.layers + if vit_blocks_pattern.match(name): + layer_id = int(vit_blocks_pattern.match(name).group(1)) + else: + layer_id = 0 + lr_ratio = self.args.vit_lr_ratio ** (n_layers - 1 - layer_id) + logger.info(f"apply vit lr_ratio to {name}, ratio={lr_ratio}") + return float(lr_ratio) + return 1.0 + + self.optimizer = optimizer_cls( + learning_rate=( + self.lr_scheduler if lr_scheduler is None else lr_scheduler + ), + apply_decay_param_fun=apply_decay_param_fun, + parameters=optimizer_params, + weight_decay=self.args.weight_decay, + grad_clip=grad_clip, + multi_precision=True, + lr_ratio=( + lr_ratio_fn + if ( + self.args.moe_gate_lr_ratio is not None + or self.args.vit_lr_ratio is not None + ) + else None + ), + **optimizer_kwargs, + ) + + self.static_name_to_dyg_name = { + p.name: n for n, p in self.model.named_parameters() + } + + return self.optimizer + + def save_model(self, output_dir=None): + + super().save_model(output_dir) + if self.args.should_save: + with open( + os.path.join(output_dir, "static_name_to_dyg_name.json"), "w" + ) as of: + of.write(json.dumps(self.static_name_to_dyg_name)) + + def _load_rng_state(self, checkpoint): + pass + + def _get_meshes_for_loader(self): + def _get_mesh(pp_idx=0): + return self.global_mesh.get_mesh_with_dim("pp")[pp_idx] + + meshes = [] + if self.args.pipeline_parallel_degree > 1: + if self.args.multimodal: + # `input_ids`, `labels`, `data_id`, `src_id`, `data_type`, `images`, `token_type_ids`, + # `image_type_ids`, `has_images` + meshes.append( + [ + _get_mesh(0), + _get_mesh(-1), + _get_mesh(0), + _get_mesh(0), + _get_mesh(0), + _get_mesh(0), + _get_mesh(0), + _get_mesh(0), + _get_mesh(0), + ] + ) + else: + meshes.append([_get_mesh(0), _get_mesh(-1), _get_mesh(0), _get_mesh(0)]) + # labels + meshes.append(_get_mesh(self.args.pipeline_parallel_degree - 1)) + else: + meshes.append(_get_mesh(0)) + return meshes + + def _wrap_for_dist_loader(self, train_dataloader): + self.dense_tensor_idx = None + dist_loader = dist.shard_dataloader( + dataloader=train_dataloader, + meshes=self._get_meshes_for_loader(), + shard_dims="dp", + is_dataset_splitted=True, + ) + dist_loader._input_keys = ["input_ids", "labels"] + return dist_loader diff --git a/examples/pre-training/ernie/src/utils/__init__.py b/examples/pre-training/ernie/src/utils/__init__.py index edcdc529..121653ca 100644 --- a/examples/pre-training/ernie/src/utils/__init__.py +++ b/examples/pre-training/ernie/src/utils/__init__.py @@ -12,6 +12,11 @@ # See the License for the specific language governing permissions and # limitations under the License. -from .logging import logger, setup_logger_output_file +# from .logging import logger, setup_logger_output_file -__all__ = ['logger', 'setup_logger_output_file'] +# __all__ = ['logger', 'setup_logger_output_file'] + +from .data_utils import * # noqa +from .logging import * # noqa +from .seed_utils import * # noqa +from .training_utils import * # noqa diff --git a/examples/pre-training/ernie/src/utils/data_utils.py b/examples/pre-training/ernie/src/utils/data_utils.py new file mode 100644 index 00000000..6f49e4e7 --- /dev/null +++ b/examples/pre-training/ernie/src/utils/data_utils.py @@ -0,0 +1,218 @@ +# !/usr/bin/env python3 + +# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +data utils +""" +import logging +import numpy as np +import os +import datetime +import paddle + +logger = logging.getLogger(__name__) + +DEBUG_PRINT_CNT = 0 + +log_dir = os.getenv("PADDLE_LOG_DIR", "./log") +local_rank = os.getenv("PADDLE_LOCAL_RANK", "0") +date_str = datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S") +print_data_path = os.path.join( + log_dir, "data_rank_{}_{}.txt".format(local_rank, date_str) +) + + +def print_data_online(msg): + """ + print data online + """ + with open(print_data_path, "a+") as f: + f.write(datetime.datetime.now().strftime("%Y-%m-%d-%H:%M:%S") + "\n") + f.write(msg + "\n") + + +def pad_sequence(sequences, padding_value=0, fix_len=None): + """Fill sequences(np.ndarray) into a fixed-length matrix.""" + # don't use any paddle.Tensor in collate-fn + # which prevent leakage in multi-process + max_size = sequences[0].shape + trailing_dims = tuple(max_size[1:]) + # print("trailing_dims: ", trailing_dims) + + max_len = max([s.shape[0] for s in sequences]) + if fix_len is not None: + if fix_len < max_len: + logger.warning(f"truncating example from {max_len} to {fix_len}") + max_len = fix_len + out_dims = (len(sequences), max_len) + trailing_dims + out_tensor = np.full(out_dims, padding_value, dtype=sequences[0].dtype) + for i, tensor in enumerate(sequences): + tensor = tensor[:max_len] + length = tensor.shape[0] + out_tensor[i, :length, ...] = tensor + return out_tensor + + +DEBUG_PRINT_CNT = 0 + + +def smart_concat(tensor, axis=0): + """_summary_ + + Args: + tensor (_type_): _description_ + axis (int, optional): _description_. Defaults to 0. + + Returns: + _type_: _description_ + """ + if isinstance(tensor[0], paddle.Tensor): + return paddle.concat(tensor, axis=axis) + else: + return np.concatenate(tensor, axis=axis) + + +def merge_fn_group_batch( + tokenizer, + batch, + pad_to_max_seqlen=None, + debug_print=1, + shift_label=False, + combine_batch: int = 1, + image_dtype="bfloat16", + doc_pack_attn=False, +): + """ + batch 内 n合一 + """ + bsz = len(batch) + global DEBUG_PRINT_CNT + if pad_to_max_seqlen and shift_label: + pad_to_max_seqlen += 1 + + keys = list(batch[0].keys()) + + if combine_batch > 1: + _batch = [] + for group in [ + batch[i : i + combine_batch] for i in range(0, len(batch), combine_batch) + ]: + + if "src_id" in group[0]: + src_lst = list(set([b["src_id"] for b in group])) + assert len(src_lst) == 1, f"src_lst: {src_lst}" + + item = {} + for k in keys: + if group[0][k] is None: + item[k] = None + continue + if isinstance(group[0][k], (int, float)): + item[k] = np.stack([i[k] for i in group], 0) + else: + item[k] = np.concatenate([i[k] for i in group]) + _batch.append(item) + batch = _batch + ret = {} + for k in keys: + if isinstance(batch[0][k], (int, float)): + ret[k] = np.stack([b[k] for b in batch], 0) + elif k in ["src_id", "data_id", "data_type"]: + ret[k] = np.concatenate([b[k] for b in batch]) + elif k == "images": + to_concat = [b[k] for b in batch if b[k] is not None] + if len(to_concat) != 0: + assert ( + image_dtype != "bfloat16" + ), f"Currently, not support {image_dtype} for numpy" + ret[k] = np.concatenate(to_concat, axis=0).astype(image_dtype) + else: + ret[k] = None + elif k == "grid_thw" and batch[0][k] is not None: + ret[k] = np.concatenate([b[k] for b in batch], axis=0).astype("int64") + if pad_to_max_seqlen: + tmp = max(0, pad_to_max_seqlen * bsz - ret[k].shape[0]) + if tmp > 0: + ret[k] = np.concatenate( + [ret[k], np.zeros([tmp, 3])], axis=0 + ).astype("int64") + elif k in ["audio_input_ids", "audio_labels"]: + to_concat = [b[k] for b in batch if b[k] is not None] + if len(to_concat) != 0: + concat_audio_ids = smart_concat(to_concat) + assert ( + len(concat_audio_ids.shape) == 2 + ), "拼接完的audio_ids必须是2维tensor,且shape=[sum(frames), depth]" + ret[k] = pad_sequence( + [concat_audio_ids], + padding_value=tokenizer.ignored_index, + fix_len=pad_to_max_seqlen * bsz, + )[0] + assert ( + len(ret[k].shape) == 2 + ), "padding完的audio_ids 必须是2维tensor,且shape=[bsz*pad_to_max_seqlen, depth]" + else: + ret[k] = None + else: + if k == "input_ids": + pad_value = tokenizer.pad_token_id + elif k == "labels" or k == "image_type_ids": + pad_value = tokenizer.ignored_index + elif k == "token_type_ids": + pad_value = 0 # pad is also considered as text + else: + pad_value = 0 + + if batch[0][k] is not None: + ret[k] = pad_sequence( + [b[k] for b in batch], + padding_value=pad_value, + fix_len=( + pad_to_max_seqlen + if k != "token_type_ids" + else pad_to_max_seqlen + 1 + ), + ) + + batch = ret + + if DEBUG_PRINT_CNT < debug_print: + DEBUG_PRINT_CNT += 1 + for k, v in batch.items(): + if v is not None and v.dtype == np.float32: # do not show image + v = v.shape + print_data_online( + f"Example={DEBUG_PRINT_CNT} key={k}, " + f"len={len(v[0])if isinstance(v, np.ndarray) and v.ndim > 1 else 0}, " + f"value={v if isinstance(v, np.ndarray) else v}" + ) + + if shift_label: + batch["labels"] = batch["labels"][:, 1:] + batch["input_ids"] = batch["input_ids"][:, :-1] + + if doc_pack_attn: + doc_marks = (batch["input_ids"] == 2).astype(np.int64) + doc_marks[:, -1] = 1 + _offset = np.where(doc_marks.reshape([-1]))[0] + _offset = (_offset + 1).tolist() + offset = np.expand_dims(np.array([0] + _offset, dtype=np.int64), axis=0) + offset = pad_sequence( + offset, padding_value=-1, fix_len=batch["input_ids"].shape[1] + ) + batch["inbatch_pack_offset"] = offset + + return batch diff --git a/examples/pre-training/ernie/src/utils/ipc_server.py b/examples/pre-training/ernie/src/utils/ipc_server.py new file mode 100644 index 00000000..a539acf4 --- /dev/null +++ b/examples/pre-training/ernie/src/utils/ipc_server.py @@ -0,0 +1,265 @@ +# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +IPCServer +""" +import enum +import logging +from dataclasses import dataclass +from multiprocessing import Process, Queue, Lock + + +logger = logging.getLogger(__name__) +logging.getLogger("PIL").setLevel(logging.WARNING) + + +class ServerStatus(enum.Enum): + """ + ServerStatus + """ + + WAIT_RUNNING = 0 + RUNNING = 1 + EXIT_WITH_FAILURE = 2 + EXIT_WITH_CLOSE = 3 + + +class ResponseTag(enum.Enum): + """ + ResponseTag + """ + + SUCCESS = 0 + FAILURE = 1 + + +class ExitFlag: + """ + ExitFlag + """ + + pass + + +@dataclass +class MethodRequest: + """ + MethodRequest + """ + + router_key: object + name: str + args: list + kwargs: dict + + +@dataclass +class AttrRequest: + """ + AttrRequest + """ + + router_key: object + name: str + + +@dataclass +class Response: + """ + Response + """ + + tag: ResponseTag + value: object + exception: Exception + + +def server_loop(init_func, server_idx, server_num, init_queue, send_queue, recv_queue): + """ + server_loop + """ + try: + init_obj = init_func(server_idx, server_num) + init_queue.put( + Response( + tag=ResponseTag.SUCCESS, exception=None, value=ServerStatus.RUNNING + ) + ) + except Exception as e: + logger.exception(e) + init_queue.put( + Response( + tag=ResponseTag.FAILURE, + exception=e, + value=ServerStatus.EXIT_WITH_FAILURE, + ) + ) + return + + while True: + request = send_queue.get() + if isinstance(request, ExitFlag): + break + + try: + value = getattr(init_obj, request.name) + if isinstance(request, MethodRequest): + args = request.args or tuple() + kwargs = request.kwargs or dict() + value = value(*args, **kwargs) + response = Response(tag=ResponseTag.SUCCESS, exception=None, value=value) + except Exception as e: + response = Response(tag=ResponseTag.FAILURE, exception=e, value=None) + print("Exception inside process", e) + + recv_queue.put(response) + + +class SubIPCServer: + """ + SubIPCServer + """ + + def __init__(self, server_idx, server_num, init_func): + """ + __init__ + """ + self.send_queue = Queue() + self.recv_queue = Queue() + self.init_queue = Queue() + self.server_status = ServerStatus.WAIT_RUNNING + self.server_idx = server_idx + self.server_num = server_num + self.process = Process( + target=server_loop, + args=( + init_func, + server_idx, + server_num, + self.init_queue, + self.send_queue, + self.recv_queue, + ), + ) + self.process.daemon = True + self.process.start() + self.lock = Lock() + + def wait_started(self): + """ + wait_started + """ + if self.server_status == ServerStatus.RUNNING: + return + elif self.server_status == ServerStatus.WAIT_RUNNING: + init_response = self.init_queue.get() + assert init_response.value in [ + ServerStatus.RUNNING, + ServerStatus.EXIT_WITH_FAILURE, + ], init_response.value + self.server_status = init_response.value + if init_response.value == ServerStatus.EXIT_WITH_FAILURE: + self.server_status = ServerStatus.EXIT_WITH_FAILURE + raise init_response.exception + elif self.server_status == ServerStatus.EXIT_WITH_FAILURE: + raise RuntimeError("IPCServer does not start successfully") + elif self.server_status == ServerStatus.EXIT_WITH_CLOSE: + raise RuntimeError("IPCServer has been closed") + else: + raise RuntimeError(f"Unknown server status {self.server_status}") + + def response(self, request): + """ + response + """ + with self.lock: + self.wait_started() + self.send_queue.put(request) + ret = self.recv_queue.get() + return ret + + def close(self): + """ + close + """ + with self.lock: + if self.process is not None: + self.wait_started() + self.send_queue.put(ExitFlag()) + self.process.join() + self.process = None + self.server_status = ServerStatus.EXIT_WITH_CLOSE + + +class IPCServer: + """ + IPCServer + """ + + def __init__(self, router_groups, init_funcs): + """ + __init__ + """ + server_num = len(init_funcs) + group_num = len(router_groups) + assert server_num == group_num, f"{server_num} vs {group_num}" + assert ( + server_num > 0 + ), f"server_num should be larger than 0, but got {server_num}" + self.router_map = {} + self.sub_servers = [None] * server_num + for i, (group, init_func) in enumerate(zip(router_groups, init_funcs)): + sub_server = SubIPCServer(i, server_num, init_func) + for router_key in group: + if router_key in self.router_map: + prev_idx = self.router_map[router_key].server_idx + assert prev_idx == i, f"{router_key}: {prev_idx} vs {i}" + else: + self.router_map[router_key] = sub_server + + def _response(self, request): + """ + _response + """ + server = self.router_map[request.router_key] + response = server.response(request) + if response.exception is not None: + raise response.exception + else: + return response.value + + def call(self, router_key, name, args=tuple(), kwargs=dict()): + """ + IPC call method + """ + request = MethodRequest( + router_key=router_key, name=name, args=args, kwargs=kwargs + ) + return self._response(request) + + def attr(self, router_key, name): + """ + IPC get attribute + """ + request = AttrRequest(router_key=router_key, name=name) + return self._response(request) + + def close(self): + """ + IPC close server + """ + for server in self.sub_servers: + if server is not None: + server.close() diff --git a/examples/pre-training/model_configs_auto/model_config.json b/examples/pre-training/model_configs_auto/model_config.json new file mode 100644 index 00000000..f552d11f --- /dev/null +++ b/examples/pre-training/model_configs_auto/model_config.json @@ -0,0 +1,66 @@ +{ + "architectures": [ + "ErnieForCausalLM" + ], + "bos_token_id": 0, + "eos_token_id": 1, + "hidden_act": "silu", + "hidden_size": 8192, + "intermediate_size": 28672, + "initializer_range": 0.00482174, + "max_sequence_length": 4096, + "max_position_embeddings": 4096, + "model_type": "ernie_pp", + "num_attention_heads": 64, + "num_key_value_heads": 8, + "num_hidden_layers": 4, + "pad_token_id": -1, + "rms_norm_eps": 1e-05, + "torch_dtype": "float16", + "transformers_version": "4.27.0.dev0", + "use_cache": true, + "vocab_size": 100352, + "rope_theta": 10000, + "use_recompute": false, + "use_recompute_attn": false, + "use_recompute_moe": false, + "use_recompute_loss_fn": false, + "use_rmsnorm": true, + "fuse_rms_norm": true, + "use_bias": false, + "use_fast_ln": true, + "fuse_attn_ffn": true, + "fuse_linear": true, + "rope_reorder": false, + "fuse_rope": true, + "fuse_swiglu": true, + "fuse_gate_detach_matmul": true, + "remove_tail_layer": 2, + "refined_recompute": { + "mlp_row_ln": -1, + "flash_attn": -1, + "attention_row_ln": -1, + "attention_column_ln": 2, + "mlp_column_ln": 0 + }, + "moe_num_experts": 16, + "moe_num_shared_experts": 0, + "moe_layer_start_index": 2, + "moe_group_experts": false, + "moe_intermediate_size": 3584, + "moe_capacity": [8,8,8], + "moe_gate": "top2_fused", + "moe_gate_scale": false, + "moe_gate_detach": 1.0, + "moe_k": 8, + "moe_aux_loss_lambda": 1e-5, + "moe_group_orthogonal_loss": true, + "moe_orthogonal_loss_lambda": 0.0, + "moe_z_loss_lambda": 0.0, + "moe_layer_interval": 1, + "z_loss_lambda": 0, + "using_precision_check": false, + "use_ep_comm_overlap": true, + "moe_use_all2all": true, + "tie_word_embeddings": true +} diff --git a/examples/pre-training/models/aadiff_decorator.py b/examples/pre-training/models/aadiff_decorator.py new file mode 100644 index 00000000..64b7aa63 --- /dev/null +++ b/examples/pre-training/models/aadiff_decorator.py @@ -0,0 +1,63 @@ +# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +The AADiff decorator. +""" +import os +import paddle +import decorator + + +def get_md5(tensors): + """ + Get MD5 of tensor, list of tensors or the combination of them. + """ + if tensors is None: + return None + elif isinstance(tensors, paddle.Tensor): + return tensors._md5sum() + elif isinstance(tensors, (list, tuple)): + return [get_md5(t) for t in tensors] + else: + raise ValueError(tensors) + + +def check_aadiff(ntimes=None): + """ + The AADiff decorator. + """ + if ntimes is None: + ntimes = int(os.getenv("AADIFF_TIMES", "0")) + + @decorator.decorator + def __impl__(_func, *args, **kwargs): + if ntimes > 0: + with paddle.no_grad(): + old_md5 = None + for idx in range(ntimes): + ret = _func(*args, **kwargs) + print("AADiff Pass {}/{} ...".format(idx, ntimes)) + cur_md5 = get_md5(ret) + del ret + if old_md5 is None: + old_md5 = cur_md5 + else: + assert old_md5 == cur_md5, "Rank {} has aadiff".format( + paddle.distributed.get_rank() + ) + + return _func(*args, **kwargs) + + return __impl__ diff --git a/examples/pre-training/models/ernie/__init__.py b/examples/pre-training/models/ernie/__init__.py index b00b0579..97095731 100644 --- a/examples/pre-training/models/ernie/__init__.py +++ b/examples/pre-training/models/ernie/__init__.py @@ -12,7 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -from .configuration import ErnieMoEConfig -from .modeling_pp import ErnieMoEForCausalLMPipe -__all__ = ['ErnieMoEConfig', 'ErnieMoEForCausalLMPipe'] +from .configuration import * # noqa +from .modeling import * # noqa +from .modeling_auto import * # noqa +from .modeling_auto_pp import * # noqa diff --git a/examples/pre-training/models/ernie/modeling_auto.py b/examples/pre-training/models/ernie/modeling_auto.py new file mode 100644 index 00000000..f297ced4 --- /dev/null +++ b/examples/pre-training/models/ernie/modeling_auto.py @@ -0,0 +1,2939 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Paddle Ernie model""" +import math +from functools import partial +import logging +from typing import Optional, Tuple +import contextlib +import inspect + +try: + from fast_ln import fast_ln +except ImportError: + fast_ln = None + +from copy import deepcopy +from dataclasses import dataclass +import numpy as np +import paddle +import paddle.distributed as dist +import paddle.nn.functional as F +from paddle import nn +from paddle.distributed import fleet +from paddle.distributed.fleet.utils import recompute +from paddle.distributed.fleet.layers.mpu.random import get_rng_state_tracker +from paddle.incubate.nn.memory_efficient_attention import ( + memory_efficient_attention, + BlockDiagonalCausalMask, +) +from paddle.distributed import in_auto_parallel_align_mode + +from models.comm_utils import subbatch + +from models.moe.top2_gate_auto_auto import Top2Gate +from models.moe.top2_gate_auto import TopKGateFusedAuto + + +# from src/ops which is install in build_envs + +from paddleformers.transformers.conversion_utils import ( + StateDictNameMapping, + init_name_mappings, +) + +from paddleformers.transformers.model_outputs import ( + BaseModelOutputWithPastAndCrossAttentions as _BaseModelOutput, +) +from paddleformers.transformers.model_outputs import CausalLMOutputWithCrossAttentions + +from paddleformers.transformers.model_utils import PretrainedModel, register_base_model + +from models.ernie.modeling import FusedDropoutImpl +from models.sequence_parallel_utils_auto import ( + sequence_parallel_sparse_mask_labels, +) +from models.moe.moe_layer_auto import ( + MOELayerAuto, +) +from .configuration import ErnieMoEConfig +from models.moe.moe_utils_auto import get_mesh + +# Because param_name is generated based on the class name, +# when changes in distributed strategies result in class modifications, +# there may be mismatches during parameter loading. +# You can achieve class name changes by importing the following environment variables. +# Example: `export rowcol_parallel_linear_class_name_convert_map="tpsp->smp"` + + +@dataclass +class BaseModelOutputWithPastAndCrossAttentions(_BaseModelOutput): + + router_loss: Optional[paddle.Tensor] = None + gate_logits: Optional[Tuple[paddle.Tensor]] = None + + +@dataclass +class CausalLMOutputWithCrossAttentionsAuto(CausalLMOutputWithCrossAttentions): + + router_loss: Optional[paddle.Tensor] = None + + +logger = logging.getLogger(__name__) + +try: + from paddle.nn.functional.flash_attention import flash_attention + + logger.warning( + "Use flash attention in scaled-dot-product. Attention mask is deprecated" + ) +except (ImportError, ModuleNotFoundError): + flash_attention = None + +try: + from paddle.nn.functional.flash_attention import flash_attention_with_mask +except (ImportError, ModuleNotFoundError): + try: + from paddle.nn.functional.flash_attention import ( + scaled_dot_product_attention as flash_attention_with_mask, + ) + except (ImportError, ModuleNotFoundError): + logger.warning( + "flash_attention_with_mask not found. Use FleetY8.2 SFT instead." + ) + flash_attention_with_mask = None + +try: + from paddle.nn.functional.flash_attention import flash_attention_with_sparse_mask +except (ImportError, ModuleNotFoundError): + logger.warning("flash_attention_with_sparse_mask not found. Use FleetY8.9 instead.") + flash_attention_with_sparse_mask = None + +try: + from to_block_diag_causal_mask import to_block_diag_causal_mask +except (ImportError, ModuleNotFoundError): + logger.warning("to_block_diag_causal_mask not found. Use FleetY8.2 SFT instead.") + to_block_diag_causal_mask = None + +try: + import fused_ln as fused +except ImportError: + logger.warning( + "fused-ln not found, run `python src/ops/fused_ln_setup.py install` to build fused ln" + ) + fused = None + +try: + from paddle.incubate.nn.functional import ( + fused_rotary_position_embedding as fused_rope, + ) +except (ImportError, ModuleNotFoundError): + logger.warning("fused_rotary_position_embedding not found") + fused_rope = None + +try: + from paddle.incubate.nn.functional import swiglu as fused_swiglu +except (ImportError, ModuleNotFoundError): + fused_swiglu = None + + +ERNIE_PRETRAINED_MODEL_ARCHIVE_LIST = [] + +__all__ = [ + "ErnieModelAuto", + "ErniePretrainedModelAuto", + "ErnieForCausalLMAuto", +] + + +gate_class = dict( + top2=Top2Gate, + top2_fused=TopKGateFusedAuto, +) + + +def is_pp_enable(): + + mesh = fleet.auto.get_mesh() + return "pp" in mesh.dim_names + + +def global_mesh_starts_with_pp(): + + mesh = fleet.auto.get_mesh() + if is_pp_enable(): + return mesh.get_mesh_with_dim("pp") + else: + return mesh + + +def is_fleety_func(): + """ + Check whether it is PaddlePaddle FleetY version. + """ + if flash_attention_with_sparse_mask is None: + return True + + args = inspect.getfullargspec(flash_attention_with_sparse_mask).args + return "causal" in args + + +IS_FLEETY = is_fleety_func() + + +def get_triangle_upper_mask(x, mask=None): + + if mask is not None: + return mask + # [bsz, n_head, q_len, kv_seq_len] + shape = x.shape + # [bsz, 1, q_len, kv_seq_len] + shape[1] = 1 + mask = paddle.full(shape, -np.inf, dtype=x.dtype) + mask.stop_gradient = True + mask = paddle.triu(mask, diagonal=1) + mask.stop_gradient = True + return mask + + +def naive_fuse_split_tp( + weight, + tensor_parallel_degree, + tensor_parallel_rank=None, + is_column=True, + fuse_tensor_parts=2, +): + + logging.info(f"spliting fused-ffn: {weight.shape}") + axis = -1 if is_column else 0 + splited = np.split(weight, fuse_tensor_parts * tensor_parallel_degree, axis=axis) + return np.concatenate( + splited[tensor_parallel_rank::tensor_parallel_degree], axis=axis + ) + + +def parallel_matmul( + x, + y, + bias=None, + transpose_y=False, + tensor_parallel_degree=1, + tensor_parallel_output=True, +): + + if transpose_y: + logits = paddle.matmul(x, y, transpose_y=True) + if bias is not None: + logits += bias + else: + logits = F.linear(x, y, bias) + + if tensor_parallel_degree > 1 and not tensor_parallel_output: + logits = dist.reshard(logits, get_mesh(-1), [dist.Shard(0), dist.Replicate()]) + + return logits + + +def calc_lm_head_logits( + config, + hidden_states, + weight, + bias, + sparse_label_idx=None, + tensor_parallel_output=None, +): + """the core function to calc lm head""" + if config.sequence_parallel: + + assert ( + not config.use_sparse_head_and_loss_fn + ), "use_sparse_head_and_loss_fn is not supported now." + + # do all gather + hcg = paddle.distributed.fleet.get_hybrid_communicate_group() + dp_rank = hcg.get_data_parallel_rank() + sharding_rank = hcg.get_sharding_parallel_rank() + if dp_rank <= 1 and sharding_rank <= 1: + hidden_states = dist.reshard( + hidden_states, + get_mesh(-1), + [dist.Replicate(), dist.Replicate()], + ) + else: + hidden_states = dist.reshard( + hidden_states, + get_mesh(-1), + [dist.Shard(1), dist.Replicate()], + ) + # [S, B, H] to [B, S, H] + hidden_states = paddle.transpose(hidden_states, [1, 0, 2]) + if not config.using_dynamic_sequence_length: + hidden_states = hidden_states.reshape( + [-1, config.seqlen, hidden_states.shape[-1]] + ) + else: + assert ( + config.micro_batch_size + ), "micro_batch_size should be set when using dygramic sequence length." + hidden_states = hidden_states.reshape( + [config.micro_batch_size, -1, hidden_states.shape[-1]] + ) + if tensor_parallel_output is None: + tensor_parallel_output = config.tensor_parallel_output + logits = parallel_matmul( + hidden_states, + weight, + bias=bias, + transpose_y=config.tie_word_embeddings, + tensor_parallel_degree=config.tensor_parallel_degree, + tensor_parallel_output=tensor_parallel_output, + ) + + return logits + + +def finfo(dtype: paddle.dtype = None): + + if dtype is None: + dtype = paddle.get_default_dtype() + + if dtype == paddle.bfloat16: + + class BFloatFInfo: + """ + Numpy do not support `np.finfo(np.uint16)`, so try to construct a finfo object to fetch min value + """ + + min = -3.3895313892515355e38 + + return BFloatFInfo + if dtype == paddle.float32: + return np.finfo(np.float32) + if dtype == paddle.float16: + return np.finfo(np.float16) + if dtype == paddle.float64: + return np.finfo(np.float64) + + +def masked_fill(x, mask, value): + + y = paddle.full(x.shape, value, x.dtype) + return paddle.where(mask, y, x) + + +def mem_eff_attn( + query, key, value, pack_offset, drop_prob=0.0, dtype=paddle.bfloat16, training=True +): + + pack_offset = pack_offset.numpy() + shape = pack_offset.shape + assert len(shape) == 2, len(shape) + assert shape[0] == 1, shape[0] + n = pack_offset.size + pack_offset = pack_offset.flatten() + seqlens = [] + assert pack_offset[0] == 0, pack_offset[0] + for i in range(1, n): + if pack_offset[i] < 0: + break + cur = pack_offset[i] - pack_offset[i - 1] + assert cur > 0 + seqlens.append(cur) + + assert drop_prob == 0.0, drop_prob + assert dtype == paddle.bfloat16, dtype + + def cast(x): + return x.astype(dtype) if x.dtype != dtype else x + + if len(seqlens) == 1: + out, _ = flash_attention( + query, key, value, drop_prob, causal=True, training=training + ) + else: + mask = BlockDiagonalCausalMask.from_seqlens(seqlens) + out = memory_efficient_attention( + cast(query), + cast(key), + cast(value), + attn_bias=mask, + p=drop_prob, + training=training, + ) + return out + + +def inbatch_pack_offset_to_attn_mask_start_row_indices(inbatch_pack_offset): + """convert inbatch_pack_offset to attn_mask_start_row_indices""" + inbatch_pack_offset = inbatch_pack_offset.numpy() + attn_mask_row_start_indices = [] + min_start_row = np.inf + for bidx in range(inbatch_pack_offset.shape[0]): + item = inbatch_pack_offset[bidx] + cumsum_item = item[item != -1] + record_lens = cumsum_item[1:] - cumsum_item[0:-1] + min_start_row = min(cumsum_item[1], min_start_row) + row_start_indices = np.repeat(cumsum_item[1:], record_lens) + attn_mask_row_start_indices.append(row_start_indices[None, None, ...]) + attn_mask_row_start_indices = np.concatenate(attn_mask_row_start_indices, axis=0) + return paddle.to_tensor(attn_mask_row_start_indices, dtype=paddle.int32), int( + min_start_row + ) + + +def scaled_dot_product_attention( + query_states, + key_states, + value_states, + attention_mask, + output_attentions, + config, + is_causal=True, + rr_flash_attn=None, + inbatch_pack_offset=None, + training=True, +): + + bsz, q_len, num_heads, head_dim = query_states.shape + _, kv_seq_len, num_key_value_heads, _ = value_states.shape + + can_use_fa = config.use_flash_attn and flash_attention is not None + can_use_fa_sparse_mask = ( + config.use_mem_eff_attn + and inbatch_pack_offset is not None + and flash_attention_with_sparse_mask is not None + ) + + if not can_use_fa and not can_use_fa_sparse_mask: + if query_states.shape[-2] != key_states.shape[-2]: + key_states = key_states.repeat_interleave( + num_heads // num_key_value_heads, axis=-2 + ) + if query_states.shape[-2] != value_states.shape[-2]: + value_states = value_states.repeat_interleave( + num_heads // num_key_value_heads, axis=-2 + ) + + if can_use_fa: + if rr_flash_attn is not None: + attn_output, attn_weights = rr_flash_attn( + query_states, + key_states, + value_states, + dropout=config.attention_probs_dropout_prob, + causal=is_causal and query_states.shape[1] != 1, + return_softmax=output_attentions, + ) + else: + attn_output, attn_weights = flash_attention( + query_states, + key_states, + value_states, + dropout=config.attention_probs_dropout_prob, + causal=is_causal and query_states.shape[1] != 1, + return_softmax=output_attentions, + ) + + attn_output = attn_output.reshape([bsz, q_len, head_dim * num_heads]) + return attn_output, attn_weights + elif config.use_mem_eff_attn and inbatch_pack_offset is not None: + assert ( + not output_attentions + ), "output_attentions should be False when use_mem_eff_attn=True" + if config.use_flash_attn_with_mask: + if flash_attention_with_sparse_mask is not None: + causal_mask_indices, attn_mask_min_start_row = ( + inbatch_pack_offset_to_attn_mask_start_row_indices( + inbatch_pack_offset + ) + ) + if IS_FLEETY: + kwargs = { + "causal": True, + "dropout": config.attention_probs_dropout_prob, + } + else: + kwargs = { + "is_causal": True, + "dropout_p": config.attention_probs_dropout_prob, + } + attn_output = flash_attention_with_sparse_mask( + query_states.astype(value_states.dtype), + key_states.astype(value_states.dtype), + value_states.astype(value_states.dtype), + attn_mask_start_row_indices=causal_mask_indices, + attn_mask_start_row=attn_mask_min_start_row, + **kwargs, + ) + else: + attn_mask = to_block_diag_causal_mask( + inbatch_pack_offset, q_len, float("-inf"), "bfloat16" + ) + attn_output = flash_attention_with_mask( + query_states, + key_states, + value_states, + attn_mask, + config.attention_probs_dropout_prob, + ) + else: + attn_output = mem_eff_attn( + query_states, + key_states, + value_states, + inbatch_pack_offset, + drop_prob=config.attention_probs_dropout_prob, + ) + attn_output = attn_output.reshape([bsz, q_len, head_dim * num_heads]) + return attn_output, None + else: + + query_states = paddle.transpose(query_states, [0, 2, 1, 3]) / math.sqrt( + head_dim + ) + # merge with the next tranpose + key_states = paddle.transpose(key_states, [0, 2, 1, 3]) + value_states = paddle.transpose(value_states, [0, 2, 1, 3]) + + attn_weights = paddle.matmul(query_states, key_states.transpose([0, 1, 3, 2])) + + if attn_weights.shape != [bsz, num_heads, q_len, kv_seq_len]: + raise ValueError( + f"Attention weights should be of shape {(bsz, num_heads, q_len, kv_seq_len)}, but is" + f" {attn_weights.shape}" + ) + + # Pipeline 的Attention mask不能从外面传。 + if attention_mask is None: + attention_mask = get_triangle_upper_mask(attn_weights) + + attention_mask = attention_mask.reshape([bsz, 1, q_len, kv_seq_len]) + if attention_mask.shape != [bsz, 1, q_len, kv_seq_len]: + raise ValueError( + f"Attention mask should be of shape {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.shape}" + ) + if training: + attn_weights = attention_mask + attn_weights + attn_weights = paddle.maximum( + attn_weights, + paddle.to_tensor( + float(finfo(query_states.dtype).min), dtype=query_states.dtype + ), + ) + + if paddle.in_dynamic_mode(): + with paddle.amp.auto_cast(False): + attn_weights = F.softmax( + attn_weights, axis=-1, dtype="float32" + ).astype(query_states.dtype) + else: + attn_weights = F.softmax(attn_weights, axis=-1, dtype="float32").astype( + query_states.dtype + ) + else: # use inplace operation to save memory + attn_weights = attn_weights.cast(paddle.float32) + attention_mask = attention_mask.cast(paddle.float32) + attn_weights = attn_weights.add_(attention_mask) + attn_weights = F.softmax_(attn_weights, axis=-1).astype(query_states.dtype) + + if config.attention_probs_dropout_prob > 0.0: + if config.tensor_parallel_degree > 1: + with get_rng_state_tracker().rng_state("local_seed"): + attn_weights = F.dropout( + attn_weights, + config.attention_probs_dropout_prob, + training=training, + mode="upscale_in_train", + ) + else: + attn_weights = F.dropout( + attn_weights, + config.attention_probs_dropout_prob, + training=training, + mode="upscale_in_train", + ) + + attn_output = paddle.matmul(attn_weights, value_states) + attn_output = attn_output.transpose([0, 2, 1, 3]) + attn_output = attn_output.reshape([bsz, q_len, head_dim * num_heads]) + if output_attentions: + return attn_output, attn_weights + return attn_output, None + + +def _make_causal_mask(input_ids_shape, past_key_values_length, dtype): + """ + Make causal mask used for self-attention. + """ + batch_size, target_length = input_ids_shape + + mask = paddle.full((target_length, target_length), float(finfo(dtype).min)) + + mask_cond = paddle.arange(mask.shape[-1]) + mask = masked_fill( + mask, mask_cond < (mask_cond + 1).reshape([mask.shape[-1], 1]), 0 + ) + + if past_key_values_length > 0: + mask = paddle.concat( + [paddle.zeros([target_length, past_key_values_length]), mask], axis=-1 + ) + + return mask[None, None, :, :].expand( + [batch_size, 1, target_length, target_length + past_key_values_length] + ) + + +def _expand_mask(mask, dtype, tgt_length): + """ + Expands attention_mask from `[batch_size, src_length]` to `[batch_size, 1, tgt_length, src_length]`. + """ + if mask.ndim == 4: + expanded_mask = mask + elif mask.ndim == 3: + expanded_mask = mask[:, None, :, :] + else: + batch_size, src_length = mask.shape[0], mask.shape[-1] + tgt_length = tgt_length if tgt_length is not None else src_length + + expanded_mask = mask[:, None, None, :].expand( + [batch_size, 1, tgt_length, src_length] + ) + + inverted_mask = 1.0 - expanded_mask + return masked_fill( + inverted_mask, inverted_mask.cast("bool"), float(finfo(dtype).min) + ) + + +def slice_experts(experts, moe_world_size): + moe_num_experts_per_device = len(experts) // moe_world_size + experts_per_device = [[] for _ in range(moe_world_size)] + + for i, expert in enumerate(experts): + ep_group_id = i // moe_num_experts_per_device + experts_per_device[ep_group_id].append(expert) + + lm_experts = nn.LayerList([]) + for experts_list in experts_per_device: + lm_experts.extend(experts_list[: moe_num_experts_per_device // 2]) + return lm_experts + + +def get_gate( + config: ErnieMoEConfig, + expert: Tuple[Tuple[int, nn.Layer]], + layer_idx: int, + ipp: int = 0, +) -> Tuple[nn.Layer, nn.LayerList]: + + moe_num_experts = config.moe_num_experts + assert ( + moe_num_experts >= config.moe_world_size + ), f"expert moe_num_experts={moe_num_experts} >= moe_world_size={config.moe_world_size}" + assert ( + moe_num_experts % config.moe_world_size == 0 + ), f"expert moe_num_experts={moe_num_experts} % moe_world_size={config.moe_world_size} == 0" + moe_num_experts_per_device = moe_num_experts // config.moe_world_size + experts = nn.LayerList([]) + for expert_id, (experts_num, fc) in enumerate(expert): + assert experts_num % config.moe_world_size == 0 + experts_to_append = [] + if not hasattr(fc, "__len__"): + experts_to_append.append(fc) + if expert_id == 1: + with paddle.utils.unique_name.guard("_mm_deepcopy"): + for _ in range(experts_num - 1): + experts_to_append.append(deepcopy(fc)) + else: + for _ in range(experts_num - 1): + experts_to_append.append(deepcopy(fc)) + else: + experts_to_append = fc + for ex in experts_to_append: + for p in ex.parameters(): + p.expert_type = f"expert_type_{expert_id}" + experts.extend(experts_to_append) + + logger.info( + f"using moe-world-size: {config.moe_world_size} " + f"expert-per-device: {moe_num_experts_per_device} " + ) + if config.moe_use_hard_gate and moe_num_experts <= 2: + gate = None + logger.info("MOE-GATE:-hard-gate") + else: + logger.info(f"MOE-GATE:-{config.moe_gate}") + gate = gate_class[config.moe_gate.lower()]( + config, layer_idx=layer_idx, group=config.moe_group, ipp=ipp + ) + + lm_gate, lm_experts = None, None + logger.info(f"LM-experts-{lm_experts} -- experts-{experts}") + + index = 0 if config.moe_group == "dp" else 1 + ep_sub_meshes = dist.auto_parallel.api.split_mesh(get_mesh(ipp), index) + + for i, expert in enumerate(experts): + ep_group_id = i // moe_num_experts_per_device + if isinstance(expert, (ErnieMoeMLPFused, ErnieMoeMLP)): + experts[i].redistribute_expert( + ep_sub_meshes[ep_group_id], [dist.Replicate(), dist.Replicate()] + ) + experts[i].ep_group_id = ep_group_id + + return gate, experts, lm_gate, lm_experts + + +def _parse_moe_group(moe_group: str): + moe_group = moe_group.lower() + assert moe_group in { + "dp", + "mp", + "none", + }, f"moe-group not supported, got: {moe_group}" + logger.info(f"using moe-group: {moe_group}") + + return moe_group + + +class RMSNorm(nn.Layer): + """ + RMSNorm is a variant of layer normalization. + """ + + def __init__(self, config, ipp=0): + super().__init__() + self.hidden_size = config.hidden_size + self.weight = paddle.create_parameter( + shape=[self.hidden_size], + dtype=paddle.get_default_dtype(), + default_initializer=nn.initializer.Constant(1.0), + ) + self.variance_epsilon = config.rms_norm_eps + self.config = config + + def forward(self, hidden_states): + + if self.config.fuse_rms_norm: + return fused.fused_rms_norm( + hidden_states, self.weight, self.variance_epsilon + )[0] + if paddle.in_dynamic_mode(): + with paddle.amp.auto_cast(False): + variance = hidden_states.astype("float32").pow(2).mean(-1, keepdim=True) + hidden_states = ( + paddle.rsqrt(variance + self.variance_epsilon) * hidden_states + ) + else: + variance = hidden_states.astype("float32").pow(2).mean(-1, keepdim=True) + hidden_states = ( + paddle.rsqrt(variance + self.variance_epsilon) * hidden_states + ) + + if self.weight.dtype in [paddle.float16, paddle.bfloat16]: + hidden_states = paddle.cast(hidden_states, self.weight.dtype) + return hidden_states * self.weight + + +class LayerNorm(nn.LayerNorm): + """ + layer normalization. + """ + + def __init__(self, config, ipp=0): + super().__init__(config.hidden_size, epsilon=config.rms_norm_eps) + + self.use_fast_ln = config.use_fast_ln + if self.use_fast_ln: + assert fast_ln is not None + self.ipp = ipp + if config.pipeline_parallel_degree > 1: + self.weight = dist.shard_tensor( + self.weight, get_mesh(self.ipp), [dist.Replicate(), dist.Replicate()] + ) + self.bias = dist.shard_tensor( + self.bias, get_mesh(self.ipp), [dist.Replicate(), dist.Replicate()] + ) + + def forward(self, hidden_states): + """ + The layer normalization operator. + """ + if self.use_fast_ln: + return fast_ln(hidden_states, self.weight, self.bias, self._epsilon)[0] + else: + return super().forward(hidden_states) + + +class FusedLayerNorm(nn.Layer): + """ + FusedLayerNorm is a variant of layer normalization. + """ + + def __init__(self, config, ipp=0): + super().__init__() + self.config = config + self.hidden_size = config.hidden_size + self.weight = paddle.create_parameter( + shape=[self.hidden_size], + dtype=paddle.get_default_dtype(), + default_initializer=nn.initializer.Constant(1.0), + ) + self.bias = paddle.create_parameter( + shape=[self.hidden_size], dtype=paddle.get_default_dtype(), is_bias=True + ) + self.variance_epsilon = config.rms_norm_eps + self.ipp = ipp + if config.pipeline_parallel_degree > 1: + self.weight = dist.shard_tensor( + self.weight, get_mesh(self.ipp), [dist.Replicate(), dist.Replicate()] + ) + self.bias = dist.shard_tensor( + self.bias, get_mesh(self.ipp), [dist.Replicate(), dist.Replicate()] + ) + + def forward(self, hidden_states): + + return fused.fused_ln( + hidden_states, self.weight, self.bias, self.variance_epsilon + )[0] + + +class RotaryEmbedding(nn.Layer): + r""" + RotaryEmbedding Layer + """ + + def __init__(self, dim, max_position_embeddings=4096, base=10000): + + super().__init__() + # dtype = paddle.get_default_dtype() + self.base = base + self.max_position_embeddings = max_position_embeddings + inv_freq = 1.0 / ( + base ** (paddle.cast(paddle.arange(0, dim, 2), dtype="float32") / dim) + ) + + # self.register_buffer("inv_freq", inv_freq.cast(dtype)) + + # higher acc using float32 + t = paddle.arange(max_position_embeddings, dtype="float32") + freqs = paddle.einsum("i,j->ij", t, inv_freq.cast("float32")) + # Different from paper, but it uses a different permutation in order to obtain the same calculation + emb = paddle.concat([freqs, freqs], axis=-1) + + # [bs, seqlen, nhead, head_dim] + self.cos_cached = emb.cos() # [None, :, None, :] # .astype(dtype) + self.sin_cached = emb.sin() # [None, :, None, :] # .astype(dtype) + + self._cast_to_low_precision = False # 兼容develop分支paddle + self._cast_to_low_precison = False + + def forward(self, x, seq_len=None): + + return ( + self.cos_cached[:seq_len, :], + self.sin_cached[:seq_len, :], + ) + + @classmethod + def rotate_half(cls, x): + """Rotates half the hidden dims of the input.""" + + x1 = x[..., : x.shape[-1] // 2] + x2 = x[..., x.shape[-1] // 2 :] + return paddle.concat([-x2, x1], axis=-1) + + @classmethod + def apply_rotary_pos_emb(cls, q, k, cos, sin, offset: int = 0, position_ids=None): + """doc""" + if position_ids is not None: + # logger.info(f'applying pos:{position_ids}') + assert offset == 0, offset + cos = F.embedding(position_ids, cos) + sin = F.embedding(position_ids, sin) + else: + cos = cos.unsqueeze(0) + sin = sin.unsqueeze(0) + cos = cos[:, offset : q.shape[1] + offset, None, :] + sin = sin[:, offset : q.shape[1] + offset, None, :] + + q_embed = paddle.add( + paddle.multiply(q, cos), paddle.multiply(cls.rotate_half(q), sin) + ) + k_embed = paddle.add( + paddle.multiply(k, cos), paddle.multiply(cls.rotate_half(k), sin) + ) + q_embed = q_embed.astype(q.dtype) # fp32->bf16 + k_embed = k_embed.astype(k.dtype) + return q_embed, k_embed + + +class RopeEmbeddingLegacy(nn.Layer): + + def __init__(self, head_dim, compression_ratio=1.0, base=10000): + super().__init__() + self.head_dim = head_dim + self.compression_ratio = compression_ratio + self.base = base + + def forward(self, seq_length, position_ids=None): + + indices = paddle.arange(0, self.head_dim, 2, dtype="float32") + indices = 1 / self.base ** (indices / self.head_dim) + if position_ids is None: + position_ids = paddle.arange(0, seq_length, 1, dtype="float32").unsqueeze(1) + position_ids = position_ids / self.compression_ratio + sinusoid_inp = position_ids * indices.unsqueeze(0) + else: + position_ids = position_ids / self.compression_ratio + seq_length = position_ids.shape[-1] + sinusoid_inp = position_ids.unsqueeze(-1).astype( + "float32" + ) * indices.unsqueeze( + 0 + ) # [b, s, 1] * [1, d/2] -> [b, s, d/2] + pos_emb = paddle.concat( + [paddle.sin(sinusoid_inp), paddle.cos(sinusoid_inp)], axis=-1 + ) + pos_emb = paddle.reshape(pos_emb, (-1, 1, seq_length, self.head_dim)) + pos_emb.stop_gradient = True + return pos_emb + + def apply_rotary(self, rp, q, k): + + # sin [sequence_length, embed_size_per_head//2] + # cos [sequence_length, embed_size_per_head//2] + sin, cos = paddle.chunk(rp, 2, axis=-1) + # sin [θ0,θ1,θ2......θd/2-1] -> sin_pos [θ0,θ0,θ1,θ1,θ2,θ2......θd/2-1,θd/2-1] + sin_pos = paddle.reshape(paddle.stack([sin, sin], axis=-1), rp.shape) + # cos [θ0,θ1,θ2......θd/2-1] -> cos_pos [θ0,θ0,θ1,θ1,θ2,θ2......θd/2-1,θd/2-1] + cos_pos = paddle.reshape(paddle.stack([cos, cos], axis=-1), rp.shape) + # rotate_half_query_layer [-q1,q0,-q3,q2......,-qd-1,qd-2] + rotate_half_q = paddle.reshape( + paddle.stack([-q[:, :, :, 1::2], q[:, :, :, 0::2]], axis=-1), + paddle.shape(q), + ) + query = paddle.add( + paddle.multiply(q.astype("float32"), cos_pos), + paddle.multiply(rotate_half_q.astype("float32"), sin_pos), + ) + # rotate_half_key_layer [-k1,k0,-k3,k2......,-kd-1,kd-2] + rotate_half_k = paddle.reshape( + paddle.stack([-k[:, :, :, 1::2], k[:, :, :, 0::2]], axis=-1), + paddle.shape(k), + ) + key = paddle.add( + paddle.multiply(k.astype("float32"), cos_pos), + paddle.multiply(rotate_half_k.astype("float32"), sin_pos), + ) + return query, key + + def forward_single(self, position_ids): + + batch_size, seq_length = position_ids.shape[:2] + rope_emb = paddle.zeros( + (2, batch_size, seq_length, 1, self.head_dim), dtype="float32" + ) + inv_freq = self.base ** ( + -paddle.arange(0, self.head_dim, 2, dtype="float32") / self.head_dim + ) + position_ids = position_ids.cast("float32") + position_ids = position_ids / self.compression_ratio + # shape: [B, S, D/2] + freqs = paddle.einsum("ij,k->ijk", position_ids.cast("float32"), inv_freq) + # shape: [B, S, D] + emb = paddle.stack([freqs, freqs], axis=-1).reshape( + (batch_size, seq_length, self.head_dim) + ) + # shape: [B, S, 1, D] + emb = paddle.unsqueeze(emb, 2) + + rope_emb[0] = paddle.cos(emb) + rope_emb[1] = paddle.sin(emb) + return rope_emb + + @staticmethod + def apply_rotary_single(x, rope_emb): + + rotate_half_x = paddle.reshape( + paddle.stack([-x[:, :, :, 1::2], x[:, :, :, 0::2]], axis=-1), + paddle.shape(x), + ) + return x * rope_emb[0] + rotate_half_x * rope_emb[1] + + +class ErnieLinear(nn.Layer): + + def __init__( + self, + in_features, + out_features, + weight_attr=None, + bias_attr=None, + name=None, + ipp=0, + ): + super(ErnieLinear, self).__init__() + self._dtype = self._helper.get_default_dtype() + self._weight_attr = weight_attr + self._bias_attr = bias_attr + self.weight = self.create_parameter( + shape=[in_features, out_features], + attr=self._weight_attr, + dtype=self._dtype, + is_bias=False, + ) + self.bias = self.create_parameter( + shape=[out_features], + attr=self._bias_attr, + dtype=self._dtype, + is_bias=True, + ) + self.name = name + self.ipp = ipp + + def forward(self, input): + + out = F.linear(x=input, weight=self.weight, bias=None, name=self.name) + out = dist.reshard( + out, + get_mesh(self.ipp), + [dist.Shard(1), dist.Shard(0)], + ) + if self.bias: + out += self.bias + return out + + +class ErnieMLP(nn.Layer): + + def __init__(self, config, ipp=None, do_shard_tensor=True): + super().__init__() + self.config = config + self.ipp = ipp + self.hidden_size = config.hidden_size + self.intermediate_size = config.intermediate_size + + LinearFN = nn.Linear + self.gate_proj = LinearFN( + self.hidden_size, self.intermediate_size, bias_attr=config.use_bias + ) + self.up_proj = LinearFN( + self.hidden_size, self.intermediate_size, bias_attr=config.use_bias + ) + + if config.sequence_parallel: + self.down_proj = ErnieLinear( + self.intermediate_size, + self.hidden_size, + bias_attr=config.use_bias, + ipp=self.ipp, + ) + else: + self.down_proj = LinearFN( + self.intermediate_size, self.hidden_size, bias_attr=config.use_bias + ) + + if do_shard_tensor and ( + self.config.tensor_parallel_degree > 1 + or self.config.pipeline_parallel_degree > 1 + ): + self.gate_proj.weight = dist.shard_tensor( + self.gate_proj.weight, + get_mesh(self.ipp), + [dist.Replicate(), dist.Shard(1)], + ) + self.up_proj.weight = dist.shard_tensor( + self.up_proj.weight, + get_mesh(self.ipp), + [dist.Replicate(), dist.Shard(1)], + ) + if config.use_bias: + self.gate_proj.bias = dist.shard_tensor( + self.gate_proj.bias, + get_mesh(self.ipp), + [dist.Replicate(), dist.Shard(0)], + ) + self.up_proj.bias = dist.shard_tensor( + self.up_proj.bias, + get_mesh(self.ipp), + [dist.Replicate(), dist.Shard(0)], + ) + self.down_proj.weight = dist.shard_tensor( + self.down_proj.weight, + get_mesh(self.ipp), + [dist.Replicate(), dist.Shard(0)], + ) + if config.use_bias: + self.down_proj.bias = dist.shard_tensor( + self.down_proj.bias, + get_mesh(self.ipp), + [dist.Replicate(), dist.Replicate()], + ) + + self.fuse_swiglu = config.fuse_swiglu + if self.fuse_swiglu: + assert fused_swiglu is not None, "fused_swiglu operator is not found." + + def forward(self, x): + + if self.fuse_swiglu: + x = fused_swiglu(self.gate_proj(x), self.up_proj(x)) + else: + x = F.silu(self.gate_proj(x)) * self.up_proj(x) + return self.down_proj(x) + + +class ErnieAttentionAuto(nn.Layer): + + def __init__(self, config, ipp: Optional[int] = None): + super().__init__() + self.ipp = ipp + self.hidden_size = config.hidden_size + self.num_heads = config.num_attention_heads + self.num_key_value_heads = config.num_key_value_heads + self.head_dim = self.hidden_size // self.num_heads + self.use_recompute_attn = config.use_recompute_attn # aka recompute core-attn + self.is_gqa = ( + config.num_key_value_heads is not None + and config.num_key_value_heads != self.num_heads + ) + if config.fuse_rope: + assert fused_rope is not None, "fused_rope is not supported" + self.fuse_rope = config.fuse_rope + + if self.is_gqa: + logger.info( + f"use GQA - num_heads: {self.num_heads}- num_key_value_heads: {self.num_key_value_heads}" + ) + assert ( + self.num_heads % self.num_key_value_heads == 0 + ), f"num_heads: {self.num_heads}, num_key_value_heads: {self.num_key_value_heads}" + kv_hidden_size = ( + self.hidden_size // self.num_heads * self.num_key_value_heads + ) + + LinearFN = nn.Linear + self.q_proj = LinearFN( + self.hidden_size, + self.hidden_size, + bias_attr=config.use_bias, + ) + self.k_proj = LinearFN( + self.hidden_size, + self.hidden_size if not self.is_gqa else kv_hidden_size, + bias_attr=config.use_bias, + ) + self.v_proj = LinearFN( + self.hidden_size, + self.hidden_size if not self.is_gqa else kv_hidden_size, + bias_attr=config.use_bias, + ) + + if config.sequence_parallel: + self.o_proj = ErnieLinear( + self.hidden_size, + self.hidden_size, + bias_attr=config.use_bias, + ipp=self.ipp, + ) + else: + self.o_proj = LinearFN( + self.hidden_size, + self.hidden_size, + bias_attr=config.use_bias, + ) + + self.config = config + + if ( + self.config.tensor_parallel_degree > 1 + or self.config.pipeline_parallel_degree > 1 + ): + self.q_proj.weight = dist.shard_tensor( + self.q_proj.weight, + get_mesh(self.ipp), + [dist.Replicate(), dist.Shard(1)], + ) + self.k_proj.weight = dist.shard_tensor( + self.k_proj.weight, + get_mesh(self.ipp), + [dist.Replicate(), dist.Shard(1)], + ) + self.v_proj.weight = dist.shard_tensor( + self.v_proj.weight, + get_mesh(self.ipp), + [dist.Replicate(), dist.Shard(1)], + ) + if config.use_bias: + self.q_proj.bias = dist.shard_tensor( + self.q_proj.bias, + get_mesh(self.ipp), + [dist.Replicate(), dist.Shard(0)], + ) + self.k_proj.bias = dist.shard_tensor( + self.k_proj.bias, + get_mesh(self.ipp), + [dist.Replicate(), dist.Shard(0)], + ) + self.v_proj.bias = dist.shard_tensor( + self.v_proj.bias, + get_mesh(self.ipp), + [dist.Replicate(), dist.Shard(0)], + ) + self.o_proj.weight = dist.shard_tensor( + self.o_proj.weight, + get_mesh(self.ipp), + [dist.Replicate(), dist.Shard(0)], + ) + + def forward( + self, + hidden_states, + past_key_value: Optional[Tuple[paddle.Tensor]] = None, + attention_mask: Optional[paddle.Tensor] = None, + position_ids: Optional[Tuple[paddle.Tensor]] = None, + output_attentions: bool = False, + use_cache: bool = False, + inbatch_pack_offset: Optional[Tuple[paddle.Tensor]] = None, + ) -> Tuple[paddle.Tensor, Optional[paddle.Tensor], Optional[Tuple[paddle.Tensor]]]: + """Input shape: Batch x Time x Channel""" + if self.config.sequence_parallel: + # do all-gather + hidden_states = dist.reshard( + hidden_states, get_mesh(self.ipp), [dist.Shard(1), dist.Replicate()] + ) + + query_states = ( + self.q_proj(hidden_states).reshape( + shape=[0, 0, self.num_heads, self.head_dim] + ) + # .transpose([0, 2, 1, 3]) + ) + key_states = ( + self.k_proj(hidden_states).reshape( + shape=[ + 0, + 0, + self.num_key_value_heads if self.is_gqa else self.num_heads, + self.head_dim, + ] + ) + # .transpose([0, 2, 1, 3]) + ) + value_states = ( + self.v_proj(hidden_states).reshape( + shape=[ + 0, + 0, + self.num_key_value_heads if self.is_gqa else self.num_heads, + self.head_dim, + ] + ) + # .transpose([0, 2, 1, 3]) + ) + + if self.config.sequence_parallel: + query_states = paddle.transpose(query_states, [1, 0, 2, 3]) + key_states = paddle.transpose(key_states, [1, 0, 2, 3]) + value_states = paddle.transpose(value_states, [1, 0, 2, 3]) + + if self.use_recompute_attn: + assert past_key_value is None, "do not use kv cache in recompute" + assert not use_cache + attn_output, attn_weights, past_key_value = recompute( + self.rope_attn, + None, + query_states, + key_states, + value_states, + attention_mask, + position_ids, + output_attentions, + past_key_value, + use_cache, + inbatch_pack_offset, + use_reentrant=False, + ) + else: + attn_output, attn_weights, past_key_value = self.rope_attn( + mix_layer=None, + query_states=query_states, + key_states=key_states, + value_states=value_states, + attention_mask=attention_mask, + position_ids=position_ids, + output_attentions=output_attentions, + past_key_value=past_key_value, + use_cache=use_cache, + inbatch_pack_offset=inbatch_pack_offset, + ) + + if self.config.sequence_parallel: + attn_output = paddle.transpose(attn_output, [1, 0, 2]) + + attn_output = self.o_proj(attn_output) + + if not output_attentions: + attn_weights = None + + return attn_output, attn_weights, past_key_value + + def rope_attn( + self, + mix_layer, + query_states, + key_states, + value_states, + attention_mask, + position_ids, + output_attentions=False, + past_key_value=None, + use_cache=False, + inbatch_pack_offset=None, + ): + if mix_layer is not None: + query_states, key_states, value_states = paddle.split(mix_layer, 3, axis=-1) + query_states_dtype = query_states.dtype + + kv_seq_len = key_states.shape[-3] + offset = 0 + if past_key_value is not None: + offset = past_key_value[0].shape[-3] + kv_seq_len += offset + + if self.config.rope_reorder: + cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) + + query_states, key_states = self.rotary_emb.apply_rotary_pos_emb( + query_states, + key_states, + cos, + sin, + position_ids=position_ids, + offset=offset if position_ids is None else 0, + ) + else: + if offset > 0 or position_ids is not None or not self.fuse_rope: + cos_sin = self.rotary_emb(kv_seq_len, position_ids).transpose( + [0, 2, 1, 3] + ) # [b,h,s,d]->[b,s,h,d] + if offset > 0 and position_ids is None: + # position_ids has been sliced in prepare_inputs_for_generation + cos_sin = cos_sin[:, offset:] + query_states, key_states = self.rotary_emb.apply_rotary( + cos_sin, query_states, key_states + ) + else: + bsz, q_len, num_heads, head_dim = query_states.shape + _, kv_seq_len, num_key_value_heads, _ = key_states.shape + if num_heads != num_key_value_heads: + query_states, _, _ = fused_rope(query_states, None, None) + key_states, _, _ = fused_rope(key_states, None, None) + else: + query_states, key_states, _ = fused_rope( + query_states, key_states, None + ) + + if use_cache: + query_states = query_states.astype(query_states_dtype) + key_states = key_states.astype(query_states_dtype) + if past_key_value is not None: + key_states = paddle.concat([past_key_value[0], key_states], axis=1) + value_states = paddle.concat([past_key_value[1], value_states], axis=1) + + past_key_value = [key_states, value_states] if use_cache else None + + attn_output, attn_weights = scaled_dot_product_attention( + query_states=query_states, + key_states=key_states, + value_states=value_states, + attention_mask=attention_mask, + output_attentions=output_attentions, + config=self.config, + inbatch_pack_offset=inbatch_pack_offset, + training=self.training, + ) + return attn_output, attn_weights, past_key_value + + +class ErnieMoeMLP(ErnieMLP): + """_summary_ + + Args: + ErnieMoeMLP (_type_): _description_ + """ + + def __init__(self, config, ipp=0): + """ + doc + """ + disable_ffn_model_parallel = getattr( + config, "disable_ffn_model_parallel", False + ) + if disable_ffn_model_parallel: + # assert config.moe_group == "mp", f"when using mp_moe, expect moe-group == mp, but get {config.moe_group}" + config = deepcopy(config) + config.tensor_parallel_degree = 1 + config.sequence_parallel = False + + super().__init__(config, ipp, do_shard_tensor=not disable_ffn_model_parallel) + self.moe_dropout_prob = config.moe_dropout_prob + self.fuse_swiglu = config.fuse_swiglu + if self.fuse_swiglu: + assert fused_swiglu is not None, "fused_swiglu operator is not found." + + def redistribute_expert(self, mesh, placements): + """ + Place the experts on different devices. + """ + self.gate_proj.weight = dist.shard_tensor( + self.gate_proj.weight, mesh, placements + ) + # self.gate_proj.bias = dist.shard_tensor(self.gate_proj.bias, mesh, placements) + self.up_proj.weight = dist.shard_tensor(self.up_proj.weight, mesh, placements) + # self.up_proj.bias = dist.shard_tensor(self.up_proj.bias, mesh, placements) + self.down_proj.weight = dist.shard_tensor( + self.down_proj.weight, mesh, placements + ) + if self.config.use_bias: + self.gate_proj.bias = dist.shard_tensor( + self.gate_proj.bias, mesh, placements + ) + self.up_proj.bias = dist.shard_tensor(self.up_proj.bias, mesh, placements) + self.down_proj.bias = dist.shard_tensor( + self.down_proj.bias, mesh, placements + ) + + def forward(self, x): + + if self.fuse_swiglu: + x = fused_swiglu(self.gate_proj(x), self.up_proj(x)) + else: + x = F.silu(self.gate_proj(x)) * self.up_proj(x) + if self.moe_dropout_prob > 0: + with get_rng_state_tracker().rng_state("local_seed"): + x = F.dropout(x=x, p=self.moe_dropout_prob) + ret = self.down_proj(x) + return ret + + +class BMMLinear(nn.Layer): + + def __init__(self, experts, d_in, d_out, use_bias=False): + super().__init__() + self.weight = self.create_parameter( + [experts, d_in, d_out], dtype=paddle.get_default_dtype() + ) + if use_bias: + self.bias = self.create_parameter( + [experts, d_out], dtype=paddle.get_default_dtype(), is_bias=True + ) + else: + self.bias = None + + def forward(self, x): + """x: [num_experts, Seq, dim]""" + if self.bias is not None: + return paddle.bmm(x, self.weight) + self.bias + return paddle.bmm(x, self.weight) + + +class ErnieMoeMLPFused(nn.Layer): + """Fused Implement of ErnieMoeMLP""" + + def __init__(self, config): + + assert ( + hasattr(config, "disable_ffn_model_parallel") + or config.tensor_parallel_degree == 1 + ), f"fused mlp only suport mp-moe, mp={config.tensor_parallel_degree}" + assert config.fuse_attn_ffn, "fused mlp only support fuse_attn_ffn" + super().__init__() + self.moe_dropout_prob = config.moe_dropout_prob + self.num_local_experts = config.moe_num_experts // config.moe_world_size + logger.info( + f"fused-expert-weight-shape: {[self.num_local_experts, config.hidden_size, config.intermediate_size]}" + ) + + self.up_gate_proj = BMMLinear( + self.num_local_experts, config.hidden_size, config.intermediate_size * 2 + ) + self.down_proj = BMMLinear( + self.num_local_experts, config.intermediate_size, config.hidden_size + ) + self.fuse_swiglu = config.fuse_swiglu + if self.fuse_swiglu: + assert fused_swiglu is not None, "fused_swiglu operator is not found." + + def __len__(self): + return self.num_local_experts + + def __iter__(self): + return (self for _ in range(1)) + + def forward(self, x): + """x""" + if self.fuse_swiglu: + x = fused_swiglu(self.up_gate_proj(x)) + else: + gate, x = self.up_gate_proj(x).chunk(2, axis=-1) + x = F.silu(gate) * x + x = self.down_proj(x) + return x + + +class ErnieDecoderLayerAuto(nn.Layer): + """ + ErnieDecoderLayerAuto is a decoder layer in Ernie model. + It is composed of self-attention, cross-attention and feedforward layers. + """ + + def __init__(self, config, layer_idx=0, ipp=0): + """ + Initializes the ErnieBlock module. + + Args: + config (ErnieConfig): The model configuration. + layer_idx (int, optional): The index of this block in the model. Defaults to 0. + ipp (int, optional): The index of this block in the pipeline parallelism. Defaults to 0. + """ + super().__init__() + self.config = config + self.layer_idx = layer_idx + self.ipp = ipp + self.hidden_size = config.hidden_size + self.self_attn = ErnieAttentionAuto(config, ipp) + self.use_moe = config.use_moe if hasattr(config, "use_moe") else False + if self.use_moe: + moe_layer_start_index = ( + min(config.moe_layer_start_index) + if isinstance(config.moe_layer_start_index, (tuple, list)) + else config.moe_layer_start_index + ) + moe_layer_end_index = ( + max(config.moe_layer_end_index) + if isinstance(config.moe_layer_end_index, (tuple, list)) + else config.moe_layer_end_index + ) + + if ( + self.use_moe + and ((layer_idx + 1) % config.moe_layer_interval == 0) + and layer_idx >= moe_layer_start_index + and layer_idx <= moe_layer_end_index + ): + self.create_moe_mlp_layer(layer_idx, ipp) + else: + self.mlp = ErnieMLP(config, ipp) + Norm = RMSNorm if config.use_rmsnorm else LayerNorm + if not config.use_rmsnorm and config.fuse_ln: + Norm = FusedLayerNorm + self.input_layernorm = Norm(config, ipp) + self.post_attention_layernorm = Norm(config, ipp) + self.residual_add1 = FusedDropoutImpl( + config.hidden_dropout_prob, mode="upscale_in_train" + ) + self.residual_add2 = FusedDropoutImpl( + config.hidden_dropout_prob, mode="upscale_in_train" + ) + + def create_moe_mlp_layer(self, layer_idx, ipp): + _ex_cfg = deepcopy(self.config) + fc_cls = ErnieMoeMLPFused if _ex_cfg.moe_fuse_experts else ErnieMoeMLP + if _ex_cfg.moe_intermediate_size: + if isinstance(_ex_cfg.moe_intermediate_size, (tuple, list)): + assert isinstance(_ex_cfg.moe_num_experts, (tuple, list)) and len( + _ex_cfg.moe_num_experts + ) == len(_ex_cfg.moe_intermediate_size) + fc = [] + for _i, (num_experts, intermediate_size) in enumerate( + zip(_ex_cfg.moe_num_experts, _ex_cfg.moe_intermediate_size) + ): + _ex_cfg_real = deepcopy(_ex_cfg) + _ex_cfg_real.intermediate_size = intermediate_size + cur_modality_start_layer_idx = ( + self.config.moe_layer_start_index[_i] + if isinstance(self.config.moe_layer_start_index, (tuple, list)) + else self.config.moe_layer_start_index + ) + cur_modality_end_layer_idx = ( + self.config.moe_layer_end_index[_i] + if isinstance(self.config.moe_layer_end_index, (tuple, list)) + else self.config.moe_layer_end_index + ) + if ( + layer_idx >= cur_modality_start_layer_idx + and layer_idx <= cur_modality_end_layer_idx + ): + if _i == 1: + with paddle.utils.unique_name.guard( + f"mm_expert_{layer_idx}_" + ): + fc.append((num_experts, fc_cls(_ex_cfg_real))) + else: + fc.append((num_experts, fc_cls(_ex_cfg_real))) + else: + logger.info( + f"moe multimodal experts use Identity layer_idx: {layer_idx}" + ) + fc.append((num_experts, nn.Identity())) + else: + _ex_cfg.intermediate_size = _ex_cfg.moe_intermediate_size + fc = [(_ex_cfg.moe_num_experts, fc_cls(_ex_cfg))] + else: + fc = [(_ex_cfg.moe_num_experts, fc_cls(_ex_cfg))] + gate, experts, lm_gate, lm_experts = get_gate( + self.config, fc, layer_idx, self.ipp + ) + _sh_cfg = deepcopy(self.config) + + if _sh_cfg.moe_num_shared_experts > 0: + if _sh_cfg.moe_intermediate_size: + _sh_inter_size = ( + _sh_cfg.moe_intermediate_size[0] + if isinstance(_sh_cfg.moe_intermediate_size, (tuple, list)) + else _sh_cfg.moe_intermediate_size + ) + _sh_cfg.intermediate_size = ( + _sh_inter_size * _sh_cfg.moe_num_shared_experts + ) + else: + _sh_cfg.intermediate_size = ( + _sh_cfg.intermediate_size * _sh_cfg.moe_num_shared_experts + ) + _sh_cfg.disable_ffn_model_parallel = False # split shared epxert + shared_experts = ErnieMoeMLP(_sh_cfg, ipp) + else: + shared_experts = None + + is_moe_infer = self.config.get("is_moe_infer", False) + if is_moe_infer: + raise NotImplementedError + elif self.config.moe_use_size_all2all: + raise NotImplementedError + else: + logger.info(f"moe-logging:{self.config.moe_logging}") + moe_cls = MOELayerAuto + self.mlp = moe_cls( + gate, + experts, + layer_idx=layer_idx, + shared_experts=shared_experts, + group=self.config.moe_group, + recompute=self.config.use_recompute_moe, + enable_logging=self.config.moe_logging, + k=self.config.moe_k, + enable_pbr=self.config.moe_use_bpr, + all_to_all_dropout=self.config.moe_all_to_all_dropout, + group_experts=self.config.moe_group_experts, + config=self.config, + ipp=self.ipp, + ) + + def forward( + self, + hidden_states: paddle.Tensor, + attention_mask: Optional[paddle.Tensor] = None, + position_ids: Optional[paddle.Tensor] = None, + output_attentions: Optional[bool] = False, + past_key_value: Optional[Tuple[paddle.Tensor]] = None, + use_cache: Optional[bool] = False, + inbatch_pack_offset: Optional[paddle.Tensor] = None, + token_type_ids: Optional[paddle.Tensor] = None, + output_gate_logits=True, # PP model should not output gate logits, + ) -> Tuple[paddle.Tensor, Optional[Tuple[paddle.Tensor, paddle.Tensor]]]: + """ + Args: + hidden_states (`paddle.Tensor`): input to the layer of shape `(batch, seq_len, embed_dim)` + attention_mask (`paddle.Tensor`, *optional*): attention mask of size + `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + use_cache (`bool`, *optional*): + If set to `True`, `cache` key value states are returned and can be used to speed up decoding + (see `cache`). + cache (`Tuple(paddle.Tensor)`, *optional*): cached past key and value projection states + """ + + residual = hidden_states + hidden_states = self.input_layernorm(hidden_states) + + # Self Attention + (hidden_states, self_attn_weights, present_key_value, *router_loss_attn) = ( + self.self_attn( + hidden_states=hidden_states, + past_key_value=past_key_value, + attention_mask=attention_mask, + position_ids=position_ids, + output_attentions=output_attentions, + use_cache=use_cache, + inbatch_pack_offset=inbatch_pack_offset, + ) + ) + + if ( + self.config.tensor_parallel_degree > 1 + and self.config.hidden_dropout_prob > 0.0 + ): + current_seed = ( + "local_seed" if self.config.sequence_parallel else "global_seed" + ) + with get_rng_state_tracker().rng_state(current_seed): + hidden_states = self.residual_add1(hidden_states, residual) + else: + hidden_states = self.residual_add1(hidden_states, residual) + + # Fully Connected + residual = hidden_states + hidden_states = self.post_attention_layernorm(hidden_states) + + if isinstance( + self.mlp, + (MOELayerAuto), + ): + + hidden_states, _, router_loss, gate_logits = self.mlp( + hidden_states, token_type_ids + ) + else: + if self.config.sequence_parallel: + # do all-gather + hidden_states = dist.reshard( + hidden_states, + get_mesh(self.ipp), + [dist.Shard(1), dist.Replicate()], + ) + hidden_states = self.mlp(hidden_states) + gate_logits = None + + if ( + self.config.tensor_parallel_degree > 1 + and self.config.hidden_dropout_prob > 0.0 + ): + current_seed = ( + "local_seed" if self.config.sequence_parallel else "global_seed" + ) + with get_rng_state_tracker().rng_state(current_seed): + hidden_states = self.residual_add2(hidden_states, residual) + else: + hidden_states = self.residual_add2(hidden_states, residual) + + outputs = (hidden_states,) + + if output_attentions: + outputs += (self_attn_weights,) + + if use_cache: + outputs += (present_key_value,) + + if hasattr(self.config, "use_moe") and self.config.use_moe: + if router_loss_attn: + router_loss_attn = router_loss_attn[0] + router_loss = router_loss + router_loss_attn + + if isinstance(self.mlp, (MOELayerAuto)): + outputs += (router_loss,) + else: + outputs += (paddle.zeros([1], dtype=paddle.float32),) + + if output_gate_logits: + outputs += (gate_logits,) + + # remove empty tuple for pipeline parallel + if type(outputs) is tuple and len(outputs) == 1: + outputs = outputs[0] + return outputs + + +class ErniePretrainedModelAuto(PretrainedModel): + """ + ErniePretrainedModelAuto is a pretrained model class for Ernie model. + It is composed of a encoder and a decoder. + """ + + config_class = ErnieMoEConfig + base_model_prefix = "ernie" + + @classmethod + def _get_name_mappings(cls, config: ErnieMoEConfig) -> StateDictNameMapping: + + mappings: StateDictNameMapping = [] + model_mappings = [ + ["embed_tokens.weight"], + ["norm.weight"], + ] + for layer_index in range( + config.num_hidden_layers + if not config.remove_tail_layer + else config.num_hidden_layers - 1 + ): + layer_mappings = [ + [ + f"layers.{layer_index}.self_attn.q_proj.weight", + None, + "transpose", + ], + [ + f"layers.{layer_index}.self_attn.k_proj.weight", + None, + "transpose", + ], + [ + f"layers.{layer_index}.self_attn.v_proj.weight", + None, + "transpose", + ], + [ + f"layers.{layer_index}.self_attn.o_proj.weight", + None, + "transpose", + ], + [f"layers.{layer_index}.self_attn.rotary_emb.inv_freq"], + [f"layers.{layer_index}.mlp.gate_proj.weight", None, "transpose"], + [f"layers.{layer_index}.mlp.down_proj.weight", None, "transpose"], + [f"layers.{layer_index}.mlp.up_proj.weight", None, "transpose"], + [f"layers.{layer_index}.input_layernorm.weight"], + [f"layers.{layer_index}.post_attention_layernorm.weight"], + ] + model_mappings.extend(layer_mappings) + + init_name_mappings(mappings=model_mappings) + if "ErnieModelAuto" not in config.architectures: + for mapping in model_mappings: + mapping[0] = "model." + mapping[0] + mapping[1] = "ernie." + mapping[1] + model_mappings.append(["lm_head.weight", "lm_head.weight", "transpose"]) + + mappings = [ + StateDictNameMapping(*mapping, index=index) + for index, mapping in enumerate(model_mappings) + ] + return mappings + + @classmethod + def _get_tensor_parallel_mappings(cls, config, is_split=True): + + from paddleformers.transformers.conversion_utils import split_or_merge_func + + fn = split_or_merge_func( + is_split=is_split, + tensor_parallel_degree=config.tensor_parallel_degree, + tensor_parallel_rank=config.tensor_parallel_rank, + num_attention_heads=config.num_attention_heads, + ) + + def get_tensor_parallel_split_mappings(num_layers): + final_actions = {} + base_actions = { + # Column Linear + "layers.0.self_attn.q_proj.weight": partial(fn, is_column=True), + "layers.0.self_attn.k_proj.weight": partial(fn, is_column=True), + "layers.0.self_attn.v_proj.weight": partial(fn, is_column=True), + "layers.0.mlp.gate_proj.weight": partial(fn, is_column=True), + "layers.0.mlp.up_proj.weight": partial(fn, is_column=True), + "lm_head.weight": partial(fn, is_column=not config.tie_word_embeddings), + # Row Linear + "embed_tokens.weight": partial(fn, is_column=False), + "layers.0.self_attn.o_proj.weight": partial(fn, is_column=False), + "layers.0.mlp.down_proj.weight": partial(fn, is_column=False), + } + if config.use_bias: + base_actions.update( + { + # Column Linear + "layers.0.self_attn.q_proj.bias": partial(fn, is_column=True), + "layers.0.self_attn.k_proj.bias": partial(fn, is_column=True), + "layers.0.self_attn.v_proj.bias": partial(fn, is_column=True), + "layers.0.mlp.gate_proj.bias": partial(fn, is_column=True), + "layers.0.mlp.up_proj.bias": partial(fn, is_column=True), + "lm_head.bias": partial(fn, is_column=True), + } + ) + for key, action in base_actions.items(): + if "layers.0." in key: + for i in range(num_layers): + final_actions[key.replace("layers.0.", f"layers.{i}.")] = action + final_actions[key] = action + + return final_actions + + mappings = get_tensor_parallel_split_mappings( + config.num_hidden_layers + if not config.remove_tail_layer + else config.num_hidden_layers - 1 + ) + + return mappings + + def init_weights(self, layer): + """Initialization hook""" + if self.config.tensor_parallel_degree > 1: + rng_tracker = get_rng_state_tracker().rng_state + else: + rng_tracker = contextlib.nullcontext + + if isinstance( + layer, + ( + ErnieLMHead, + nn.Embedding, + nn.Linear, + paddle.incubate.nn.FusedLinear, + ), + ): + + with rng_tracker(): + dtype = paddle.get_default_dtype() + paddle.set_default_dtype("float32") + if layer.weight._is_initialized(): + if layer.weight.is_dist(): + layer.weight._local_value().set_value( + paddle.randn( + layer.weight._local_shape, dtype=layer.weight.dtype + ).scale(self.config.initializer_range) + ) + else: + layer.weight.set_value( + paddle.randn( + layer.weight.shape, dtype=layer.weight.dtype + ).scale(self.config.initializer_range) + ) + paddle.set_default_dtype(dtype) + logger.info( + f"dist-init-fc: shape={layer.weight.shape}, " + f" range={self.config.initializer_range}," + f' type={type(layer)},norm={layer.weight.astype("float32").norm()}' + ) + + elif isinstance(layer, RotaryEmbedding): + head_dim = self.config.hidden_size // self.config.num_attention_heads + inv_freq = 1.0 / ( + layer.base ** (np.arange(0, head_dim, 2).astype("float32") / head_dim) + ) + # self.register_buffer("inv_freq", inv_freq.cast(dtype)) + + # higher acc using float32 + t = np.arange(layer.max_position_embeddings, dtype="float32") + freqs = np.einsum("i,j->ij", t, inv_freq) + # Different from paper, but it uses a different permutation in order to obtain the same calculation + emb = np.concatenate([freqs, freqs], axis=-1) + # [bs, seqlen, nhead, head_dim] + cos_cached = np.cos(emb)[:, :] + sin_cached = np.sin(emb)[:, :] + layer.cos_cached.set_value(cos_cached) + layer.sin_cached.set_value(sin_cached) + elif isinstance(layer, Top2Gate): + if not hasattr(layer, "weight"): + return + with rng_tracker("model_parallel_rng"): + dtype = paddle.get_default_dtype() + paddle.set_default_dtype("float32") + if self.config.moe_group_experts: + if layer.weight._is_initialized(): + layer.weight.set_value( + paddle.randn( + layer.weight.shape, dtype=layer.weight.dtype + ).scale(self.config.initializer_range) + ) + else: + if layer.weight._is_initialized(): + granularity = ( + 1 + if self.config.moe_intermediate_size == 0 + else self.config.intermediate_size + // self.config.moe_intermediate_size + ) + layer.weight.set_value( + paddle.randn( + [ + self.config.hidden_size, + self.config.moe_num_experts // granularity, + ], + dtype="float32", + ) + .scale(self.config.initializer_range) + .repeat_interleave(granularity, axis=-1) + ) + logger.info( + f"dist-init-moe_gate: shape={layer.weight.shape}, dtype={layer.weight.dtype} " + f"range={self.config.initializer_range},type={type(layer)}, " + f'norm={layer.weight.astype("float32").norm()}' + ) + + +@register_base_model +class ErnieModelAuto(ErniePretrainedModelAuto): + """ + Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`ErnieDecoderLayerAuto`] + Args: + config: ErnieMoEConfig + """ + + def __init__(self, config: ErnieMoEConfig): + if hasattr(config, "use_moe") and config.use_moe: + if config.moe_group in {"mp", "model", "tp", "mpdp"}: + assert config.sequence_parallel + logger.info( + f"disable FFN tensor model parallel, moe-group={config.moe_group}" + ) + config.disable_ffn_model_parallel = True + + config.moe_group = _parse_moe_group(config.moe_group) + if config.moe_group in fleet.auto.get_mesh().dim_names: + config.moe_world_size = fleet.auto.get_mesh().get_dim_size( + config.moe_group + ) + if config.moe_world_size < 0: + config.moe_world_size = 1 + else: + config.moe_world_size = 1 + + super().__init__(config) + self.padding_idx = config.pad_token_id + self.vocab_size = config.vocab_size + self.hidden_size = config.hidden_size + self.config = config + + self.embed_tokens = nn.Embedding( + self.vocab_size, + self.hidden_size, + ) + + if ( + self.config.tensor_parallel_degree > 1 + or self.config.pipeline_parallel_degree > 1 + ): + if not in_auto_parallel_align_mode(): + self.embed_tokens.weight = dist.shard_tensor( + self.embed_tokens.weight, + get_mesh(), + [dist.Replicate(), dist.Shard(1)], + ) + + layers_list = [] + + def get_layer_pp_info(ipp): + mesh = fleet.auto.get_mesh() + if is_pp_enable() is False: + return None, False + else: + pp_degree = mesh.get_dim_size("pp") + layer_num = ( + config.num_hidden_layers - 1 + if config.remove_tail_layer + else config.num_hidden_layers + ) + layer_per_stage = math.ceil(layer_num / pp_degree) + input_need_reshard = ipp % layer_per_stage == 0 + return ipp // layer_per_stage, input_need_reshard + + self.next_pp_stage_indexes = [] + for layer_idx in range( + config.num_hidden_layers - 1 + if config.remove_tail_layer + else config.num_hidden_layers + ): + pp_stage_id, input_need_reshard = get_layer_pp_info(layer_idx) + layers_list.append(ErnieDecoderLayerAuto(config, layer_idx, pp_stage_id)) + if input_need_reshard: + self.next_pp_stage_indexes.append(layer_idx) + self.layers = nn.LayerList(layers_list) + Norm = RMSNorm if config.use_rmsnorm else LayerNorm + if not config.use_rmsnorm and config.fuse_ln: + Norm = FusedLayerNorm + self.norm = Norm(config, -1) + + self.gradient_checkpointing = False + + # Initialize weights and apply final processing + + self.placements = ( + [dist.Shard(1), dist.Shard(0)] + if self.config.sequence_parallel + else [dist.Shard(0), dist.Replicate()] + ) + + def get_input_embeddings(self): + return self.embed_tokens + + def set_input_embeddings(self, value): + self.embed_tokens = value + + @classmethod + def _prepare_decoder_attention_mask( + cls, attention_mask, input_shape, past_key_values_length, dtype + ): + # create causal mask + # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] + combined_attention_mask = None + if input_shape[-1] > 1: + combined_attention_mask = _make_causal_mask( + input_shape, past_key_values_length=past_key_values_length, dtype=dtype + ) + + if attention_mask is not None: + # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] + expanded_attn_mask = _expand_mask( + attention_mask, dtype, tgt_length=input_shape[-1] + ) + combined_attention_mask = ( + expanded_attn_mask + if combined_attention_mask is None + else expanded_attn_mask + combined_attention_mask + ) + combined_attention_mask = paddle.maximum( + combined_attention_mask.astype(dtype), + paddle.to_tensor(float(finfo(dtype).min), dtype=dtype), + ) + return combined_attention_mask + + def recompute_training( + self, + layer_module, + hidden_states, + attention_mask, + position_ids, + output_attentions, + past_key_value, + use_cache, + inbatch_pack_offset, + token_type_ids, + ): + + def create_custom_forward(module): + def custom_forward(*inputs): + return module(*inputs, output_gate_logits=False) + + return custom_forward + + hidden_states = recompute( + create_custom_forward(layer_module), + hidden_states, + attention_mask, + position_ids, + output_attentions, + past_key_value, + use_cache, + inbatch_pack_offset, + token_type_ids, + use_reentrant=False, + ) + return hidden_states + + def forward( + self, + input_ids=None, + position_ids=None, + attention_mask=None, + inputs_embeds=None, + use_cache=None, + past_key_values=None, + output_attentions=False, + output_hidden_states=None, + return_dict=False, + inbatch_pack_offset=None, + token_type_ids=None, + **kwargs, + ): + output_attentions = ( + output_attentions + if output_attentions is not None + else self.config.output_attentions + ) + output_hidden_states = ( + output_hidden_states + if output_hidden_states is not None + else self.config.output_hidden_states + ) + use_cache = use_cache if use_cache is not None else self.config.use_cache + + return_dict = ( + return_dict if return_dict is not None else self.config.use_return_dict + ) + + if input_ids is not None and inputs_embeds is not None: + raise ValueError( + "You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time" + ) + elif input_ids is not None: + batch_size, seq_length = input_ids.shape + elif inputs_embeds is not None: + batch_size, seq_length, _ = inputs_embeds.shape + else: + raise ValueError( + "You have to specify either decoder_input_ids or decoder_inputs_embeds" + ) + + if past_key_values is None: + past_key_values = tuple([None] * len(self.layers)) + + seq_length_with_past = seq_length + cache_length = 0 + + if past_key_values[0] is not None: + cache_length = paddle.shape(past_key_values[0][0])[1] + seq_length_with_past += cache_length + + if inputs_embeds is None: + inputs_embeds = self.embed_tokens(input_ids).astype( + self.embed_tokens.weight.dtype + ) + + global_mesh = global_mesh_starts_with_pp() + if self.config.sequence_parallel: + # [B, S, H] -> [S, B, H] + inputs_embeds = paddle.transpose(inputs_embeds, [1, 0, 2]) + + if position_ids is not None: + position_ids = dist.shard_tensor( + position_ids, + global_mesh, + [dist.Replicate() for _ in range(len(global_mesh._shape))], + ) + can_use_fa = self.config.use_flash_attn and flash_attention is not None + can_mem_eff_attn = ( + self.config.use_mem_eff_attn and inbatch_pack_offset is not None + ) + if can_use_fa or can_mem_eff_attn: + if attention_mask is not None: + attention_mask = None + + elif attention_mask is None: + attention_mask = paddle.ones( + (batch_size, seq_length_with_past), dtype=paddle.bool + ) + + if attention_mask is not None: + attention_mask = self._prepare_decoder_attention_mask( + attention_mask, + (batch_size, seq_length), + cache_length, + inputs_embeds.dtype, + ) + attention_mask = dist.shard_tensor( + attention_mask, + global_mesh, + [dist.Replicate() for _ in range(len(global_mesh._shape))], + ) + + hidden_states = inputs_embeds + if self.config.tensor_parallel_degree > 1: + hidden_states = dist.reshard(hidden_states, get_mesh(0), self.placements) + + # decoder layers + all_hidden_states = () if output_hidden_states else None + all_self_attns = () if output_attentions else None + next_decoder_cache = () if use_cache else None + + all_router_loss = None + if hasattr(self.config, "use_moe") and self.config.use_moe: + all_router_loss = paddle.to_tensor(0.0) + all_router_loss = dist.shard_tensor( + all_router_loss, get_mesh(0), dist.Replicate() + ) + all_gate_logits = () if hasattr(self.config, "use_moe") else None + for idx, (decoder_layer) in enumerate(self.layers): + if output_hidden_states: + all_hidden_states += (hidden_states,) + + past_key_value = ( + past_key_values[idx] if past_key_values is not None else None + ) + + has_gradient = not hidden_states.stop_gradient + ipp = decoder_layer.ipp + if not is_pp_enable(): + position_ids_input = position_ids + attention_mask_input = attention_mask + token_type_ids_input = token_type_ids + else: + if position_ids is not None: + position_ids_input = dist.reshard( + position_ids, + get_mesh(ipp), + [dist.Replicate(), dist.Replicate()], + ) + else: + position_ids_input = position_ids + attention_mask_input = ( + dist.reshard( + attention_mask, + get_mesh(ipp), + [dist.Replicate(), dist.Replicate()], + ) + if attention_mask is not None + else None + ) + token_type_ids_input = ( + dist.reshard( + token_type_ids, + get_mesh(ipp), + [dist.Replicate(), dist.Replicate()], + ) + if token_type_ids is not None + else None + ) + + if idx in self.next_pp_stage_indexes: + hidden_states = dist.reshard( + hidden_states, + get_mesh(ipp), + self.placements, + ) + if hasattr(self.config, "use_moe") and self.config.use_moe: + all_router_loss = dist.reshard( + all_router_loss, + get_mesh(ipp), + [dist.Replicate()], + ) + if self.config.use_recompute and has_gradient: + layer_outputs = self.recompute_training( + decoder_layer, + hidden_states, + attention_mask_input, + position_ids_input, + output_attentions, + past_key_value, + use_cache, + inbatch_pack_offset, + token_type_ids_input, + ) + else: + layer_outputs = decoder_layer( + hidden_states, + attention_mask_input, + position_ids_input, + output_attentions, + past_key_value, + use_cache, + inbatch_pack_offset, + token_type_ids_input, + ) + + if isinstance(layer_outputs, (tuple, list)): + hidden_states = layer_outputs[0] + else: + hidden_states = layer_outputs + + if use_cache: + next_decoder_cache += (layer_outputs[2 if output_attentions else 1],) + + if output_attentions: + all_self_attns += (layer_outputs[1],) + if hasattr(self.config, "use_moe") and self.config.use_moe: + if not (self.config.use_recompute and has_gradient): + layer_outputs, gate_logits = layer_outputs[:-1], layer_outputs[-1] + all_gate_logits = all_gate_logits + (gate_logits,) + router_loss = layer_outputs[-1] + all_router_loss += router_loss + + if use_cache and not (hasattr(self.config, "use_moe") and self.config.use_moe): + hidden_states = paddle.unsqueeze(hidden_states[:, -1, :], 1) + + if self.config.pipeline_parallel_degree > 1: + hidden_states = dist.reshard( + hidden_states, + get_mesh(-1), + self.placements, + ) + hidden_states = self.norm(hidden_states) + + if output_hidden_states: + all_hidden_states += (hidden_states,) + + next_cache = next_decoder_cache if use_cache else None + + if not return_dict: + return tuple( + v + for v in [ + hidden_states, + next_cache, + all_hidden_states, + all_self_attns, + all_router_loss, + all_gate_logits, + ] + if v is not None + ) + return BaseModelOutputWithPastAndCrossAttentions( + last_hidden_state=hidden_states, + past_key_values=next_cache, + hidden_states=all_hidden_states, + attentions=all_self_attns, + cross_attentions=None, + router_loss=all_router_loss, + gate_logits=all_gate_logits, + ) + + +class ErniePretrainingCriterionBase(paddle.nn.Layer): + """ + Criterion for Ernie. + It calculates the final loss. + """ + + def __init__(self, config, return_tuple=True): + super(ErniePretrainingCriterionBase, self).__init__() + self.ignored_index = getattr(config, "ignored_index", -100) + self.config = config + self.return_tuple = return_tuple + self.enable_parallel_cross_entropy = ( + config.tensor_parallel_degree > 1 and config.tensor_parallel_output + ) + + self.loss_func = paddle.nn.CrossEntropyLoss( + reduction="none", + ) + + def forward(self, prediction_scores, masked_lm_labels): + if self.config.use_sparse_head_and_loss_fn: + hidden_states, outlinear_weight, outlinear_bias = prediction_scores + + if self.config.sequence_parallel: + masked_lm_labels, sparse_label_idx = ( + sequence_parallel_sparse_mask_labels( + masked_lm_labels, self.ignored_index + ) + ) + else: + masked_lm_labels = masked_lm_labels.flatten() + sparse_label_idx = paddle.nonzero( + masked_lm_labels != self.ignored_index + ).flatten() + masked_lm_labels = paddle.take_along_axis( + masked_lm_labels, sparse_label_idx, axis=0 + ) + + hidden_states = hidden_states.reshape([-1, hidden_states.shape[-1]]) + hidden_states = paddle.take_along_axis( + hidden_states, sparse_label_idx.reshape([-1, 1]), axis=0 + ) + + if self.config.use_recompute_loss_fn: + res = recompute( + self.forward_impl_with_calc_logits, + masked_lm_labels, + hidden_states, + outlinear_weight, + outlinear_bias, + sparse_label_idx, + ) + else: + logits = calc_lm_head_logits( + self.config, + hidden_states, + outlinear_weight, + outlinear_bias, + sparse_label_idx, + ) + res = self.forward_impl(logits, masked_lm_labels) + elif self.config.use_recompute_loss_fn: + assert isinstance(prediction_scores, tuple) and len(prediction_scores) in [ + 3, + 4, + ] + res = recompute( + self.forward_impl_with_calc_logits, masked_lm_labels, *prediction_scores + ) + else: + res = self.forward_impl(prediction_scores, masked_lm_labels) + + return res + + def forward_impl_with_calc_logits( + self, + masked_lm_labels, + hidden_states, + outlinear_weight, + outlinear_bias, + sparse_label_idx=None, + tensor_parallel_output=None, + ): + + logits = calc_lm_head_logits( + self.config, + hidden_states, + outlinear_weight, + outlinear_bias, + sparse_label_idx, + tensor_parallel_output, + ) + + return self.forward_impl(logits, masked_lm_labels) + + def loss_impl(self, prediction_scores, masked_lm_labels): + """extract loss impl for subbatch""" + masked_lm_loss = self.loss_func( + prediction_scores.astype("float32"), masked_lm_labels.unsqueeze(-1) + ) + return masked_lm_loss + + def forward_impl(self, prediction_scores, masked_lm_labels): + + with paddle.amp.auto_cast(False): + if self.config.use_sparse_head_and_loss_fn and prediction_scores.shape[ + 0 + ] > self.config.get("loss_subbatch_seqlen", 32768): + sb_loss_func = subbatch( + self.loss_impl, + [0, 1], + [0, 0], + self.config.get("loss_subbatch_seqlen", 32768), + 0, + ) + masked_lm_loss = sb_loss_func(prediction_scores, masked_lm_labels) + else: + masked_lm_loss = self.loss_impl(prediction_scores, masked_lm_labels) + lossmask = masked_lm_labels != self.ignored_index + + if (~lossmask).all(): # empty span + logger.warning( + f"encounter empty span when calculate loss, ignored_index={self.ignored_index}" + ) + loss = paddle.mean(masked_lm_loss) * 0.0 + loss_sum = masked_lm_loss.sum().detach() + else: + lossmask_ = lossmask.reshape([-1]).cast(paddle.float32) + # 逐位对齐, 全精度聚合 + masked_lm_loss_ = paddle.sum( + masked_lm_loss.cast(paddle.float32).reshape([-1]) * lossmask_ + ) + loss = masked_lm_loss_ / lossmask_.sum() + loss_sum = masked_lm_loss_.sum().detach() + + if not self.return_tuple: # only used in pp + if self.training: + return loss + return loss_sum + return loss, loss_sum + + +class ErniePretrainingCriterion(ErniePretrainingCriterionBase): + """ + Criterion for Ernie. + It calculates the final loss. + """ + + def __init__(self, config, return_tuple=True): + super(ErniePretrainingCriterion, self).__init__( + config, return_tuple=return_tuple + ) + + def forward(self, prediction_scores, masked_lm_labels, router_loss=None): + """ + calculates the final loss + """ + res = super().forward( + prediction_scores, + masked_lm_labels, + ) + if self.return_tuple: + loss, loss_sum = res + else: + loss, loss_sum = res, None + # global_training_logs.update(lm_loss=loss.clone().detach()) + if router_loss is not None and not in_auto_parallel_align_mode(): + global_mesh = global_mesh_starts_with_pp() + if self.config.pipeline_parallel_degree > 1: + loss = dist.reshard( + loss, + global_mesh, + [dist.Replicate() for _ in range(len(global_mesh._shape))], + ) + router_loss = dist.reshard( + router_loss, + global_mesh, + [dist.Replicate() for _ in range(len(global_mesh._shape))], + ) + loss = loss + router_loss - router_loss.detach() + # if isinstance(router_loss, paddle.Tensor): + # global_training_logs.update(router_loss=router_loss.detach()) + return loss, loss_sum + + +class ErnieLMHead(nn.Layer): + """ + ErnieLMHead is the linear layer used to project hidden state of decoder into word embeddings. + """ + + def __init__(self, config): + super(ErnieLMHead, self).__init__() + self.config = config + vocab_size = config.vocab_size + self.weight = self.create_parameter( + shape=( + [vocab_size, config.hidden_size] + if config.tie_word_embeddings + else [config.hidden_size, vocab_size] + ), + dtype=paddle.get_default_dtype(), + ) + + if ( + self.config.tensor_parallel_degree > 1 + or self.config.pipeline_parallel_degree > 1 + ): + self.weight = dist.shard_tensor( + self.weight, + get_mesh(-1), + [dist.Replicate(), dist.Shard(1)], + ) + + logger.info( + f"output-weight:{self.weight.shape} config.tie_word_embeddings={config.tie_word_embeddings}" + ) + if config.weight_share_add_bias and config.use_bias: + self.bias = self.create_parameter( + shape=[vocab_size], + dtype=paddle.get_default_dtype(), + attr=paddle.ParamAttr( + initializer=paddle.nn.initializer.constant.Constant(0.0) + ), + ) + if ( + self.config.tensor_parallel_degree > 1 + or self.config.pipeline_parallel_degree > 1 + ): + self.bias = dist.shard_tensor( + self.bias, + get_mesh(-1), + [dist.Replicate(), dist.Shard(0)], + ) + else: + self.bias = None + + # Must set distributed attr for Tensor Parallel ! + self.weight.is_distributed = ( + True if (vocab_size != config.vocab_size) else False + ) + if config.weight_share_add_bias and config.use_bias: + self.bias.is_distributed = ( + True if (vocab_size != config.vocab_size) else False + ) + + if self.weight.is_distributed: + self.weight.split_axis = 1 + if ( + config.weight_share_add_bias + and config.use_bias + and self.bias.is_distributed + ): + self.bias.split_axis = 0 + + if self.config.use_recompute_loss_fn: + logger.info( + "Using recompute_loss_fn, the calculation of logits will be moved into " + "loss_fn for memory optimization" + ) + + def forward(self, hidden_states, tensor_parallel_output=None): + + if self.config.use_recompute_loss_fn or self.config.use_sparse_head_and_loss_fn: + out_tensors = ( + (hidden_states, self.weight, self.bias) + if tensor_parallel_output is None + else (hidden_states, self.weight, self.bias, tensor_parallel_output) + ) + + return out_tensors + + return calc_lm_head_logits( + self.config, + hidden_states, + self.weight, + self.bias, + None, + tensor_parallel_output, + ) + + +class ErnieForCausalLMAuto(ErniePretrainedModelAuto): + """ + ErnieForCausalLMAuto is the model class for causal language modeling. + """ + + _keys_to_ignore_on_load_missing = [r"lm_head.weight"] + + def __init__(self, config): + super().__init__(config) + + if config.sequence_parallel: + logger.info(f"using sequence_parallel, input seqlen={config.seqlen}") + if config.using_dynamic_sequence_length: + assert ( + not config.micro_batch_size + ), "sequence-parallel needs micro_batch_size setting when using dygramic_sequnence_length" + else: + assert config.seqlen is not None + + assert ( + config.tensor_parallel_degree > 1 + ), f"sequence-parallel needs mp>1, got mp={config.tensor_parallel_degree}" + + # initialize-trick for big model, see + # https://github.com/bigscience-workshop/bigscience/blob/master/train/tr11-176B-ml/README.md#std-init + new_initializer_range = math.sqrt(0.3333 / config.hidden_size) + logger.info( + f"change initializer-range from {config.initializer_range} to {new_initializer_range}" + ) + config.initializer_range = new_initializer_range + self.config = config + self.ernie = ErnieModelAuto(config) + self.lm_head = ErnieLMHead(config) + self.criterion = ErniePretrainingCriterion(config) + + self.tie_weights() # maybe weight share + + if self.config.use_rmsnorm: + if self.config.fuse_rms_norm: + logger.info("Use fusedRMSNorm") + else: + logger.info("Use normal RMSNorm") + else: + if self.config.fuse_ln: + logger.info("Use fusedLN") + else: + logger.info("Use normal LayerNorm") + + def _post_init(self, original_init, *args, **kwargs): + """ + Initialize weights and apply final processing + """ + super()._post_init(self, original_init, *args, **kwargs) + factor = 1 / math.sqrt(2 * self.config.num_hidden_layers) + logger.info(f"using post init div: factor:{factor}") + + def scale_by_factor_if_valid(w): + if w.is_dist() and w._is_initialized(): + w.scale_(factor) + + if hasattr(self.config, "use_moe") and self.config.use_moe: + with paddle.no_grad(): + for left in self.ernie.layers: + if isinstance( + left.self_attn.o_proj, + (MOELayerAuto), + ): + for e in left.self_attn.o_proj.experts: + if isinstance(e, ErnieMoeMLP): + scale_by_factor_if_valid(e.weight) + else: + scale_by_factor_if_valid(left.self_attn.o_proj.weight) + + if isinstance( + left.mlp, + (MOELayerAuto), + ): + for e in left.mlp.experts: + if isinstance(e, ErnieMoeMLP): + scale_by_factor_if_valid(e.down_proj.weight) + else: + scale_by_factor_if_valid(left.mlp.down_proj.weight) + else: + with paddle.no_grad(): + for left in self.ernie.layers: + scale_by_factor_if_valid(left.self_attn.o_proj.weight) + scale_by_factor_if_valid(left.mlp.down_proj.weight) + + def get_input_embeddings(self): + + return self.ernie.embed_tokens + + def set_input_embeddings(self, value): + + self.ernie.embed_tokens = value + + def get_output_embeddings(self): + return self.lm_head + + def set_output_embeddings(self, new_embeddings): + self.lm_head = new_embeddings + + def set_decoder(self, decoder): + self.ernie = decoder + + def get_decoder(self): + return self.ernie + + @staticmethod + def prepare_attention_mask_for_generation(input_ids, pad_token_id, eos_token_id): + is_pad_token_in_inputs_ids = (pad_token_id is not None) and paddle.any( + input_ids == pad_token_id + ).numpy().item() + is_pad_token_not_equal_to_eos_token_id = (eos_token_id is None) or ( + (eos_token_id is not None) and (pad_token_id != eos_token_id) + ) + if is_pad_token_in_inputs_ids and is_pad_token_not_equal_to_eos_token_id: + attention_mask = (input_ids != pad_token_id).astype("int64") + else: + attention_mask = paddle.ones_like(input_ids, dtype="int64") + return attention_mask + + def prepare_inputs_for_generation( + self, + input_ids, + use_cache=False, + past_key_values=None, + inputs_embeds=None, + **kwargs, + ): + if past_key_values: + input_ids = input_ids[:, -1:] + + attention_mask = kwargs.get("attention_mask", None) + + # if `inputs_embeds` are passed, we only want to use them in the 1st generation step + if inputs_embeds is not None and past_key_values is None: + model_inputs = {"inputs_embeds": inputs_embeds} + else: + model_inputs = {"input_ids": input_ids} + + model_inputs.update( + { + "past_key_values": past_key_values, + "use_cache": True, # use_cache, + "attention_mask": attention_mask, + "return_dict": True, + } + ) + return model_inputs + + @staticmethod + def update_model_kwargs_for_generation( + outputs, model_kwargs, is_encoder_decoder=False + ): + # update cache + if ( + isinstance(outputs, tuple) + and len(outputs) > 1 + and not isinstance(outputs[1], paddle.Tensor) + ): + model_kwargs["past_key_values"] = outputs[1] + + if ( + isinstance(outputs, CausalLMOutputWithCrossAttentions) + and "past_key_values" in outputs + ): + model_kwargs["past_key_values"] = outputs.past_key_values + + # update token_type_ids with last value + if ( + "token_type_ids" in model_kwargs + and model_kwargs["token_type_ids"] is not None + ): + token_type_ids = model_kwargs["token_type_ids"] + model_kwargs["token_type_ids"] = paddle.concat( + [token_type_ids, token_type_ids[:, -1:]], axis=-1 + ) + + if not is_encoder_decoder: + # update attention mask + if "attention_mask" in model_kwargs: + attention_mask = model_kwargs["attention_mask"] + model_kwargs["attention_mask"] = paddle.concat( + [ + attention_mask, + paddle.ones([attention_mask.shape[0], 1], dtype="int64"), + ], + axis=-1, + ) + # update role_ids + if "role_ids" in model_kwargs and model_kwargs["role_ids"] is not None: + role_ids = model_kwargs["role_ids"] + model_kwargs["role_ids"] = paddle.concat( + [role_ids, role_ids[:, -1:]], axis=-1 + ) + + return model_kwargs + + def forward( + self, + input_ids, + labels=None, + position_ids=None, + attention_mask=None, + inputs_embeds=None, + use_cache=False, + past_key_values=None, + output_attentions=None, + output_hidden_states=None, + return_dict=False, + ignored_index=0, + inbatch_pack_offset=None, + token_type_ids=None, + ): + if isinstance(input_ids, list): + input_ids, labels = input_ids[:2] + + output_attentions = ( + output_attentions + if output_attentions is not None + else self.config.output_attentions + ) + output_hidden_states = ( + output_hidden_states + if output_hidden_states is not None + else self.config.output_hidden_states + ) + return_dict = ( + return_dict if return_dict is not None else self.config.use_return_dict + ) + + outputs = self.ernie( + input_ids, + position_ids=position_ids, + attention_mask=attention_mask, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + past_key_values=past_key_values, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=True, + inbatch_pack_offset=inbatch_pack_offset, + token_type_ids=token_type_ids, + ) + + hidden_states = outputs.last_hidden_state + + logits = self.lm_head( + hidden_states, + ) # tensor_parallel_output=tensor_parallel_output) + + if return_dict: # aka Generate Decoding + if labels is not None: + loss, _ = self.criterion(logits, labels) + else: + loss = None + return CausalLMOutputWithCrossAttentionsAuto( + loss=loss, + logits=logits, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + router_loss=outputs.router_loss if self.config.use_moe else None, + ) + + assert labels is not None + router_loss = ( + outputs.router_loss + if hasattr(self.config, "use_moe") and self.config.use_moe + else None + ) + return self.criterion(logits, labels, router_loss) diff --git a/examples/pre-training/models/ernie/modeling_auto_pp.py b/examples/pre-training/models/ernie/modeling_auto_pp.py new file mode 100644 index 00000000..56e66aba --- /dev/null +++ b/examples/pre-training/models/ernie/modeling_auto_pp.py @@ -0,0 +1,620 @@ +# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Paddle Ernie model""" +import math +import logging + + +import paddle +import paddle.distributed as dist +from paddle import nn +from paddle.distributed import fleet +from paddle.distributed.auto_parallel.pipelining.schedules import ( + Schedule1F1B, + ScheduleFThenB, + ScheduleVPP, +) +from paddle.distributed.auto_parallel.pipelining.stage import PipelineStage + +from paddle.distributed.fleet.utils import recompute + + +from models.moe.moe_utils_auto import get_mesh + +from .modeling_auto import ( + _parse_moe_group, + ErnieDecoderLayerAuto, + ErniePretrainedModelAuto, + LayerNorm, + RMSNorm, + FusedLayerNorm, + ErniePretrainingCriterion, + ErnieLMHead, +) + +from paddle.distributed import in_auto_parallel_align_mode + + +# Because param_name is generated based on the class name, +# when changes in distributed strategies result in class modifications, +# there may be mismatches during parameter loading. +# You can achieve class name changes by importing the following environment variables. +# Example: `export rowcol_parallel_linear_class_name_convert_map="tpsp->smp"` + +logger = logging.getLogger(__name__) + +try: + from paddle.nn.functional.flash_attention import flash_attention + + logger.warning( + "Use flash attention in scaled-dot-product. Attention mask is deprecated" + ) +except (ImportError, ModuleNotFoundError): + flash_attention = None + + +__all__ = [ + "get_ernie_pp_schedule", + "ErnieForCausalLMAutoPP", +] + + +def parse_args(args): + hidden_states, attention_mask, position_ids = None, None, None + if isinstance(args, tuple): + if len(args) == 3: + hidden_states, attention_mask, position_ids = args + elif len(args) == 2: + hidden_states, attention_mask = args + elif len(args) == 1: + hidden_states = args[0] + else: + hidden_states = args + if position_ids is not None: + position_ids.stop_gradient = True + + if attention_mask is not None: + attention_mask.stop_gradient = True + + return hidden_states, attention_mask, position_ids + + +def return_args(hidden_states, attention_mask=None, position_ids=None): + + ret = (hidden_states,) + + if attention_mask is not None: + ret += (attention_mask.clone(),) + if position_ids is not None: + ret += (position_ids.clone(),) + if len(ret) == 1: + ret = ret[0] + + return ret + + +def global_mesh_starts_with_pp(): + + mesh = fleet.auto.get_mesh() + if "pp" in mesh.dim_names: + return mesh.get_mesh_with_dim("pp") + else: + return mesh + + +class ErnieChunk(nn.Layer): + def __init__(self, layers=None, is_first=False): + + super(ErnieChunk, self).__init__() + self.layers = layers + self.is_first = is_first + + def forward(self, *args, **kwargs): + """ + Forward function of the model. + + Args: + *args (tuple, optional): Tuple containing input tensors. If is_first is True, + input_ids, attention_mask and position_ids are required; otherwise, + it should be a tuple of output tensors from previous layer. Default None. + **kwargs (dict, optional): Dictionary containing input tensors. If is_first is False, + input_ids, attention_mask and position_ids are required; otherwise, it should be + an empty dictionary. Default None. + + Returns: + tuple (list[Tensor], Tensor, Tensor): Tuple containing output tensors from each decoder layer. + The first item is a list of output tensors from each decoder layer, the second item is the last + hidden state of the encoder, and the third item is the last position encoding index. + """ + if self.is_first: + input_ids = kwargs.get("input_ids") + attention_mask = kwargs.get("attention_mask") + position_ids = kwargs.get("position_ids") + outputs = tuple([input_ids, attention_mask, position_ids]) + # decoder layers + for idx, (decoder_layer) in enumerate(self.layers): + outputs = decoder_layer(outputs) + return outputs + else: + outputs = args + # decoder layers + for idx, (decoder_layer) in enumerate(self.layers): + outputs = decoder_layer(outputs) + return outputs + + +def manual_model_split(model, stage_idx, group, mode, pp_degree): + + num_hidden_layers = model.config.num_hidden_layers + virtual_pp_degree = model.config.virtual_pp_degree if mode == "VPP" else 1 + chunk_size = num_hidden_layers // virtual_pp_degree // pp_degree + chunk_num = virtual_pp_degree * pp_degree + layer_lists = None + + layer_lists = model.layers + + def _build_stage(model, stage_idx, group): + new_model = None + if stage_idx == 0: + new_model = ErnieChunk(layer_lists[:chunk_size], is_first=True) + else: + new_model = ErnieChunk( + layer_lists[stage_idx * chunk_size : (stage_idx + 1) * chunk_size], + is_first=False, + ) + stage = PipelineStage(new_model, stage_idx, chunk_num, group=group) + return stage + + stages = [] + for i in range(virtual_pp_degree): + stage = _build_stage(model, stage_idx + i * pp_degree, group) + stages.append(stage) + return stages + + +def get_ernie_pp_schedule(model, n_microbatches, loss_fn, mode, pp_degree, group): + + assert mode in ["VPP", "1F1B", "FThenB"] + stages = manual_model_split(model, group.rank, group, mode, pp_degree) + if mode == "VPP": + schedule = ScheduleVPP(stages, n_microbatches=n_microbatches, loss_fn=loss_fn) + elif mode == "1F1B": + schedule = Schedule1F1B( + stages[0], n_microbatches=n_microbatches, loss_fn=loss_fn + ) + else: + schedule = ScheduleFThenB( + stages[0], n_microbatches=n_microbatches, loss_fn=loss_fn + ) + return schedule + + +class ErnieDecoderLayerAutoPP(nn.Layer): + def __init__(self, config, layer_idx=0, ipp=0): + """ + Initializes the model. + + Args: + config (ErnieConfig): The configuration of the model. + layer_idx (int, optional): The index of the decoder layer. Defaults to 0. + ipp (int, optional): The index of the inner parallelism dimension. Defaults to 0. + + Returns: + None. + """ + if hasattr(config, "use_moe") and config.use_moe: + if config.moe_group in {"mp", "model", "tp", "mpdp"}: + assert config.sequence_parallel + logger.info( + f"disable FFN tensor model parallel, moe-group={config.moe_group}" + ) + config.disable_ffn_model_parallel = True + + config.moe_group = _parse_moe_group(config.moe_group) + if config.moe_group in fleet.auto.get_mesh().dim_names: + config.moe_world_size = fleet.auto.get_mesh().get_dim_size( + config.moe_group + ) + if config.moe_world_size < 0: + config.moe_world_size = 1 + else: + config.moe_world_size = 1 + + super().__init__() + self.config = config + + if hasattr(config, "use_moe") and config.use_moe: + if config.moe_group in {"mp", "model", "tp", "mpdp"}: + assert config.sequence_parallel + logger.info( + f"disable FFN tensor model parallel, moe-group={config.moe_group}" + ) + config.disable_ffn_model_parallel = True + + config.moe_group = _parse_moe_group(config.moe_group) + if config.moe_group in fleet.auto.get_mesh().dim_names: + config.moe_world_size = fleet.auto.get_mesh().get_dim_size( + config.moe_group + ) + if config.moe_world_size < 0: + config.moe_world_size = 1 + else: + config.moe_world_size = 1 + + self.layer_idx = layer_idx + self.ipp = ipp + self.placements = ( + [dist.Shard(1), dist.Shard(0)] + if self.config.sequence_parallel + else [dist.Shard(0), dist.Replicate()] + ) + self.embed_tokens = None + self.norm = None + self.lm_head = None + if layer_idx == 0: + self.vocab_size = config.vocab_size + self.hidden_size = config.hidden_size + self.embed_tokens = nn.Embedding( + self.vocab_size, + self.hidden_size, + ) + if ( + self.config.tensor_parallel_degree > 1 + or self.config.pipeline_parallel_degree > 1 + ): + if not in_auto_parallel_align_mode(): + self.embed_tokens.weight = dist.shard_tensor( + self.embed_tokens.weight, + get_mesh(), + [dist.Replicate(), dist.Shard(1)], + ) + self.layer = ErnieDecoderLayerAuto(config, layer_idx, ipp) + + Norm = RMSNorm if config.use_rmsnorm else LayerNorm + if not config.use_rmsnorm and config.fuse_ln: + Norm = FusedLayerNorm + if self.layer_idx == self.config.num_hidden_layers - 1: + self.norm = Norm(config, -1) + self.lm_head = ErnieLMHead(config) + + def recompute_training( + self, + layer_module, + hidden_states, + attention_mask, + position_ids, + output_attentions, + past_key_value, + use_cache, + inbatch_pack_offset, + token_type_ids, + ): + + def create_custom_forward(module): + def custom_forward(*inputs): + return module(*inputs, output_gate_logits=False) + + return custom_forward + + hidden_states = recompute( + create_custom_forward(layer_module), + hidden_states, + attention_mask, + position_ids, + output_attentions, + past_key_value, + use_cache, + inbatch_pack_offset, + token_type_ids, + use_reentrant=False, + ) + return hidden_states + + def forward(self, args): + output_attentions = self.config.output_attentions + use_cache = self.config.use_cache + output_hidden_states = self.config.output_hidden_states + return_dict = self.config.return_dict + past_key_values = None + past_key_value = None + token_type_ids = None + inbatch_pack_offset = None + if self.embed_tokens is not None: + + input_ids, attention_mask, position_ids = parse_args(args) + if isinstance(input_ids, list): + input_ids, labels = input_ids[:2] + + output_attentions = ( + output_attentions + if output_attentions is not None + else self.config.output_attentions + ) + output_hidden_states = ( + output_hidden_states + if output_hidden_states is not None + else self.config.output_hidden_states + ) + use_cache = use_cache if use_cache is not None else self.config.use_cache + + return_dict = ( + return_dict if return_dict is not None else self.config.use_return_dict + ) + + # retrieve input_ids and inputs_embeds + if input_ids is not None: + batch_size, seq_length = input_ids.shape + else: + raise ValueError( + "You have to specify either input_ids or inputs_embeds" + ) + + seq_length_with_past = seq_length + cache_length = 0 + + if past_key_values is not None: + cache_length = paddle.shape(past_key_values[0])[1] + seq_length_with_past += cache_length + inputs_embeds = self.embed_tokens(input_ids).astype( + self.embed_tokens.weight.dtype + ) + + if self.config.sequence_parallel: + # [B, S, H] -> [S, B, H] + inputs_embeds = paddle.transpose(inputs_embeds, [1, 0, 2]) + # if token_type_ids is not None: + # token_type_ids = token_type_ids.reshape([-1, 1]) + # token_type_ids = dist.reshard( + # token_type_ids, global_mesh, [dist.Replicate() for _ in range(len(global_mesh._shape))] + # ) + # token_type_ids = token_type_ids.reshape([-1]) + global_mesh = global_mesh_starts_with_pp() + + if position_ids is not None: + position_ids = dist.shard_tensor( + position_ids, + global_mesh, + [dist.Replicate() for _ in range(len(global_mesh._shape))], + ) + + can_use_fa = self.config.use_flash_attn and flash_attention is not None + can_mem_eff_attn = ( + self.config.use_mem_eff_attn and inbatch_pack_offset is not None + ) + if can_use_fa or can_mem_eff_attn: + if attention_mask is not None: + attention_mask = None + + elif attention_mask is None: + attention_mask = paddle.ones( + (batch_size, seq_length_with_past), dtype=paddle.bool + ) + if attention_mask is not None: + attention_mask = self._prepare_decoder_attention_mask( + attention_mask, + (batch_size, seq_length), + cache_length, + inputs_embeds.dtype, + ) + attention_mask = dist.shard_tensor( + attention_mask, + global_mesh, + [dist.Replicate() for _ in range(len(global_mesh._shape))], + ) + hidden_states = inputs_embeds + if self.config.tensor_parallel_degree > 1: + hidden_states = dist.reshard( + hidden_states, get_mesh(0), self.placements + ) + + args = return_args(hidden_states, attention_mask, position_ids) + + # decoder layers + hidden_states, attention_mask, position_ids = parse_args(args) + + all_hidden_states = () if output_hidden_states else None + + all_router_loss = None + if hasattr(self.config, "use_moe") and self.config.use_moe: + all_router_loss = paddle.to_tensor(0.0) + all_router_loss = dist.shard_tensor( + all_router_loss, get_mesh(0), dist.Replicate() + ) + + if output_hidden_states: + all_hidden_states += (hidden_states,) + + has_gradient = not hidden_states.stop_gradient + if position_ids is not None: + position_ids_input = dist.reshard( + position_ids, + get_mesh(self.ipp), + [dist.Replicate(), dist.Replicate()], + ) + else: + position_ids_input = position_ids + attention_mask_input = ( + dist.reshard( + attention_mask, + get_mesh(self.ipp), + [dist.Replicate(), dist.Replicate()], + ) + if attention_mask is not None + else None + ) + token_type_ids_input = ( + dist.reshard( + token_type_ids, + get_mesh(self.ipp), + [dist.Replicate(), dist.Replicate()], + ) + if token_type_ids is not None + else None + ) + if self.config.use_recompute and has_gradient: + layer_outputs = self.recompute_training( + self.layer, + hidden_states, + attention_mask_input, + position_ids_input, + output_attentions, + past_key_value, + use_cache, + inbatch_pack_offset, + token_type_ids_input, + ) + else: + layer_outputs = self.layer( + hidden_states, + attention_mask_input, + position_ids_input, + output_attentions, + past_key_value, + use_cache, + inbatch_pack_offset, + token_type_ids_input, + ) + if isinstance(layer_outputs, (tuple, list)): + hidden_states = layer_outputs[0] + else: + hidden_states = layer_outputs + + ret_args = return_args( + hidden_states, + attention_mask, + position_ids, + ) + if self.norm is not None: + hidden_states = self.norm(hidden_states) + if self.lm_head is not None: + logits = self.lm_head(hidden_states) + ret_args = return_args( + logits, + ) + + return ret_args + + +class ErniePretrainingCriterionPP(ErniePretrainingCriterion): + """ + Criterion for Ernie. + It calculates the final loss. + """ + + def __init__(self, config): + + super().__init__(config) + + def forward(self, prediction_scores, masked_lm_labels, router_loss=None): + """ + calculates the final loss + """ + losses = super().forward(prediction_scores, masked_lm_labels) + if losses is not None: + loss = losses[0] + else: + print("err") + return loss + + +class ErnieForCausalLMAutoPP(ErniePretrainedModelAuto): + """ + ErnieForCausalLMAutoPP is the model class for causal language modeling. + """ + + def __init__(self, config): + """ + Args: + config (Config): Config object containing hyperparameters and other configuration details. + + Returns: + None. + + Initializes the ErnieDecoder with the given config. + """ + super().__init__(config) + + if config.sequence_parallel: + logger.info(f"using sequence_parallel, input seqlen={config.seqlen}") + if config.using_dynamic_sequence_length: + assert ( + not config.micro_batch_size + ), "sequence-parallel needs micro_batch_size setting when using dygramic_sequnence_length" + else: + assert config.seqlen is not None + + assert ( + config.tensor_parallel_degree > 1 + ), f"sequence-parallel needs mp>1, got mp={config.tensor_parallel_degree}" + + # initialize-trick for big model, see + # https://github.com/bigscience-workshop/bigscience/blob/master/train/tr11-176B-ml/README.md#std-init + new_initializer_range = math.sqrt(0.3333 / config.hidden_size) + logger.info( + f"change initializer-range from {config.initializer_range} to {new_initializer_range}" + ) + config.initializer_range = new_initializer_range + self.config = config + self.criterion = ErniePretrainingCriterionPP(config) + + if self.config.use_rmsnorm: + if self.config.fuse_rms_norm: + logger.info("Use fusedRMSNorm") + else: + logger.info("Use normal RMSNorm") + else: + if self.config.fuse_ln: + logger.info("Use fusedLN") + else: + logger.info("Use normal LayerNorm") + + decoder_layers = [] + + def get_pp_stage_id(layer_id): + pp_degree = global_mesh_starts_with_pp().shape[0] + chunk_size = self.config.num_hidden_layers // ( + pp_degree * self.config.virtual_pp_degree + ) + chunk_id = layer_id // chunk_size + pp_stage_id = chunk_id % pp_degree + return pp_stage_id + + for i in range(config.num_hidden_layers): + pp_stage_id = get_pp_stage_id(i) + decoder_layers.append(ErnieDecoderLayerAutoPP(config, i, pp_stage_id)) + self.layers = nn.LayerList(decoder_layers) + + def forward( + self, + input_ids, + labels=None, + position_ids=None, + attention_mask=None, + inputs_embeds=None, + use_cache=False, + past_key_values=None, + output_attentions=None, + output_hidden_states=None, + return_dict=False, + ignored_index=0, + inbatch_pack_offset=None, + token_type_ids=None, + ): + outputs = return_args(input_ids, attention_mask, position_ids) + + for layer in self.layers: + outputs = layer(outputs) + + return outputs[0] diff --git a/examples/pre-training/models/ernie_moe/configuration.py b/examples/pre-training/models/ernie_moe/configuration.py new file mode 100644 index 00000000..fdebf8c6 --- /dev/null +++ b/examples/pre-training/models/ernie_moe/configuration.py @@ -0,0 +1,740 @@ +# !/usr/bin/env python3 + +# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" Ernie model configuration""" +import logging +import json +from typing import Union +import paddle.distributed.communication.group + +from paddleformers.transformers.configuration_utils import PretrainedConfig + +logger = logging.getLogger(__name__) + +__all__ = [ + "ERNIE_PRETRAINED_INIT_CONFIGURATION", + "ErnieMoEConfig", +] + +ERNIE_PRETRAINED_INIT_CONFIGURATION = { + "ernie/tiny-random-ernie": { + "hidden_size": 768, + "initializer_range": 0.02, + "intermediate_size": 11008, + "max_position_embeddings": 2048, + "model_type": "ernie", + "num_attention_heads": 2, + "num_hidden_layers": 2, + "rms_norm_eps": 1e-06, + "vocab_size": 32000, + "bos_token_id": 1, + "eos_token_id": 2, + "pad_token_id": 0, + "use_cache": False, + "use_recompute": False, + "use_flash_attn": True, + "use_pure_fp16": False, + }, +} + + +class ErnieConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`~ErnieModel`]. It is used to instantiate an Ernie + model according to the specified arguments, defining the model architecture. Instantiating a configuration with the + defaults will yield a similar configuration to that of the Ernie-7B. + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + Args: + vocab_size (`int`, *optional*, defaults to 32000): + Vocabulary size of the Ernie model. Defines the number of different tokens that can be represented by the + `inputs_ids` passed when calling [`~ErnieModel`] or [`~TFErnieModel`]. + hidden_size (`int`, *optional*, defaults to 4096): + Dimension of the hidden representations. + intermediate_size (`int`, *optional*, defaults to 11008): + Dimension of the MLP representations. + num_hidden_layers (`int`, *optional*, defaults to 32): + Number of hidden layers in the Transformer encoder. + num_attention_heads (`int`, *optional*, defaults to 32): + Number of attention heads for each attention layer in the Transformer encoder. + hidden_act (`str` or `function`, *optional*, defaults to `"silu"`): + The non-linear activation function (function or string) in the decoder. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + rms_norm_eps (`float`, *optional*, defaults to 1e-12): + The epsilon used by the rms normalization layers. + use_cache (`bool`, *optional*, defaults to `True`): + Whether or not the model should return the last key/values attentions (not used by all models). Only + relevant if `config.is_decoder=True`. + tie_word_embeddings(`bool`, *optional*, defaults to `False`): + Whether to tie weight embeddings + Example: + ```python + >>> from paddleformers.transformer import ErnieModel, ErnieConfig + + >>> # Initializing a Ernie ernie-7b style configuration + >>> configuration = ErnieConfig() + + >>> # Initializing a model from the ernie-7b style configuration + >>> model = ErnieModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "ernie" + attribute_map = { + "n_positions": "max_position_embeddings", + "n_embd": "hidden_size", + "n_layer": "num_hidden_layers", + "n_head": "num_attention_heads", + "n_inner": "intermediate_size", + "activation_function": "hidden_act", + } + pretrained_init_configuration = ERNIE_PRETRAINED_INIT_CONFIGURATION + + def __init__( + self, + vocab_size=32000, + hidden_size=768, + intermediate_size=11008, + max_position_embeddings=32768, + num_hidden_layers=2, + num_attention_heads=2, + head_dim=None, + initializer_range=0.02, # no use + rms_norm_eps=1e-6, + use_cache=False, + use_flash_attn=True, + use_mem_eff_attn=False, + use_flash_attn_with_mask=False, + use_recompute=False, + use_recompute_attn=False, + recompute_use_reentrant=False, + use_rmsnorm=True, + z_loss_lambda=None, + fuse_rms_norm=False, + fuse_ln=False, + pad_token_id=0, + bos_token_id=1, + eos_token_id=2, + fuse_attn_ffn=False, + fuse_swiglu=False, + use_bias=False, + expert_mlp_use_bias=None, + rope_reorder=True, + rope_theta=10000, + fuse_rope=False, + use_fast_ln=False, + weight_share_add_bias=True, + fuse_linear=False, + seqlen=False, + ignored_index=-100, + remove_tail_layer=False, + use_recompute_lm_head=False, + use_recompute_loss_fn=False, + use_recompute_mtp=False, + use_recompute_dnd=False, + selective_no_recompute_num=0, + use_mp_gathered_weight=False, + refined_recompute=dict(), + attention_probs_dropout_prob=0.0, + hidden_dropout_prob=0.0, + compression_ratio: float = 1.0, + quant_bits=-1, + num_key_value_heads=None, + submatrix_parallel=False, + submatrix_parallel_low_memory=True, + use_sparse_head_and_loss_fn=False, + using_dynamic_sequence_length=False, + micro_batch_size=-1, + using_precision_check=False, + use_qk_norm=False, + use_tpsp_comm_overlap=False, + offload_pp_data_chunk_size=0, + use_fused_head_loss_fn=False, + use_recompute_resampler=False, + resampler_fuse_rms_norm=False, + token_loss_equal_weight=False, + token_balance_loss=False, + token_balance_seqlen=False, + use_fp8=False, + fp8_configs=dict(), + use_fp8_mlp=False, + fp8_mem_configs=dict(), + fp8_fused_ops_configs=dict(), + drop_before_deepep=False, + deepep_drop_padding=False, + disable_pipeline_warmup=False, + skip_align_position_id=False, + rope_3d=False, + freq_allocation=0, + moe_layer_feed_fake_token=False, + decoderlayer_act_offload_settings={"type": "", "value": ""}, + loss_subbatch_seqlen=32768, + gate_force_zero_padding_grad=False, + recompute_num_layers=None, + use_combine_before_a2a=False, + use_quant_before_a2a=False, + rope_yarn_config={}, + **kwargs, + ): + if "tie_word_embeddings" not in kwargs: + kwargs["tie_word_embeddings"] = False + super().__init__( + pad_token_id=pad_token_id, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + **kwargs, + ) + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.intermediate_size = intermediate_size + self.max_position_embeddings = max_position_embeddings + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.head_dim = head_dim + self.initializer_range = initializer_range + self.rms_norm_eps = rms_norm_eps + self.use_cache = use_cache + self.use_recompute_attn = use_recompute_attn + if use_recompute_attn: + logger.warning("set `use_recompute_attn`=True, disabling `use_recompute`") + use_recompute = False + self.use_recompute = use_recompute + self.recompute_num_layers = ( + recompute_num_layers + if recompute_num_layers is not None + else num_hidden_layers + ) + self.use_flash_attn = use_flash_attn + self.recompute_use_reentrant = recompute_use_reentrant + self.use_mem_eff_attn = use_mem_eff_attn + self.use_flash_attn_with_mask = use_flash_attn_with_mask + self.pad_token_id = pad_token_id + self.bos_token_id = bos_token_id + self.eos_token_id = eos_token_id + self.fuse_attn_ffn = fuse_attn_ffn + self.fuse_swiglu = fuse_swiglu + self.fuse_rms_norm = fuse_rms_norm + self.fuse_ln = fuse_ln + self.use_rmsnorm = use_rmsnorm + self.z_loss_lambda = z_loss_lambda + self.using_dynamic_sequence_length = using_dynamic_sequence_length + if using_dynamic_sequence_length: + assert ( + micro_batch_size > 0 + ), "micro_batch_size should be set when using_dynamic_sequence_length" + self.micro_batch_size = micro_batch_size + self.using_precision_check = using_precision_check + self.use_qk_norm = use_qk_norm + + self.seqlen = seqlen + self.use_bias = use_bias + self.weight_share_add_bias = weight_share_add_bias + self.rope_reorder = rope_reorder + self.rope_yarn_config = rope_yarn_config + self.rope_theta = rope_theta + self.fuse_rope = fuse_rope + self.use_fast_ln = use_fast_ln + + self.fuse_linear = fuse_linear + self.ignored_index = ignored_index + self.remove_tail_layer = remove_tail_layer + self.use_recompute_lm_head = use_recompute_lm_head + self.use_recompute_loss_fn = use_recompute_loss_fn + self.use_recompute_mtp = use_recompute_mtp + self.use_recompute_dnd = use_recompute_dnd + + self.use_mp_gathered_weight = use_mp_gathered_weight + self.selective_no_recompute_num = selective_no_recompute_num # only PP + + self.refined_recompute = refined_recompute + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.hidden_dropout_prob = hidden_dropout_prob + self.compression_ratio = compression_ratio + self.skip_recompute_ops = dict() + self.quant_bits = quant_bits + self.num_key_value_heads = num_key_value_heads + self.submatrix_parallel = submatrix_parallel + self.submatrix_parallel_low_memory = submatrix_parallel_low_memory + self.use_sparse_head_and_loss_fn = use_sparse_head_and_loss_fn + self.use_tpsp_comm_overlap = use_tpsp_comm_overlap + self.offload_pp_data_chunk_size = offload_pp_data_chunk_size + self.use_fused_head_loss_fn = use_fused_head_loss_fn + self.use_recompute_resampler = use_recompute_resampler + self.resampler_fuse_rms_norm = resampler_fuse_rms_norm + self.token_balance_loss = token_balance_loss + self.token_balance_seqlen = token_balance_seqlen + self.rope_3d = rope_3d + self.freq_allocation = freq_allocation + self.decoderlayer_act_offload_settings = decoderlayer_act_offload_settings + self.loss_subbatch_seqlen = loss_subbatch_seqlen + self.gate_force_zero_padding_grad = gate_force_zero_padding_grad + + # 默认的 fp8 设置 + default_fp8_configs = { + "quant_scheme": "DelayedScaling", + "recipe": { + "format": "hybrid", + "calibrating": True, + "amax_history_len": 1024, + "amax_compute_algo": "max", + "fuse_wgrad_accumulation": False, + "quant_weight_at_first_microbatch": False, + }, + "layers": { + "attn_fc1_linear": True, + "attn_fc2_linear": True, + "mlp_fc1_linear": True, + "mlp_fc2_linear": True, + "attn_tp_fc1_linear": True, + "attn_tp_fc2_linear": True, + "mlp_tp_fc1_linear": True, + "mlp_tp_fc2_linear": True, + }, + "smooth_swiglu": False, + } + + def update_nested_dict(default_dict, update_dict): + for key, value in update_dict.items(): + if ( + isinstance(value, dict) + and key in default_dict + and isinstance(default_dict[key], dict) + ): + update_nested_dict(default_dict[key], value) + else: + default_dict[key] = value + + # 更新默认设置 + update_nested_dict(default_fp8_configs, fp8_configs) + self.fp8_configs = default_fp8_configs + self.use_fp8 = use_fp8 + self.expert_mlp_use_bias = expert_mlp_use_bias + self.use_fp8_mlp = use_fp8_mlp + default_fp8_mem_configs = { + "shared_expert": False, + "recompute_fwd_gate_up": False, + "dequant_input": False, + } + update_nested_dict(default_fp8_mem_configs, fp8_mem_configs) + self.fp8_mem_configs = default_fp8_mem_configs + default_fp8_fused_ops_configs = { + "stack_quant": False, + "swiglu_probs_bwd": False, + "split_group_gemm": True, + } + update_nested_dict(default_fp8_fused_ops_configs, fp8_fused_ops_configs) + self.fp8_fused_ops_configs = default_fp8_fused_ops_configs + self.drop_before_deepep = drop_before_deepep + self.deepep_drop_padding = deepep_drop_padding + self.disable_pipeline_warmup = disable_pipeline_warmup + self.skip_align_position_id = skip_align_position_id + self.moe_layer_feed_fake_token = moe_layer_feed_fake_token + + if self.sequence_parallel: + assert ( + self.using_dynamic_sequence_length or self.seqlen + ), "seqlen not provided in sequence-parallel when not using dygramic sequence length" + + assert ( + self.tensor_parallel_degree > 1 + ), f"senquence-parallel only works in mp, got mp={self.tensor_parallel_degree}" + + self.register_nonsaveable_keys("use_recompute") + self.register_nonsaveable_keys("recompute_use_reentrant") + self.register_nonsaveable_keys("refined_recompute") + self.register_nonsaveable_keys("use_recompute_attn") + self.register_nonsaveable_keys("use_recompute_lm_head") + self.register_nonsaveable_keys("use_recompute_mtp") + self.register_nonsaveable_keys("use_recompute_dnd") + self.register_nonsaveable_keys("use_recompute_loss_fn") + self.register_nonsaveable_keys("using_precision_check") + self.register_nonsaveable_keys("skip_recompute_ops") + + def __setattr__(self, name: str, value): + super().__setattr__(name, value) + if getattr(self, "use_recompute", False): + assert not getattr( + self, "use_recompute_attn", False + ), "cannot set `use_recompute_attn=True` when `use_recompute=True`" + + def register_nonsaveable_keys(self, keys): + + if hasattr(super(), "register_nonsaveable_keys"): + return super().register_nonsaveable_keys(keys) + elif hasattr(super(), "register_unsavable_keys"): + return super().register_unsavable_keys(keys) + else: + raise AttributeError( + "register_nonsaveable_keys not found in PretrainedConfig" + ) + + +class ErnieMoEConfig(ErnieConfig): + r""" + This is the configuration class to store the configuration of a [`~ErnieModel`]. It is used to instantiate an Ernie + model according to the specified arguments, defining the model architecture. Instantiating a configuration with the + defaults will yield a similar configuration to that of the Ernie-7B. + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + Args: + vocab_size (`int`, *optional*, defaults to 32000): + Vocabulary size of the Ernie model. Defines the number of different tokens that can be represented by the + `inputs_ids` passed when calling [`~ErnieModel`] or [`~TFErnieModel`]. + hidden_size (`int`, *optional*, defaults to 4096): + Dimension of the hidden representations. + intermediate_size (`int`, *optional*, defaults to 11008): + Dimension of the MLP representations. + num_hidden_layers (`int`, *optional*, defaults to 32): + Number of hidden layers in the Transformer encoder. + num_attention_heads (`int`, *optional*, defaults to 32): + Number of attention heads for each attention layer in the Transformer encoder. + hidden_act (`str` or `function`, *optional*, defaults to `"silu"`): + The non-linear activation function (function or string) in the decoder. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + rms_norm_eps (`float`, *optional*, defaults to 1e-12): + The epsilon used by the rms normalization layers. + use_cache (`bool`, *optional*, defaults to `True`): + Whether or not the model should return the last key/values attentions (not used by all models). Only + relevant if `config.is_decoder=True`. + tie_word_embeddings(`bool`, *optional*, defaults to `False`): + Whether to tie weight embeddings + Example: + ```python + >>> from paddleformers.transformer import ErnieModel, ErnieConfig + + >>> # Initializing a Ernie ernie-7b style configuration + >>> configuration = ErnieConfig() + + >>> # Initializing a model from the ernie-7b style configuration + >>> model = ErnieModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "ernie" + attribute_map = { + "n_positions": "max_position_embeddings", + "n_embd": "hidden_size", + "n_layer": "num_hidden_layers", + "n_head": "num_attention_heads", + "n_inner": "intermediate_size", + "activation_function": "hidden_act", + } + pretrained_init_configuration = ERNIE_PRETRAINED_INIT_CONFIGURATION + + def __init__( + self, + moe_num_experts: Union[int, list] = 0, + use_fake_gate=False, + use_recompute_moe=False, + moe_capacity=(), + moe_layer_interval=2, + moe_layer_start_index: Union[int, list] = 0, + moe_layer_end_index: Union[int, list] = -1, + moe_aux_loss_lambda=1e-2, + moe_z_loss_lambda=1e-4, + moe_orthogonal_loss_lambda=1e-2, + moe_use_size_all2all=False, + sinkhorn_2gate=True, + sinkhorn_temp=3e-2, + global_aux_loss=False, + moe_dropout_prob=0.0, + moe_group="world", + moe_gate="top2", + moe_num_attn_experts=False, + moe_logging=False, + num_experts_per_tok: int = 8, + moe_intermediate_size: Union[int, list] = 0, + moe_num_shared_experts: int = 0, + moe_num_dense_experts: int = 0, + moe_dense_experts_token_type_id: int = 3, + moe_multimodal_dispatch_use_allgather: str = "", + moe_multimodal_paired_experts: bool = False, + moe_reverse_token_drop: bool = False, + moe_gate_act: str = "softmax", + moe_norm_gate_logits=True, + moe_use_hard_gate: bool = False, + moe_use_bpr: bool = False, + moe_fuse_experts: bool = False, + moe_all_to_all_dropout: float = 0.0, + moe_use_token_type_bias: bool = False, + moe_k=2, + moe_use_aux_free: bool = False, + moe_group_experts: bool = False, + moe_group_orthogonal_loss: bool = False, + moe_with_send_router_loss: bool = True, + enable_delay_scale_loss: bool = True, + num_acc_steps: int = None, + insert_empty_layer: list = None, + pp_no_recompute_layer: list = None, + multi_token_pred_depth: int = 0, + multi_token_pred_lambda: float = 0.3, + fuse_gate_detach_matmul: bool = False, + enable_mtp_magic_send: bool = False, + use_elastic_topk: bool = False, + use_deepep: bool = False, + use_elastic_expert_num: bool = False, + elastic_min_expert_num: int = 0, + all_expert_ratio: float = 1.0, + use_elastic_topk_for_mbs: bool = False, + elastic_min_topk: int = 1, + elastic_max_topk: int = None, + n_group: int = 0, + topk_group: int = 0, + scaling_factor: float = None, + aux_loss_type: str = "", + deepep_fine_grained: bool = False, + deepep_use_fused: bool = False, + deepep_tokens_per_subbatch: int = 0, + use_linear_residual_norm_recompute: bool = False, + use_rms_qkv_recompute: bool = False, + build_skip_comm_buffer: bool = False, + use_norm_gate_recompute: bool = False, + moe_state_dict_use_global_expert_id: bool = False, + enable_entropy_logging: bool = False, + use_fp8_fuse_node: bool = False, + use_combine_before_a2a: bool = False, + use_fp8_dispatch_a2a: bool = False, + use_ep_comm_overlap: bool = False, + **kwargs, + ): + """ + config + """ + if use_recompute_moe: + logger.warning("set `use_recompute_moe`=True, disabling `use_recompute`") + kwargs["use_recompute"] = False + super().__init__(**kwargs) + # moe + self.use_fake_gate = use_fake_gate + self.use_recompute_moe = use_recompute_moe + self.moe_num_experts = moe_num_experts + self.moe_capacity = moe_capacity + self.moe_aux_loss_lambda = moe_aux_loss_lambda + self.moe_z_loss_lambda = moe_z_loss_lambda + self.moe_orthogonal_loss_lambda = moe_orthogonal_loss_lambda + self.global_aux_loss = global_aux_loss + self.sinkhorn_2gate = sinkhorn_2gate + self.sinkhorn_temp = sinkhorn_temp + self.moe_layer_interval = moe_layer_interval + self.moe_dropout_prob = moe_dropout_prob + self.moe_group = moe_group + self.moe_gate = moe_gate + self.moe_num_attn_experts = moe_num_attn_experts + # implemtent size-all2all as https://arxiv.org/pdf/2303.06182.pdf + self.moe_use_size_all2all = moe_use_size_all2all + self.moe_logging = moe_logging + self.num_experts_per_tok = num_experts_per_tok + self.moe_num_shared_experts = moe_num_shared_experts + self.moe_num_dense_experts = moe_num_dense_experts + self.moe_dense_experts_token_type_id = moe_dense_experts_token_type_id + self.moe_intermediate_size = moe_intermediate_size + self.moe_reverse_token_drop = moe_reverse_token_drop + self.moe_use_hard_gate = moe_use_hard_gate + self.moe_fuse_experts = moe_fuse_experts + self.moe_k = moe_k + self.moe_all_to_all_dropout = moe_all_to_all_dropout + self.moe_use_token_type_bias = moe_use_token_type_bias + self.moe_use_bpr = moe_use_bpr + self.moe_group_experts = moe_group_experts + self.moe_group_orthogonal_loss = moe_group_orthogonal_loss + # optimize send without router loss + self.moe_with_send_router_loss = moe_with_send_router_loss + self.enable_delay_scale_loss = enable_delay_scale_loss + self.num_acc_steps = num_acc_steps + self.moe_layer_start_index = moe_layer_start_index + self.moe_layer_end_index = ( + self.num_hidden_layers - 1 + if moe_layer_end_index == -1 + else moe_layer_end_index + ) + self.moe_multimodal_dispatch_use_allgather = ( + moe_multimodal_dispatch_use_allgather + ) + self.moe_multimodal_paired_experts = moe_multimodal_paired_experts + self.moe_gate_act = moe_gate_act + self.moe_norm_gate_logits = moe_norm_gate_logits + self.moe_use_aux_free = moe_use_aux_free + self.fuse_gate_detach_matmul = fuse_gate_detach_matmul + if insert_empty_layer is not None: + assert isinstance( + insert_empty_layer, list + ), "insert_empty_layer should be a list" + else: + insert_empty_layer = [] + + # Overlap A2A communication with shared expert and auxiliary loss. + self.use_ep_comm_overlap = use_ep_comm_overlap + # Move the combine operation before A2A communication. + self.use_combine_before_a2a = use_combine_before_a2a + # Use FP8 for dispatch communication. + self.use_fp8_dispatch_a2a = use_fp8_dispatch_a2a + + # Multi-Token Prediction (MTP) + self.multi_token_pred_depth = multi_token_pred_depth + self.multi_token_pred_lambda = multi_token_pred_lambda + self.enable_mtp_magic_send = enable_mtp_magic_send + + # The insert_empty_layer is a list of integer which will be used under pipeline parallel. + # After each layer indicated in the insert_empty_layer, an empty layer will be inserted. + # For example, a model with 4 layers, insert_empty_layer = [1, 3], the model actually passed to + # pp is: transformer, transformer, EMPTY, transformer, transformer, EMPTY + self.insert_empty_layer = insert_empty_layer + + # elastic + self.use_elastic_topk = use_elastic_topk + self.use_elastic_expert_num = use_elastic_expert_num + self.elastic_min_expert_num = elastic_min_expert_num + self.all_expert_ratio = all_expert_ratio + self.use_elastic_topk_for_mbs = use_elastic_topk_for_mbs + self.elastic_min_topk = elastic_min_topk + if elastic_max_topk is None: + self.elastic_max_topk = self.moe_k * 2 - 1 + + # Using fusion expert node in moe layer. + self.use_fp8_fuse_node = use_fp8_fuse_node + + # Perform MoE computation at expert granularity. + self.deepep_fine_grained = deepep_fine_grained + # Requires deepep_fine_grained to be enabled; further disperses token + # granularity within experts to compute subbatches. + self.deepep_tokens_per_subbatch = deepep_tokens_per_subbatch + # Fuse combine and scatter operations when using BF16 for expert computation. + self.deepep_use_fused = deepep_use_fused + + assert not ( + self.use_combine_before_a2a and self.use_deepep + ), "combine_before_a2a is not supported for deepep now." + + assert not ( + self.use_fp8_dispatch_a2a and not self.use_fp8_fuse_node + ), "fp8_dispatch_a2a must be used with use_fp8_fuse_node." + + assert not ( + self.use_fp8_dispatch_a2a and self.use_ep_comm_overlap + ), "fp8_dispatch_a2a connot be used with use_ep_comm_overlap." + + if self.deepep_tokens_per_subbatch: + assert ( + self.deepep_fine_grained + ), "deepep_fine_grained must be enabled when deepep_tokens_per_subbatch is set." + + # node limit routing + self.n_group = n_group + self.topk_group = topk_group + + # router scaling_factor + self.scaling_factor = scaling_factor + + self.build_skip_comm_buffer = build_skip_comm_buffer + + # router loss type + assert aux_loss_type in ["", "default", "seq_aux_loss", "switch_aux_loss"] + self.aux_loss_type = aux_loss_type + + self.use_deepep = use_deepep + if self.moe_multimodal_paired_experts and isinstance( + self.moe_num_experts, (tuple, list) + ): + logger.warning( + "moe_num_experts must be one element when using paired experts" + ) + self.moe_num_experts = self.moe_num_experts[0] + + if pp_no_recompute_layer is not None: + assert isinstance( + insert_empty_layer, list + ), "pp_no_recompute_layer should be a list" + + # Indicating layers not do recompute under pipeline parallel. + # Note that, when insert_empty_layer is not None, the pp_no_recompute_layer should be indicating + # layers number in origin model structure, AKA model before insert empty layers. + self.pp_no_recompute_layer = pp_no_recompute_layer + self.register_nonsaveable_keys("moe_group") + self.register_nonsaveable_keys("pp_no_recompute_layer") + + if ( + self.moe_group in ["dp", "data"] + and self.moe_multimodal_dispatch_use_allgather + ): + assert ( + self.moe_num_shared_experts == 0 + ), "shared experts are not supported when using dp moe and moe_allgather_layer" + assert ( + self.moe_num_dense_experts == 0 + ), "dense experts are not supported when using dp moe and moe_allgather_layer" + + self.use_linear_residual_norm_recompute = use_linear_residual_norm_recompute + self.use_rms_qkv_recompute = use_rms_qkv_recompute + self.use_norm_gate_recompute = use_norm_gate_recompute + self.moe_state_dict_use_global_expert_id = moe_state_dict_use_global_expert_id + self.enable_entropy_logging = enable_entropy_logging + + @property + def multimodel_experts(self) -> bool: + + return ( + isinstance(self.moe_num_experts, (tuple, list)) + and len(self.moe_num_experts) > 1 + ) + + @property + def use_moe(self) -> bool: + """_summary_ + + Returns: + bool: _description_ + """ + return ( + sum(self.moe_num_experts) > 0 + if self.multimodel_experts + else self.moe_num_experts > 0 + ) + + def __setattr__(self, name: str, value): + super().__setattr__(name, value) + if getattr(self, "use_recompute", False): + assert not getattr( + self, "use_recompute_moe", False + ), "cannot set `use_recompute_moe=True` when `use_recompute=True`" + + def to_json_string(self, use_diff: bool = True) -> str: + + if use_diff is True: + config_dict = self.to_diff_dict() + else: + config_dict = self.to_dict() + + def _serializer(obj): + if isinstance(obj, paddle.distributed.communication.group.Group): + return repr(obj) + raise TypeError(f"Type {type(obj)} is not serializable") + + return ( + json.dumps( + config_dict, + indent=2, + sort_keys=True, + ensure_ascii=False, + default=_serializer, + ) + + "\n" + ) diff --git a/examples/pre-training/models/fp8_linear_auto.py b/examples/pre-training/models/fp8_linear_auto.py new file mode 100644 index 00000000..0d1ea29c --- /dev/null +++ b/examples/pre-training/models/fp8_linear_auto.py @@ -0,0 +1,603 @@ +# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +FP8 Linear Layer Implementation for PaddlePaddle + +This module implements FP8 (8-bit floating point) linear layers using PaddlePaddle's +incubate APIs for low-precision training. Key features include: + +1. FP8 matrix multiplication with block-wise quantization +2. Memory-efficient forward/backward passes +3. PaddlePaddle-specific optimizations like: + - Using paddle.incubate.fp8 APIs + - Leveraging Paddle's automatic differentiation system + - Optimized for Paddle's tensor layout and memory management +""" + + +import numpy +import paddle +from paddle.incubate.fp8 import deep_gemm +from paddle.incubate.nn.functional import swiglu + +# Keep reference to original linear op for fallback if needed +original_linear = paddle.nn.functional.linear + + +# Expose only the main class to public API +__all__ = ["Fp8FusedMlp"] + + +def fp8_gemm( + x_fp8, + x_scale, + w_fp8, + w_scale, + is_a_1d_scaled, + is_b_1d_scaled, + out=None, + rtn_dtype=paddle.bfloat16, +): + """ + Performs FP8 matrix multiplication (GEMM) operation, using blockwise GEMM algorithm. + + Args: + x_fp8 (Tensor): Input tensor in FP8 format + x_scale (Tensor): Scaling factor for input tensor + w_fp8 (Tensor): Weight tensor in FP8 format + w_scale (Tensor): Scaling factor for weight tensor + is_a_1d_scaled (bool): Whether input tensor uses 1D scaling + is_b_1d_scaled (bool): Whether weight tensor uses 1D scaling + out (Tensor, optional): Output tensor for accumulation. Defaults to None + rtn_dtype (dtype, optional): Return data type. Defaults to paddle.bfloat16 + + Returns: + Tensor: Result of the matrix multiplication + """ + accumulate = out is not None + if numpy.prod(x_fp8.shape) != 0 and numpy.prod(w_fp8.shape) != 0: + # Using Paddle's blockwise FP8 GEMM with split accumulator for numerical stability + y = paddle.incubate.nn.functional.fp8_gemm_blockwise( + a=x_fp8, + a_decode_scale=x_scale, # Input scaling factors + b=w_fp8, + b_decode_scale=w_scale, # Weight scaling factors + out_dtype=rtn_dtype, # Output dtype (bfloat16) + out=out, # Optional output tensor for accumulation + accumulate=accumulate, # Whether to accumulate into out tensor + use_split_accumulator=True, # Paddle-specific optimization + is_a_1d_scaled=is_a_1d_scaled, # 1D scaling for input + is_b_1d_scaled=is_b_1d_scaled, # 1D scaling for weights + ) + else: + y = paddle.zeros([x_fp8.shape[0], w_fp8.shape[0]], rtn_dtype) + if out is not None: + out = out + y + return out + + return y + + +def padding(x, axis): + """ + Pads the input tensor along specified axis to make its size divisible by 512 or 128. + + Args: + x (Tensor): Input tensor to be padded + axis (int): Axis along which to pad (0 for rows, 1 for columns) + + Returns: + Tensor: Padded tensor + """ + if x.shape[axis] % 512 != 0: + if (x.shape[axis] + 128 - (x.shape[axis] % 128)) % 512 != 0: + padding_size = 512 + else: + padding_size = 128 + pad_size = padding_size - (x.shape[axis] % padding_size) + if axis == 0: + x = paddle.concat( + [x, paddle.zeros([pad_size, x.shape[-1]], dtype=x.dtype)], axis=0 + ) + else: + x = paddle.concat( + [x, paddle.zeros([x.shape[0], pad_size], dtype=x.dtype)], axis=-1 + ) + return x + + +class Fp8FusedMlpFunc(paddle.autograd.PyLayer): + """ + Custom PyLayer implementation of FP8 fused MLP operation. + + This class implements both forward and backward passes for a memory-efficient + FP8 (8-bit floating point) multi-layer perceptron using PaddlePaddle's + FP8 quantization APIs. + """ + + @staticmethod + def forward(ctx, x, w1, w2): + """ + Forward pass for FP8 fused multi-layer perceptron (MLP) operation. + + Args: + ctx (PyLayerContext): Context object to save tensors for backward pass + x (paddle.Tensor): Input tensor of shape [batch_size, hidden_size] + w1 (paddle.Tensor): First weight matrix of shape [hidden_size, intermediate_size*2] + w2 (paddle.Tensor): Second weight matrix of shape [intermediate_size, hidden_size] + + Returns: + paddle.Tensor: Output tensor of shape [batch_size, hidden_size] + + Note: + - Uses Paddle's FP8 quantization for memory efficiency + - Implements SWiGLU activation internally + - Handles tensor padding for optimal FP8 GEMM performance + """ + x_orig_shape = x.shape + x = x.reshape([-1, x_orig_shape[-1]]) + + if x.shape[0] % 512 != 0: + x_fp8, x_scale = paddle.incubate.nn.functional.fp8.fp8_quant_blockwise( + x, + quant_method="1x128", + input_transpose=False, + output_scale_transpose=True, + ) + x = padding(x, 0) + _, _, x_t_fp8, x_t_scale = ( + paddle.incubate.nn.functional.fp8.fp8_quant_blockwise( + x, + quant_method="1x128", + input_transpose=True, + output_scale_transpose=True, + ) + ) + + else: + x_fp8, x_scale, x_t_fp8, x_t_scale = ( + paddle.incubate.nn.functional.fp8.fp8_quant_blockwise( + x, + quant_method="1x128", + input_transpose=True, + output_scale_transpose=True, + ) + ) + + _, _, w1_fp8, w1_scale = paddle.incubate.nn.functional.fp8.fp8_quant_blockwise( + w1, + quant_method="128x128", + input_transpose=True, + output_scale_transpose=False, + ) + o1 = paddle.empty([x_fp8.shape[0], w1_fp8.shape[0]], dtype=x.dtype) + deep_gemm.gemm_fp8_fp8_bf16_nt((x_fp8, x_scale.T), (w1_fp8, w1_scale), o1) + + o2 = swiglu(o1) + o2_fp8, o2_scale = paddle.incubate.nn.functional.fp8.fp8_quant_blockwise( + o2, quant_method="1x128", input_transpose=False, output_scale_transpose=True + ) + + _, _, w2_t_fp8, w2_t_scale = ( + paddle.incubate.nn.functional.fp8.fp8_quant_blockwise( + w2, + quant_method="128x128", + input_transpose=True, + output_scale_transpose=False, + ) + ) + o3 = paddle.empty([o2_fp8.shape[0], w2_t_fp8.shape[0]], dtype=o1.dtype) + deep_gemm.gemm_fp8_fp8_bf16_nt((o2_fp8, o2_scale.T), (w2_t_fp8, w2_t_scale), o3) + if len(x_orig_shape) > 2: + o3 = o3.reshape([x_orig_shape[0], -1, o3.shape[-1]]) + + ctx.save_for_backward( + x_t_fp8, + x_t_scale, + w1, + o1, + w2, + paddle.to_tensor(x_orig_shape, dtype="int64", place=paddle.CPUPlace()), + ) + return o3 + + @staticmethod + def backward(ctx, do3): + """ + Memory-efficient backward pass for FP8 fused MLP operation. + + Args: + ctx: Context object containing saved tensors from forward pass + do3 (Tensor): Gradient of the loss with respect to the output + + Returns: + Tuple[Tensor, Tensor, Tensor]: Gradients with respect to x, w1, and w2 + """ + do3_orig_shape = do3.shape + do3 = do3.reshape([-1, do3_orig_shape[-1]]) + + x_t_fp8, x_t_scale, w1, o1, w2, x_orig_shape = ctx.saved_tensor() + x_orig_shape = x_orig_shape.numpy() + + o2 = swiglu(o1) + if do3.shape[0] % 512 != 0: + do3_fp8, do3_scale = paddle.incubate.nn.functional.fp8.fp8_quant_blockwise( + do3, + quant_method="1x128", + input_transpose=False, + output_scale_transpose=True, + ) + do3 = padding(do3, 0) + _, _, do3_t_fp8, do3_t_scale = ( + paddle.incubate.nn.functional.fp8.fp8_quant_blockwise( + do3, + quant_method="1x128", + input_transpose=True, + output_scale_transpose=True, + ) + ) + else: + do3_fp8, do3_scale, do3_t_fp8, do3_t_scale = ( + paddle.incubate.nn.functional.fp8.fp8_quant_blockwise( + do3, + quant_method="1x128", + input_transpose=True, + output_scale_transpose=True, + ) + ) + w2_fp8, w2_scale = paddle.incubate.nn.functional.fp8.fp8_quant_blockwise( + w2, + quant_method="128x128", + input_transpose=False, + output_scale_transpose=False, + ) + do2 = paddle.empty([do3_fp8.shape[0], w2_fp8.shape[0]], do3.dtype) + deep_gemm.gemm_fp8_fp8_bf16_nt((do3_fp8, do3_scale.T), (w2_fp8, w2_scale), do2) + + o2 = padding(o2, 0) + _, _, o2_t_fp8, o2_t_scale = ( + paddle.incubate.nn.functional.fp8.fp8_quant_blockwise( + o2, + quant_method="1x128", + input_transpose=True, + output_scale_transpose=True, + ) + ) + + dw2 = fp8_gemm( + o2_t_fp8, + o2_t_scale, + do3_t_fp8, + do3_t_scale, + True, + True, + rtn_dtype=paddle.float32, + ) + + do1, _ = paddle._C_ops.swiglu_grad(o1, None, do2) + + if do1.shape[0] % 512 != 0: + do1_fp8, do1_scale = paddle.incubate.nn.functional.fp8.fp8_quant_blockwise( + do1, + quant_method="1x128", + input_transpose=False, + output_scale_transpose=True, + ) + do1 = padding(do1, 0) + _, _, do1_t_fp8, do1_t_scale = ( + paddle.incubate.nn.functional.fp8.fp8_quant_blockwise( + do1, + quant_method="1x128", + input_transpose=True, + output_scale_transpose=True, + ) + ) + else: + do1_fp8, do1_scale, do1_t_fp8, do1_t_scale = ( + paddle.incubate.nn.functional.fp8.fp8_quant_blockwise( + do1, + quant_method="1x128", + input_transpose=True, + output_scale_transpose=True, + ) + ) + w1_fp8, w1_scale = paddle.incubate.nn.functional.fp8.fp8_quant_blockwise( + w1, + quant_method="128x128", + input_transpose=False, + output_scale_transpose=False, + ) + dx = paddle.empty([do1_fp8.shape[0], w1_fp8.shape[0]], do1.dtype) + deep_gemm.gemm_fp8_fp8_bf16_nt((do1_fp8, do1_scale.T), (w1_fp8, w1_scale), dx) + if len(x_orig_shape) > 2: + dx = dx.reshape([x_orig_shape[0], -1, dx.shape[-1]]) + + dw1 = fp8_gemm( + x_t_fp8, + x_t_scale, + do1_t_fp8, + do1_t_scale, + True, + True, + rtn_dtype=paddle.float32, + ) + return dx, dw1, dw2 + + +class MemEfficientFp8FusedMlpFunc(paddle.autograd.PyLayer): + """ + Memory-optimized version of FP8 fused MLP operation. + + This implementation reduces memory usage during training by: + - Avoiding redundant tensor storage in forward pass + - Recomputing intermediate values during backward pass + - Using optimized FP8 quantization strategies + + Inherits from paddle.autograd.PyLayer to implement custom backward pass. + """ + + @staticmethod + def forward(ctx, x, w1, w2): + """ + Memory-efficient forward pass for FP8 fused MLP operation. + + Args: + ctx (PyLayerContext): Context object to save minimal tensors for backward pass + x (paddle.Tensor): Input tensor of shape [batch_size, hidden_size] + w1 (paddle.Tensor): First weight matrix of shape [hidden_size, intermediate_size*2] + w2 (paddle.Tensor): Second weight matrix of shape [intermediate_size, hidden_size] + + Returns: + paddle.Tensor: Output tensor of shape [batch_size, hidden_size] + + Note: + - Saves only essential tensors for backward pass to reduce memory usage + - Uses recomputation strategy during backward pass + - Maintains same numerical accuracy as standard implementation + """ + x_orig_shape = x.shape + x = x.reshape([-1, x_orig_shape[-1]]) + + x_fp8, x_scale = paddle.incubate.nn.functional.fp8.fp8_quant_blockwise( + x, quant_method="1x128", input_transpose=False, output_scale_transpose=True + ) + + _, _, w1_fp8, w1_scale = paddle.incubate.nn.functional.fp8.fp8_quant_blockwise( + w1, + quant_method="128x128", + input_transpose=True, + output_scale_transpose=False, + ) + o1 = paddle.empty([x_fp8.shape[0], w1_fp8.shape[0]], dtype=x.dtype) + deep_gemm.gemm_fp8_fp8_bf16_nt((x_fp8, x_scale.T), (w1_fp8, w1_scale), o1) + + o2 = swiglu(o1) + o2_fp8, o2_scale = paddle.incubate.nn.functional.fp8.fp8_quant_blockwise( + o2, quant_method="1x128", input_transpose=False, output_scale_transpose=True + ) + + _, _, w2_t_fp8, w2_t_scale = ( + paddle.incubate.nn.functional.fp8.fp8_quant_blockwise( + w2, + quant_method="128x128", + input_transpose=True, + output_scale_transpose=False, + ) + ) + o3 = paddle.empty([o2_fp8.shape[0], w2_t_fp8.shape[0]], dtype=o1.dtype) + deep_gemm.gemm_fp8_fp8_bf16_nt((o2_fp8, o2_scale.T), (w2_t_fp8, w2_t_scale), o3) + if len(x_orig_shape) > 2: + o3 = o3.reshape([x_orig_shape[0], -1, o3.shape[-1]]) + + ctx.save_for_backward( + x_fp8, + x_scale, + w1, + w2, + paddle.to_tensor(x_orig_shape, dtype="int64", place=paddle.CPUPlace()), + ) + return o3 + + @staticmethod + def backward(ctx, do3): + do3_orig_shape = do3.shape + do3 = do3.reshape([-1, do3_orig_shape[-1]]) + + x_fp8, x_scale, w1, w2, x_orig_shape = ctx.saved_tensor() + x_orig_shape = x_orig_shape.numpy() + + _, _, w1_fp8, w1_scale = paddle.incubate.nn.functional.fp8.fp8_quant_blockwise( + w1, + quant_method="128x128", + input_transpose=True, + output_scale_transpose=False, + ) + o1 = paddle.empty([x_fp8.shape[0], w1_fp8.shape[0]], dtype=do3.dtype) + deep_gemm.gemm_fp8_fp8_bf16_nt((x_fp8, x_scale.T), (w1_fp8, w1_scale), o1) + + x_dequant_fp16 = paddle.incubate.nn.functional.fused_act_dequant( + x_fp8, x_scale.T.contiguous() + ) + x_dequant_fp16 = padding(x_dequant_fp16, 0) + + _, _, x_t_fp8, x_t_scale = ( + paddle.incubate.nn.functional.fp8.fp8_quant_blockwise( + x_dequant_fp16, + quant_method="1x128", + input_transpose=True, + output_scale_transpose=True, + ) + ) + + o2 = swiglu(o1) + + if do3.shape[0] % 512 != 0: + do3_fp8, do3_scale = paddle.incubate.nn.functional.fp8.fp8_quant_blockwise( + do3, + quant_method="1x128", + input_transpose=False, + output_scale_transpose=True, + ) + do3 = padding(do3, 0) + _, _, do3_t_fp8, do3_t_scale = ( + paddle.incubate.nn.functional.fp8.fp8_quant_blockwise( + do3, + quant_method="1x128", + input_transpose=True, + output_scale_transpose=True, + ) + ) + else: + do3_fp8, do3_scale, do3_t_fp8, do3_t_scale = ( + paddle.incubate.nn.functional.fp8.fp8_quant_blockwise( + do3, + quant_method="1x128", + input_transpose=True, + output_scale_transpose=True, + ) + ) + w2_fp8, w2_scale = paddle.incubate.nn.functional.fp8.fp8_quant_blockwise( + w2, + quant_method="128x128", + input_transpose=False, + output_scale_transpose=False, + ) + do2 = paddle.empty([do3_fp8.shape[0], w2_fp8.shape[0]], do3.dtype) + deep_gemm.gemm_fp8_fp8_bf16_nt((do3_fp8, do3_scale.T), (w2_fp8, w2_scale), do2) + + o2 = padding(o2, 0) + _, _, o2_t_fp8, o2_t_scale = ( + paddle.incubate.nn.functional.fp8.fp8_quant_blockwise( + o2, + quant_method="1x128", + input_transpose=True, + output_scale_transpose=True, + ) + ) + + dw2 = fp8_gemm( + o2_t_fp8, + o2_t_scale, + do3_t_fp8, + do3_t_scale, + True, + True, + rtn_dtype=paddle.float32, + ) + + do1, _ = paddle._C_ops.swiglu_grad(o1, None, do2) + + if do1.shape[0] % 512 != 0: + do1_fp8, do1_scale = paddle.incubate.nn.functional.fp8.fp8_quant_blockwise( + do1, + quant_method="1x128", + input_transpose=False, + output_scale_transpose=True, + ) + do1 = padding(do1, 0) + _, _, do1_t_fp8, do1_t_scale = ( + paddle.incubate.nn.functional.fp8.fp8_quant_blockwise( + do1, + quant_method="1x128", + input_transpose=True, + output_scale_transpose=True, + ) + ) + else: + do1_fp8, do1_scale, do1_t_fp8, do1_t_scale = ( + paddle.incubate.nn.functional.fp8.fp8_quant_blockwise( + do1, + quant_method="1x128", + input_transpose=True, + output_scale_transpose=True, + ) + ) + w1_fp8, w1_scale = paddle.incubate.nn.functional.fp8.fp8_quant_blockwise( + w1, + quant_method="128x128", + input_transpose=False, + output_scale_transpose=False, + ) + dx = paddle.empty([do1_fp8.shape[0], w1_fp8.shape[0]], do1.dtype) + deep_gemm.gemm_fp8_fp8_bf16_nt((do1_fp8, do1_scale.T), (w1_fp8, w1_scale), dx) + if len(x_orig_shape) > 2: + dx = dx.reshape([x_orig_shape[0], -1, dx.shape[-1]]) + + dw1 = fp8_gemm( + x_t_fp8, + x_t_scale, + do1_t_fp8, + do1_t_scale, + True, + True, + rtn_dtype=paddle.float32, + ) + return dx, dw1, dw2 + + +class Fp8FusedMlp(paddle.nn.Layer): + """ + PaddlePaddle Layer implementing FP8 fused multi-layer perceptron (MLP). + + This layer combines: + - FP8 precision matrix operations for improved performance + - Fused MLP architecture with SWiGLU activation + - Memory-efficient training through custom PyLayer implementation + + """ + + def __init__(self, config): + """ + Initializes the FP8 Fused MLP layer. + + Args: + config (object): Configuration object containing: + - hidden_size (int): Dimension of the input/output features + - intermediate_size (int): Dimension of the intermediate features + + Note: + - Weights are initialized using Paddle's create_parameter + - Uses bfloat16 precision for weight storage + - No bias terms are used in this implementation + """ + + super().__init__() + self.config = config + self.hidden_size = config.hidden_size + self.intermediate_size = config.intermediate_size + + self.w1 = self.create_parameter( + shape=[self.hidden_size, self.intermediate_size * 2], + dtype="bfloat16", # Using Paddle's bfloat16 dtype + is_bias=False, # Paddle-specific parameter attribute + ) + self.w2 = self.create_parameter( + shape=[self.intermediate_size, self.hidden_size], + dtype="bfloat16", + is_bias=False, + ) + + def forward(self, x): + """ + Forward pass of the FP8 fused MLP layer. + + Args: + x (Tensor): Input tensor + + Returns: + Tensor: Output tensor after MLP transformation + """ + return Fp8FusedMlpFunc.apply(x, self.w1, self.w2) diff --git a/examples/pre-training/models/moe/moe_layer_auto.py b/examples/pre-training/models/moe/moe_layer_auto.py new file mode 100644 index 00000000..0b7fc0cf --- /dev/null +++ b/examples/pre-training/models/moe/moe_layer_auto.py @@ -0,0 +1,851 @@ +# !/usr/bin/env python3 + +# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""_summary_ + +Returns: + _type_: _description_ +""" +from typing import Any, Tuple, List, Optional, Callable +import logging +from collections import namedtuple +from contextlib import contextmanager +from functools import partial + +import paddle +from paddle import framework +from paddle import nn +from paddle.distributed.communication import stream +import paddle.nn.functional as F +from paddle.distributed import in_auto_parallel_align_mode + +from paddle.autograd import PyLayer +from paddle.distributed.communication.group import Group +from paddle.distributed import fleet + +import paddle.distributed as dist +from paddle import Tensor +from paddleformers.trainer.plugins.timer import get_timers + + +from models.moe.top2_gate_auto import TopKGateFusedAuto +from models.moe.moe_utils_auto import get_flatten_mesh, get_mesh, _reshard +from models.moe.moe_layer_auto_utils import MOELayer + +try: + from src.utils.misc import global_training_logs +except ModuleNotFoundError: + global_training_logs = {} + + +logger = logging.getLogger(__name__) + +try: + import moe_ops +except ImportError: + moe_ops = None + logger.warning( + "`moe-ops` not found, run " + "`python3 src/ernie_core/ops/moe/setup.py install` to install" + ) + +try: + import moe_ops_auto +except ImportError: + moe_ops_auto = None + logger.warning( + "`moe_ops_auto` not found, run " + "`python3 src/ernie_core/ops/moe/setup_auto.py install` to install" + ) + +try: + import moe_combine_auto +except ImportError: + moe_combine_auto = None + logger.warning( + "`moe_combine_auto` not found, run " + "`python3 src/ernie_core/ops/moe/setup_auto.py install` to install" + ) + + +GateOutput = namedtuple( + "GateOutput", + [ + "aux", + "z", + "logits", + ], +) + + +@contextmanager +def profile(name): + """doc""" + if get_timers() is not None: + get_timers()(name).start() + yield + if get_timers() is not None: + get_timers()(name).stop() + + +class GateCombineForStatic(PyLayer): + """GateCombine""" + + @staticmethod + def forward(ctx, x, combine_weights, scatter_index): + """ + Input: + x: [seqlen * k, hidden_size] + combine_weights: [seqlen, k] + scatter_index: [seqlen, k] + Output: + y: [seqlen, hidden_size] + """ + ctx.save_for_backward(x, combine_weights, scatter_index) + assert moe_combine_auto is not None + return moe_combine_auto.moe_combine_auto(x, combine_weights, scatter_index) + + @staticmethod + def backward(ctx, grad_y, *_): + """ + Input: + grad_y: [seqlen, hidden_size] + combine_weights: [seqlen, k] + scatter_index: [seqlen, k] + Output: + grad_x: [seqlen * k, hidden_size] + grad_combine_weight: [seqlen, k] + + """ + x, combine_weights, scatter_index = ctx.saved_tensor() + assert moe_combine_auto is not None + grad_x, grad_combine_weight_helper = moe_combine_auto.moe_combine_bwd_auto( + x, combine_weights, scatter_index, grad_y + ) + # grad_combine_weight_helper is the same shape with grad x [seqlen * K, dim] + # reduce the hidden shape + # TODO: implement reduce in cuda ops + grad_combine_weight = grad_combine_weight_helper.sum(-1) + # NOTE: PyLayer do not support some inputs with stop_gradient=True in static mode, + # this means that there must be a gradient for each input + scatter_index_grad = paddle.zeros_like(scatter_index) + return grad_x, grad_combine_weight, scatter_index_grad + + +class GateCombine(PyLayer): + """GateCombine""" + + @staticmethod + def forward(ctx, x, combine_weights, scatter_index): + """ + Input: + x: [seqlen * k, hidden_size] + combine_weights: [seqlen, k] + scatter_index: [seqlen, k] + Output: + y: [seqlen, hidden_size] + """ + ctx.x = x + ctx.combine_weights = combine_weights + ctx.scatter_index = scatter_index + assert moe_combine_auto is not None + return moe_combine_auto.moe_combine_auto(x, combine_weights, scatter_index) + + @staticmethod + def backward(ctx, grad_y, *_): + """ + Input: + grad_y: [seqlen, hidden_size] + combine_weights: [seqlen, k] + scatter_index: [seqlen, k] + Output: + grad_x: [seqlen * k, hidden_size] + grad_combine_weight: [seqlen, k] + + """ + + assert moe_combine_auto is not None + grad_x, grad_combine_weight_helper = moe_combine_auto.moe_combine_bwd_auto( + ctx.x, ctx.combine_weights, ctx.scatter_index, grad_y + ) + # grad_combine_weight_helper is the same shape with grad x [seqlen * K, dim] + # reduce the hidden shape + # TODO: implement reduce in cuda ops + grad_combine_weight = grad_combine_weight_helper.sum(-1) + return grad_x, grad_combine_weight.reshape(ctx.combine_weights.shape), None + + +def combining_fused_auto(x, combine_weights, scatter_index, hard_gate=False): + """ + Args: + x: Tensor[seq, dim] + combine_weights: [s, k] + scatter_index: ** [k, s] ** + + Returns: + y: Tensor[s, dim] + """ + if hard_gate: + x_gatherd = F.embedding(scatter_index, x) # [s,k,dim] + return x_gatherd.squeeze(-2) + ret = moe_combine_auto.moe_combine_auto(x, combine_weights, scatter_index) + + ret.stop_gradient = False + return ret + + +def dispatching(x, dispatch_mask, scatter_index, num_experts, capacity): + + output = None + # init_output = paddle.zeros([num_experts * capacity, x.shape[-1]], dtype='float32') + # output = init_output + 0. * x.sum() + orig_dtype = x.dtype + scatter_index = scatter_index.unbind(1) + dispatch_mask = dispatch_mask.unbind(1) + for i_scatter_index, i_dispatch_mask in zip(scatter_index, dispatch_mask): + init_output = paddle.zeros( + [num_experts * capacity, x.shape[-1]], dtype="float32" + ) + updates = x * i_dispatch_mask.unsqueeze(-1).cast(x.dtype) + if output is None: + output = paddle.scatter( + init_output, + i_scatter_index, + updates, + overwrite=False, + ) + else: + output = output + paddle.scatter( + init_output, + i_scatter_index, + updates, + overwrite=False, + ) + if output.dtype != orig_dtype: + output = output.cast(orig_dtype) + return output + + +def combining(x, combine_weights, scatter_index): + + dim = x.shape[-1] + scatter_index = scatter_index.reshape([-1]) + num_k = combine_weights.shape[-1] + x = dist.reshard(x, get_mesh(0), [dist.Replicate(), dist.Shard(0)]) + combine_weights = combine_weights.unsqueeze(1) + # num_k = 2 + x = paddle.gather(x, scatter_index).reshape([-1, num_k, dim]) # [seq,2,dim] + return paddle.matmul(combine_weights, x).squeeze( + 1 + ) # [seq,1,2] @ [seq,2,dim] -> [seq,1,dim] + + +class AlltoAll(PyLayer): + """ + AlltoAll w/ backward + """ + + @staticmethod + def forward(ctx, x, group): + """ + All-to-all communication in the group. + """ + ctx.group = group + if dist.get_world_size(group) <= 1: + return x + output = paddle.empty_like(x) + output.stop_gradient = False + with profile("moe-all2all"): + stream.alltoall_single(output, x, None, None, group, True, True) + return output + + @staticmethod + def backward(ctx, *dx): + """backward""" + return AlltoAll.apply(*dx, group=ctx.group) + + +class AlltoAllAsync(PyLayer): + """ + AlltoAll async w/ backward + """ + + @staticmethod + def forward(ctx, x, *fn_args, group=None, fn=None, is_first_fwd=False): + """ + All-to-all communication in the group. + Args: + x: Tensor + args: List[Any], argument(s) to `fn` + group: ProcessGroup + fn: callable, called while doing alltoall + is_first_fwd: if using recompute, don't record bacward when first forward + Returns: + x: Tensor + fn_out: List[Tensor] + """ + assert fn is not None, "use AlltoAll no async" + ctx.group = group + if dist.get_world_size(group) <= 1: + ctx.bwf, fn_out = manual_backward(fn, is_first_fwd, *fn_args) + return (x,) + fn_out + x_out = paddle.empty_like(x) + x_out.stop_gradient = False + with profile("moe-all2all"): + task = stream.alltoall_single( + x_out, + x, + None, + None, + group, + sync_op=False, + ) + ctx.bwf, fn_out = manual_backward(fn, is_first_fwd, *fn_args) + task.wait() + return (x_out,) + fn_out + + @staticmethod + def backward(ctx, dx_out, *fn_out_grads): + """backward""" + if dist.get_world_size(ctx.group) <= 1: + fn_args_grads = ctx.bwf(*fn_out_grads) + return (dx_out,) + fn_args_grads + + dx = paddle.empty_like(dx_out) + dx.stop_gradient = False + with profile("moe-all2all"): + task = stream.alltoall_single( + dx, + dx_out, + None, + None, + ctx.group, + sync_op=False, + ) + fn_args_grads = ctx.bwf(*fn_out_grads) + task.wait() + return (dx,) + fn_args_grads + + +def detach_and_requires_grad_(*args): + """detach_and_requires_grad_""" + ret = [a.detach() if a is not None else None for a in args] + for r, a in zip(ret, args): + if a is not None: + r.stop_gradient = a.stop_gradient + return ret + + +def manual_backward(f: Callable, is_first_fwd: bool, *args: List[Any]): + """ + Args: + f(callable) + args(*Any) + Returns + bw_f(callable): manual backward fn + out(List[Tensor]): output of f(*args) + """ + tracer = framework._dygraph_tracer() + orig = tracer._has_grad + if not is_first_fwd: + tracer._has_grad = True # turn on grad trace so we can manual backward + + detached_args = detach_and_requires_grad_(*args) + detached_args_clone = [a.clone() if a is not None else None for a in detached_args] + out = f(*detached_args_clone) + for a in detached_args: + if a is not None: + a._clear_dataptr() # free mem + if isinstance(out, list): + out = tuple(out) + elif not isinstance(out, tuple): + out = (out,) + + if is_first_fwd: + tracer._has_grad = orig + return None, out + + out_cached = [ + o.clone() for o in out if o is not None and not o.stop_gradient + ] # do not cache stop_gradient output + for o in out_cached: + o._clear_dataptr() # free mem + tracer._has_grad = orig + + def bwd_f(*grad): + nonlocal out_cached, detached_args, f + grad = list(grad) + grad = [g for g in grad if g is not None] + assert len(grad) == len(out_cached), (len(grad), len(out_cached), f) + # out, grad = zip(*[(o, g) for o, g in zip(out, grad) if g is not None]) + paddle.autograd.backward(out_cached, grad) + return tuple([t.grad if t is not None else None for t in detached_args]) + + return bwd_f, out + + +def bpr_preprocess(input, logits, capacity, buffer): + """impletment bpr sorting""" + assert input.ndim == 2, input.shape + idx = paddle.argsort(logits.max(-1), axis=0, descending=True) + input = input[idx] + logits = logits[idx] + buffer["idx"] = idx + return input, logits + + +def bpr_postprocess(output, buffer): + """bpr sorting""" + idx = buffer.pop("idx") + rev_idx = paddle.argsort(idx) + output = output[rev_idx] + return output + + +class MOELayerAuto(MOELayer): + + def __init__( + self, + gate: nn.Layer, + experts: List[nn.Layer], + layer_idx, + shared_experts: Optional[List[nn.Layer]] = None, + group: Group = None, + recompute=False, + enable_logging: bool = False, + k=2, + enable_pbr: bool = False, + all_to_all_dropout=0, + group_experts=False, + config=None, + ipp=0, + ): + nn.Layer.__init__(self) + self.config = config + self.gate = gate + self.layer_idx = layer_idx + self.ipp = ipp + self.recompute = recompute + logger.info(f"using moe recompute={recompute}") + for p in self.gate.parameters(): + p.is_gate = True + if isinstance(experts, nn.LayerList): + self.experts = experts + else: + logger.info(f"using fused experts, type={type(experts)}") + self.experts = experts + self.shared_experts = shared_experts + + self.group = group + self.k = k + self.all_to_all_dropout = all_to_all_dropout + self.enable_logging = enable_logging + is_mp_moe = ( + hasattr(fleet.fleet, "_hcg") + and group is fleet.get_hybrid_communicate_group().get_model_parallel_group() + ) + is_dummy_moe = config.moe_world_size == 1 + + for p in experts.parameters(): + p.expert = not (is_mp_moe or is_dummy_moe) # type: ignore + p.no_sync = not (is_mp_moe or is_dummy_moe) + logger.info(f"expert no-sync={p.no_sync}-{p.name}") + if is_mp_moe or is_mp_moe: + p.is_distributed = True + + self.world_size = config.moe_world_size + if self.group in fleet.auto.get_mesh().dim_names: + self.rank = fleet.auto.get_mesh().get_rank_by_dim_and_process_id( + self.group, dist.get_rank() + ) + if self.rank < 0: + self.rank = 0 + else: + self.rank = 0 + + self.num_experts_per_group = len(self.experts) + self.ep_group_num = config.moe_world_size + self.num_local_experts = self.num_experts_per_group // self.ep_group_num + + self.moe_mesh_dim = 0 if config.moe_group == "dp" else 1 + self.dispatch_by_task = ( + hasattr(self.gate, "dispatch_by_task") and self.gate.dispatch_by_task + ) + + if self.dispatch_by_task: + assert 0, "no supported, checkout earylier code" + assert self.num_local_experts == 1 + + if enable_pbr: + logger.info("using BPR") + prepost_process_buffer = {} + self.input_preprocess = partial( + bpr_preprocess, buffer=prepost_process_buffer + ) + self.output_postprocess = partial( + bpr_postprocess, buffer=prepost_process_buffer + ) + else: + self.input_preprocess = self.output_postprocess = None + self.group_experts = group_experts + + def _cal_multimodel_experts_prob( + self, gate_logits, token_type_ids, group_experts, moe_k + ): + + if not self.gate.experts_type_ids.is_dist(): + self.gate.experts_type_ids = dist.shard_tensor( + self.gate.experts_type_ids, + get_mesh(), + [dist.Replicate(), dist.Replicate()], + ) + return super()._cal_multimodel_experts_prob( + gate_logits, token_type_ids, group_experts, moe_k + ) + + def forward_experts(self, dispatched_input): + """ + call experts sequently + Args: + dispatched_input: Tensor[num_experts, capacity, dim] + Returns: + expert_output: Tensor[num_experts, capacity, dim] + """ + assert isinstance(self.experts, nn.LayerList) + if self.config.moe_group == "mp": + local_input_list = dist.auto_parallel.api.moe_sub_mesh_tensors( + dispatched_input, + get_mesh(self.ipp), + self.moe_mesh_dim, + [dist.Shard(2), dist.Shard(0)], + ) + + assert len(self.experts) % len(local_input_list) == 0, ( + "num of experts must be divided by num of ep_group, " + f"but got {len(self.experts)} and {len(local_input_list)}" + ) + expert_group_outputs = [] + for i_ep_group, local_input in enumerate(local_input_list): + chunks = local_input.unbind(1) + experts = self.experts[ + i_ep_group + * self.num_local_experts : (i_ep_group + 1) + * self.num_local_experts + ] + ep_output = [] + assert len(experts) == len( + chunks + ), f"num of experts must be equal to num of chunks, but got {len(experts)} and {len(chunks)}" + for chunk_id, (chunk, expert) in enumerate(zip(chunks, experts)): + ep_output += [expert(chunk)] + expert_group_outputs += [paddle.stack(ep_output, axis=1)] + return expert_group_outputs + else: + chunks = dispatched_input.unbind(1) + expert_outputs = [] + assert len(chunks) == len(self.experts), (len(chunks), len(self.experts)) + for chunk, expert in zip(chunks, self.experts): + expert_outputs += [expert(chunk)] + expert_output = paddle.stack(expert_outputs, axis=1) # [ecm] + return expert_output + + def gate_and_distpach(self, input, token_type_ids): + """ + calc gate and dispatch inputs (and do logging, optionaly) + Args: + input: Tensor[seq, dim], float + token_type_ids: Tensor[seq], int + Returns: + dispatched_input: Tensor[num_experts, capacity, dim] + combine_weights: [seq, k] + scatter_index: [seq, k] + router_loss: scalar + gate_logits: [seq, num_experts] + """ + with profile("moe-gate"): + args = () + if token_type_ids is not None: + token_type_ids = token_type_ids.reshape([-1]) + args = (token_type_ids,) + use_fuse = isinstance(self.gate, (TopKGateFusedAuto)) + if use_fuse: + (gate_logits, capacity, router_loss, local_capacity) = self.gate( + input, *args + ) + else: + ( + capacity, + dispatch_mask, + combine_weights, + scatter_index, + router_loss, + gate_logits, + ) = self.gate(input, *args) + prob = None + if self.input_preprocess is not None: + input, gate_logits = self.input_preprocess(input, gate_logits, capacity) + + with profile("moe-dispatch"): + if use_fuse: + # capacity no use + k = self.k + prob, max_prob = self.fused_gate_logits_process( + gate_logits, token_type_ids + ) + ( + dispatched_input, + combine_weights_unnorm, + scatter_index, + dispatch_mask, + _, + ) = moe_ops_auto.moe_gate_dispatch_auto( + input, prob, k, local_capacity, True + ) + dispatched_input.stop_gradient = False + combine_weights_unnorm.stop_gradient = False + # NOTE: PyLayer do not support some inputs with stop_gradient=True in static mode + # it's a bug that will be fixed in the future + # scatter_index.stop_gradient = True + dispatch_mask.stop_gradient = True + + scatter_index = scatter_index.transpose([1, 0]) # [k,s] ->[s,k] + + if self.group_experts: + if max_prob is not None: + if token_type_ids is not None: + p = paddle.ones_like(combine_weights_unnorm.unsqueeze(-1)) + p = paddle.scatter_nd_add( + p, paddle.nonzero(token_type_ids == 0), -1 + max_prob + ) + else: + p = max_prob + combine_weights_unnorm = ( + combine_weights_unnorm.unsqueeze(-1) * p + ).squeeze(-1) + prob = (prob.reshape([p.shape[0], k, -1]) * p).reshape( + [p.shape[0], -1] + ) + combine_weights = combine_weights_unnorm / paddle.clip( + combine_weights_unnorm.sum(-1, keepdim=True), min=1e-12 + ) + combine_weights = combine_weights.cast(dispatched_input.dtype) + else: + dispatched_input = dispatching( + input, + dispatch_mask, + scatter_index, + num_experts=self.config.moe_num_experts, + capacity=capacity, + ) + dispatch_mask.stop_gradient = True + scatter_index.stop_gradient = True + return ( + dispatched_input, + combine_weights, + dispatch_mask, + scatter_index, + router_loss, + gate_logits, + prob, + ) + + def combine_expert_output(self, expert_output, combine_weights, scatter_index): + """ + Combine Expert output + Args: + expert_output: Tensor[num_experts, caapcity, dim] + combine_weights: + Returns: + combined_output: Tensor[seqlen, dim] + """ + with profile("moe-combine"): + if self.config.moe_use_all2all and self.config.moe_group == "mp": + expert_output = dist.auto_parallel.moe_utils._dist_reshape( + expert_output, + [-1, expert_output.shape[-1]], + get_flatten_mesh(get_mesh(self.ipp)), + [dist.Shard(0)], + ) + else: + expert_output = expert_output.reshape( + [-1, expert_output.shape[-1]] + ) # [e*c,m] + + if not self.config.moe_use_all2all: + if self.config.moe_group == "mp": + expert_output = dist.reshard( + expert_output, + get_mesh(self.ipp), + [dist.Replicate(), dist.Replicate()], + ) + else: + expert_output = dist.reshard( + expert_output, get_mesh(), [dist.Shard(0), dist.Replicate()] + ) + use_fuse = isinstance(self.gate, (TopKGateFusedAuto)) + combine_fn = combining_fused_auto if use_fuse else combining + combined_output = combine_fn(expert_output, combine_weights, scatter_index) + + if self.output_postprocess is not None: + combined_output = self.output_postprocess(combined_output) + return combined_output + + def forward( + self, + input: Tensor, + token_type_ids=None, + ) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor]: + """ + Args: + input (`Tensor`): The input data with shape ``(s, d)``. + Only one token is supported for now. + token_type_ids (`Tensor`) int64 tensor with shape (s), + if specified, rount tensor according to `token_type_ids`. + Returns: + output (`Tensor`): The final output tensor with shape ``(s, d)`` where ``m`` is the + size of model parameters. + combine_weights (`Tensor`, optional): A tensor with shape ``(s,)``, which represents weights + for each expert in MoE. + router_loss (`Tensor`, optional): A scalar tensor representing the loss of routing function. + """ + if self.shared_experts is not None: + shared_expert_input = dist.reshard( + input, + get_mesh(self.ipp), + [dist.Shard(1), dist.Replicate()], + ) + if input.ndim == 3: + orig_shape = input.shape + input = dist.reshard( + input, get_mesh(self.ipp), [dist.Replicate(), dist.Shard(0)] + ) + if self.config.moe_use_all2all: + input = dist.auto_parallel.moe_utils._dist_reshape( + input, + [-1, input.shape[-1]], + get_flatten_mesh(get_mesh(self.ipp)), + [dist.Shard(0)], + ) + else: + input = input.reshape([-1, input.shape[-1]]) + else: + orig_shape = None + assert ( + len(input.shape) == 2 + ), f"input Tensor must have dimensions: (s)equence, (d)im, got:{input.shape}" + seqlen, d_model = input.shape + + if token_type_ids is not None: + token_type_ids = token_type_ids.clone()[:, :-1] + if self.config.sequence_parallel: + token_type_ids = token_type_ids.reshape([-1]) + # token_type_ids = ScatterOp.apply(token_type_ids) + token_type_ids.stop_gradient = True + + assert self.gate is not None + if hasattr(self, "rng") and self.rng.random() < self.all_to_all_dropout: + orig_shape_2 = input.shape + output = self.forward_experts(input) + output += self.gate.weight.sum() * 0.0 # hack for grad + output = output.reshape(orig_shape or orig_shape_2) # [e*1,c,m] + return output, None, 0 + + ( + dispatched_input, + combine_weights, + dispatch_mask, + scatter_index, + router_loss, + gate_logits, + gate_prob, + ) = self.gate_and_distpach(input, token_type_ids) + if self.config.moe_use_all2all and self.config.moe_group == "mp": + dispatched_input = _reshard( + dispatched_input, get_mesh(self.ipp), [dist.Shard(1), dist.Shard(1)] + ) + if self.config.moe_group == "mp": + # TODO(zhangyichen): 统一 moe_group 是 mp 和其他情况下的代码 + dispatched_input = dist.reshard( + dispatched_input, get_mesh(self.ipp), [dist.Shard(1), dist.Shard(0)] + ) + + if self.shared_experts is not None: + shared_out = self.shared_experts(shared_expert_input) + dispatched_input = dispatched_input.reshape( + [self.config.moe_world_size, self.num_local_experts, -1, d_model] + ) + expert_out = self.forward_experts(dispatched_input) + if self.config.moe_group == "mp": + expert_out = dist.auto_parallel.api.moe_global_mesh_tensor( + expert_out, + get_mesh(self.ipp), + [dist.Shard(2), dist.Shard(0)], + self.moe_mesh_dim, + ) + expert_out = dist.auto_parallel.moe_utils._dist_reshape( + expert_out, + [self.config.moe_world_size * self.num_local_experts, -1, d_model], + get_mesh(self.ipp), + [dist.Shard(1), dist.Shard(0)], + ) + expert_out = dist.reshard( + expert_out, get_mesh(self.ipp), [dist.Shard(1), dist.Shard(1)] + ) + if not in_auto_parallel_align_mode(): + router_loss2 = self.calc_router_loss_and_logging( + router_loss, + combine_weights, + dispatch_mask, + gate_logits, + gate_prob, + token_type_ids, + ) + else: + router_loss2 = router_loss + router_loss2 = dist.shard_tensor( + router_loss2, get_flatten_mesh(get_mesh(self.ipp)), [dist.Replicate()] + ) + combined_output = self.combine_expert_output( + expert_out, combine_weights, scatter_index + ) + + if self.shared_experts is not None: + shared_out = dist.auto_parallel.moe_utils._dist_reshape( + shared_out, + [-1, shared_out.shape[-1]], + get_flatten_mesh(get_mesh(self.ipp)), + [dist.Shard(0)], + ) + combined_output += shared_out + + if orig_shape: + if self.config.moe_use_all2all: + combined_output = dist.auto_parallel.moe_utils._dist_reshape( + combined_output, + orig_shape[:-1] + [combined_output.shape[-1]], + get_mesh(self.ipp), + [dist.Shard(1), dist.Shard(0)], + ) + router_loss2 = _reshard( + router_loss2, + get_mesh(self.ipp), + [dist.Replicate(), dist.Replicate()], + ) + else: + combined_output = combined_output.reshape( + orig_shape[:-1] + [combined_output.shape[-1]] + ) + return combined_output, combine_weights, router_loss2, gate_logits diff --git a/examples/pre-training/models/moe/moe_layer_auto_utils.py b/examples/pre-training/models/moe/moe_layer_auto_utils.py new file mode 100644 index 00000000..ee36fad5 --- /dev/null +++ b/examples/pre-training/models/moe/moe_layer_auto_utils.py @@ -0,0 +1,2087 @@ +# !/usr/bin/env python3 + +# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""_summary_ + +Returns: + _type_: _description_ +""" +from typing import Tuple, List, Optional +import logging +from collections import namedtuple +from functools import partial +import inspect + +import paddle +from paddle import framework +from paddle import nn +from paddle.distributed.communication import stream +import paddle.nn.functional as F + +from paddle.autograd import PyLayer +from paddle.distributed.communication.group import Group +from paddle.distributed.fleet.utils import recompute +from paddle.distributed import fleet +from paddle.distributed import in_auto_parallel_align_mode + +import paddle.distributed as dist +from paddle import Tensor +from paddleformers.utils.tools import get_env_device + +from models.moe.top2_gate_auto_auto import ( + TopKGateFused, + cast_if_needed, +) +from models.sequence_parallel_utils_auto import ScatterOp +from models.utils import ( + global_training_logs_enabled, + manual_backward, +) + +from models.comm_utils import profile + +from models.moe.moe_utils import MOEAllGatherDispatcher + + +from paddle.incubate.nn.functional import ( + moe_combine, +) + +try: + from paddle.incubate.nn.functional import ( + moe_gate_dispatch_and_quant, + ) +except ImportError: + moe_gate_dispatch_and_quant = None + +try: + from src.utils.misc import global_training_logs +except ModuleNotFoundError: + global_training_logs = {} +try: + import moe_router_loss_ops +except ImportError: + moe_router_loss_ops = None + + +try: + from paddle import scatter_add_ +except ImportError: + scatter_add_ = None + +try: + from bincount_ops import int_bincount +except ImportError: + int_bincount = None + +logger = logging.getLogger(__name__) + +if get_env_device() == "xpu": + try: + from paddle_xpu_nn import moe_gate_dispatch as xpu_moe_gate_dispatch + except ImportError: + xpu_moe_gate_dispatch = None + logger.warning("`xpu moe dispatch` not found") +else: + try: + import moe_ops + except ImportError: + moe_ops = None + logger.warning( + "`moe-ops` not found, run " + "`python3 src/ernie_core/ops/moe/setup.py install` to install" + ) + + try: + import moe_ops_fp8 + except ImportError: + moe_ops_fp8 = None + logger.warning( + "`moe-ops` not found, run " + "`python3 src/ernie_core/ops/moe/setup_fp8.py install` to install" + ) + +try: + from moe_combine import moe_combine_no_weight +except ImportError: + moe_combine_no_weight = None + + +try: + import fused_ln as fused +except ImportError: + logger.warning( + "fused-ln not found, run `python src/ops/fused_ln_setup.py install` to build fused ln" + ) + fused = None + +try: + from custom_setup_ops import matmul_bwd +except ImportError: + matmul_bwd = None + + +GateOutput = namedtuple( + "GateOutput", + [ + "aux", + "z", + "logits", + ], +) + + +class GateCombine_ori(PyLayer): + """GateCombine_ori""" + + @staticmethod + def forward(ctx, x, combine_weights, scatter_index): + """ + Input: + x: [seqlen * k, hidden_size] + combine_weights: [seqlen, k] + scatter_index: [seqlen, k] + Output: + y: [seqlen, hidden_size] + """ + ctx.x = x + ctx.combine_weights = combine_weights + ctx.scatter_index = scatter_index + assert moe_combine is not None + ret = moe_combine.moe_combine(x, combine_weights, scatter_index) + return ret + + @staticmethod + def backward(ctx, grad_y, *_): + """ + Input: + grad_y: [seqlen, hidden_size] + combine_weights: [seqlen, k] + scatter_index: [seqlen, k] + Output: + grad_x: [seqlen * k, hidden_size] + grad_combine_weight: [seqlen, k] + + """ + + assert moe_combine is not None + grad_x, grad_combine_weight_helper = moe_combine.moe_combine_bwd( + ctx.x, ctx.combine_weights, ctx.scatter_index, grad_y + ) + + grad_combine_weight = grad_combine_weight_helper.sum(-1) + return grad_x, grad_combine_weight.reshape(ctx.combine_weights.shape), None + + +def combining_fused(x, combine_weights, scatter_index, hard_gate=False): + """ + Args: + x: Tensor[seq, dim] + combine_weights: [s, k] + scatter_index: ** [k, s] ** + + Returns: + y: Tensor[s, dim] + """ + if hard_gate: + x_gatherd = F.embedding(scatter_index, x) # [s,k,dim] + return x_gatherd.squeeze(-2) + ret = GateCombine_ori.apply(x, combine_weights, scatter_index) + ret.stop_gradient = False + return ret + + +def recompute_fwd_gate_up_func(config, layer_idx): + + if "recompute_fwd_gate_up" in config.fp8_mem_configs: + if isinstance(config.fp8_mem_configs["recompute_fwd_gate_up"], bool): + return config.fp8_mem_configs["recompute_fwd_gate_up"] + if isinstance(config.fp8_mem_configs["recompute_fwd_gate_up"], list): + return layer_idx in config.fp8_mem_configs["recompute_fwd_gate_up"] + + return False + + +class MoEStatics(nn.Layer): + + def __init__(self, config, layer_idx): + super().__init__() + self._cast_to_low_precision = False + self._cast_to_low_precison = False + num_experts = ( + config.moe_num_experts[0] + if config.multimodel_experts + else config.moe_num_experts + ) + if config.multimodel_experts: + assert ( + len(set(config.moe_num_experts)) == 1 + ), "assume expert group has same size, got: {config.moe_num_experts}" + + with paddle.utils.unique_name.guard(f"mm_layer_{layer_idx}_"): + num_experts_groups = ( + len(config.moe_num_experts) if config.multimodel_experts else 1 + ) + p = self.create_parameter( + shape=[num_experts_groups, num_experts], + dtype="float32", + is_bias=True, + attr=paddle.ParamAttr( + name=paddle.utils.unique_name.generate("corr_bias") + ), + ) + + p.stop_gradient = False + self.e_score_correction_bias = p + self.e_score_correction_bias.is_distributed = True + self.e_score_correction_bias.unused_param = True + if getattr(config, "build_skip_comm_buffer", False): + self.e_score_correction_bias.color = { + "color": "skip_comm", + "group": paddle.distributed.new_group( + [paddle.distributed.get_rank()] + ), + } + p = paddle.zeros( + shape=[num_experts_groups, num_experts], + dtype="int64", + ) + p.stop_gradient = True + self.expert_usage = p + # self.expert_usage.is_distributed = True + + +def dispatching(x, dispatch_mask, scatter_index, num_experts, capacity): + + output = None + # init_output = paddle.zeros([num_experts * capacity, x.shape[-1]], dtype='float32') + # output = init_output + 0. * x.sum() + orig_dtype = x.dtype + scatter_index = scatter_index.unbind(1) + dispatch_mask = dispatch_mask.unbind(1) + for i_scatter_index, i_dispatch_mask in zip(scatter_index, dispatch_mask): + init_output = paddle.zeros( + [num_experts * capacity, x.shape[-1]], dtype="float32" + ) + updates = x * i_dispatch_mask.unsqueeze(-1).cast(x.dtype) + if output is None: + output = paddle.scatter( + init_output, + i_scatter_index, + updates, + overwrite=False, + ) + else: + output = output + paddle.scatter( + init_output, + i_scatter_index, + updates, + overwrite=False, + ) + if output.dtype != orig_dtype: + output = output.cast(orig_dtype) + return output + + +def combining(x, combine_weights, scatter_index): + + dim = x.shape[-1] + scatter_index = scatter_index.reshape([-1]) + num_k = combine_weights.shape[-1] + combine_weights = combine_weights.unsqueeze(1) + # num_k = 2 + x = paddle.gather(x, scatter_index).reshape([-1, num_k, dim]) # [seq,2,dim] + return paddle.matmul(combine_weights, x).squeeze( + 1 + ) # [seq,1,2] @ [seq,2,dim] -> [seq,1,dim] + + +def fuse_logging(gate_logits, combine_weights, token_type_ids): + """fuse_logging""" + with paddle.no_grad(): + gate_expert_per_token_type_0, gate_expert_per_token_type_1 = None, None + gate_experts_per_token = None + ce = moe_router_loss_ops.cal_cross_entropy_info(gate_logits).mean(0) + if token_type_ids is not None: + ( + gate_expert_per_token_type_0, + gate_expert_per_token_type_1, + gate_experts_per_token, + ) = moe_router_loss_ops.cal_gate_experts_per_token_info( + combine_weights, token_type_ids + ) + else: + gate_experts_per_token = paddle.count_nonzero(combine_weights) / ( + gate_logits.shape[0] + ) + + return ( + gate_expert_per_token_type_0, + gate_expert_per_token_type_1, + gate_experts_per_token, + ce, + ) + + +class GateCombine(PyLayer): + @staticmethod + def forward(ctx, x, combine_weights, scatter_index): + ctx.x = x + ctx.combine_weights = combine_weights + ctx.scatter_index = scatter_index + ret = moe_combine(x, combine_weights, scatter_index) + return ret + + @staticmethod + def backward(ctx, grad_y, *_): + # assert moe_combine is not None + grad_x, grad_combine_weight_helper = paddle._C_ops.moe_combine_grad( + ctx.x, ctx.combine_weights, ctx.scatter_index, grad_y + ) + grad_combine_weight = grad_combine_weight_helper.sum(-1) + return grad_x, grad_combine_weight.reshape(ctx.combine_weights.shape), None + + +class Fp8MoeGateDispatchAndQuant(paddle.autograd.PyLayer): + """Fp8MoeGateDispatchAndQuant""" + + @staticmethod + def forward( + ctx, x, gate_logtis, corr_bias, k, capacity, use_pad, use_pow2_scale=True + ): + """forward""" + ( + out_fp8, + scale, + combine_weights, + scatter_index, + expert_offset, + expert_id, + ) = moe_ops_fp8.moe_gate_dispatch_and_quant( + x, + gate_logtis, + corr_bias=corr_bias, + k=k, + capacity=capacity, + use_pad=use_pad, + use_pow2_scale=use_pow2_scale, + ) + assert out_fp8.shape[0] == scale.shape[0] + + out_fp8.stop_gradient = False + combine_weights.stop_gradient = False + scatter_index.stop_gradient = True + expert_offset.stop_gradient = True + expert_id.stop_gradient = True + scale.stop_gradient = True + + ctx.k = k + ctx.capacity = capacity + ctx.use_pad = use_pad + ctx.combine_weights = combine_weights + ctx.scatter_index = scatter_index + ctx.expert_id = expert_id + ctx.has_corr_bias = corr_bias is not None + + return ( + out_fp8, + combine_weights, + scatter_index, + expert_offset, + expert_id, + { + "scale": scale, + }, + ) + + @staticmethod + def backward(ctx, *grads): + """backward""" + out_grad, combine_weights_grad = grads[0], grads[1] + x_grad, gate_logits_grad = moe_ops.moe_gate_dispatch_bwd( + ctx.combine_weights, + ctx.scatter_index, + ctx.expert_id, + out_grad, + combine_weights_grad, + k=ctx.k, + capacity=ctx.capacity, + use_pad=ctx.use_pad, + ) + if ctx.has_corr_bias: + return x_grad, gate_logits_grad, None + else: + return x_grad, gate_logits_grad + + +class AlltoAll(PyLayer): + """ + AlltoAll w/ backward + """ + + @staticmethod + def forward(ctx, x, group, sync_op=True): + """ + All-to-all communication in the group. + """ + ctx.group = group + if dist.get_world_size(group) <= 1: + return x + output = paddle.empty_like(x) + output.stop_gradient = False + task = stream.alltoall_single( + output, x, None, None, group, sync_op=sync_op, use_calc_stream=sync_op + ) + if not sync_op: + return output, task + else: + return output + + @staticmethod + def backward(ctx, *dx): + """backward""" + return AlltoAll.apply(*dx, group=ctx.group) + + +class AlltoAllExpertOverlap(PyLayer): + """ + AlltoAllExpertOverlap w/ backward + """ + + @staticmethod + def forward( + ctx, input, group, num_local_experts, forward_func_dict, is_first_fwd=False + ): + """forward""" + assert ( + dist.get_world_size(group) > 1 + ), "AlltoAllExpertOverlap is not supported for a world size less than or equal to 1." + + ctx.bw_funcs = {} + ctx.group = group + ctx.num_local_experts = num_local_experts + + assert isinstance(forward_func_dict, nn.LayerList) + all2all_tasks = [] + all2all_ins = paddle.unbind(input, axis=0) + for stage_id in range(1): + stage_input = all2all_ins[stage_id] + x_out, task = AlltoAll.apply(stage_input, group=group, sync_op=False) + all2all_tasks.append((task, x_out)) + + expert_outputs = [] + for stage_id in range(num_local_experts): + if stage_id + 1 != num_local_experts: + stage_input = all2all_ins[stage_id + 1] + x_out, task = AlltoAll.apply(stage_input, group=group, sync_op=False) + all2all_tasks.append((task, x_out)) + + task, dispatched_input = all2all_tasks[stage_id] + task.wait() + bwf, (expert_outputs_cur_stage,) = manual_backward( + forward_func_dict[stage_id], is_first_fwd, dispatched_input + ) + ctx.bw_funcs[stage_id] = bwf + expert_outputs.append(expert_outputs_cur_stage) + + expert_output = paddle.stack(expert_outputs, axis=1) + return expert_output + + @staticmethod + def backward(ctx, out_grad): + """backward""" + all2all_tasks = [] + expert_outputs = [] + + out_grad_list = paddle.split( + out_grad, num_or_sections=out_grad.shape[1], axis=1 + ) + for stage_id in range(ctx.num_local_experts): + (grad_cur_stage,) = ctx.bw_funcs[stage_id](out_grad_list[stage_id]) + + x_out, task = AlltoAll.apply(grad_cur_stage, group=ctx.group, sync_op=False) + all2all_tasks.append(task) + expert_outputs.append(x_out) + + for task in all2all_tasks: + task.wait() + + expert_output = paddle.stack(expert_outputs, axis=0) + return expert_output + + +class AlltoAllAsync(PyLayer): + """ + AlltoAll async w/ backward + """ + + @staticmethod + def forward(ctx, x, *fn_args, group=None, fn=None, is_first_fwd=False): + """ + All-to-all communication in the group. + Args: + x: Tensor + args: List[Any], argument(s) to `fn` + group: ProcessGroup + fn: callable, called while doing alltoall + is_first_fwd: if using recompute, don't record bacward when first forward + Returns: + x: Tensor + fn_out: List[Tensor] + """ + assert fn is not None, "use AlltoAll no async" + ctx.group = group + if dist.get_world_size(group) <= 1: + ctx.bwf, fn_out = manual_backward(fn, is_first_fwd, *fn_args) + return (x,) + fn_out + x_out = paddle.empty_like(x) + x_out.stop_gradient = False + task = stream.alltoall_single( + x_out, + x, + None, + None, + group, + sync_op=False, + ) + ctx.bwf, fn_out = manual_backward(fn, is_first_fwd, *fn_args) + task.wait() + return (x_out,) + fn_out + + @staticmethod + def backward(ctx, dx_out, *fn_out_grads): + """backward""" + if dist.get_world_size(ctx.group) <= 1: + fn_args_grads = ctx.bwf(*fn_out_grads) + return (dx_out,) + fn_args_grads + + dx = paddle.empty_like(dx_out) + dx.stop_gradient = False + task = stream.alltoall_single( + dx, + dx_out, + None, + None, + ctx.group, + sync_op=False, + ) + fn_args_grads = ctx.bwf(*fn_out_grads) + task.wait() + return (dx,) + fn_args_grads + + +def bpr_preprocess(input, logits, capacity, buffer): + """impletment bpr sorting""" + assert input.ndim == 2, input.shape + idx = paddle.argsort(logits.max(-1), axis=0, descending=True) + input = input[idx] + logits = logits[idx] + buffer["idx"] = idx + return input, logits + + +def bpr_postprocess(output, buffer): + """bpr sorting""" + idx = buffer.pop("idx") + rev_idx = paddle.argsort(idx) + output = output[rev_idx] + return output + + +class FusedNormGateFunc(paddle.autograd.PyLayer): + """recompute of postnorm and gate""" + + @staticmethod + def forward(ctx, x, rms_norm_weight, moe_gate_weight, eps): + """doc""" + ctx.dtype = paddle.float32 + norm_output, invar = fused.fused_rms_norm(x, rms_norm_weight, eps) + with paddle.amp.auto_cast(False): + gate_logits = F.linear( + cast_if_needed(norm_output, ctx.dtype), + cast_if_needed(moe_gate_weight, ctx.dtype), + ) + + ctx.save_for_backward(x, rms_norm_weight, moe_gate_weight, eps) + return gate_logits, norm_output + + @staticmethod + def backward(ctx, d_gate_logits, d_norm_output): + """doc""" + x, rms_norm_weight, moe_gate_weight, eps = ctx.saved_tensor() + # recompute rmsnorm + norm_output, invar = fused.fused_rms_norm(x, rms_norm_weight, eps) + # with paddle.amp.auto_cast(False): + d_norm_output_linear, d_moe_gate_weight = matmul_bwd( + cast_if_needed(norm_output, ctx.dtype), + cast_if_needed(moe_gate_weight, ctx.dtype), + d_gate_logits, + False, + False, + ) + d_norm_output_linear, d_moe_gate_weight = cast_if_needed( + d_norm_output_linear, norm_output.dtype + ), cast_if_needed(d_moe_gate_weight, moe_gate_weight.dtype) + d_norm_output = d_norm_output + d_norm_output_linear + dx, d_rms_norm_weight = fused.fused_rms_norm_grad_func( + x, rms_norm_weight, invar, d_norm_output, eps + ) + + return dx, d_rms_norm_weight, d_moe_gate_weight + + +class FusedNormGateMoe(paddle.nn.Layer): + """recompute of postnorm and gate""" + + def __init__(self, gate, rms_norm_weight, eps) -> None: + """doc""" + super().__init__() + self.rms_norm_weight = rms_norm_weight + self.gate = gate + self.eps = eps + + def forward(self, x): + """doc""" + moe_gate_weight = self.gate.get_gate_weight(True) + capacity = self.gate.get_capacity(x.shape[0]) + + router_loss = paddle.zeros([1], dtype="float32") + router_loss.stop_gradient = False + + gate_logits, norm_output = FusedNormGateFunc.apply( + x, self.rms_norm_weight, moe_gate_weight, self.eps + ) + return gate_logits, capacity, router_loss, norm_output + + +class MOELayer(nn.Layer): + """MOELayer module which implements MixtureOfExperts as described in Gshard_. + :: + + gate = Top2Gate(model_dim, num_experts) + + moe = MOELayer(gate, expert) + output = moe(input) + l_aux = moe.l_aux + + .. Gshard_: https://arxiv.org/pdf/2006.16668.pdf + + Args: + gate (paddle.nn.Layer): + gate network + expert (paddle.nn.LayerList): + expert network, LayerList 长度是 per_device 上的 expert 数。 + group (paddle.ProgressGroup) + recompute: 启用MOE内recomupte + Returns: + output + combine_weight + router-loss + """ + + def __init__( + self, + gate: nn.Layer, + experts: List[nn.Layer], + layer_idx, + shared_experts: Optional[List[nn.Layer]] = None, + group: Group = None, + recompute=False, + enable_logging: bool = False, + k=2, + enable_bpr: bool = False, + all_to_all_dropout=0, + group_experts=False, + moe_statics=None, + ): + + super().__init__() + self.gate = gate + self.layer_idx = layer_idx + self.recompute = recompute + logger.info(f"using moe recompute={recompute}") + for p in self.gate.parameters(): + p.is_gate = True + if isinstance(experts, nn.LayerList): + self.experts = experts + else: + logger.info(f"using fused experts, type={type(experts)}") + self.experts = experts + self.shared_experts = shared_experts + + self.group = group + self.k = k + self.all_to_all_dropout = all_to_all_dropout + self.enable_logging = enable_logging + self.use_correction_bias = moe_statics is not None + self.moe_statics = moe_statics + if self.use_correction_bias: + logger.info( + f"using correction bias, aux-coef:{self.gate.config.moe_aux_loss_lambda}" + ) + assert self.gate.config.moe_use_aux_free + + self.is_mp_moe = ( + hasattr(fleet.fleet, "_hcg") + and group is fleet.get_hybrid_communicate_group().get_model_parallel_group() + ) + self.is_ep_moe = ( + hasattr(fleet.fleet, "_hcg") + and hasattr( + fleet.get_hybrid_communicate_group(), + "get_moe_sharding_parallel_world_size", + ) + and fleet.get_hybrid_communicate_group().get_moe_sharding_parallel_world_size() + > 0 + ) + is_dummy_moe = dist.get_world_size(group) == 1 + + for p in experts.parameters(): + p.expert = not (self.is_mp_moe or is_dummy_moe) # type: ignore + p.no_sync = not (self.is_mp_moe or is_dummy_moe) + logger.info(f"expert no-sync={p.no_sync}-{p.name}") + if self.is_mp_moe or self.is_ep_moe: + p.is_distributed = True + + expert_color = None + if self.is_ep_moe: + moe_grad_group = ( + fleet.get_hybrid_communicate_group().get_moe_sharding_parallel_group() + ) + expert_color = {"color": "moe_expert", "group": moe_grad_group} + elif ( + self.config.offline_quant_expert_weight + and self.config.clear_origin_weight_when_offline_quant + ): + expert_color = {"color": "moe_expert"} + + if expert_color is not None: + for p in self.experts.parameters(): + setattr(p, "color", expert_color) + + self.world_size = dist.get_world_size(self.group) + # assert self.world_size > 1, f'moe-group not found, world_size {self.world_size}' + self.rank = dist.get_rank(self.group) + if self.world_size < 1: + self.world_size = 1 + if self.rank < 0: + self.rank = 0 + + self.num_local_experts = len(self.experts) + self.dispatch_by_task = ( + hasattr(self.gate, "dispatch_by_task") and self.gate.dispatch_by_task + ) + + if self.dispatch_by_task: + assert 0, "no supported, checkout earylier code" + assert self.num_local_experts == 1 + + if enable_bpr: + logger.info("using BPR") + prepost_process_buffer = {} + self.input_preprocess = partial( + bpr_preprocess, buffer=prepost_process_buffer + ) + self.output_postprocess = partial( + bpr_postprocess, buffer=prepost_process_buffer + ) + else: + self.input_preprocess = self.output_postprocess = None + self.group_experts = group_experts + self.config = self.gate.config + self.zero = paddle.to_tensor(0, dtype=paddle.float32) + + self._rr_moe_gate_dispatch = None + self._rr_moe_combine = None + self.use_norm_gate_recompute = None + + if self.config.use_recompute and self.config.skip_recompute_ops.get( + "moe_gate_dispatch", False + ): + self._rr_moe_gate_dispatch = None + if self.config.use_recompute and self.config.skip_recompute_ops.get( + "moe_combine", False + ): + self._rr_moe_combine = None + if hasattr(fleet.fleet, "_hcg"): + hcg = fleet.get_hybrid_communicate_group() + if ( + hasattr(hcg, "get_moe_sharding_parallel_world_size") + and hcg.get_moe_sharding_parallel_world_size() > 0 + ): + moe_grad_group = hcg.get_moe_sharding_parallel_group() + for p in self.experts.parameters(): + setattr( + p, "color", {"color": "moe_expert", "group": moe_grad_group} + ) + + def add_gate_recompute_func(self, post_norm_weight, post_norm_eps): + """Add FusedNormGateMoe recompute function""" + self.config.use_norm_gate_recompute = True + self.fused_norm_gate = FusedNormGateMoe( + self.gate, post_norm_weight, post_norm_eps + ) + + def forward_experts(self, dispatched_input): + """ + call experts sequently + Args: + dispatched_input: Tensor[num_experts, capacity, dim] + Returns: + expert_output: Tensor[num_experts, capacity, dim] + """ + with profile("fwd-expert"): + dispatched_input = dispatched_input.reshape( + [ + self.world_size, + self.num_local_experts, + -1, + dispatched_input.shape[-1], + ] + ) # [e,1,c,m] + expert_outputs = [] + if isinstance(self.experts, nn.LayerList): + + chunks = dispatched_input.transpose([1, 0, 2, 3]).contiguous().unbind(0) + assert len(chunks) == len(self.experts), ( + len(chunks), + len(self.experts), + ) + for chunk, expert in zip(chunks, self.experts): + expert_outputs += [expert(chunk)] + # logger.info( + # f"moe-fwd-expert: {chunk.shape}" + # f'-> {expert_outputs[-1].shape}: {chunk.astype("float32").norm(axis=-1)}' + # ) + expert_output = paddle.stack(expert_outputs, axis=1) # [ecm] + + else: + dispatched_input = dispatched_input.transpose([1, 0, 2, 3]) + dispatched_input.contiguous() + orig_shape = dispatched_input.shape + chunks = dispatched_input.reshape([orig_shape[0], -1, orig_shape[-1]]) + chunks = self.experts(chunks) + chunks = chunks.reshape(orig_shape[:-1] + [chunks.shape[-1]]).unbind(0) + expert_outputs += chunks + expert_output = paddle.stack(expert_outputs, axis=1) # [ecm] + return expert_output + + def fused_gate_logits_process( + self, gate_logits, token_type_ids, offload_helper=None + ): + + k = self.k + experts_type_ids = self.gate.experts_type_ids + use_hard_gate = self.config.moe_use_hard_gate + max_prob = None + + if token_type_ids is not None and use_hard_gate: + if offload_helper is None: + offload_helper = dict() + lm_mask = token_type_ids == 0 + is_lm = lm_mask.any() + mm_mask = token_type_ids == 1 + is_mm = mm_mask.any() + seq_lm = lm_mask.sum() + seq_mm = mm_mask.sum() + lm_mask = lm_mask.unsqueeze(1) & (experts_type_ids == 0).unsqueeze(0) + mm_mask = mm_mask.unsqueeze(1) & (experts_type_ids == 1).unsqueeze(0) + offload_helper["lm_mask"] = [lm_mask, is_lm, seq_lm] + offload_helper["mm_mask"] = [mm_mask, is_mm, seq_mm] + + is_lm = offload_helper["lm_mask"][1] + prob = paddle.zeros_like(gate_logits) + # 处理 lm_prob + if is_lm: + lm_mask = offload_helper["lm_mask"][0] + seq_lm_cpu = offload_helper["lm_mask"][2] + lm_mask_nonzero = lm_mask.nonzero() + lm_partial_gate_logits = gate_logits.gather_nd(lm_mask_nonzero).reshape( + [seq_lm_cpu, -1] + ) + if self.group_experts: + lm_prob = self.gate.act( + lm_partial_gate_logits.reshape( + [lm_partial_gate_logits.shape[0], k, -1] + ) + ) + max_prob = lm_prob.max(-1, keepdim=True) # [s_l, k, 1] + lm_prob /= max_prob + else: + lm_prob = self.gate.act(lm_partial_gate_logits) + prob = paddle.scatter_nd_add(prob, lm_mask_nonzero, lm_prob.flatten()) + is_mm = offload_helper["mm_mask"][1] + if is_mm: + mm_mask = offload_helper["mm_mask"][0] + seq_mm_cpu = offload_helper["mm_mask"][2] + mm_mask_nonzero = paddle.nonzero(mm_mask) + mm_partial_gate_logits = gate_logits.gather_nd(mm_mask_nonzero).reshape( + [seq_mm_cpu, -1] + ) + mm_prob = self.gate.act(mm_partial_gate_logits) + prob = paddle.scatter_nd_add(prob, mm_mask_nonzero, mm_prob.flatten()) + else: + if self.group_experts: + prob = self.gate.act(gate_logits.reshape([gate_logits.shape[0], k, -1])) + max_prob = prob.max(-1, keepdim=True) + prob /= max_prob + prob = prob.reshape([prob.shape[0], -1]) + else: + prob = self.gate.act(gate_logits) + return prob, max_prob + + def gate_distpach_and_quant(self, input, token_type_ids): + """ + gate_distpach_and_quant + """ + assert isinstance(self.gate, (TopKGateFused)), "Only fused gate is supported." + assert not self.config.use_ep_comm_overlap, "ep_comm_overlap is not supported" + assert ( + self._rr_moe_gate_dispatch is None + ), "rr_moe_gate_dispatch is not supported" + assert moe_ops_fp8 is not None + + args = () + if token_type_ids is not None: + token_type_ids = token_type_ids.reshape([-1]) + args = (token_type_ids,) + + ( + gate_logits, + capacity, + router_loss, + ) = self.gate(input, *args) + + if self.config.moe_multimodal_paired_experts: + assert token_type_ids is not None + input = paddle.concat( + [input, token_type_ids.unsqueeze(-1).astype(input.dtype)], axis=-1 + ) + if self.input_preprocess is not None: + input, gate_logits = self.input_preprocess(input, gate_logits, capacity) + + k = self.k + prob, max_prob = self.fused_gate_logits_process(gate_logits, token_type_ids) + + with profile("dispatch_op"): + corr_bias = ( + self.moe_statics.e_score_correction_bias[0].detach() + if self.use_correction_bias + else None + ) + + ( + dispatched_input, + combine_weights_unnorm, + scatter_index, + dispatch_mask, + _, + fp8_dispatched_handle, + ) = Fp8MoeGateDispatchAndQuant.apply( + input, prob, corr_bias, k=k, capacity=capacity, use_pad=True + ) + + # TODO(zhangyuqin): 把这些代码封装起来, 增强代码复用 + dispatch_mask = paddle.diff(F.pad(dispatch_mask, (1, 0))) + if self.use_correction_bias: + if self.gate.config.multimodel_experts: + for i in range(len(self.moe_statics.expert_usage)): + self.moe_statics.expert_usage[i] += dispatch_mask[ + self.gate.experts_type_mask[i] + ].detach() + else: + self.moe_statics.expert_usage[0] += dispatch_mask.detach() + dispatched_input.stop_gradient = False + combine_weights_unnorm.stop_gradient = False + scatter_index.stop_gradient = True + dispatch_mask.stop_gradient = True + + scatter_index = scatter_index.transpose([1, 0]) # [k,s] ->[s,k] + if self.group_experts: + if max_prob is not None: + if token_type_ids is not None: + p = paddle.ones_like(combine_weights_unnorm.unsqueeze(-1)) + p = paddle.scatter_nd_add( + p, paddle.nonzero(token_type_ids == 0), -1 + max_prob + ) + else: + p = max_prob + combine_weights_unnorm = ( + combine_weights_unnorm.unsqueeze(-1) * p + ).squeeze(-1) + # gate_prob 进行还原 + prob = (prob.reshape([p.shape[0], k, -1]) * p).reshape([p.shape[0], -1]) + if self.gate.norm_gate_logits: + combine_weights = combine_weights_unnorm / paddle.clip( + combine_weights_unnorm.sum(-1, keepdim=True), min=1e-12 + ) + else: + combine_weights = combine_weights_unnorm + combine_weights = combine_weights.cast("bfloat16") + + def reshape_for_a2a(tensor): + return tensor.reshape( + [ + self.world_size * self.num_local_experts, + capacity, + -1, + ] + ) + + dispatched_input = reshape_for_a2a(dispatched_input) + fp8_dispatched_handle["scale"] = reshape_for_a2a(fp8_dispatched_handle["scale"]) + dispatch_mask.stop_gradient = True + scatter_index.stop_gradient = True + return ( + dispatched_input, + combine_weights, + dispatch_mask, + scatter_index, + router_loss, + gate_logits, + prob, + fp8_dispatched_handle, + ) + + def gate_and_distpach(self, input, token_type_ids): + """ + calc gate and dispatch inputs (and do logging, optionaly) + Args: + input: Tensor[seq, dim], float + token_type_ids: Tensor[seq], int + Returns: + dispatched_input: Tensor[num_experts, capacity, dim] + combine_weights: [seq, k] + scatter_index: [seq, k] + router_loss: scalar + gate_logits: [seq, num_experts] + """ + seqlen, d_model = input.shape + args = () + if token_type_ids is not None: + token_type_ids = token_type_ids.reshape([-1]) + args = (token_type_ids,) + + use_fuse = isinstance(self.gate, (TopKGateFused)) + if use_fuse: + if self.use_norm_gate_recompute: + ( + gate_logits, + capacity, + router_loss, + norm_res, + ) = self.fused_norm_gate(input) + input = norm_res + else: + ( + gate_logits, + capacity, + router_loss, + ) = self.gate(input, *args) + else: + ( + capacity, + dispatch_mask, + combine_weights, + scatter_index, + router_loss, + gate_logits, + ) = self.gate( + input, + *args, + correction_bias=( + self.moe_statics.e_score_correction_bias[0] + if self.use_correction_bias + else None + ), + ) + prob = None + if self.config.moe_multimodal_paired_experts: + assert token_type_ids is not None + input = paddle.concat( + [input, token_type_ids.unsqueeze(-1).astype(input.dtype)], axis=-1 + ) + if self.input_preprocess is not None: + input, gate_logits = self.input_preprocess(input, gate_logits, capacity) + if use_fuse: + # capacity no use + k = self.k + prob, max_prob = self.fused_gate_logits_process(gate_logits, token_type_ids) + if get_env_device() == "xpu": + assert xpu_moe_gate_dispatch is not None + ( + dispatched_input, + combine_weights_unnorm, + scatter_index, + dispatch_mask, + _, + ) = xpu_moe_gate_dispatch(input, prob, k, capacity, True) + else: + assert moe_ops is not None + with profile("dispatch_op"): + if ( + "corr_bias" + in inspect.signature(moe_ops.moe_gate_dispatch).parameters + ): + if self.use_correction_bias: + compat_args = (self.moe_statics.e_score_correction_bias[0],) + else: + compat_args = (None,) + else: + assert ( + not self.use_correction_bias + ), "correction bias not supported, rebuild moe-ops" + compat_args = () + if not self.config.use_ep_comm_overlap: + if self._rr_moe_gate_dispatch is None: + ( + dispatched_input, + combine_weights_unnorm, + scatter_index, + dispatch_mask, + _, + ) = moe_ops.moe_gate_dispatch( + input, + prob, + *compat_args, + k=k, + capacity=capacity, + use_pad=True, + ) + else: + ( + dispatched_input, + combine_weights_unnorm, + scatter_index, + dispatch_mask, + _, + ) = self._rr_moe_gate_dispatch( + input, + prob, + compat_args, + k=k, + capacity=capacity, + use_pad=True, + ) + else: + ( + dispatched_input, + combine_weights_unnorm, + scatter_index, + dispatch_mask, + _, + ) = moe_ops.moe_gate_dispatch_permute( + input, + prob, + *compat_args, + k=k, + capacity=capacity, + world_size=self.group.nranks, + ) + dispatch_mask = paddle.diff(F.pad(dispatch_mask, (1, 0))) + if self.use_correction_bias and framework._dygraph_tracer()._has_grad: + if self.gate.config.multimodel_experts: + for i in range(len(self.moe_statics.expert_usage)): + self.moe_statics.expert_usage[i] += dispatch_mask[ + self.gate.experts_type_mask[i] + ].detach() + else: + self.moe_statics.expert_usage[0] += dispatch_mask.detach() + dispatched_input.stop_gradient = False + combine_weights_unnorm.stop_gradient = False + scatter_index.stop_gradient = True + dispatch_mask.stop_gradient = True + + scatter_index = scatter_index.transpose([1, 0]) # [k,s] ->[s,k] + if self.group_experts: + if max_prob is not None: + if token_type_ids is not None: + p = paddle.ones_like(combine_weights_unnorm.unsqueeze(-1)) + p = paddle.scatter_nd_add( + p, paddle.nonzero(token_type_ids == 0), -1 + max_prob + ) + else: + p = max_prob + combine_weights_unnorm = ( + combine_weights_unnorm.unsqueeze(-1) * p + ).squeeze(-1) + # gate_prob 进行还原 + prob = (prob.reshape([p.shape[0], k, -1]) * p).reshape( + [p.shape[0], -1] + ) + if self.gate.norm_gate_logits: + combine_weights = combine_weights_unnorm / paddle.clip( + combine_weights_unnorm.sum(-1, keepdim=True), min=1e-12 + ) + else: + combine_weights = combine_weights_unnorm + combine_weights = combine_weights.cast(dispatched_input.dtype) + else: + dispatched_input = dispatching( + input, + dispatch_mask, + scatter_index, + num_experts=self.world_size * self.num_local_experts, + capacity=capacity, + ) + if self.use_correction_bias and framework._dygraph_tracer()._has_grad: + usage = paddle.bincount( + scatter_index.reshape([-1]) // capacity, + minlength=self.world_size * self.num_local_experts, + ) + assert ( + not self.config.multimodel_experts + ), "correction bias not supported, use top2-fused gate" + self.moe_statics.expert_usage[0] += usage.detach() + if not self.config.use_ep_comm_overlap: + dispatched_input = dispatched_input.reshape( + [ + self.world_size * self.num_local_experts, + capacity, + ( + d_model + if not self.config.moe_multimodal_paired_experts + else d_model + 1 + ), + ] + ) # .clone() + else: + assert ( + len(dispatched_input.shape) == 4 + and dispatched_input.shape[1] == self.world_size + and dispatched_input.shape[0] == self.num_local_experts + ), ( + f"When using ep_comm_overlap, moe_gate_dispatch_permute is needed. " + f"Expected dispatched_input to have shape[1] == {self.world_size} " + f"and shape[0] == {self.num_local_experts}, " + f"but got shape {dispatched_input.shape}" + ) + dispatched_input = dispatched_input # .clone() + dispatch_mask.stop_gradient = True + scatter_index.stop_gradient = True + return ( + dispatched_input, + combine_weights, + dispatch_mask, + scatter_index, + router_loss, + gate_logits, + prob, + ) + + def _calc_router_loss( + self, + dispatch_mask, + gate_logits, + gate_prob, + num_experts, + use_group, + layer_idx, + token_type=None, + tokens_type_mask=None, + dispatch_tokens_mask=None, + prefix="", + ): + log = {} + router_loss, l_aux, orthogonal_loss, zloss = 0.0, None, None, None + if self.gate.config.moe_aux_loss_lambda: + l_aux = self.gate._cal_aux_loss( + gate_prob, + dispatch_mask, + num_experts, + use_group, + tokens_type_mask, + dispatch_tokens_mask, + ) + router_loss += self.gate.moe_aux_loss_lambda[token_type or 0] * l_aux + else: + router_loss += ( + self.zero * gate_prob[0, 0] + ) # must use gate prob to avoid zero pointer + if self.gate.config.moe_orthogonal_loss_lambda: + orthogonal_loss = self.gate._cal_orthogonal_loss(token_type, use_group) + router_loss += ( + self.gate.moe_orthogonal_loss_lambda[token_type or 0] * orthogonal_loss + ) + if self.gate.config.moe_z_loss_lambda and not in_auto_parallel_align_mode(): + zloss = self.gate._cal_z_loss(gate_logits, tokens_type_mask) + router_loss += self.gate.moe_z_loss_lambda[token_type or 0] * zloss + + tracer = framework._dygraph_tracer() + if self.enable_logging and global_training_logs_enabled() and tracer._has_grad: + if l_aux is not None: + log[f"aux_loss_layer_{self.layer_idx}"] = l_aux + + if orthogonal_loss is not None: + log[f"orthogonal_loss_layer_{self.layer_idx}"] = orthogonal_loss + + if zloss is not None: + log[f"zloss_layer_{self.layer_idx}"] = zloss + + global_training_logs.update( + **log, + **{ + k.replace(f"_layer_{self.layer_idx}", ""): v for k, v in log.items() + }, + ) + global_training_logs.update( + **{ + prefix + "_" + k.replace(f"_layer_{self.layer_idx}", ""): v + for k, v in log.items() + } + ) + return router_loss + + def calc_router_loss_and_logging( + self, + router_loss, + combine_weights, + dispatch_mask, + gate_logits, + gate_prob, + token_type_ids, + dispatch_token_type_ids=None, + offload_helper=None, + ): + + use_fuse = isinstance(self.gate, (TopKGateFused)) + if use_fuse: + assert gate_prob is not None + if token_type_ids is not None and self.gate.config.moe_use_hard_gate: + if not self.gate.weight.stop_gradient: + lm_tokens_mask = token_type_ids == 0 + if offload_helper is not None: + is_lm = offload_helper["lm_mask"][1] + else: + is_lm = lm_tokens_mask.any() + if is_lm: + dispatch_tokens_mask = ( + dispatch_token_type_ids == 0 + if dispatch_token_type_ids is not None + else None + ) + router_loss += self._calc_router_loss( + ( + dispatch_mask[self.gate.experts_type_mask[0]] + if hasattr(self.gate, "experts_type_mask") + else dispatch_mask + ), + ( + gate_logits[:, self.gate.experts_type_mask[0]] + if hasattr(self.gate, "experts_type_mask") + else gate_logits + ), + ( + gate_prob[:, self.gate.experts_type_mask[0]] + if hasattr(self.gate, "experts_type_mask") + else gate_prob + ), + ( + self.gate.num_experts_list[0] + if hasattr(self.gate, "num_experts_list") + else self.gate.num_experts_tensor + ), + self.group_experts, + self.layer_idx, + 0, + lm_tokens_mask, + dispatch_tokens_mask, + prefix="lm", + ) + mm_tokens_mask = token_type_ids == 1 + if offload_helper is not None: + is_mm = offload_helper["mm_mask"][1] + else: + is_mm = mm_tokens_mask.any() + if is_mm: + dispatch_tokens_mask = ( + dispatch_token_type_ids == 1 + if dispatch_token_type_ids is not None + else None + ) + router_loss += self._calc_router_loss( + dispatch_mask[self.gate.experts_type_mask[1]], + gate_logits[:, self.gate.experts_type_mask[1]], + gate_prob[:, self.gate.experts_type_mask[1]], + self.gate.num_experts_list[1], + False, + self.layer_idx, + 1, + mm_tokens_mask, + dispatch_tokens_mask, + prefix="mm", + ) + + else: + router_loss += self._calc_router_loss( + dispatch_mask, + gate_logits, + gate_prob, + self.gate.num_experts_tensor, + self.group_experts, + self.layer_idx, + ) + + if self.enable_logging and global_training_logs_enabled(): + seqlen = gate_logits.shape[0] + num_active = paddle.count_nonzero(combine_weights) + gate_experts_per_token = num_active.item() / seqlen + + if token_type_ids is not None: + token_type_ids = token_type_ids.reshape([-1]) + combine_weights_type_0 = combine_weights[token_type_ids == 0] + if combine_weights_type_0.size: + gate_expert_per_token_type_0 = ( + paddle.count_nonzero(combine_weights_type_0).item() + / combine_weights_type_0.shape[0] + ) + global_training_logs.update( + experts_per_token_text=gate_expert_per_token_type_0, + ) + + combine_weights_type_1 = combine_weights[token_type_ids == 1] + if combine_weights_type_1.size: + gate_expert_per_token_type_1 = ( + paddle.count_nonzero(combine_weights_type_1).item() + / combine_weights_type_1.shape[0] + ) + global_training_logs.update( + experts_per_token_image=gate_expert_per_token_type_1, + ) + + ce = ( + (-F.softmax(gate_logits, -1) * F.log_softmax(gate_logits, -1)) + .sum(-1) + .mean(0) + ) + _log = { + f"gate_prob_ce_layer_{self.layer_idx}": ce.item(), + f"experts_per_token_layer_{self.layer_idx}": gate_experts_per_token, + } + global_training_logs.update( + **_log, + **{ + k.replace(f"_layer_{self.layer_idx}", ""): v + for k, v in _log.items() + }, + ) + else: + seqlen = dispatch_mask.shape[0] + dispatch_mask = dispatch_mask.unbind(-1) + top1_gate_experts_per_token = ( + paddle.cast(dispatch_mask[0], dtype="float32").sum() / seqlen + ) + if ( + self.enable_logging + and global_training_logs_enabled() + and len(dispatch_mask) == 2 + ): + top2_gate_experts_per_token = ( + paddle.cast(dispatch_mask[1], dtype="float32").sum() / seqlen + ) + leakage_experts_per_token = ( + paddle.cast( + (~dispatch_mask[0]) & (~dispatch_mask[1]), dtype="float32" + ).sum() + / seqlen + ) + experts_per_token = ( + top1_gate_experts_per_token + top2_gate_experts_per_token + ) + global_training_logs.update( + experts_per_token=experts_per_token.detach(), + top1_experts_per_token=top1_gate_experts_per_token.detach(), + top2_experts_per_token=top2_gate_experts_per_token.detach(), + leakage_experts_per_token=leakage_experts_per_token.detach(), + ) + elif ( + self.enable_logging + and global_training_logs_enabled() + and len(dispatch_mask) == 1 + ): + experts_per_token = top1_gate_experts_per_token + leakage_experts_per_token = ( + paddle.cast(~dispatch_mask[0], dtype="float32").sum() / seqlen + ) + global_training_logs.update( + experts_per_token=experts_per_token.detach(), + top1_experts_per_token=top1_gate_experts_per_token.detach(), + leakage_experts_per_token=leakage_experts_per_token.detach(), + ) + + return router_loss + + def combine_expert_output(self, expert_output, combine_weights, scatter_index): + """ + Combine Expert output + Args: + expert_output: Tensor[num_experts, caapcity, dim] + combine_weights: + Returns: + combined_output: Tensor[seqlen, dim] + """ + expert_output = expert_output.reshape( + [-1, expert_output.shape[-1]] + ) # [e*1,c,m] + use_fuse = isinstance(self.gate, (TopKGateFused)) + combine_fn = combining_fused if use_fuse else combining + combined_output = combine_fn(expert_output, combine_weights, scatter_index) + + if self.output_postprocess is not None: + combined_output = self.output_postprocess(combined_output) + return combined_output + + def forward_single_stage(self, dispatched_input, stage_id): + """forward_single_stage""" + assert isinstance(self.experts, nn.LayerList) + return self.experts[stage_id](dispatched_input) + + def all2all_expert_overlap(self, x, group): + """all2all_expert_overlap""" + all2all_tasks = [] + all2all_ins = paddle.unbind(x, axis=0) + for stage_id in range(1): + stage_input = all2all_ins[stage_id] + x_out, task = AlltoAll.apply(stage_input, group=self.group, sync_op=False) + all2all_tasks.append((task, x_out)) + + expert_outputs = [] + for stage_id in range(self.num_local_experts): + if stage_id + 1 != self.num_local_experts: + stage_input = all2all_ins[stage_id + 1] + x_out, task = AlltoAll.apply( + stage_input, group=self.group, sync_op=False + ) + all2all_tasks.append((task, x_out)) + + task, dispatched_input = all2all_tasks[stage_id] + task.wait() + expert_outputs_cur_stage = ( + recompute(self.forward_single_stage, dispatched_input, stage_id) + if self.recompute and self.training + else self.forward_single_stage(dispatched_input, stage_id) + ) + expert_outputs.append(expert_outputs_cur_stage) + + expert_output = paddle.stack(expert_outputs, axis=1) + return expert_output + + def forward( + self, + input: Tensor, + token_type_ids=None, + ) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor]: + """ + Args: + input (`Tensor`): The input data with shape ``(s, d)``. + Only one token is supported for now. + token_type_ids (`Tensor`) int64 tensor with shape (s), + if specified, rount tensor according to `token_type_ids`. + Returns: + output (`Tensor`): The final output tensor with shape ``(s, d)`` where ``m`` is the + size of model parameters. + combine_weights (`Tensor`, optional): A tensor with shape ``(s,)``, which represents weights + for each expert in MoE. + router_loss (`Tensor`, optional): A scalar tensor representing the loss of routing function. + """ + if input.ndim == 3: + orig_shape = input.shape + input = input.reshape([-1, input.shape[-1]]) + else: + orig_shape = None + assert ( + len(input.shape) == 2 + ), f"input Tensor must have dimensions: (s)equence, (d)im, got:{input.shape}" + hidden_size = input.shape[1] + if token_type_ids is not None: + token_type_ids = token_type_ids.clone()[:, :-1] + if self.config.sequence_parallel: + token_type_ids = token_type_ids.reshape([-1]) + token_type_ids = ScatterOp.apply(token_type_ids) + token_type_ids.stop_gradient = True + + assert self.gate is not None + if hasattr(self, "rng") and self.rng.random() < self.all_to_all_dropout: + orig_shape_2 = input.shape + if self.config.moe_multimodal_paired_experts: + assert token_type_ids is not None + input = paddle.concat( + [input, token_type_ids.unsqueeze(-1).astype(input.dtype)], axis=-1 + ) + output = self.forward_experts(input) + output += self.gate.weight.sum() * 0.0 # hack for grad + output = output.reshape(orig_shape or orig_shape_2) # [e*1,c,m] + return output, None, 0 + + is_first_fwd = not framework._dygraph_tracer()._has_grad + use_async = self.shared_experts is not None + if in_auto_parallel_align_mode(): + gate_input = paddle.assign(input) + else: + gate_input = input + + use_fp8_fuse_node = ( + self.config.use_combine_before_a2a and self.config.use_fp8_fuse_node + ) + use_fp8_dispatch_a2a = self.config.use_fp8_dispatch_a2a and use_fp8_fuse_node + + with profile("fused_gate_and_dispatch"): + fp8_dispatched_handle = None + if use_fp8_dispatch_a2a: + ( + dispatched_input, + combine_weights, + dispatch_mask, + scatter_index, + router_loss, + gate_logits, + gate_prob, + fp8_dispatched_handle, + ) = self.gate_distpach_and_quant(gate_input, token_type_ids) + else: + ( + dispatched_input, + combine_weights, + dispatch_mask, + scatter_index, + router_loss, + gate_logits, + gate_prob, + ) = self.gate_and_distpach(gate_input, token_type_ids) + + # TODO(shenliang03): to fuse one kernel to optimize + if self.config.use_combine_before_a2a: + assert ( + not self.config.use_ep_comm_overlap + ), "Dont support use_ep_comm_overlap" + assert ( + moe_combine_no_weight is not None + ), "use_combine_before_a2a can only use with moe_combine_no_weight op, please install it first." + cw_shape = combine_weights.shape + si_shape = scatter_index.shape + scatter_index = scatter_index.reshape([-1]) + + token_combine_weights = paddle.zeros( + [cw_shape[0] * cw_shape[1]], dtype=combine_weights.dtype + ) + token_combine_weights = paddle.scatter( + token_combine_weights, + scatter_index, + combine_weights.reshape([-1]), + overwrite=False, + ) + + token_combine_weights = token_combine_weights.reshape( + [cw_shape[0], cw_shape[1], 1] + ) + token_combine_weights = AlltoAll.apply(token_combine_weights, self.group) + + if not self.config.use_ep_comm_overlap: + if use_fp8_dispatch_a2a: + shared_out = ( + self.shared_experts(input) + if self.shared_experts is not None + else None + ) + else: + with profile("moe_comm_and_shared_expert"): + if use_async: + dispatched_input, shared_out = AlltoAllAsync.apply( + dispatched_input, + input, + group=self.group, + fn=self.shared_experts, + is_first_fwd=is_first_fwd, + ) + else: + dispatched_input = AlltoAll.apply(dispatched_input, self.group) + + expert_out = ( + recompute(self.forward_experts, dispatched_input) + if self.recompute and self.training + else self.forward_experts(dispatched_input) + ) + + if self.config.use_combine_before_a2a: + token_combine_weights = token_combine_weights.clone().reshape( + expert_out.shape[:-1] + [1] + ) + expert_out = expert_out * token_combine_weights + else: + assert ( + len(dispatched_input.shape) == 4 + and dispatched_input.shape[1] == self.world_size + and dispatched_input.shape[0] == self.num_local_experts + ), ( + f"When using ep_comm_overlap, moe_gate_dispatch_permute is needed. " + f"Expected dispatched_input to have shape[1] == {self.world_size} " + f"and shape[0] == {self.num_local_experts}, " + f"but got shape {dispatched_input.shape}" + ) + with profile("moe_comm_and_forward_expert"): + expert_out = AlltoAllExpertOverlap.apply( + dispatched_input, + self.group, + self.num_local_experts, + self.experts, + is_first_fwd=is_first_fwd, + ) + if self.shared_experts is not None: + shared_out = self.shared_experts(input) + + with profile("moe_comm_and_calc_routerloss"): + expert_out, router_loss2 = AlltoAllAsync.apply( + expert_out, + router_loss, + combine_weights, + dispatch_mask, + gate_logits, + gate_prob, + token_type_ids, + group=self.group, + fn=self.calc_router_loss_and_logging, + is_first_fwd=is_first_fwd, + ) + + with profile("combine"): + if self.config.use_combine_before_a2a: + expert_out = expert_out.reshape([-1, hidden_size]) + + scatter_index = scatter_index.reshape(si_shape) + combined_output = moe_combine_no_weight( + expert_out, combine_weights, scatter_index, epsilon=1e-15 + ) + else: + combined_output = self.combine_expert_output( + expert_out, combine_weights, scatter_index + ) + + if self.shared_experts is not None: + combined_output += shared_out + + if orig_shape: + combined_output = combined_output.clone().reshape( + orig_shape[:-1] + [combined_output.shape[-1]] + ) + return combined_output, combine_weights, router_loss2, gate_logits + + +class MOEInferLayer(nn.Layer): + + def __init__( + self, + gate: nn.Layer, + experts: List[nn.Layer], + group: Group = None, + recompute=False, + ) -> None: + + super().__init__() + self.gate = gate + self.recompute = recompute + logger.info(f"using infer moe recompute={recompute}") + for p in self.gate.parameters(): + p.is_gate = True + if type(experts) == nn.LayerList: + self.experts = experts + else: + self.experts = nn.LayerList([experts]) + self.group = group + for p in experts.parameters(): + p.expert = True # type: ignore + p.no_sync = True + + self.world_size = dist.get_world_size(self.group) + self.rank = dist.get_rank(self.group) + + if self.world_size < 1: + self.world_size = 1 + if self.rank < 0: + self.rank = 0 + self.num_local_experts = len(self.experts) + + def forward( + self, + input: Tensor, + token_type_ids=None, + ) -> Tensor: + """_summary_ + + Args: + input (Tensor): _description_ + + Returns: + Tensor: _description_ + """ + # assert len(input) == 1, "only single input Tensor supported" + if input.ndim == 3: + orig_shape = input.shape + input = input.reshape([-1, input.shape[-1]]) + else: + orig_shape = None + assert ( + len(input.shape) == 2 + ), f"input Tensor must have dimensions: (s)equence, (d)im, got:{input.shape}" + + # Implement Algorithm 2 from GShard paper. + seqlen, d_model = input.shape + + # Reshape into S tokens by dropping sequence dimension. + # reshaped_input = input.reshape(-1, d_model) + # assert reshaped_input.shape[0] % len(self.experts) == 0, + # f'num tokens must be order of number of local experts, {input[0].shape[0]} vs {len(self.experts)}' + def fwdfn(dispatched_input): + chunks = dispatched_input.unbind(1) + expert_outputs = [] + for chunk, expert in zip(chunks, self.experts): + expert_outputs += [expert(chunk)] + expert_output = paddle.stack(expert_outputs, axis=1) # [ecm] + return expert_output + + assert self.gate is not None + ( + capacity, + dispatch_mask, + combine_weights, + scatter_index, + router_loss, + ) = self.gate(input) + + dispatched_input = dispatching( + input, + dispatch_mask, + scatter_index, + num_experts=self.world_size * self.num_local_experts, + capacity=capacity, + ) + dispatched_input = dispatched_input.reshape( + [self.world_size * self.num_local_experts, capacity, d_model] + ) + # dispatched_input = _AllToAll.apply(dispatched_input, self.group) #[ecm] + dispatched_input = dispatched_input.reshape( + [self.world_size, self.num_local_experts, -1, d_model] + ) # [e,1,c,m] + dispatched_input = dispatched_input[ + self.rank : (self.rank + 1) + ] # [1, local_experts, c, m] + + expert_output = ( + recompute(fwdfn, dispatched_input) + if self.recompute and self.training + else fwdfn(dispatched_input) + ) + # expert_output = fwdfn(dispatched_input) + # expert_output = _AllToAll.apply(expert_output, self.group) #[ecm] + if self.world_size > 1: + tmp = [] + dist.all_gather(tmp, expert_output, group=self.group) + expert_output = paddle.concat(tmp, axis=0) + + expert_output = expert_output.reshape( + [self.world_size * self.num_local_experts * capacity, d_model] + ) # [e*1,c,m] + combined_output = combining(expert_output, combine_weights, scatter_index) + + # combined_output = paddle.einsum("sec,ecm->sm", combine_weights, expert_output) + if orig_shape: + combined_output = combined_output.reshape(orig_shape) + top1_gate_experts_per_token = ( + paddle.cast(dispatch_mask[0], dtype="float32").sum() / seqlen + ) + top2_gate_experts_per_token = ( + paddle.cast(dispatch_mask[1], dtype="float32").sum() / seqlen + ) + leakage_experts_per_token = ( + paddle.cast( + (~dispatch_mask[0]) & (~dispatch_mask[1]), dtype="float32" + ).sum() + / seqlen + ) + + experts_per_token = top1_gate_experts_per_token + top2_gate_experts_per_token + global_training_logs.update( + experts_per_token=experts_per_token.detach(), + top1_experts_per_token=top1_gate_experts_per_token.detach(), + top2_experts_per_token=top2_gate_experts_per_token.detach(), + leakage_experts_per_token=leakage_experts_per_token.detach(), + ) + return combined_output, combine_weights, router_loss, None + + +class MOELayerWithAllGatherDispatcher(MOELayer): + """ + MOELayer with allgather dispatcher. + """ + + def __init__( + self, + gate: nn.Layer, + experts: List[nn.Layer], + layer_idx, + shared_experts: Optional[List[nn.Layer]] = None, + group: Group = None, + recompute=False, + enable_logging: bool = False, + k=2, + enable_bpr: bool = False, + all_to_all_dropout=0, + group_experts=False, + ): + super(MOELayerWithAllGatherDispatcher, self).__init__( + gate=gate, + experts=experts, + layer_idx=layer_idx, + shared_experts=shared_experts, + group=group, + recompute=recompute, + enable_logging=enable_logging, + k=k, + enable_bpr=enable_bpr, + all_to_all_dropout=all_to_all_dropout, + group_experts=group_experts, + ) + logger.info("Using MOELayerWithAllGatherDispatcher") + assert get_env_device() == "xpu" + assert isinstance(self.gate, TopKGateFused) + assert self.shared_experts is not None + local_expert_indices_offset = self.rank * self.num_local_experts + self.expert_indices = [ + local_expert_indices_offset + i for i in range(self.num_local_experts) + ] + + def gate_and_distpach(self, input, token_type_ids): + """ + gate and dispatch + """ + args = () + + gate_logits, capacity, router_loss = self.gate(input, *args) + + if self.input_preprocess is not None: + input, gate_logits = self.input_preprocess(input, gate_logits, capacity) + + moe_allgather_dispatcher_return = MOEAllGatherDispatcher.token_dispatcher( + input, + gate_logits, + self.k, + self.expert_indices, + self.num_local_experts * self.world_size, + self.num_local_experts, + ) + global_hidden_states = moe_allgather_dispatcher_return.global_hidden_states + dispatched_input = moe_allgather_dispatcher_return.dispatched_input + combine_weights = moe_allgather_dispatcher_return.combine_weights + scatter_index = moe_allgather_dispatcher_return.scatter_index + gather_scatter_mask = moe_allgather_dispatcher_return.gather_scatter_mask + dispatch_mask = moe_allgather_dispatcher_return.dispatch_mask + tokens_per_expert = moe_allgather_dispatcher_return.tokens_per_expert + + dispatched_input.stop_gradient = False + combine_weights.stop_gradient = False + scatter_index.stop_gradient = True + gather_scatter_mask.stop_gradient = True + dispatch_mask.stop_gradient = True + + return ( + dispatched_input, + combine_weights, + gather_scatter_mask, + dispatch_mask, + scatter_index, + router_loss, + gate_logits, + global_hidden_states, + tokens_per_expert, + ) + + def forward_experts( + self, dispatched_input, global_hidden_states, tokens_per_expert + ): + """ + call moe experts and share experts + """ + tokens_per_expert_no_zero = list( + filter(lambda x: x != 0, tokens_per_expert.tolist()) + ) + chunks_per_expert = paddle.split( + dispatched_input, tokens_per_expert_no_zero, axis=0 + ) + assert len(chunks_per_expert) <= len(self.experts) + moe_output = [] + offset = 0 + for index, cur_tokens in enumerate(tokens_per_expert.tolist()): + if cur_tokens == 0: + offset += 1 + else: + cur_expert = self.experts[index] + cur_chunk = chunks_per_expert[index - offset] + moe_output.append(cur_expert(cur_chunk)) + hidden_states = paddle.concat(moe_output, axis=0) + shared_expert_out = self.shared_experts(global_hidden_states) + return hidden_states, shared_expert_out + + def forward(self, input, token_type_ids): + """ + forward function + """ + assert ( + len(input.shape) == 2 + ), f"input Tensor must have dimensions: (s)equence, (d)im, got:{input.shape}" + orig_shape = input.shape + global_shape = [orig_shape[0] * self.world_size, orig_shape[1]] + if token_type_ids is not None: + token_type_ids.stop_gradient = True + assert self.gate is not None + + ( + dispatched_input, + combine_weights, + gather_scatter_mask, + dispatch_mask, + scatter_index, + router_loss, + gate_logits, + global_hidden_states, + tokens_per_expert, + ) = self.gate_and_distpach(input, token_type_ids) + + expert_out, shared_out = ( + recompute( + self.forward_experts, + dispatched_input, + global_hidden_states, + tokens_per_expert, + ) + if self.recompute and self.training + else self.forward_experts( + dispatched_input, global_hidden_states, tokens_per_expert + ) + ) + combined_output = MOEAllGatherDispatcher.token_combine( + expert_out, + shared_out, + combine_weights, + scatter_index, + gather_scatter_mask, + global_shape, + ) + if self.shared_experts.down_proj.bias is not None: + combined_output = combined_output + self.shared_experts.down_proj.bias + router_loss2 = self.calc_router_loss_and_logging( + router_loss, combine_weights, dispatch_mask, gate_logits, token_type_ids + ) + + return combined_output, combine_weights, router_loss2, gate_logits diff --git a/examples/pre-training/models/moe/moe_utils.py b/examples/pre-training/models/moe/moe_utils.py new file mode 100644 index 00000000..cd797ab4 --- /dev/null +++ b/examples/pre-training/models/moe/moe_utils.py @@ -0,0 +1,229 @@ +# !/usr/bin/env python3 + +# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" moe utils for allgather dispatcher """ +import paddle +import paddle.distributed as dist +from paddle.distributed import fleet +import paddle.nn.functional as F +from paddle import nn +from paddle.autograd import PyLayer + +from models.sequence_parallel_utils import ( + AllGatherOp, + ReduceScatterOp, +) + + +class MOEGather(PyLayer): + """ + MOE Gather + """ + + @staticmethod + def forward(ctx, input_, map_): + """ + MOE Gather forward + """ + ctx.input_shape = input_.shape + ctx.map = map_ + return paddle.take_along_axis(input_, map_, 0) + + @staticmethod + def backward(ctx, grad_output): + """ + MOE Gather backward + """ + input_shape = ctx.input_shape + map_ = ctx.map + + output = paddle.zeros(input_shape, dtype=grad_output.dtype) + return paddle.put_along_axis(output, map_, grad_output, 0), None + + +class MOEScatter(PyLayer): + """ + MOE Scatter + """ + + @staticmethod + def forward(ctx, input_, map_, output_size=None): + """ + MOE Scatter forward + """ + ctx.map = map_ + + if output_size is not None: + output = paddle.zeros(output_size, dtype=input_.dtype) + else: + output = paddle.zeros_like(input_) + + return paddle.put_along_axis(output, map_, input_, 0) + + @staticmethod + def backward(ctx, grad_output): + """ + MOE Scatter backward + """ + map_ = ctx.map + return paddle.take_along_axis(grad_output, map_, 0), None + + +class AllgatherDispatcherReturn(object): + """ + MOE allgather dispatcher return value + """ + + def __init__( + self, + global_hidden_states, + dispatched_input, + combine_weights, + scatter_index, + gather_scatter_mask, + dispatch_mask, + tokens_per_expert, + ): + self.global_hidden_states = global_hidden_states + self.dispatched_input = dispatched_input + self.combine_weights = combine_weights + self.scatter_index = scatter_index + self.gather_scatter_mask = gather_scatter_mask + self.dispatch_mask = dispatch_mask + self.tokens_per_expert = tokens_per_expert + + +class MOEAllGatherDispatcher(nn.Layer): + """ + MOE with allgather dispatcher. + Contains two static methos. + MOEAllGatherDispatcher.token_dispatcher + MOEAllGatherDispatcher.token_combine + """ + + @staticmethod + def token_dispatcher( + hidden_states, + local_gate_logits, + top_k, + local_expert_indices, + num_moe_experts, + num_local_experts, + ): + """ + MOE token dispatcher with allgather + """ + seq_len = local_gate_logits.shape[0] + num_experts = local_gate_logits.shape[-1] + prob = F.softmax(local_gate_logits.reshape([seq_len, top_k, -1]), axis=-1) + max_prob = prob.max(-1, keepdim=True) + prob /= max_prob + prob = prob.reshape([-1, num_experts]) + + probs, scatter_index = paddle.topk(prob, top_k, axis=-1) + dispatch_mask = paddle.cumsum( + paddle.histogram(scatter_index.flatten(), bins=num_experts) + ) + + # dispatch + with paddle.no_grad(): + global_indices = AllGatherOp.apply(scatter_index) + global_local_mask = (global_indices >= local_expert_indices[0]) & ( + global_indices <= local_expert_indices[-1] + ) + local_indices = global_indices.masked_select(global_local_mask) + + global_hidden_states = AllGatherOp.apply(hidden_states) + global_probs = AllGatherOp.apply(probs) + + # get local hidden states + combine_weights = global_probs.masked_select(global_local_mask).cast( + dtype=hidden_states.dtype + ) + gather_scatter_mask = global_local_mask.nonzero()[:, 0] + gather_scatter_mask = paddle.reshape(gather_scatter_mask, shape=[-1, 1]) + gather_scatter_mask = paddle.expand( + gather_scatter_mask, shape=[-1, hidden_states.shape[-1]] + ) + local_hidden_states = MOEGather.apply(global_hidden_states, gather_scatter_mask) + + with paddle.no_grad(): + # The indices of local_indices that give its sorted order along dim 0. + scatter_index = paddle.argsort(local_indices, axis=0) + tokens_per_expert = paddle.bincount( + paddle.reshape(local_indices, [-1]), minlength=num_moe_experts + ) + if num_local_experts < num_moe_experts: + start = local_expert_indices[0] + end = local_expert_indices[-1] + 1 + tokens_per_expert = tokens_per_expert[start:end] + + scatter_index = paddle.reshape(scatter_index, shape=[-1, 1]) + scatter_index = paddle.expand( + scatter_index, shape=[-1, hidden_states.shape[-1]] + ) + + dispatched_input = MOEGather.apply(local_hidden_states, scatter_index) + + return AllgatherDispatcherReturn( + global_hidden_states, + dispatched_input, + combine_weights, + scatter_index, + gather_scatter_mask, + dispatch_mask, + tokens_per_expert, + ) + + @staticmethod + def token_combine( + expert_out, + shared_out, + combine_weights, + scatter_index, + gather_scatter_mask, + global_shape, + ): + """ + MOE token combine with reduce scatter + """ + expert_out = MOEScatter.apply(expert_out, scatter_index) + expert_out = expert_out * paddle.reshape(combine_weights, shape=[-1, 1]) + expert_out = MOEScatter.apply(expert_out, gather_scatter_mask, global_shape) + combine_out = expert_out + shared_out + combine_out = ReduceScatterOp.apply(combine_out) + return combine_out + + +def get_flatten_mesh(mesh): + + return dist.ProcessMesh(mesh.process_ids) + + +def get_mesh(pp_idx=0): + + mesh = fleet.auto.get_mesh() + if "pp" in mesh.dim_names: + mesh = mesh.get_mesh_with_dim("pp", pp_idx) + return mesh + + +def _reshard(tensor, mesh, placements): + + dst_tensor = dist.auto_parallel.moe_utils._dist_reshape( + tensor, tensor.shape, mesh, placements + ) + return dst_tensor diff --git a/examples/pre-training/models/moe/moe_utils_auto.py b/examples/pre-training/models/moe/moe_utils_auto.py new file mode 100644 index 00000000..546d1c8e --- /dev/null +++ b/examples/pre-training/models/moe/moe_utils_auto.py @@ -0,0 +1,229 @@ +# !/usr/bin/env python3 + +# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" moe utils for allgather dispatcher """ +import paddle +import paddle.distributed as dist +from paddle.distributed import fleet +import paddle.nn.functional as F +from paddle import nn +from paddle.autograd import PyLayer + +from models.sequence_parallel_utils_auto import ( + AllGatherOp, + ReduceScatterOp, +) + + +class MOEGather(PyLayer): + """ + MOE Gather + """ + + @staticmethod + def forward(ctx, input_, map_): + """ + MOE Gather forward + """ + ctx.input_shape = input_.shape + ctx.map = map_ + return paddle.take_along_axis(input_, map_, 0) + + @staticmethod + def backward(ctx, grad_output): + """ + MOE Gather backward + """ + input_shape = ctx.input_shape + map_ = ctx.map + + output = paddle.zeros(input_shape, dtype=grad_output.dtype) + return paddle.put_along_axis(output, map_, grad_output, 0), None + + +class MOEScatter(PyLayer): + """ + MOE Scatter + """ + + @staticmethod + def forward(ctx, input_, map_, output_size=None): + """ + MOE Scatter forward + """ + ctx.map = map_ + + if output_size is not None: + output = paddle.zeros(output_size, dtype=input_.dtype) + else: + output = paddle.zeros_like(input_) + + return paddle.put_along_axis(output, map_, input_, 0) + + @staticmethod + def backward(ctx, grad_output): + """ + MOE Scatter backward + """ + map_ = ctx.map + return paddle.take_along_axis(grad_output, map_, 0), None + + +class AllgatherDispatcherReturn(object): + """ + MOE allgather dispatcher return value + """ + + def __init__( + self, + global_hidden_states, + dispatched_input, + combine_weights, + scatter_index, + gather_scatter_mask, + dispatch_mask, + tokens_per_expert, + ): + self.global_hidden_states = global_hidden_states + self.dispatched_input = dispatched_input + self.combine_weights = combine_weights + self.scatter_index = scatter_index + self.gather_scatter_mask = gather_scatter_mask + self.dispatch_mask = dispatch_mask + self.tokens_per_expert = tokens_per_expert + + +class MOEAllGatherDispatcher(nn.Layer): + """ + MOE with allgather dispatcher. + Contains two static methos. + MOEAllGatherDispatcher.token_dispatcher + MOEAllGatherDispatcher.token_combine + """ + + @staticmethod + def token_dispatcher( + hidden_states, + local_gate_logits, + top_k, + local_expert_indices, + num_moe_experts, + num_local_experts, + ): + """ + MOE token dispatcher with allgather + """ + seq_len = local_gate_logits.shape[0] + num_experts = local_gate_logits.shape[-1] + prob = F.softmax(local_gate_logits.reshape([seq_len, top_k, -1]), axis=-1) + max_prob = prob.max(-1, keepdim=True) + prob /= max_prob + prob = prob.reshape([-1, num_experts]) + + probs, scatter_index = paddle.topk(prob, top_k, axis=-1) + dispatch_mask = paddle.cumsum( + paddle.histogram(scatter_index.flatten(), bins=num_experts) + ) + + # dispatch + with paddle.no_grad(): + global_indices = AllGatherOp.apply(scatter_index) + global_local_mask = (global_indices >= local_expert_indices[0]) & ( + global_indices <= local_expert_indices[-1] + ) + local_indices = global_indices.masked_select(global_local_mask) + + global_hidden_states = AllGatherOp.apply(hidden_states) + global_probs = AllGatherOp.apply(probs) + + # get local hidden states + combine_weights = global_probs.masked_select(global_local_mask).cast( + dtype=hidden_states.dtype + ) + gather_scatter_mask = global_local_mask.nonzero()[:, 0] + gather_scatter_mask = paddle.reshape(gather_scatter_mask, shape=[-1, 1]) + gather_scatter_mask = paddle.expand( + gather_scatter_mask, shape=[-1, hidden_states.shape[-1]] + ) + local_hidden_states = MOEGather.apply(global_hidden_states, gather_scatter_mask) + + with paddle.no_grad(): + # The indices of local_indices that give its sorted order along dim 0. + scatter_index = paddle.argsort(local_indices, axis=0) + tokens_per_expert = paddle.bincount( + paddle.reshape(local_indices, [-1]), minlength=num_moe_experts + ) + if num_local_experts < num_moe_experts: + start = local_expert_indices[0] + end = local_expert_indices[-1] + 1 + tokens_per_expert = tokens_per_expert[start:end] + + scatter_index = paddle.reshape(scatter_index, shape=[-1, 1]) + scatter_index = paddle.expand( + scatter_index, shape=[-1, hidden_states.shape[-1]] + ) + + dispatched_input = MOEGather.apply(local_hidden_states, scatter_index) + + return AllgatherDispatcherReturn( + global_hidden_states, + dispatched_input, + combine_weights, + scatter_index, + gather_scatter_mask, + dispatch_mask, + tokens_per_expert, + ) + + @staticmethod + def token_combine( + expert_out, + shared_out, + combine_weights, + scatter_index, + gather_scatter_mask, + global_shape, + ): + """ + MOE token combine with reduce scatter + """ + expert_out = MOEScatter.apply(expert_out, scatter_index) + expert_out = expert_out * paddle.reshape(combine_weights, shape=[-1, 1]) + expert_out = MOEScatter.apply(expert_out, gather_scatter_mask, global_shape) + combine_out = expert_out + shared_out + combine_out = ReduceScatterOp.apply(combine_out) + return combine_out + + +def get_flatten_mesh(mesh): + + return dist.ProcessMesh(mesh.process_ids) + + +def get_mesh(pp_idx=0): + + mesh = fleet.auto.get_mesh() + if "pp" in mesh.dim_names: + mesh = mesh.get_mesh_with_dim("pp", pp_idx) + return mesh + + +def _reshard(tensor, mesh, placements): + + dst_tensor = dist.auto_parallel.moe_utils._dist_reshape( + tensor, tensor.shape, mesh, placements + ) + return dst_tensor diff --git a/examples/pre-training/models/moe/top2_gate_auto.py b/examples/pre-training/models/moe/top2_gate_auto.py new file mode 100644 index 00000000..79c2a3aa --- /dev/null +++ b/examples/pre-training/models/moe/top2_gate_auto.py @@ -0,0 +1,77 @@ +# !/usr/bin/env python3 + +# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +top2gate +""" + + +from typing import Tuple +import logging +from paddle import Tensor +import paddle.distributed as dist + +# import paddle.nn.functional as F + +logger = logging.getLogger(__name__) + +from models.moe.top2_gate_auto_auto import TopKGateFused +from models.moe.moe_utils_auto import get_mesh, get_flatten_mesh + + +class TopKGateFusedAuto(TopKGateFused): + """doc""" + + def __init__(self, config, layer_idx: int, group, gate_weight=None, ipp=0) -> None: + super().__init__(config, layer_idx, group, gate_weight) + self.ipp = ipp + self.weight = dist.shard_tensor( + self.weight, get_flatten_mesh(get_mesh(self.ipp)), [dist.Replicate()] + ) + + def forward( + self, + input: Tensor, + token_type_ids=None, + ) -> Tuple[Tensor, Tensor, Tensor, Tensor]: # type: ignore + """ + Args: + input: paddle.Tensor, hidden-states of layer + Retruns: + paddle.Tensor [Seq, Expert, Capacity]: float32, combine weights + paddle.Tensor [Seq, Expert, Capacity]: bool, dispatch mask + Tuple[paddle.Tensor]: `GateOutput` + """ + num_experts = ( + sum(self.num_experts) + if self.config.multimodel_experts + else self.num_experts + ) + if self.training: + cap = self.cap[0] + elif input.shape[0] < num_experts: # seqlen < num_expert + cap = self.cap[2] + else: + cap = self.cap[1] + num_tokens = input.shape[0] + # capacity = 2S/E + global_capacity = int(cap * num_tokens // num_experts) + local_num_tokens = input._local_shape[0] + local_capacity = int(cap * local_num_tokens // num_experts) + + logits, _, router_loss = super().forward(input, token_type_ids) + + return logits, global_capacity, router_loss, local_capacity diff --git a/examples/pre-training/models/moe/top2_gate_auto_auto.py b/examples/pre-training/models/moe/top2_gate_auto_auto.py new file mode 100644 index 00000000..6460c157 --- /dev/null +++ b/examples/pre-training/models/moe/top2_gate_auto_auto.py @@ -0,0 +1,1135 @@ +# !/usr/bin/env python3 + +# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +top2gate +""" + + +from typing import Tuple +from functools import partial +import logging +import numpy as np +import paddle +from paddle import Tensor +import paddle.distributed as dist +import paddle.nn.functional as F +from paddle import nn +from paddle.utils import unique_name +from paddle.nn.clip import _squared_l2_norm +from paddle.distributed import fleet +from paddleformers.utils.tools import get_env_device +from models.utils import global_training_logs_enabled + +try: + from src.utils.misc import global_training_logs +except ModuleNotFoundError: + global_training_logs = {} +try: + import moe_router_loss_ops +except ImportError: + moe_router_loss_ops = None + +try: + from custom_setup_ops import matmul_bwd +except ImportError: + matmul_bwd = None + +try: + from bincount_ops import int_bincount +except ImportError: + int_bincount = None + +logger = logging.getLogger(__name__) + + +class CalOrthogonalLossOptEachWeightFunctor(paddle.autograd.PyLayer): + """CalOrthogonalLossOptEachWeightFunctor""" + + @staticmethod + def forward(ctx, gate_weight, moe_k, use_group, eps=1e-12): + """forward""" + if gate_weight.dtype != paddle.float32: + gate_weight = gate_weight.astype(paddle.float32) + ( + orthogonal_loss, + wnorm, + weight_scale, + normed_weight, + weight_matmul, + ) = moe_router_loss_ops.cal_orthogonal_loss_opt_each_weight( + gate_weight, moe_k, use_group, eps + ) + ctx.save_for_backward( + gate_weight, wnorm, weight_scale, normed_weight, weight_matmul + ) + ctx.moe_k = moe_k + ctx.use_group = use_group + ctx.eps = eps + return orthogonal_loss + + @staticmethod + def backward(ctx, out_grad): + """backward""" + gate_weight, wnorm, weight_scale, normed_weight, weight_matmul = ( + ctx.saved_tensor() + ) + if gate_weight.stop_gradient: + return None + moe_k = ctx.moe_k + use_group = ctx.use_group + eps = ctx.eps + return moe_router_loss_ops.cal_orthogonal_loss_opt_each_weight_grad( + out_grad, + wnorm, + weight_scale, + normed_weight, + weight_matmul, + moe_k, + use_group, + eps, + ) + + +class CalZLossFunctor(paddle.autograd.PyLayer): + """CalZLossFunctor""" + + @staticmethod + def forward(ctx, logits, loss_mask=None, clip_min=1e-6): + """forward""" + if loss_mask is not None: + assert loss_mask.stop_gradient + loss, max_logits, safe_sumexp, logsumexp_per_token = ( + moe_router_loss_ops.cal_z_loss(logits, loss_mask, clip_min) + ) + ctx.save_for_backward( + logits, loss_mask, max_logits, safe_sumexp, logsumexp_per_token + ) + ctx.clip_min = clip_min + return loss + + @staticmethod + def backward(ctx, out_grad): + """backward""" + logits, loss_mask, max_logits, safe_sumexp, logsumexp_per_token = ( + ctx.saved_tensor() + ) + if logits.stop_gradient: + return None + clip_min = ctx.clip_min + return moe_router_loss_ops.cal_z_loss_grad( + out_grad, + logits, + loss_mask, + max_logits, + safe_sumexp, + logsumexp_per_token, + clip_min, + ) + + +class CalAuxLossFunctor(paddle.autograd.PyLayer): + """CalAuxLossFunctor""" + + @staticmethod + def forward( + ctx, + gate_prob, + dispatch_mask, + tokens_mask, + dispatch_tokens_mask, + num_experts, + use_group, + moe_k, + clip_min=1e-6, + ): + """forward""" + if tokens_mask is not None and tokens_mask.dtype != gate_prob.dtype: + tokens_mask = tokens_mask.astype(gate_prob.dtype) + loss, seqlen_float, ce = paddle.incubate.nn.functional.cal_aux_loss( + gate_prob, + dispatch_mask, + tokens_mask, + dispatch_tokens_mask, + num_experts, + use_group, + moe_k, + clip_min, + ) + ctx.save_for_backward(gate_prob, seqlen_float, ce) + ctx.num_experts = num_experts + ctx.use_group = use_group + ctx.moe_k = moe_k + return loss + + @staticmethod + def backward(ctx, out_grad): + """backward""" + gate_prob, seqlen_float, ce = ctx.saved_tensor() + num_experts = ctx.num_experts + use_group = ctx.use_group + moe_k = ctx.moe_k + return paddle.incubate.nn.functional.cal_aux_loss_grad( + out_grad, gate_prob, seqlen_float, ce, num_experts, use_group, moe_k + ) + + +def cal_orthogonal_loss_opt_each_weight_func( + weight, moe_k, use_group, eps, xpu_matmul=None, training=True +): + """cal_orthogonal_loss_opt_each_weight_func""" + weight = weight.transpose([1, 0]).contiguous() # transpose weight here + wnorm = weight.norm(axis=1) + weight = weight / paddle.maximum(wnorm, eps).unsqueeze(1) + + if use_group: + weight = weight.reshape([moe_k, -1, weight.shape[1]]) # [K, E/K, H] + eye_matrix = paddle.eye(weight.shape[1], dtype=weight.dtype).unsqueeze(0) + else: + eye_matrix = paddle.eye(weight.shape[0], dtype=weight.dtype) + + if get_env_device() == "xpu" and xpu_matmul is not None: + weight_matmul = xpu_matmul(weight, weight, transpose_y=True, training=training) + else: + weight_matmul = paddle.matmul(weight, weight, transpose_y=True) + + orthogonal_loss = weight_matmul - eye_matrix + orthogonal_loss = _squared_l2_norm(orthogonal_loss) / orthogonal_loss.size + return orthogonal_loss + + +def cal_z_loss_func(logits, loss_mask): + """cal_z_loss_func""" + if loss_mask is not None: + loss_mask = loss_mask.astype(logits.dtype) + l_zloss = (logits.logsumexp(1).square() * loss_mask).sum() / paddle.clip( + loss_mask.sum(), min=1e-6 + ) + else: + l_zloss = logits.logsumexp(1).square().mean() + return l_zloss + + +def cal_aux_loss_func( + gate_prob, + dispatch_mask, + tokens_mask, + dispatch_tokens_mask, + num_experts, + use_group, + moe_k, + global_aux_loss=False, + rank=None, + group=None, +): + """cal_aux_loss_func""" + if tokens_mask is not None and tokens_mask.dtype != gate_prob.dtype: + tokens_mask = tokens_mask.astype(gate_prob.dtype) + + scale = None + if dispatch_tokens_mask is not None: + seqlen_float = dispatch_tokens_mask.astype(gate_prob.dtype).sum() + if ( + tokens_mask is not None + and gate_prob.shape[0] != dispatch_tokens_mask.shape[0] + ): + scale = seqlen_float / paddle.clip(tokens_mask.sum(), min=1e-6) + elif tokens_mask is not None: + seqlen_float = tokens_mask.sum() + else: + seqlen_float = gate_prob.numel().astype(gate_prob.dtype) / num_experts + seqlen_float = paddle.clip(seqlen_float, min=1e-6) + + if len(dispatch_mask.shape) == 2: + dispatch_mask = dispatch_mask.sum(0) + ce = dispatch_mask.astype(gate_prob.dtype).detach() / seqlen_float + me = paddle.sum(gate_prob, axis=0) / seqlen_float + # me = paddle.mean(gate_prob, axis=0) + # ce = paddle.mean(dispatch_mask.cast("float32"), axis=0) + if global_aux_loss: + me_list, ce_list = [], [] + dist.all_gather(me_list, me, group=group) + dist.all_gather(ce_list, ce, group=group) + + me_list[rank] = me + ce_list[rank] = ce + me = paddle.stack(me_list).mean(0) + ce = paddle.stack(ce_list).mean(0) + + l_aux = paddle.sum(me * ce) * num_experts + if use_group: + l_aux = l_aux / moe_k + + if scale is not None: + l_aux = l_aux + (scale - 1) * l_aux.detach() + + return l_aux + + +def masked_fill(x, mask, value): + + y = paddle.full(x.shape, value, x.dtype) + return paddle.where(mask, y, x) + + +@paddle.no_grad() +def compute_optimal_transport(M, r, c, lam=1.0, epsilon=1e-8, max_iters: int = 10): + """ + Computes the optimal transport matrix and Slinkhorn distance using the + Sinkhorn-Knopp algorithm + + Inputs: + - M : cost matrix (n x m) + - r : vector of marginals (n, ) + - c : vector of marginals (m, ) + - lam : strength of the entropic regularization + - epsilon : convergence parameter + + Outputs: + - P : optimal transport matrix (n x m) + - dist : Sinkhorn distance + """ + n, _ = M.shape + # P = (- lam * M).exp() + # P /= P.sum() + P = F.softmax(-M / lam) + u = paddle.zeros(n, "float32") + # normalize this matrix + for _ in range(max_iters): + if (u - P.sum(1)).abs().max() < epsilon: + break + u = P.sum(1) + P *= (r / (u + 1e-8)).reshape((-1, 1)) + P *= (c / (P.sum(0) + 1e-8)).reshape((1, -1)) + P = paddle.where(~P.isnan(), P, paddle.zeros_like(P)) + return P, _ + + +def cast_if_needed(x, dtype): + """ + cast_if_needed + """ + return x.cast(dtype) if x.dtype != dtype else x + + +class FusedGateDetachMatmul(paddle.autograd.PyLayer): + """ + FusedGateDetachMatmul + """ + + @staticmethod + def forward(ctx, x, w): + """ + forward + """ + ctx.dtype = paddle.float32 + ctx.save_for_backward(x, w) + return F.linear(cast_if_needed(x, ctx.dtype), cast_if_needed(w, ctx.dtype)) + + @staticmethod + def backward(ctx, y_grad): + """ + backward + """ + x, w = ctx.saved_tensor() + assert ctx.dtype == y_grad.dtype, "dtype not match" + x_g, w_g = matmul_bwd( + cast_if_needed(x, ctx.dtype), + cast_if_needed(w, ctx.dtype), + y_grad, + False, + False, + ) + return cast_if_needed(x_g, x.dtype), cast_if_needed(w_g, w.dtype) + + +def gate_detach_matmul(x, weight, use_fuse, use_fake_gate=False): + """ + gate_detach_matmul + """ + if use_fuse: + score = FusedGateDetachMatmul.apply(x, weight) + else: + x = cast_if_needed(x, paddle.float32) + score = F.linear(x, weight) + + if use_fake_gate: + score = paddle.randn(score.shape).astype(score.dtype) + score - score + return score + + +class Top2Gate(nn.Layer): + """Gate module which implements Top2Gating as described in Gshard_. + :: + + gate = Top2Gate(model_dim, num_experts) + l_aux, combine_weights, dispatch_mask = gate(input) + + .. Gshard_: https://arxiv.org/pdf/2006.16668.pdf + + Args: + model_dim (int): + size of model embedding dimension + num_experts (ints): + number of experts in model + """ + + def __init__(self, config, layer_idx: int, group, gate_weight=None) -> None: + + super().__init__() + if get_env_device() == "xpu": + try: + from paddle_xpu.layers.nn import xpu_matmul + + self.xpu_matmul = xpu_matmul() + except ImportError: + self.xpu_matmul = None + else: + self.xpu_matmul = None + self.config = config + self.fuse_gate_detach_matmul = config.fuse_gate_detach_matmul + if self.fuse_gate_detach_matmul: + assert matmul_bwd is not None, "matmul_bwd is not supported" + + self.use_fake_gate = config.use_fake_gate + if self.use_fake_gate: + logging.warning( + "You are use fake_gate, which is just for test, not for real training." + ) + + self.model_dim = config.hidden_size + self.num_experts = config.moe_num_experts + self.num_experts_tensor = ( + sum(config.moe_num_experts) + if config.multimodel_experts + else config.moe_num_experts + ) # paddle.to_tensor(config.moe_num_experts, dtype="float32").sum() + + self.cap = config.moe_capacity + self.group = group + + self.layer_idx = layer_idx + self.global_aux_loss = config.global_aux_loss + if self.global_aux_loss: + self.rank = dist.get_rank(self.group) + + self.sinkhorn_2gate = config.sinkhorn_2gate + self.sinkhorn_temp = config.sinkhorn_temp + self.use_token_type_bias = config.moe_use_token_type_bias + self.use_correction_bias = config.moe_use_aux_free + + if config.moe_gate_act == "softmax": + self.act = partial(F.softmax, axis=-1) # [S,E] + elif config.moe_gate_act == "sigmoid": + self.act = F.sigmoid + else: + raise ValueError(f"{config.moe_gate_act} is not supported.") + self.no_jitter = True + self.expert_drop = False + self.eye_matrix = None + self.eye_matrix_size = None + self.enable_logging = config.moe_logging + self.norm_gate_logits = config.moe_norm_gate_logits + self.one = paddle.ones([], dtype="float32") + + self.moe_aux_loss_lambda = paddle.to_tensor( + config.moe_aux_loss_lambda, dtype="float32" + ) + self.moe_z_loss_lambda = paddle.to_tensor( + config.moe_z_loss_lambda, dtype="float32" + ) + self.moe_orthogonal_loss_lambda = paddle.to_tensor( + config.moe_orthogonal_loss_lambda, dtype="float32" + ) + if self.moe_aux_loss_lambda.ndim == 0: + self.moe_aux_loss_lambda = self.moe_aux_loss_lambda.unsqueeze(0) + if self.moe_z_loss_lambda.ndim == 0: + self.moe_z_loss_lambda = self.moe_z_loss_lambda.unsqueeze(0) + if self.moe_orthogonal_loss_lambda.ndim == 0: + self.moe_orthogonal_loss_lambda = self.moe_orthogonal_loss_lambda.unsqueeze( + 0 + ) + + self.experts_type_ids = None + if config.moe_orthogonal_loss_lambda: + if hasattr(fleet.fleet, "_user_defined_strategy"): + strategy = fleet.fleet._user_defined_strategy + sharding_configs = strategy.hybrid_configs["sharding_configs"] + pp_config = strategy.hybrid_configs["pp_configs"] + assert ( + not sharding_configs.comm_overlap + and not pp_config.sharding_comm_overlap + ), "orthogonal loss will cause twice gradient accumulate, will break pp/sharding overlap" + + self.eps = paddle.to_tensor([1e-12], dtype="float32") + if config.multimodel_experts: + if config.moe_use_hard_gate: + self.num_experts_list = [] + self.experts_type_mask = [] + experts_ids = paddle.zeros( + [sum(self.num_experts)], dtype="int64" + ).reshape([config.moe_world_size, -1]) + offset = 0 + for i, expert_num in enumerate(self.num_experts): + experts_ids[ + :, offset : offset + expert_num // config.moe_world_size + ] = i + offset += expert_num // config.moe_world_size + self.experts_type_ids = experts_ids.reshape([-1]) + logger.info( + f"use moe_use_hard_gate, experts_ids: {self.experts_type_ids}" + ) + for i, expert_num in enumerate(self.num_experts): + self.experts_type_mask.append( + self.experts_type_ids == i, + ) + self.num_experts_list.append(expert_num) + else: + assert ( + not config.moe_group_experts + ), "group_experts must use hard_gate when multimodel_experts is True" + else: + self.num_experts_list = [self.num_experts] + if gate_weight is not None: + self.weight = gate_weight + assert ( + not self.config.moe_use_token_type_bias + ), "gate_weights is from outside, token_type_bias can't be used" + logger.info("moe use gate_weight from outside") + self._cast_to_low_precision = False + self._cast_to_low_precison = False + else: + self._create_gate_parameter() + logger.info( + f"{config.moe_gate}: w/ capacity: {self.cap} experts:{self.num_experts} " + f"use_token_type_bias:{self.use_token_type_bias} gate_act:{config.moe_gate_act} " + f"norm_gate_logits={self.norm_gate_logits} use_correction_bias={self.use_correction_bias}" + ) + + def _create_gate_parameter(self): + + if self.config.multimodel_experts: + # support setting lambda for each expert group + self.moe_z_loss_lambda = self.moe_z_loss_lambda.expand( + len(self.num_experts) + ) + self.moe_aux_loss_lambda = self.moe_aux_loss_lambda.expand( + len(self.num_experts) + ) + self.moe_orthogonal_loss_lambda = self.moe_orthogonal_loss_lambda.expand( + len(self.num_experts) + ) + + for i, num_experts in enumerate(self.num_experts): + if i == 1: + with paddle.utils.unique_name.guard(f"mm_gate_{self.layer_idx}_"): + p = self.create_parameter( + shape=[self.model_dim, num_experts], + dtype="float32", + attr=paddle.ParamAttr( + name=unique_name.generate("moe_gate") + ), + ) + else: + p = self.create_parameter( + shape=[self.model_dim, num_experts], + dtype="float32", + attr=paddle.ParamAttr(name=unique_name.generate("moe_gate")), + ) + p.expert_type = f"expert_type_{i}" + self.add_parameter( + ( + "weight" if i == 0 else f"weight_{i}" + ), + p, + ) + else: + self.weight = self.create_parameter( + shape=[self.model_dim, self.num_experts], + dtype="float32", + attr=paddle.ParamAttr( + name=unique_name.generate("moe_gate") + ), + ) + logger.info(f"moe-Gate, {self.weight}") + + if self.use_token_type_bias: + if self.config.multimodel_experts: + assert ( + not self.config.moe_use_hard_gate + ), "multimodel_experts with hard_gate is not support token_type_bias." + num_experts = ( + sum(self.num_experts) + if self.config.multimodel_experts + else self.num_experts + ) + bias_type_num = ( + len(self.num_experts) if self.config.multimodel_experts else 1 + ) + self.bias = self.create_parameter( + shape=[bias_type_num, num_experts], + dtype="float32", + attr=paddle.ParamAttr( + name=unique_name.generate("moe_gate_bias"), + initializer=paddle.nn.initializer.Assign( + np.zeros([bias_type_num, num_experts]) + ), + ), + ) + logger.info(f"using token type bias, bias: {self.bias},") + self._cast_to_low_precision = False + self._cast_to_low_precison = False + + def get_gate_weight(self, transform_weight): + if not self.config.multimodel_experts: + return self.weight + if not transform_weight: + return paddle.concat( + [ + getattr(self, "weight" if i == 0 else f"weight_{i}") + for i in range(len(self.num_experts)) + ], + -1, + ) + weight = paddle.zeros( + [ + self.model_dim, + self.config.moe_world_size, + sum(self.num_experts) // self.config.moe_world_size, + ], + dtype="float32", + ) + offset = 0 + for i, num_experts in enumerate(self.num_experts): + weight[ + :, :, offset : offset + num_experts // self.config.moe_world_size + ] = getattr(self, "weight" if i == 0 else f"weight_{i}").reshape( + [self.model_dim, self.config.moe_world_size, -1] + ) + offset += num_experts // self.config.moe_world_size + weight = weight.reshape([self.model_dim, -1]) + + return weight + + def forward( + self, + input: Tensor, + token_type_ids: Tensor = None, + transform_weight: bool = True, # [seq] + correction_bias: Tensor = None, # [seq] + ) -> Tuple[Tensor, Tensor, Tensor]: # type: ignore + + orig_dtype = input.dtype + weight = self.get_gate_weight(transform_weight) + with paddle.amp.auto_cast(False): + if get_env_device() == "xpu" and self.xpu_matmul is not None: + assert not self.fuse_gate_detach_matmul, "not supported on XPU" + input_32 = input.cast("float32") + logits = self.xpu_matmul( + input_32, + weight, + training=self.training, + ) + else: + logits = gate_detach_matmul( + input, weight, self.fuse_gate_detach_matmul, self.use_fake_gate + ) + + if self.use_token_type_bias: + assert token_type_ids is not None + bias = self.bias[token_type_ids] # [seq] + # logger.info(f"adding bias: {bias}") + logits = logits + bias + ( + capacity, + dispatch_mask, + combine_weights, + scatter_index, + l_aux, + l_zloss, + ) = self.top2_gating(logits, correction_bias=correction_bias) + orthogonal_loss = self._cal_orthogonal_loss() + router_loss = ( + l_aux * self.moe_aux_loss_lambda + + l_zloss * self.moe_z_loss_lambda + + orthogonal_loss * self.moe_orthogonal_loss_lambda + ) + router_loss.stop_gradient = False + if self.enable_logging and global_training_logs_enabled(): + _log = { + f"aux_loss_layer_{self.layer_idx}": l_aux.item(), + f"orthogonal_loss_layer_{self.layer_idx}": orthogonal_loss.item(), + f"zloss_layer_{self.layer_idx}": l_zloss.item(), + } + global_training_logs.update( + **_log, + **{ + k.replace(f"_layer_{self.layer_idx}", ""): v + for k, v in _log.items() + }, + ) + if self.use_token_type_bias: + _bias_log = { + f"token_type_bias_layer_{self.layer_idx}_expert{i}_gap": v + for i, v in enumerate((self.bias[0] - self.bias[1]).numpy()) + } + global_training_logs.update(**_bias_log) + + combine_weights = combine_weights.cast(orig_dtype) + return ( + capacity, + dispatch_mask, + combine_weights, + scatter_index, + router_loss, + logits, + ) + + def get_capacity(self, num_tokens, cap_factor=None): + """ + return capcity + """ + num_experts = ( + sum(self.num_experts) + if self.config.multimodel_experts + else self.num_experts + ) + if cap_factor is not None: + cap = cap_factor + else: + if self.training: + cap = self.cap[0] + elif num_tokens < num_experts: # seqlen < num_expert + cap = self.cap[2] + else: + cap = self.cap[1] + # capacity = 2S/E + capacity = int(cap * num_tokens // num_experts) + assert ( + capacity > 0 + ), f"requires capacity to >= 0. cap={cap}, num_tokens={num_tokens}" + return capacity + + def top2_gating(self, logits, cap=None, correction_bias=None): + + # logger.info(f'gate-input: {logits}') + l_zloss = self._cal_z_loss(logits) + gates = self.act(logits) + + # gates has shape of SE + assert logits.ndim == 2, logits.shape + num_tokens = gates.shape[0] + num_experts = gates.shape[1] + # capacity = 2S/E + capacity = self.get_capacity(logits.shape[0], cap) + + # Create a mask for 1st's expert per token + score_for_argmax = ( + gates + correction_bias.unsqueeze(0) + if correction_bias is not None + else gates + ) + indices1_s = paddle.argmax(score_for_argmax, axis=1) + mask1 = F.one_hot(indices1_s, num_classes=num_experts).cast( + paddle.int64 + ) # [0,1] + + l_aux = self._cal_aux_loss(gates, mask1.sum(axis=0), self.num_experts_tensor) + + if self.training and not self.no_jitter: + gumbels = ( + -paddle.empty_like( + logits, + ) + .exponential_() + .log() + ) # ~Gumbel(0,1) + logits_w_noise = logits + gumbels + else: + logits_w_noise = logits + + logits_except1 = masked_fill( + logits_w_noise, mask1.cast(paddle.bool), float("-inf") + ) + score_for_argmax = ( + self.act(logits_except1) + correction_bias.unsqueeze(0) + if correction_bias is not None + else logits_except1 + ) + indices2_s_original = paddle.argmax(score_for_argmax, axis=1) + + if self.training and self.sinkhorn_2gate: + r = paddle.ones(num_tokens, "float32") / num_tokens + + c = capacity - mask1.cast("float32").sum(0) + c = paddle.maximum(c, paddle.zeros_like(c)) + c /= c.sum() + + pi, _ = compute_optimal_transport( + -logits_except1.cast("float32").detach(), r, c, lam=self.sinkhorn_temp + ) + pi = masked_fill(pi, mask1.cast(paddle.bool), float("-inf")) + indices2_s = paddle.argmax(pi, axis=1) + else: + indices2_s = indices2_s_original + + if self.enable_logging and global_training_logs_enabled(): + global_training_logs.update( + **{ + "redispatch_acc": (indices2_s_original == indices2_s) + .cast(paddle.float32) + .mean() + .item(), + f"redispatch_acc_layer_{self.layer_idx}": ( + indices2_s_original == indices2_s + ) + .cast(paddle.float32) + .mean() + .item(), + } + ) + + mask2 = F.one_hot(indices2_s, num_classes=self.num_experts).cast(paddle.int64) + + # Compute locations in capacity buffer + locations1 = ( + paddle.cumsum(mask1, axis=0) - 1 + ) # [0,1,1,0,1,0,0] -> [0,0,0,0,1,1,1,] + locations2 = paddle.cumsum(mask2, axis=0) - 1 + # Update 2nd's location by accounting for locations of 1st + locations2 += paddle.sum(mask1, axis=0, keepdim=True) + + # Remove locations outside capacity from mask + mask1 *= (locations1 < capacity).cast(paddle.int64) # [0,1,1,0,0,0,0] + mask2 *= (locations2 < capacity).cast(paddle.int64) + + # Store the capacity location for each token + locations1_s = paddle.sum(locations1 * mask1, axis=1) + locations2_s = paddle.sum(locations2 * mask2, axis=1) + + # Normalize gate probabilities + mask1_float = mask1.cast(paddle.float32) + mask2_float = mask2.cast(paddle.float32) + gates1_s = (gates * mask1_float).sum(axis=-1) + gates2_s = (gates * mask2_float).sum(axis=-1) + # logger.info(f'gates1_s:{gates1_s} gates2_s:{gates2_s} logits:{logits}') + + if self.norm_gate_logits: + denom_s = gates1_s + gates2_s # [0.2, 0.3] + # Avoid divide-by-zero + denom_s = paddle.clip(denom_s, min=1e-6) + gates1_s /= denom_s + gates2_s /= denom_s + if self.training and self.expert_drop: + # log.debug(gates2_s) + gates2_s = paddle.where( + 2 * gates2_s < paddle.rand_like(gates2_s), + paddle.zeros_like(gates2_s), + gates2_s, + ) + + # Calculate combine_weights and dispatch_mask + gates1 = gates1_s.unsqueeze(1) * mask1_float + gates2 = gates2_s.unsqueeze(1) * mask2_float + + expert1_index = paddle.argmax(gates1, -1) + combine1_weight = paddle.max(gates1, -1, keepdim=True) + scatter1_index = expert1_index * capacity + locations1_s + scatter1_index = scatter1_index.cast("int64") + dispatch1_mask = combine1_weight.cast(paddle.bool).detach() + + expert2_index = paddle.argmax(gates2, -1) + combine2_weight = paddle.max(gates2, -1, keepdim=True) + scatter2_index = expert2_index * capacity + locations2_s + scatter2_index = scatter2_index.cast("int64") + dispatch2_mask = combine2_weight.cast(paddle.bool).detach() + # logger.info(f'expert-id: {expert1_index} vs {expert2_index}, mask:{mask1_float} vs {mask2_float}') + if self.enable_logging and global_training_logs_enabled(): + global_training_logs.update( + **{ + "top1_gate": ( + combine1_weight.sum() + / (dispatch1_mask.cast("float32").sum() + 1e-9) + ).item(), + "top2_gate": ( + combine2_weight.sum() + / (dispatch2_mask.cast("float32").sum() + 1e-9) + ).item(), + f"top1_gate_layer_{self.layer_idx}": ( + combine1_weight.sum() + / (dispatch1_mask.cast("float32").sum() + 1e-9) + ).item(), + f"top2_gate_layer_{self.layer_idx}": ( + combine2_weight.sum() + / (dispatch2_mask.cast("float32").sum() + 1e-9) + ).item(), + } + ) + + seqlen = logits.shape[0] + top1_gate_experts_per_token = ( + paddle.cast(dispatch1_mask, dtype="float32").sum() / seqlen + ) + top2_gate_experts_per_token = ( + paddle.cast(dispatch2_mask, dtype="float32").sum() / seqlen + ) + leakage_experts_per_token = ( + paddle.cast( + (~dispatch1_mask) & (~dispatch2_mask), dtype="float32" + ).sum() + / seqlen + ) + + experts_per_token = ( + top1_gate_experts_per_token + top2_gate_experts_per_token + ) + _log = { + f"experts_per_token_layer_{self.layer_idx}": experts_per_token.item(), + f"top1_experts_per_token_layer_{self.layer_idx}": top1_gate_experts_per_token.item(), + f"top2_experts_per_token_layer_{self.layer_idx}": top2_gate_experts_per_token.item(), + f"leakage_experts_per_token_layer_{self.layer_idx}": leakage_experts_per_token.item(), + } + global_training_logs.update( + **_log, + **{ + k.replace(f"_layer_{self.layer_idx}", ""): v + for k, v in _log.items() + }, + ) + + return ( + capacity, + paddle.concat((dispatch1_mask, dispatch2_mask), 1), + paddle.concat((combine1_weight, combine2_weight), 1), + paddle.stack((scatter1_index, scatter2_index), 1), + l_aux, + l_zloss, + ) + + def _cal_aux_loss( + self, + gate_prob, + dispatch_mask, + num_experts=None, + use_group=None, + tokens_mask=None, + dispatch_tokens_mask=None, + ): + + if self.act is F.sigmoid: + gate_prob = gate_prob / gate_prob.sum(-1, keepdim=True) + + if self.use_correction_bias: + if tokens_mask is not None: + gate_prob_this_modality = gate_prob[tokens_mask.astype("bool")] + if gate_prob_this_modality.shape[0]: + _, top_idx = gate_prob_this_modality.topk( + k=self.config.moe_k, axis=-1 + ) + if int_bincount is not None: + dispatch_mask = int_bincount( + top_idx, 0, gate_prob.shape[-1], paddle.int64 + ) + else: + mask = paddle.zeros_like( + gate_prob_this_modality + ).put_along_axis(top_idx, paddle.to_tensor(1.0), axis=1) + dispatch_mask = paddle.sum(mask.cast(paddle.int64), axis=0) + else: + dispatch_mask = paddle.zeros(gate_prob.shape[-1], dtype="int64") + dist.stream.all_reduce( + dispatch_mask, + group=self.group, + use_calc_stream=True, + ) + else: + _, top_idx = gate_prob.topk(k=self.config.moe_k, axis=-1) + if int_bincount is not None: + dispatch_mask = int_bincount( + top_idx, 0, gate_prob.shape[-1], paddle.int64 + ) + else: + mask = paddle.zeros_like(gate_prob).put_along_axis( + top_idx, paddle.to_tensor(1.0), axis=1 + ) + dispatch_mask = paddle.sum(mask.cast(paddle.int64), axis=0) + + if num_experts is None: + num_experts = self.num_experts_tensor + if use_group is None: + use_group = self.config.moe_group_experts + + moe_router_loss_ops = None + if ( + moe_router_loss_ops is not None + and get_env_device() != "xpu" + and (tokens_mask is None or len(tokens_mask.shape) == 1) + and (tokens_mask is None or tokens_mask.shape[0] == gate_prob.shape[0]) + and (gate_prob.shape[0] >= gate_prob.shape[1]) + and (not self.global_aux_loss) + and (gate_prob.dtype == paddle.float32) + ): + return CalAuxLossFunctor.apply( + gate_prob, + dispatch_mask, + tokens_mask, + dispatch_tokens_mask, + num_experts, + use_group, + self.config.moe_k, + clip_min=1e-6, + ) + else: + return cal_aux_loss_func( + gate_prob, + dispatch_mask, + tokens_mask, + dispatch_tokens_mask, + num_experts, + use_group, + self.config.moe_k, + self.global_aux_loss, + self.rank if self.global_aux_loss else None, + self.group if self.global_aux_loss else None, + ) + + def _cal_z_loss(self, logits, loss_mask=None): + + if ( + (moe_router_loss_ops is not None) + and (loss_mask is None or len(loss_mask.shape) == 1) + and (get_env_device() != "xpu") + and (logits.dtype == paddle.float32) + ): + return CalZLossFunctor.apply(logits, loss_mask) + else: + return cal_z_loss_func(logits, loss_mask) + + def _cal_orthogonal_loss_opt_each_weight(self, weight, use_group): + """ + gate正交loss(优化版) + """ + if weight.dtype != paddle.float32: + weight = weight.astype(paddle.float32) + + if ( + (moe_router_loss_ops is not None) + and (get_env_device() != "xpu") + and (weight.dtype == paddle.float32) + ): + return CalOrthogonalLossOptEachWeightFunctor.apply( + weight, self.config.moe_k, use_group + ) + else: + return cal_orthogonal_loss_opt_each_weight_func( + weight, + self.config.moe_k, + use_group, + self.eps, + self.xpu_matmul, + self.training, + ) + + def _cal_orthogonal_loss(self, weight_id=None, use_group=None): + """ + gate正交Loss + """ + if use_group is None: + use_group = ( + self.config.moe_group_experts and self.config.moe_group_orthogonal_loss + ) + + if weight_id is not None: + if weight_id == 0: + w_ = self.weight + else: + assert self.config.multimodel_experts + w_ = getattr(self, f"weight_{weight_id}") + return self._cal_orthogonal_loss_opt_each_weight(w_, use_group) + + orthogonal_loss = self._cal_orthogonal_loss_opt_each_weight( + self.weight, use_group + ) + if self.config.multimodel_experts: + for i in range(1, len(self.config.moe_num_experts)): + w_ = getattr(self, f"weight_{i}") + orthogonal_loss += self._cal_orthogonal_loss_opt_each_weight( + w_, use_group=False + ) + return orthogonal_loss + + +class TopKGateFused(Top2Gate): + """doc""" + + def forward( + self, + input: Tensor, + token_type_ids=None, + transform_weight=True, + ) -> Tuple[Tensor, Tensor, Tensor]: # type: ignore + """ + Args: + input: paddle.Tensor, hidden-states of layer + token_type_ids: paddle.Tensor[Seqw], token_type_ids of input + transform_weight: bool, when using multimodal experts, perform `self.get_gate_weight` if specified + Retruns: + paddle.Tensor [Seq, Expert, Capacity]: float32, combine weights + paddle.Tensor [Seq, Expert, Capacity]: bool, dispatch mask + Tuple[paddle.Tensor]: `GateOutput` + """ + capacity = self.get_capacity(input.shape[0]) + weight = self.get_gate_weight(transform_weight) + with paddle.amp.auto_cast(False): + if get_env_device() == "xpu" and self.xpu_matmul is not None: + assert not self.fuse_gate_detach_matmul, "not supported on XPU" + input_32 = input.cast("float32") + logits = self.xpu_matmul( + input_32, + weight, + training=self.training, + ) + else: + logits = gate_detach_matmul( + input, weight, self.fuse_gate_detach_matmul, self.use_fake_gate + ) + if self.use_token_type_bias: + assert token_type_ids is not None + assert ( + token_type_ids.max() < self.bias.shape[0] + ), f"token_type_ids {token_type_ids.max()} >= bias shape {self.bias.shape[0]}" + bias = self.bias[token_type_ids] # [seq] + logits = logits + bias + orthogonal_loss = None + router_loss = paddle.zeros([1], dtype="float32") + router_loss.stop_gradient = False + if ( + self.enable_logging + and global_training_logs_enabled() + and orthogonal_loss is not None + ): + _log = { + f"orthogonal_loss_layer_{self.layer_idx}": orthogonal_loss.item(), + } + global_training_logs.update( + **_log, + **{ + k.replace(f"_layer_{self.layer_idx}", ""): v + for k, v in _log.items() + }, + ) + + return logits, capacity, router_loss diff --git a/examples/pre-training/models/sequence_parallel_utils_auto.py b/examples/pre-training/models/sequence_parallel_utils_auto.py new file mode 100644 index 00000000..4b80ca3f --- /dev/null +++ b/examples/pre-training/models/sequence_parallel_utils_auto.py @@ -0,0 +1,910 @@ +# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# !/usr/bin/env python3 + +import hashlib +import numpy as np +import logging + +import paddle +from paddle import distributed as dist +from paddle.nn import functional as F +from paddle.autograd import PyLayer +from paddle.nn.layer.layers import Layer +from paddle.distributed import fleet +from paddle.distributed.fleet.meta_parallel import get_rng_state_tracker +from paddle.distributed.fleet.utils.hybrid_parallel_util import ( + fused_allreduce_gradients_with_group, +) +from paddle.incubate.tensor.manipulation import create_async_load + +from models.comm_utils import ( + scatter, + all_gather, + reduce_scatter, + mp_slice, + all_gather_varlen, +) +from paddleformers.utils.tools import get_env_device + + +from paddle.distributed import in_auto_parallel_align_mode + + +try: + from paddle.nn.functional import gemm_reduce_scatter, all_gather_gemm + import paddle.nn.functional.flux as flux +except ImportError: + gemm_reduce_scatter = None + all_gather_gemm = None + flux = None + +logger = logging.getLogger(__name__) + +if not hasattr(paddle.Tensor, "contiguous"): + + def contiguous(self): + """ + Make the tensor contiguous. + """ + return self + + setattr(paddle.Tensor, "contiguous", contiguous) + + +if not hasattr(paddle.Tensor, "_md5sum"): + + def _md5sum(self): + """ + Calculate the md5sum of the Tensor. + """ + numpy_array = np.array(self) + array_bytes = numpy_array.tobytes() + return hashlib.md5(array_bytes).hexdigest() + + setattr(paddle.Tensor, "_md5sum", _md5sum) + + +def get_hcg(): + return fleet.get_hybrid_communicate_group() + + +async_loader = None + + +def get_async_loader(): + assert get_env_device() != "xpu" + global async_loader + """get_async_loader""" + if not hasattr(fleet.fleet, "_hcg"): + if async_loader is None: + async_loader = create_async_load() + return async_loader + + hcg = get_hcg() + if not hasattr(hcg, "async_loader"): + setattr(hcg, "async_loader", create_async_load()) + return hcg.async_loader + + +def hack_offload_wait(task): + """hack_offload_wait""" + task.cpu_wait() + + +def hack_reload_wait(task): + """hack_offload_wait""" + task.cuda_wait() + + +class _AllToAll(paddle.autograd.PyLayer): + @staticmethod + def forward( + ctx, + input, + group, + output_split_sizes=None, + input_split_sizes=None, + ): + """ + All-to-all communication in the group. + + Args: + ctx (Any): Context object. + input (Tensor): Input tensor. + group (Group): The group object. + + Returns: + Tensor: Output tensor. + """ + + ctx.group = group + ctx.input_split_sizes = input_split_sizes + ctx.output_split_sizes = output_split_sizes + # return input + if dist.get_world_size(group) <= 1: + return input + if input_split_sizes is None and output_split_sizes is None: + output = paddle.empty_like(input) + task = dist.stream.alltoall_single( + output, input, None, None, group, True, True + ) + task.wait() + else: + out_sizes = [sum(output_split_sizes)] + out_sizes.extend(input.shape[1:]) + output = paddle.empty(out_sizes, dtype=input.dtype) + task = dist.stream.alltoall_single( + output, + input, + output_split_sizes, + input_split_sizes, + group, + sync_op=False, + ) + task.wait() + return output + + @staticmethod + def backward(ctx, *grad_output): + """ + all-to-all backward + + """ + # return grad_output + if ctx.input_split_sizes is None and ctx.output_split_sizes is None: + return _AllToAll.apply(*grad_output, ctx.group) + else: + return _AllToAll.apply( + *grad_output, ctx.group, ctx.input_split_sizes, ctx.output_split_sizes + ) + + +class AllGatherVarlenOpV2(PyLayer): + + @staticmethod + def forward(ctx, input, indices, axis=0, group=None): + """fwd""" + ctx.axis = axis + ctx.group = group + ctx.indices = indices + return all_gather_varlen(input, indices, axis=axis, group=group) + + @staticmethod + def backward(ctx, grad): + """bwd""" + return mp_slice(grad, ctx.indices, axis=ctx.axis, group=ctx.group) + + +class SliceVarlenOp(PyLayer): + + @staticmethod + def forward( + ctx, + input, + indices, + group=None, + ): + """ + fwd + """ + ctx.indices = indices + ctx.group = group + ret = mp_slice(input, indices, group=ctx.group) + return ret + + @staticmethod + def backward(ctx, grad): + """ + bwd + """ + return all_gather_varlen(grad, axis=ctx.axis, group=ctx.group) + + +class ScatterOp(PyLayer): + + @staticmethod + def forward(ctx, input, axis=0, group=None): + """fwd""" + ctx.axis = axis + ctx.group = group + return scatter(input, axis=axis, group=ctx.group) + + @staticmethod + def backward(ctx, grad): + return all_gather(grad, axis=ctx.axis, group=ctx.group) + + +SliceOp = ScatterOp # `ScatterOp` 的行为应该更像 Sclice + + +class GatherOp(PyLayer): + """ + input shape: [s/n, b, h], n is mp parallelism + after forward shape: [s, b, h] + 行为类似`AllGather`,反向不会汇聚梯度,从MP 异步态,回到 MP 同步态。 + """ + + @staticmethod + def forward(ctx, input, axis=0, group=None): + """fwd""" + ctx.axis = axis + ctx.group = group + return all_gather(input, axis=axis, group=group) + + @staticmethod + def backward(ctx, grad): + return scatter(grad, axis=ctx.axis, group=ctx.group) + + +# All gather along the first dim during forward pass +# All reduce and scatter along the first dim during backward pass +class AllGatherOp(PyLayer): + """ + input shape: [s/n, b, h], n is mp parallelism + after forward shape: [s, b, h] + 行为类似`AllGather`,反向会汇聚梯度,AllGather 完之后还是 MP 异步态。 + """ + + @staticmethod + def forward(ctx, input, group=None): + """fwd""" + ctx.group = group + return all_gather(input, group=group) + + # grad shape: [s, b, h], n is mp parallelism + # after forward shape: [s/n, b, h] + @staticmethod + def backward(ctx, grad): + if in_auto_parallel_align_mode(): + group = ctx.group + if group is None: + group = get_hcg().get_model_parallel_group() + pg = group.process_group + pg.allreduce(grad).wait() + return paddle.split(grad, group.nranks, axis=0)[group.rank] + else: + return reduce_scatter(grad, group=ctx.group) + + +# All reduce and scatter along the first dim during forward pass +# All gather along the first dim during backward pass +class ReduceScatterOp(PyLayer): + # input shape: [s, b, h], n is mp parallelism + # after forward shape: [s/n, b, h] + @staticmethod + def forward(ctx, input, group=None): + """fwd""" + ctx.group = group + return reduce_scatter(input, group=group) + + # grad shape: [s/n, b, h], n is mp parallelism + # after forward shape: [s, b, h] + @staticmethod + def backward(ctx, grad): + return all_gather(grad, group=ctx.group) + + +class AllGatherVarlenOp(PyLayer): + """the shape of allgather can be not same for each rank""" + + @staticmethod + def forward(ctx, input, group=None): + """ """ + hcg = fleet.get_hybrid_communicate_group() + if group is None: + group = hcg.get_model_parallel_group() + + shape0 = paddle.to_tensor([input.shape[0]]) + shape0_all = paddle.empty(shape=[group.nranks], dtype=shape0.dtype) + dist.stream.all_gather(shape0_all, shape0, group=group, use_calc_stream=True) + shape0_all = shape0_all.numpy() + max_shape0 = shape0_all.max() + + indices = [] + for idx, s in enumerate(shape0_all): + offset = idx * max_shape0 + indices.append(list(range(offset, offset + s))) + indices = np.concatenate(indices, axis=0) + indices = indices.reshape([-1] + [1] * (len(input.shape) - 1)) + indices = paddle.to_tensor(indices, dtype=paddle.int32) + + padding = max_shape0 - input.shape[0] + + ctx.shape0 = input.shape[0] + ctx.max_shape0 = max_shape0 + ctx.shape0_all = shape0_all + ctx.padding = padding + ctx.indices = indices + ctx.group = group + + if padding > 0: + input_shape = input.shape + input_shape[0] = padding + padding_tensor = paddle.empty(shape=input_shape, dtype=input.dtype) + input = paddle.concat([input, padding_tensor], axis=0) + output = all_gather(input, group) + output = paddle.take_along_axis(output, indices, axis=0) + + return output + + @staticmethod + def backward(ctx, grad): + """ """ + input_shape = grad.shape + input_shape[0] = ctx.max_shape0 * ctx.shape0_all.shape[0] + output = paddle.zeros(shape=input_shape, dtype=grad.dtype) + + grad = paddle.scatter(output, ctx.indices, grad) + + grad = scatter(grad, ctx.group) + + if ctx.padding > 0: + grad = grad[: ctx.shape0] + return grad + + +class GemmReduceScatterOp(PyLayer): + """overlap gemm and reduce scatter""" + + @staticmethod + def forward(ctx, input, weight, group): + """ + Args: input: Tensor[b * s, h / mp_size] + weight: Tensor[h / mp_size, h'] or Tensor[h', h / mp_size] + group: mp_group + Returns: output: Tensor[b * s / mp_size, h'] + """ + ctx.save_for_backward(input, weight) + ctx.group = group + output = gemm_reduce_scatter(input, weight, group) + return output + + @staticmethod + def backward(ctx, grad): + """ + Args: grad: Tensor[b * s / mp_size, h'] + Returns: input_grad: Tensor[b * s, h / mp_size] + weight_grad: Tensor[h / mp_size, h'] or Tensor[h', h / mp_size] + """ + input, weight = ctx.saved_tensor() + group = ctx.group + if input.stop_gradient and weight.stop_gradient: + return None, None + + if input.stop_gradient: + input_grad = None + grad_parallel = None + else: + input_grad, grad_parallel = all_gather_gemm( + grad, weight, group, deepcopy_input_parallel=False + ) + + if weight.stop_gradient: + weight_grad = None + else: + if grad_parallel is None: + grad_parallel = all_gather(grad) + weight_grad = paddle.matmul(input, grad_parallel, transpose_x=True) + return input_grad, weight_grad + + +class AllGatherGemmOp(PyLayer): + """overlap all gather and gemm""" + + @staticmethod + def forward(ctx, input, weight, group): + """ + Args: input: Tensor[b * s / mp_size, h] + weight: Tensor[h, h' / mp_size] or Tensor[h' / mp_size, h] + group: mp_group + Returns: output: Tensor[b * s, h' / mp_size] + """ + output, input_parallel = all_gather_gemm( + input, weight, group, deepcopy_input_parallel=True + ) + ctx.save_for_backward(input_parallel, weight) + ctx.group = group + ctx.input_stop_gradient = input.stop_gradient + return output + + @staticmethod + def backward(ctx, grad): + """ + Args: grad: Tensor[b * s, h' / mp_size] + Returns: input_grad: Tensor[b * s / mp_size, h] + weight_grad: Tensor[h, h' / mp_size] or Tensor[h' / mp_size, h] + """ + input_parallel, weight = ctx.saved_tensor() + group = ctx.group + if ctx.input_stop_gradient and weight.stop_gradient: + return None, None + if ctx.input_stop_gradient: + input_grad = None + else: + input_grad = gemm_reduce_scatter(grad, weight, group) + if weight.stop_gradient: + weight_grad = None + else: + weight_grad = paddle.matmul(input_parallel, grad, transpose_x=True) + + return input_grad, weight_grad + + +def sequence_parallel_sparse_mask_labels(labels, ignore_label=-100): + """allgather sparse label and return sparse idx""" + hcg = fleet.get_hybrid_communicate_group() + group = hcg.get_model_parallel_group() + labels = labels.flatten() + labels_local = paddle.split(labels, group.nranks)[group.rank] + + tgt_index = paddle.nonzero(labels_local != ignore_label).squeeze() + if tgt_index.numel() == 0: + tgt_index = paddle.to_tensor([0]) + + tgt_index = tgt_index.reshape([-1]).astype(paddle.int32) + labels_local_gather = paddle.take_along_axis(labels_local, tgt_index, axis=0) + labels_all_gather = AllGatherVarlenOp.apply(labels_local_gather) + return labels_all_gather, tgt_index.reshape([-1, 1]) + + +################################################### +# # +# Modified Parallel Linear Operator # +# # +################################################### + + +def mark_as_sequence_parallel_parameter(parameter): + setattr(parameter, "sequence_parallel", True) + + +def is_sequence_parallel_parameter(parameter): + return getattr(parameter, "sequence_parallel", False) + + +def create_fused_allreduce_gradient_hook(parameter_list, accumulation_steps): + hcg = get_hcg() + group = hcg.get_model_parallel_group() + + step = [0] + accumulation_steps *= len(parameter_list) + + def __impl__(grad): + step[0] += 1 + if step[0] == accumulation_steps: + step[0] = 0 + fused_allreduce_gradients_with_group(parameter_list, group=group, scale=1.0) + return grad + + return __impl__ + + +def create_non_fused_allreduce_gradient_hook(param, model, verbose=False): + + hcg = get_hcg() + pg = hcg.get_model_parallel_group().process_group + step = [0] + + @paddle.autograd.no_grad() + def __impl__(): + step[0] += 1 + # if accumulation_steps is None: + accumulation_steps = model.accumulate_steps + if verbose: + logger.info( + f'hook called: acc-step={step[0]}/{accumulation_steps}, use_main_grad={hasattr(param, "main_grad")}' + ) + if (step[0] % accumulation_steps) == 0: + step[0] = 0 + if hasattr(param, "main_grad"): + pg.allreduce(param.main_grad).wait() + else: + pg.allreduce(param.grad).wait() + + return __impl__ + + +def register_sequence_parallel_allreduce_hooks( + model, fuse_sequence_parallel_allreduce=False +): + logger.warning( + "DO NOT use sphook unless your PyLayer does not trigger param backward hook" + ) + mp_group = get_hcg().get_model_parallel_group() + if mp_group.nranks <= 1: + return + + params = [] + for n, p in model._layers.named_parameters(): + if is_sequence_parallel_parameter(p): + logger.info(f"register bw hook for:{n}") + params.append(p) + logger.info(f"#-sp-sync param:{len(params)}") + + if fuse_sequence_parallel_allreduce: + raise NotImplementedError() + else: + for i, p in enumerate(params): + if p.stop_gradient: + continue + hook = create_non_fused_allreduce_gradient_hook(p, model, verbose=False) + p._register_backward_hook(hook) + + +def is_fused_matmul_bias_supported(): + if paddle.is_compiled_with_cuda() and not paddle.is_compiled_with_rocm(): + import_module_error = False + try: + from paddle.base import core + except ModuleNotFoundError: + logger.warning( + "Unable to import paddle.base, are you using paddle latest build?" + ) + import_module_error = True + + if import_module_error: + try: + from paddle.fluid import core + except ModuleNotFoundError: + logger.warning( + "Unable to import paddle.fluid, are you using paddle latest build?" + ) + return False + return hasattr(core.eager.ops.legacy, "fused_gemm_epilogue") + else: + return False + + +class ColumnSequenceParallelLinear(Layer): + def __init__( + self, + in_features, + out_features, + weight_attr=None, + has_bias=None, + gather_output=True, + fuse_matmul_bias=False, + mp_group=None, + use_rr=False, + name=None, + use_comm=True, + use_tpsp_comm_overlap=False, + ): + super(ColumnSequenceParallelLinear, self).__init__() + + hcg = get_hcg() + self.model_parallel_group = ( + hcg.get_model_parallel_group() if mp_group is None else mp_group + ) + self.world_size = ( + hcg.get_model_parallel_group().nranks + if mp_group is None + else mp_group.nranks + ) + self._name = name + self.is_mp = self.world_size > 1 + self.use_comm = use_comm + if not self.use_comm: + assert not use_rr, "The moe allgather not compatibale with rr for now." + logger.warning( + "ColumnSequenceParallelLinear will NOT call ANY comm, " + "this feature is only used for XPU moe allgather dispatcher. " + "If this is not your purpose, please unset XPU_MOE_USE_ALLGATHER." + ) + + self.use_tpsp_comm_overlap = use_tpsp_comm_overlap + if self.use_tpsp_comm_overlap: + assert all_gather_gemm is not None + assert flux is not None + + assert ( + gather_output is False + ), "If sequence_parallel is True, \ + gather_output is False" + + self.gather_output = gather_output + assert out_features % self.world_size == 0, ( + "Number of column of the weight for linear ({}) must be" + " divisible by model parallel size ({})".format( + out_features, self.world_size + ) + ) + self.output_size_per_partition = out_features // self.world_size + + self._weight_attr = weight_attr + self._dtype = self._helper.get_default_dtype() + + if self.is_mp and paddle.in_dynamic_mode(): + with get_rng_state_tracker().rng_state(): + self.weight = self.create_parameter( + shape=[in_features, self.output_size_per_partition], + attr=self._weight_attr, + dtype=self._dtype, + is_bias=False, + ) + else: + self.weight = self.create_parameter( + shape=[in_features, self.output_size_per_partition], + attr=self._weight_attr, + dtype=self._dtype, + is_bias=False, + ) + + self.weight.is_distributed = True if self.is_mp else False + if self.weight.is_distributed: + self.weight.split_axis = 1 + + if has_bias: + # initialize bias to zero like Megatron + self.bias = self.create_parameter( + shape=[self.output_size_per_partition], + attr=paddle.nn.initializer.Constant(value=0.0), + dtype=self._dtype, + is_bias=True, + ) + self.bias.is_distributed = True if self.is_mp else False + if self.bias.is_distributed: + self.bias.split_axis = 0 + else: + self.bias = None + + self.linear = F.linear + + if self.use_tpsp_comm_overlap and self.is_mp and self.use_comm: + self._rr_column_comm_ln = None + + self._rr_column_ln = None + + if fuse_matmul_bias: + if not is_fused_matmul_bias_supported(): + raise NotImplementedError( + "You set fuse_matmul_bias=True in ColumnSequenceParallelLinear, " + "however, the paddle you are using not support this operation. " + "Please set fuse_matmul_bias=False or use paddle compiled " + "with cuda 11.6 or higher." + ) + from paddle.incubate.nn.functional import fused_linear + + self.linear = fused_linear + + def forward(self, x, use_comm=True): + """ + Args: + x: Tensor:[seq/mp, dim]: input tensor: + use_comm: bool, skip all gahther set to false + """ + # sequence parallelism is same as model parallelism + # if sequence parallel is true, input shape is [s, b, h] + # else input shape is [b, s, h] + if ( + self.use_tpsp_comm_overlap + and self.is_mp + and (use_comm and self.use_comm) + and flux.all_gather_gemm_can_implement( + x, self.weight, self.model_parallel_group + ) + ): + if self._rr_column_ln is not None and self.training: + output = self._rr_column_comm_ln( + x=x, weight=self.weight, group=self.model_parallel_group + ) + else: + output = AllGatherGemmOp.apply( + x, self.weight, self.model_parallel_group + ) + if self.bias is not None: + output += self.bias + return output + else: + if self.is_mp and (use_comm and self.use_comm): + input_parallel = AllGatherOp.apply(x) + else: + input_parallel = x + + if self._rr_column_ln is not None and self.training: + output = self._rr_column_ln( + self.linear, x=input_parallel, weight=self.weight, bias=self.bias + ) + else: + output = self.linear(input_parallel, self.weight, self.bias) + return output + + +class MPScale(PyLayer): + @staticmethod + def forward(ctx, x, mp_degree): + out = paddle.scale(x, 1.0 / mp_degree) + return out + + @staticmethod + def backward(ctx, dout): + return dout + + +class RowSequenceParallelLinear(Layer): + def __init__( + self, + in_features, + out_features, + weight_attr=None, + has_bias=True, + input_is_parallel=False, + fuse_matmul_bias=False, + use_rr=False, + mp_group=None, + name=None, + use_comm=True, + use_tpsp_comm_overlap=False, + ): + super(RowSequenceParallelLinear, self).__init__() + + self.in_features = in_features + self.out_features = out_features + assert ( + input_is_parallel is True + ), "If sequence_parallel is True, \ + input_is_parallel should be true." + + self.input_is_parallel = input_is_parallel + self._weight_attr = weight_attr + self._dtype = self._helper.get_default_dtype() + self._name = name + self.use_comm = use_comm + if not self.use_comm: + assert not use_rr, "The moe allgather not compatibale with rr for now." + logger.warning( + "RowSequenceParallelLinear will NOT call ANY comm, " + "this feature is only used for XPU moe allgather dispatcher. " + "If this is not your purpose, please unset XPU_MOE_USE_ALLGATHER." + ) + + self.use_tpsp_comm_overlap = use_tpsp_comm_overlap + if self.use_tpsp_comm_overlap: + assert gemm_reduce_scatter is not None + assert flux is not None + + if self.use_tpsp_comm_overlap and self.use_comm: + self._rr_rown_comm_ln = None + self._rr_rown_ln = None + + hcg = get_hcg() + self.model_parallel_group = ( + hcg.get_model_parallel_group() if mp_group is None else mp_group + ) + self.world_size = ( + hcg.get_model_parallel_group().nranks + if mp_group is None + else mp_group.nranks + ) + self.rank = ( + hcg.get_model_parallel_group().rank if mp_group is None else mp_group.rank + ) + + self.is_mp = self.world_size > 1 + assert in_features % self.world_size == 0, ( + "Number of row of the weight for linear ({}) must be" + " divisible by model parallel size ({})".format( + in_features, self.world_size + ) + ) + + self.input_size_per_partition = in_features // self.world_size + + if self.is_mp and paddle.in_dynamic_mode(): + with get_rng_state_tracker().rng_state(): + self.weight = self.create_parameter( + shape=[self.input_size_per_partition, self.out_features], + attr=self._weight_attr, + dtype=self._dtype, + is_bias=False, + ) + else: + self.weight = self.create_parameter( + shape=[self.input_size_per_partition, self.out_features], + attr=self._weight_attr, + dtype=self._dtype, + is_bias=False, + ) + + self.weight.is_distributed = True if self.is_mp else False + if self.weight.is_distributed: + self.weight.split_axis = 0 + + # if sequence parallel is true, + # register hook to all_reduce gradient of weight and bias + if has_bias: + self.bias = self.create_parameter( + shape=[self.out_features], + attr=paddle.nn.initializer.Constant(value=0.0), + dtype=self._dtype, + is_bias=True, + ) + if self.is_mp: + mark_as_sequence_parallel_parameter(self.bias) + else: + self.bias = None + + self.linear = F.linear + self.mp_scale = None + + if fuse_matmul_bias: + if not is_fused_matmul_bias_supported(): + raise NotImplementedError( + "You set fuse_matmul_bias=True in RowParallelLinear, " + "however, the paddle you are using not support this operation. " + "Please set fuse_matmul_bias=False or use paddle compiled " + "with cuda 11.6 or higher." + ) + from paddle.incubate.nn.functional import fused_linear + + self.linear = fused_linear + + def forward(self, x): + input_parallel = x + if self.is_mp: + if self.mp_scale is not None: + bias = self.mp_scale(self.bias, self.world_size) + else: + bias = None + + if self._rr_rown_ln is not None and self.training: + if ( + self.use_tpsp_comm_overlap + and self.use_comm + and flux.gemm_reduce_scatter_can_implement( + x, self.weight, self.model_parallel_group + ) + ): + output_ = self._rr_rown_comm_ln( + x=input_parallel, + weight=self.weight, + group=self.model_parallel_group, + ) + if bias is not None: + output_ += bias + else: + output_ = self._rr_rown_ln( + self.linear, x=input_parallel, weight=self.weight, bias=bias + ) + else: + if ( + self.use_tpsp_comm_overlap + and self.use_comm + and flux.gemm_reduce_scatter_can_implement( + x, self.weight, self.model_parallel_group + ) + ): + output_ = GemmReduceScatterOp.apply( + x, self.weight, self.model_parallel_group + ) + if bias is not None: + output_ = output_ + bias + else: + output_parallel = self.linear(input_parallel, self.weight, bias) + if self.use_comm: + output_ = ReduceScatterOp.apply(output_parallel) + else: + output_ = output_parallel + + # if self.bias is not none, sequence parallel will use + # register_hook to all_reduce self.bias + if bias is None and self.bias is not None and self.use_comm: + output = output_ + self.bias + else: + output = output_ + else: + output = self.linear(input_parallel, self.weight, self.bias) + return output diff --git a/examples/pre-training/scripts/train_96_auto.sh b/examples/pre-training/scripts/train_96_auto.sh new file mode 100644 index 00000000..4deaf98d --- /dev/null +++ b/examples/pre-training/scripts/train_96_auto.sh @@ -0,0 +1,118 @@ +#!/bin/bash + +# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +export NNODES=1 +export PADDLE_TRAINERS_NUM=1 + +mpi_rank=${OMPI_COMM_WORLD_RANK:-0} +node_rank=$((mpi_rank+offset)) +mpi_node=${OMPI_COMM_WORLD_SIZE:-1} +echo "MPI status:${mpi_rank}/${mpi_node}" +nnode_train=${nnode_set:-${mpi_node}} +master_train=${master:-localhost} +# +echo "Distributed Training ${node_rank}/${nnode_train} master=${master_train}" +set -x + +export CUDA_MODULE_LOADING=LAZY +export CUDA_DEVICE_MAX_CONNECTIONS=1 +export NCCL_DEBUG=INFO +export PYTHONUNBUFFERED=1 +unset GLOG_vmodule GLOG_v +export PADDLE_DISABLE_CUDNN_FA=1 +export FLAGS_use_auto_growth_pinned_allocator=True +export FLAGS_pipeline_nccl_comm_init_option=1 +export FLAGS_sharding_v2_check_zero_padding=1 +export FLAGS_use_paddle_recall_error=0 +export FLAGS_tcp_max_syn_backlog=16384 +export FLAGS_call_stack_level=2 + + +# 屏蔽平台预设的环境变量,因为框架采用兼容升级,检测到这些配置会使用原方式启动 +unset PADDLE_ELASTIC_JOB_ID +unset PADDLE_TRAINER_ENDPOINTS +unset DISTRIBUTED_TRAINER_ENDPOINTS +unset FLAGS_START_PORT +unset PADDLE_ELASTIC_TIMEOUT +nnodes=$PADDLE_TRAINERS_NUM +rank=$PADDLE_TRAINER_ID + + +export FLAGS_shard_use_reduce=1 +export FLAGS_shard_norm_align_dp=0 + +#加速pin memory save ckpt时间 +export FLAGS_use_auto_growth_pinned_allocator=True + +# export FLAGS_flash_attn_version=v1 +# 开启FA3 +SM=`nvidia-smi --query-gpu=compute_cap --format=csv | tail -n 1 | sed 's/\.//g'` +if [ $SM -eq 90 ] +then + export FLAGS_flash_attn_version=3 +else + export FLAGS_flash_attn_version=2 +fi + +# 保证集群稳定性的配置,跟性能无关 +export NCCL_IB_QPS_PER_CONNECTION=8 +export NCCL_IB_TIMEOUT=22 +export NCCL_IB_GID_INDEX=3 +# 开启AR功能 +export NCCL_IB_ADAPTIVE_ROUTING=1 + +# 集群hang检测 +export PADDLE_PG_TIMEOUT=150000 # 通信组超时时间,单位是ms,默认2分钟 +export FLAGS_enable_async_trace=False # True开启通信debug功能,False或不设置关闭,默认开启 +# export CUDA_MODULE_LOADING=LAZY + +export FLAGS_pipeline_nccl_comm_init_option=1 + +# 启动方式 +cuda_version=`nvidia-smi |grep "CUDA Version" |awk '{print $9}' |awk -F'.' '{print $1}'` +if [ ${cuda_version} != "12" ];then + export LD_LIBRARY_PATH=/usr/local/cuda/compat:$LD_LIBRARY_PATH +fi + +master=`cat /root/paddlejob/workspace/hostfile | head -n 1 | awk '{print $1}'` +port=36677 + + +#自动并行相关 +export FLAGS_enable_fused_ffn_qkv_pass=1 +export FLAGS_enable_pir_api=1 +#export FLAGS_enable_sharding_stage1_tensor_fusion=1 +export FLAGS_enable_moe_utils=true + +#调试相关 +export FLAGS_call_stack_level=2 +#export GLOG_v=6 +#export FLAGS_print_ir=1 +#export FLAGS_benchmark=1 +#export CUDA_VISIBLE_DEVICES=0,1 + +export PYTHONPATH=$PYTHONPATH:./ernie + +LOG_DIR=output/paddle_distributed_logs + +rm -rf output +rm -rf core.* + +python -m paddle.distributed.launch \ + --log_dir $LOG_DIR \ + --run_mode=collective \ + ${script:-ernie/pretrain_auto.py} \ + --config yamls/pretrain_96_auto.yaml diff --git a/examples/pre-training/yamls/pretrain_96_auto.yaml b/examples/pre-training/yamls/pretrain_96_auto.yaml new file mode 100644 index 00000000..f2380abb --- /dev/null +++ b/examples/pre-training/yamls/pretrain_96_auto.yaml @@ -0,0 +1,162 @@ +# -----------环境变量----------------------# +env: + HOME: null + +# ---------------------------model args-------------------------------------------------# +model_args: + model_name_or_path: model_configs_auto/ + tokenizer_name: ./ernie/src/tokenizers/tokenizer_model + output_dir: ./output/ + data_filelist: conf/filelist_ernie45turbo_tk_m100k_250321.txt.1000 + data_weights: conf/ratio_eb45t_0321 + dev_data: ~/afs_ro/baihua.afs.baidu.com/user/sasd-score/rank-score-total/linjianhe/liuweixin/app/model/data/char-en-65536-v1/v4_corpus_wordseg/newgcc.dev.h5 + data_load_process_num: 40 + max_seq_length: 4096 + base_seq_length: 4096 + num_consecutive: 32 + sequence_parallel: 1 + + enable_global_training_logs: False + moe_use_aux_free_update_coef: 0.001 + global_logging_interval: 10 + model_config: + moe_logging: True + moe_use_aux_free: true + multi_token_pred_depth: 1 + + + +# ---------------------------trainer args-------------------------------------------------# +trainer_args: + loss_spike_settings: + enable_loss_spike_watcher: 1 + longjob_id: long-78f0ae68688b4659 + supervised_filename: output/paddle_distributed_logs/metrics_rank0.json + delimiter: "Loading configuration file" + watch_loss_spike_interval: 20 + loss_spike_restart_interval: 300 + params: + - data_type: null + data_type_human_read: "纯文" + max_loss_thr: 2.0 + max_tolerance_steps: 1 + allow_loss_fallback: 0 + start_check_step: 219700 + + use_sp_callback: true + moe_gate_lr_ratio: 0.01 + do_train: True + dataloader_num_workers: 8 + prefetch_factor: 32 + overwrite_output_dir: 1 + disable_tqdm: 1 + logging_steps: 1 + eval_steps: 1000 + eval_iters: -1 + save_steps: 3000 + max_steps: 1000 + adam_beta1: 0.9 + adam_beta2: 0.95 + adam_epsilon: 1e-8 + learning_rate: 2.2e-4 + min_lr: 2.2e-5 + + global_batch_size: 2 # 16660 + gradient_accumulation_steps: 1 # 8008: 14; + per_device_train_batch_size: 2 + per_device_eval_batch_size: 1 + + lr_scheduler: wsd:231084 + decay_function: 1-sqrt + max_grad_norm: 1.0 + + adaptive_norm_clip: 0 # 4350 step后,关闭 adaptive-norm-clip + adaptive_norm_clip_ratio: 1.2 + adaptive_norm_force_clear_state: 0 # 在切换分布式策略时, 开启强制刷新统计状态 + adaptive_norm_enable_record: 1 # 开启更详细的裁剪日志 + + use_async_save: True # enable asynchronize save to gain efficiency + + weight_decay: 0.1 + warmup_steps: 200 + save_total_limit: 5 + bf16: True + fp16_opt_level: "O2" + use_fp8: False + scale_loss: 4096 + seed: 666 + use_train_part_sharding: 1 + pre_alloc_memory: 60 + + # # N7 + # tensor_parallel_degree: 8 # N7:8, N4:8, N1:4 + # pipeline_parallel_degree: 7 # N7:7, N4:4, N1:2 + # virtual_pp_degree: 8 # N7:8, N4:8, N1:1 + + # # N4 + # tensor_parallel_degree: 8 # N7:8, N4:8, N1:4 + # pipeline_parallel_degree: 4 # N7:7, N4:4, N1:2 + # virtual_pp_degree: 8 # N7:8, N4:8, N1:1 + + # # N1 + # tensor_parallel_degree: 4 # N7:8, N4:8, N1:4 + # pipeline_parallel_degree: 2 # N7:7, N4:4, N1:2 + # virtual_pp_degree: 1 # N7:8, N4:8, N1:1 + + # N1 dynamic auto + tensor_parallel_degree: 4 # N7:8, N4:8, N1:4 + # pipeline_parallel_degree: 1 # N7:7, N4:4, N1:2 + pipeline_parallel_degree: 2 # N7:7, N4:4, N1:2 + n_microbatches: 2 + pipeline_schedule_mode: "VPP" + model_type: "ernie_pp" + virtual_pp_degree: 1 # N7:8, N4:8, N1:1 + + data_parallel_degree: 1 + sharding: "stage1" + sharding_degree: 1 # 170 + # sharding_degree: 170 # + amp_master_grad: 1 + pipeline_parallel_config: enable_delay_scale_loss #enable_dp_comm_overlap + # pipeline_parallel_config: enable_delay_scale_loss enable_overlap_p2p_comm best_unbalanced_scheduler #enable_dp_comm_overlap + sharding_parallel_config: split_param enable_fuse_optimizer_states + sharding_comm_buffer_size_MB: 2048 + tensor_parallel_config: replace_with_parallel_cross_entropy + # tensor_parallel_config: sync_param sync_grad sync_moment + + + skip_profile_timer: False + + ignore_data_skip: 0 + shuffle_consecutive: True + + load_sharded_model: True + save_sharded_model: True + save_sharding_stage1_model_include_freeze_params: True + ignore_load_lr_and_optim: False + metrics_output_path: ./output/paddle_distributed_logs/ + + #TODO(@gexiao): move to longjob_args + pdc_download_ckpt: true + pdc_download_timeout: 300 + + # # Flash checkpoint settings + # enable_zero_cost_checkpoint: true + # save_tokenizer: false + # save_rng_states: false + # zcc_workers_num: 1 + # zcc_pipeline_hooks_capacity_usage: 0.8 + # flash_device_save_steps: 4 + # zcc_save_ema_coef: 0.9993 #exp((4/10000)*ln(1-0.9999)) + # zcc_ema_interval: 4 + + + use_moe: true + moe_with_send_router_loss: False + moe_group: mp + log_global_grad_norm: True + enable_optimizer_timer: False + gc_interval: 100000 + + enable_auto_parallel: 1 + to_static: 0 From 90aa758f750ec3793178734abfca1c8df66cc85e Mon Sep 17 00:00:00 2001 From: xuexixi Date: Wed, 13 Aug 2025 14:41:28 +0800 Subject: [PATCH 02/15] remove --- .../pre-training/models/fp8_linear_auto.py | 603 -------------- .../models/moe/moe_layer_auto_utils.py | 785 ++---------------- .../pre-training/models/moe/moe_utils_auto.py | 191 +---- .../pre-training/models/moe/top2_gate_auto.py | 1 - .../models/moe/top2_gate_auto_auto.py | 145 +--- .../models/sequence_parallel_utils_auto.py | 687 +-------------- 6 files changed, 103 insertions(+), 2309 deletions(-) delete mode 100644 examples/pre-training/models/fp8_linear_auto.py diff --git a/examples/pre-training/models/fp8_linear_auto.py b/examples/pre-training/models/fp8_linear_auto.py deleted file mode 100644 index 0d1ea29c..00000000 --- a/examples/pre-training/models/fp8_linear_auto.py +++ /dev/null @@ -1,603 +0,0 @@ -# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. -# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -FP8 Linear Layer Implementation for PaddlePaddle - -This module implements FP8 (8-bit floating point) linear layers using PaddlePaddle's -incubate APIs for low-precision training. Key features include: - -1. FP8 matrix multiplication with block-wise quantization -2. Memory-efficient forward/backward passes -3. PaddlePaddle-specific optimizations like: - - Using paddle.incubate.fp8 APIs - - Leveraging Paddle's automatic differentiation system - - Optimized for Paddle's tensor layout and memory management -""" - - -import numpy -import paddle -from paddle.incubate.fp8 import deep_gemm -from paddle.incubate.nn.functional import swiglu - -# Keep reference to original linear op for fallback if needed -original_linear = paddle.nn.functional.linear - - -# Expose only the main class to public API -__all__ = ["Fp8FusedMlp"] - - -def fp8_gemm( - x_fp8, - x_scale, - w_fp8, - w_scale, - is_a_1d_scaled, - is_b_1d_scaled, - out=None, - rtn_dtype=paddle.bfloat16, -): - """ - Performs FP8 matrix multiplication (GEMM) operation, using blockwise GEMM algorithm. - - Args: - x_fp8 (Tensor): Input tensor in FP8 format - x_scale (Tensor): Scaling factor for input tensor - w_fp8 (Tensor): Weight tensor in FP8 format - w_scale (Tensor): Scaling factor for weight tensor - is_a_1d_scaled (bool): Whether input tensor uses 1D scaling - is_b_1d_scaled (bool): Whether weight tensor uses 1D scaling - out (Tensor, optional): Output tensor for accumulation. Defaults to None - rtn_dtype (dtype, optional): Return data type. Defaults to paddle.bfloat16 - - Returns: - Tensor: Result of the matrix multiplication - """ - accumulate = out is not None - if numpy.prod(x_fp8.shape) != 0 and numpy.prod(w_fp8.shape) != 0: - # Using Paddle's blockwise FP8 GEMM with split accumulator for numerical stability - y = paddle.incubate.nn.functional.fp8_gemm_blockwise( - a=x_fp8, - a_decode_scale=x_scale, # Input scaling factors - b=w_fp8, - b_decode_scale=w_scale, # Weight scaling factors - out_dtype=rtn_dtype, # Output dtype (bfloat16) - out=out, # Optional output tensor for accumulation - accumulate=accumulate, # Whether to accumulate into out tensor - use_split_accumulator=True, # Paddle-specific optimization - is_a_1d_scaled=is_a_1d_scaled, # 1D scaling for input - is_b_1d_scaled=is_b_1d_scaled, # 1D scaling for weights - ) - else: - y = paddle.zeros([x_fp8.shape[0], w_fp8.shape[0]], rtn_dtype) - if out is not None: - out = out + y - return out - - return y - - -def padding(x, axis): - """ - Pads the input tensor along specified axis to make its size divisible by 512 or 128. - - Args: - x (Tensor): Input tensor to be padded - axis (int): Axis along which to pad (0 for rows, 1 for columns) - - Returns: - Tensor: Padded tensor - """ - if x.shape[axis] % 512 != 0: - if (x.shape[axis] + 128 - (x.shape[axis] % 128)) % 512 != 0: - padding_size = 512 - else: - padding_size = 128 - pad_size = padding_size - (x.shape[axis] % padding_size) - if axis == 0: - x = paddle.concat( - [x, paddle.zeros([pad_size, x.shape[-1]], dtype=x.dtype)], axis=0 - ) - else: - x = paddle.concat( - [x, paddle.zeros([x.shape[0], pad_size], dtype=x.dtype)], axis=-1 - ) - return x - - -class Fp8FusedMlpFunc(paddle.autograd.PyLayer): - """ - Custom PyLayer implementation of FP8 fused MLP operation. - - This class implements both forward and backward passes for a memory-efficient - FP8 (8-bit floating point) multi-layer perceptron using PaddlePaddle's - FP8 quantization APIs. - """ - - @staticmethod - def forward(ctx, x, w1, w2): - """ - Forward pass for FP8 fused multi-layer perceptron (MLP) operation. - - Args: - ctx (PyLayerContext): Context object to save tensors for backward pass - x (paddle.Tensor): Input tensor of shape [batch_size, hidden_size] - w1 (paddle.Tensor): First weight matrix of shape [hidden_size, intermediate_size*2] - w2 (paddle.Tensor): Second weight matrix of shape [intermediate_size, hidden_size] - - Returns: - paddle.Tensor: Output tensor of shape [batch_size, hidden_size] - - Note: - - Uses Paddle's FP8 quantization for memory efficiency - - Implements SWiGLU activation internally - - Handles tensor padding for optimal FP8 GEMM performance - """ - x_orig_shape = x.shape - x = x.reshape([-1, x_orig_shape[-1]]) - - if x.shape[0] % 512 != 0: - x_fp8, x_scale = paddle.incubate.nn.functional.fp8.fp8_quant_blockwise( - x, - quant_method="1x128", - input_transpose=False, - output_scale_transpose=True, - ) - x = padding(x, 0) - _, _, x_t_fp8, x_t_scale = ( - paddle.incubate.nn.functional.fp8.fp8_quant_blockwise( - x, - quant_method="1x128", - input_transpose=True, - output_scale_transpose=True, - ) - ) - - else: - x_fp8, x_scale, x_t_fp8, x_t_scale = ( - paddle.incubate.nn.functional.fp8.fp8_quant_blockwise( - x, - quant_method="1x128", - input_transpose=True, - output_scale_transpose=True, - ) - ) - - _, _, w1_fp8, w1_scale = paddle.incubate.nn.functional.fp8.fp8_quant_blockwise( - w1, - quant_method="128x128", - input_transpose=True, - output_scale_transpose=False, - ) - o1 = paddle.empty([x_fp8.shape[0], w1_fp8.shape[0]], dtype=x.dtype) - deep_gemm.gemm_fp8_fp8_bf16_nt((x_fp8, x_scale.T), (w1_fp8, w1_scale), o1) - - o2 = swiglu(o1) - o2_fp8, o2_scale = paddle.incubate.nn.functional.fp8.fp8_quant_blockwise( - o2, quant_method="1x128", input_transpose=False, output_scale_transpose=True - ) - - _, _, w2_t_fp8, w2_t_scale = ( - paddle.incubate.nn.functional.fp8.fp8_quant_blockwise( - w2, - quant_method="128x128", - input_transpose=True, - output_scale_transpose=False, - ) - ) - o3 = paddle.empty([o2_fp8.shape[0], w2_t_fp8.shape[0]], dtype=o1.dtype) - deep_gemm.gemm_fp8_fp8_bf16_nt((o2_fp8, o2_scale.T), (w2_t_fp8, w2_t_scale), o3) - if len(x_orig_shape) > 2: - o3 = o3.reshape([x_orig_shape[0], -1, o3.shape[-1]]) - - ctx.save_for_backward( - x_t_fp8, - x_t_scale, - w1, - o1, - w2, - paddle.to_tensor(x_orig_shape, dtype="int64", place=paddle.CPUPlace()), - ) - return o3 - - @staticmethod - def backward(ctx, do3): - """ - Memory-efficient backward pass for FP8 fused MLP operation. - - Args: - ctx: Context object containing saved tensors from forward pass - do3 (Tensor): Gradient of the loss with respect to the output - - Returns: - Tuple[Tensor, Tensor, Tensor]: Gradients with respect to x, w1, and w2 - """ - do3_orig_shape = do3.shape - do3 = do3.reshape([-1, do3_orig_shape[-1]]) - - x_t_fp8, x_t_scale, w1, o1, w2, x_orig_shape = ctx.saved_tensor() - x_orig_shape = x_orig_shape.numpy() - - o2 = swiglu(o1) - if do3.shape[0] % 512 != 0: - do3_fp8, do3_scale = paddle.incubate.nn.functional.fp8.fp8_quant_blockwise( - do3, - quant_method="1x128", - input_transpose=False, - output_scale_transpose=True, - ) - do3 = padding(do3, 0) - _, _, do3_t_fp8, do3_t_scale = ( - paddle.incubate.nn.functional.fp8.fp8_quant_blockwise( - do3, - quant_method="1x128", - input_transpose=True, - output_scale_transpose=True, - ) - ) - else: - do3_fp8, do3_scale, do3_t_fp8, do3_t_scale = ( - paddle.incubate.nn.functional.fp8.fp8_quant_blockwise( - do3, - quant_method="1x128", - input_transpose=True, - output_scale_transpose=True, - ) - ) - w2_fp8, w2_scale = paddle.incubate.nn.functional.fp8.fp8_quant_blockwise( - w2, - quant_method="128x128", - input_transpose=False, - output_scale_transpose=False, - ) - do2 = paddle.empty([do3_fp8.shape[0], w2_fp8.shape[0]], do3.dtype) - deep_gemm.gemm_fp8_fp8_bf16_nt((do3_fp8, do3_scale.T), (w2_fp8, w2_scale), do2) - - o2 = padding(o2, 0) - _, _, o2_t_fp8, o2_t_scale = ( - paddle.incubate.nn.functional.fp8.fp8_quant_blockwise( - o2, - quant_method="1x128", - input_transpose=True, - output_scale_transpose=True, - ) - ) - - dw2 = fp8_gemm( - o2_t_fp8, - o2_t_scale, - do3_t_fp8, - do3_t_scale, - True, - True, - rtn_dtype=paddle.float32, - ) - - do1, _ = paddle._C_ops.swiglu_grad(o1, None, do2) - - if do1.shape[0] % 512 != 0: - do1_fp8, do1_scale = paddle.incubate.nn.functional.fp8.fp8_quant_blockwise( - do1, - quant_method="1x128", - input_transpose=False, - output_scale_transpose=True, - ) - do1 = padding(do1, 0) - _, _, do1_t_fp8, do1_t_scale = ( - paddle.incubate.nn.functional.fp8.fp8_quant_blockwise( - do1, - quant_method="1x128", - input_transpose=True, - output_scale_transpose=True, - ) - ) - else: - do1_fp8, do1_scale, do1_t_fp8, do1_t_scale = ( - paddle.incubate.nn.functional.fp8.fp8_quant_blockwise( - do1, - quant_method="1x128", - input_transpose=True, - output_scale_transpose=True, - ) - ) - w1_fp8, w1_scale = paddle.incubate.nn.functional.fp8.fp8_quant_blockwise( - w1, - quant_method="128x128", - input_transpose=False, - output_scale_transpose=False, - ) - dx = paddle.empty([do1_fp8.shape[0], w1_fp8.shape[0]], do1.dtype) - deep_gemm.gemm_fp8_fp8_bf16_nt((do1_fp8, do1_scale.T), (w1_fp8, w1_scale), dx) - if len(x_orig_shape) > 2: - dx = dx.reshape([x_orig_shape[0], -1, dx.shape[-1]]) - - dw1 = fp8_gemm( - x_t_fp8, - x_t_scale, - do1_t_fp8, - do1_t_scale, - True, - True, - rtn_dtype=paddle.float32, - ) - return dx, dw1, dw2 - - -class MemEfficientFp8FusedMlpFunc(paddle.autograd.PyLayer): - """ - Memory-optimized version of FP8 fused MLP operation. - - This implementation reduces memory usage during training by: - - Avoiding redundant tensor storage in forward pass - - Recomputing intermediate values during backward pass - - Using optimized FP8 quantization strategies - - Inherits from paddle.autograd.PyLayer to implement custom backward pass. - """ - - @staticmethod - def forward(ctx, x, w1, w2): - """ - Memory-efficient forward pass for FP8 fused MLP operation. - - Args: - ctx (PyLayerContext): Context object to save minimal tensors for backward pass - x (paddle.Tensor): Input tensor of shape [batch_size, hidden_size] - w1 (paddle.Tensor): First weight matrix of shape [hidden_size, intermediate_size*2] - w2 (paddle.Tensor): Second weight matrix of shape [intermediate_size, hidden_size] - - Returns: - paddle.Tensor: Output tensor of shape [batch_size, hidden_size] - - Note: - - Saves only essential tensors for backward pass to reduce memory usage - - Uses recomputation strategy during backward pass - - Maintains same numerical accuracy as standard implementation - """ - x_orig_shape = x.shape - x = x.reshape([-1, x_orig_shape[-1]]) - - x_fp8, x_scale = paddle.incubate.nn.functional.fp8.fp8_quant_blockwise( - x, quant_method="1x128", input_transpose=False, output_scale_transpose=True - ) - - _, _, w1_fp8, w1_scale = paddle.incubate.nn.functional.fp8.fp8_quant_blockwise( - w1, - quant_method="128x128", - input_transpose=True, - output_scale_transpose=False, - ) - o1 = paddle.empty([x_fp8.shape[0], w1_fp8.shape[0]], dtype=x.dtype) - deep_gemm.gemm_fp8_fp8_bf16_nt((x_fp8, x_scale.T), (w1_fp8, w1_scale), o1) - - o2 = swiglu(o1) - o2_fp8, o2_scale = paddle.incubate.nn.functional.fp8.fp8_quant_blockwise( - o2, quant_method="1x128", input_transpose=False, output_scale_transpose=True - ) - - _, _, w2_t_fp8, w2_t_scale = ( - paddle.incubate.nn.functional.fp8.fp8_quant_blockwise( - w2, - quant_method="128x128", - input_transpose=True, - output_scale_transpose=False, - ) - ) - o3 = paddle.empty([o2_fp8.shape[0], w2_t_fp8.shape[0]], dtype=o1.dtype) - deep_gemm.gemm_fp8_fp8_bf16_nt((o2_fp8, o2_scale.T), (w2_t_fp8, w2_t_scale), o3) - if len(x_orig_shape) > 2: - o3 = o3.reshape([x_orig_shape[0], -1, o3.shape[-1]]) - - ctx.save_for_backward( - x_fp8, - x_scale, - w1, - w2, - paddle.to_tensor(x_orig_shape, dtype="int64", place=paddle.CPUPlace()), - ) - return o3 - - @staticmethod - def backward(ctx, do3): - do3_orig_shape = do3.shape - do3 = do3.reshape([-1, do3_orig_shape[-1]]) - - x_fp8, x_scale, w1, w2, x_orig_shape = ctx.saved_tensor() - x_orig_shape = x_orig_shape.numpy() - - _, _, w1_fp8, w1_scale = paddle.incubate.nn.functional.fp8.fp8_quant_blockwise( - w1, - quant_method="128x128", - input_transpose=True, - output_scale_transpose=False, - ) - o1 = paddle.empty([x_fp8.shape[0], w1_fp8.shape[0]], dtype=do3.dtype) - deep_gemm.gemm_fp8_fp8_bf16_nt((x_fp8, x_scale.T), (w1_fp8, w1_scale), o1) - - x_dequant_fp16 = paddle.incubate.nn.functional.fused_act_dequant( - x_fp8, x_scale.T.contiguous() - ) - x_dequant_fp16 = padding(x_dequant_fp16, 0) - - _, _, x_t_fp8, x_t_scale = ( - paddle.incubate.nn.functional.fp8.fp8_quant_blockwise( - x_dequant_fp16, - quant_method="1x128", - input_transpose=True, - output_scale_transpose=True, - ) - ) - - o2 = swiglu(o1) - - if do3.shape[0] % 512 != 0: - do3_fp8, do3_scale = paddle.incubate.nn.functional.fp8.fp8_quant_blockwise( - do3, - quant_method="1x128", - input_transpose=False, - output_scale_transpose=True, - ) - do3 = padding(do3, 0) - _, _, do3_t_fp8, do3_t_scale = ( - paddle.incubate.nn.functional.fp8.fp8_quant_blockwise( - do3, - quant_method="1x128", - input_transpose=True, - output_scale_transpose=True, - ) - ) - else: - do3_fp8, do3_scale, do3_t_fp8, do3_t_scale = ( - paddle.incubate.nn.functional.fp8.fp8_quant_blockwise( - do3, - quant_method="1x128", - input_transpose=True, - output_scale_transpose=True, - ) - ) - w2_fp8, w2_scale = paddle.incubate.nn.functional.fp8.fp8_quant_blockwise( - w2, - quant_method="128x128", - input_transpose=False, - output_scale_transpose=False, - ) - do2 = paddle.empty([do3_fp8.shape[0], w2_fp8.shape[0]], do3.dtype) - deep_gemm.gemm_fp8_fp8_bf16_nt((do3_fp8, do3_scale.T), (w2_fp8, w2_scale), do2) - - o2 = padding(o2, 0) - _, _, o2_t_fp8, o2_t_scale = ( - paddle.incubate.nn.functional.fp8.fp8_quant_blockwise( - o2, - quant_method="1x128", - input_transpose=True, - output_scale_transpose=True, - ) - ) - - dw2 = fp8_gemm( - o2_t_fp8, - o2_t_scale, - do3_t_fp8, - do3_t_scale, - True, - True, - rtn_dtype=paddle.float32, - ) - - do1, _ = paddle._C_ops.swiglu_grad(o1, None, do2) - - if do1.shape[0] % 512 != 0: - do1_fp8, do1_scale = paddle.incubate.nn.functional.fp8.fp8_quant_blockwise( - do1, - quant_method="1x128", - input_transpose=False, - output_scale_transpose=True, - ) - do1 = padding(do1, 0) - _, _, do1_t_fp8, do1_t_scale = ( - paddle.incubate.nn.functional.fp8.fp8_quant_blockwise( - do1, - quant_method="1x128", - input_transpose=True, - output_scale_transpose=True, - ) - ) - else: - do1_fp8, do1_scale, do1_t_fp8, do1_t_scale = ( - paddle.incubate.nn.functional.fp8.fp8_quant_blockwise( - do1, - quant_method="1x128", - input_transpose=True, - output_scale_transpose=True, - ) - ) - w1_fp8, w1_scale = paddle.incubate.nn.functional.fp8.fp8_quant_blockwise( - w1, - quant_method="128x128", - input_transpose=False, - output_scale_transpose=False, - ) - dx = paddle.empty([do1_fp8.shape[0], w1_fp8.shape[0]], do1.dtype) - deep_gemm.gemm_fp8_fp8_bf16_nt((do1_fp8, do1_scale.T), (w1_fp8, w1_scale), dx) - if len(x_orig_shape) > 2: - dx = dx.reshape([x_orig_shape[0], -1, dx.shape[-1]]) - - dw1 = fp8_gemm( - x_t_fp8, - x_t_scale, - do1_t_fp8, - do1_t_scale, - True, - True, - rtn_dtype=paddle.float32, - ) - return dx, dw1, dw2 - - -class Fp8FusedMlp(paddle.nn.Layer): - """ - PaddlePaddle Layer implementing FP8 fused multi-layer perceptron (MLP). - - This layer combines: - - FP8 precision matrix operations for improved performance - - Fused MLP architecture with SWiGLU activation - - Memory-efficient training through custom PyLayer implementation - - """ - - def __init__(self, config): - """ - Initializes the FP8 Fused MLP layer. - - Args: - config (object): Configuration object containing: - - hidden_size (int): Dimension of the input/output features - - intermediate_size (int): Dimension of the intermediate features - - Note: - - Weights are initialized using Paddle's create_parameter - - Uses bfloat16 precision for weight storage - - No bias terms are used in this implementation - """ - - super().__init__() - self.config = config - self.hidden_size = config.hidden_size - self.intermediate_size = config.intermediate_size - - self.w1 = self.create_parameter( - shape=[self.hidden_size, self.intermediate_size * 2], - dtype="bfloat16", # Using Paddle's bfloat16 dtype - is_bias=False, # Paddle-specific parameter attribute - ) - self.w2 = self.create_parameter( - shape=[self.intermediate_size, self.hidden_size], - dtype="bfloat16", - is_bias=False, - ) - - def forward(self, x): - """ - Forward pass of the FP8 fused MLP layer. - - Args: - x (Tensor): Input tensor - - Returns: - Tensor: Output tensor after MLP transformation - """ - return Fp8FusedMlpFunc.apply(x, self.w1, self.w2) diff --git a/examples/pre-training/models/moe/moe_layer_auto_utils.py b/examples/pre-training/models/moe/moe_layer_auto_utils.py index ee36fad5..f9ad5995 100644 --- a/examples/pre-training/models/moe/moe_layer_auto_utils.py +++ b/examples/pre-training/models/moe/moe_layer_auto_utils.py @@ -14,15 +14,9 @@ # See the License for the specific language governing permissions and # limitations under the License. -"""_summary_ - -Returns: - _type_: _description_ -""" from typing import Tuple, List, Optional import logging from collections import namedtuple -from functools import partial import inspect import paddle @@ -39,7 +33,6 @@ import paddle.distributed as dist from paddle import Tensor -from paddleformers.utils.tools import get_env_device from models.moe.top2_gate_auto_auto import ( TopKGateFused, @@ -53,66 +46,42 @@ from models.comm_utils import profile -from models.moe.moe_utils import MOEAllGatherDispatcher - from paddle.incubate.nn.functional import ( moe_combine, ) -try: - from paddle.incubate.nn.functional import ( - moe_gate_dispatch_and_quant, - ) -except ImportError: - moe_gate_dispatch_and_quant = None try: from src.utils.misc import global_training_logs except ModuleNotFoundError: - global_training_logs = {} + global_training_logs = {} try: import moe_router_loss_ops except ImportError: moe_router_loss_ops = None +logger = logging.getLogger(__name__) + + try: - from paddle import scatter_add_ + import moe_ops except ImportError: - scatter_add_ = None + moe_ops = None + logger.warning( + "`moe-ops` not found, run " + "`python3 src/ernie_core/ops/moe/setup.py install` to install" + ) try: - from bincount_ops import int_bincount + import moe_ops_fp8 except ImportError: - int_bincount = None - -logger = logging.getLogger(__name__) - -if get_env_device() == "xpu": - try: - from paddle_xpu_nn import moe_gate_dispatch as xpu_moe_gate_dispatch - except ImportError: - xpu_moe_gate_dispatch = None - logger.warning("`xpu moe dispatch` not found") -else: - try: - import moe_ops - except ImportError: - moe_ops = None - logger.warning( - "`moe-ops` not found, run " - "`python3 src/ernie_core/ops/moe/setup.py install` to install" - ) - - try: - import moe_ops_fp8 - except ImportError: - moe_ops_fp8 = None - logger.warning( - "`moe-ops` not found, run " - "`python3 src/ernie_core/ops/moe/setup_fp8.py install` to install" - ) + moe_ops_fp8 = None + logger.warning( + "`moe-ops` not found, run " + "`python3 src/ernie_core/ops/moe/setup_fp8.py install` to install" + ) try: from moe_combine import moe_combine_no_weight @@ -145,18 +114,9 @@ class GateCombine_ori(PyLayer): - """GateCombine_ori""" @staticmethod def forward(ctx, x, combine_weights, scatter_index): - """ - Input: - x: [seqlen * k, hidden_size] - combine_weights: [seqlen, k] - scatter_index: [seqlen, k] - Output: - y: [seqlen, hidden_size] - """ ctx.x = x ctx.combine_weights = combine_weights ctx.scatter_index = scatter_index @@ -166,17 +126,6 @@ def forward(ctx, x, combine_weights, scatter_index): @staticmethod def backward(ctx, grad_y, *_): - """ - Input: - grad_y: [seqlen, hidden_size] - combine_weights: [seqlen, k] - scatter_index: [seqlen, k] - Output: - grad_x: [seqlen * k, hidden_size] - grad_combine_weight: [seqlen, k] - - """ - assert moe_combine is not None grad_x, grad_combine_weight_helper = moe_combine.moe_combine_bwd( ctx.x, ctx.combine_weights, ctx.scatter_index, grad_y @@ -187,15 +136,7 @@ def backward(ctx, grad_y, *_): def combining_fused(x, combine_weights, scatter_index, hard_gate=False): - """ - Args: - x: Tensor[seq, dim] - combine_weights: [s, k] - scatter_index: ** [k, s] ** - - Returns: - y: Tensor[s, dim] - """ + if hard_gate: x_gatherd = F.embedding(scatter_index, x) # [s,k,dim] return x_gatherd.squeeze(-2) @@ -215,55 +156,6 @@ def recompute_fwd_gate_up_func(config, layer_idx): return False -class MoEStatics(nn.Layer): - - def __init__(self, config, layer_idx): - super().__init__() - self._cast_to_low_precision = False - self._cast_to_low_precison = False - num_experts = ( - config.moe_num_experts[0] - if config.multimodel_experts - else config.moe_num_experts - ) - if config.multimodel_experts: - assert ( - len(set(config.moe_num_experts)) == 1 - ), "assume expert group has same size, got: {config.moe_num_experts}" - - with paddle.utils.unique_name.guard(f"mm_layer_{layer_idx}_"): - num_experts_groups = ( - len(config.moe_num_experts) if config.multimodel_experts else 1 - ) - p = self.create_parameter( - shape=[num_experts_groups, num_experts], - dtype="float32", - is_bias=True, - attr=paddle.ParamAttr( - name=paddle.utils.unique_name.generate("corr_bias") - ), - ) - - p.stop_gradient = False - self.e_score_correction_bias = p - self.e_score_correction_bias.is_distributed = True - self.e_score_correction_bias.unused_param = True - if getattr(config, "build_skip_comm_buffer", False): - self.e_score_correction_bias.color = { - "color": "skip_comm", - "group": paddle.distributed.new_group( - [paddle.distributed.get_rank()] - ), - } - p = paddle.zeros( - shape=[num_experts_groups, num_experts], - dtype="int64", - ) - p.stop_gradient = True - self.expert_usage = p - # self.expert_usage.is_distributed = True - - def dispatching(x, dispatch_mask, scatter_index, num_experts, capacity): output = None @@ -310,7 +202,6 @@ def combining(x, combine_weights, scatter_index): def fuse_logging(gate_logits, combine_weights, token_type_ids): - """fuse_logging""" with paddle.no_grad(): gate_expert_per_token_type_0, gate_expert_per_token_type_1 = None, None gate_experts_per_token = None @@ -336,33 +227,12 @@ def fuse_logging(gate_logits, combine_weights, token_type_ids): ) -class GateCombine(PyLayer): - @staticmethod - def forward(ctx, x, combine_weights, scatter_index): - ctx.x = x - ctx.combine_weights = combine_weights - ctx.scatter_index = scatter_index - ret = moe_combine(x, combine_weights, scatter_index) - return ret - - @staticmethod - def backward(ctx, grad_y, *_): - # assert moe_combine is not None - grad_x, grad_combine_weight_helper = paddle._C_ops.moe_combine_grad( - ctx.x, ctx.combine_weights, ctx.scatter_index, grad_y - ) - grad_combine_weight = grad_combine_weight_helper.sum(-1) - return grad_x, grad_combine_weight.reshape(ctx.combine_weights.shape), None - - class Fp8MoeGateDispatchAndQuant(paddle.autograd.PyLayer): - """Fp8MoeGateDispatchAndQuant""" @staticmethod def forward( ctx, x, gate_logtis, corr_bias, k, capacity, use_pad, use_pow2_scale=True ): - """forward""" ( out_fp8, scale, @@ -409,7 +279,6 @@ def forward( @staticmethod def backward(ctx, *grads): - """backward""" out_grad, combine_weights_grad = grads[0], grads[1] x_grad, gate_logits_grad = moe_ops.moe_gate_dispatch_bwd( ctx.combine_weights, @@ -428,15 +297,10 @@ def backward(ctx, *grads): class AlltoAll(PyLayer): - """ - AlltoAll w/ backward - """ @staticmethod def forward(ctx, x, group, sync_op=True): - """ - All-to-all communication in the group. - """ + ctx.group = group if dist.get_world_size(group) <= 1: return x @@ -452,20 +316,15 @@ def forward(ctx, x, group, sync_op=True): @staticmethod def backward(ctx, *dx): - """backward""" return AlltoAll.apply(*dx, group=ctx.group) class AlltoAllExpertOverlap(PyLayer): - """ - AlltoAllExpertOverlap w/ backward - """ @staticmethod def forward( ctx, input, group, num_local_experts, forward_func_dict, is_first_fwd=False ): - """forward""" assert ( dist.get_world_size(group) > 1 ), "AlltoAllExpertOverlap is not supported for a world size less than or equal to 1." @@ -502,7 +361,6 @@ def forward( @staticmethod def backward(ctx, out_grad): - """backward""" all2all_tasks = [] expert_outputs = [] @@ -524,24 +382,10 @@ def backward(ctx, out_grad): class AlltoAllAsync(PyLayer): - """ - AlltoAll async w/ backward - """ @staticmethod def forward(ctx, x, *fn_args, group=None, fn=None, is_first_fwd=False): - """ - All-to-all communication in the group. - Args: - x: Tensor - args: List[Any], argument(s) to `fn` - group: ProcessGroup - fn: callable, called while doing alltoall - is_first_fwd: if using recompute, don't record bacward when first forward - Returns: - x: Tensor - fn_out: List[Tensor] - """ + assert fn is not None, "use AlltoAll no async" ctx.group = group if dist.get_world_size(group) <= 1: @@ -563,7 +407,6 @@ def forward(ctx, x, *fn_args, group=None, fn=None, is_first_fwd=False): @staticmethod def backward(ctx, dx_out, *fn_out_grads): - """backward""" if dist.get_world_size(ctx.group) <= 1: fn_args_grads = ctx.bwf(*fn_out_grads) return (dx_out,) + fn_args_grads @@ -583,30 +426,10 @@ def backward(ctx, dx_out, *fn_out_grads): return (dx,) + fn_args_grads -def bpr_preprocess(input, logits, capacity, buffer): - """impletment bpr sorting""" - assert input.ndim == 2, input.shape - idx = paddle.argsort(logits.max(-1), axis=0, descending=True) - input = input[idx] - logits = logits[idx] - buffer["idx"] = idx - return input, logits - - -def bpr_postprocess(output, buffer): - """bpr sorting""" - idx = buffer.pop("idx") - rev_idx = paddle.argsort(idx) - output = output[rev_idx] - return output - - class FusedNormGateFunc(paddle.autograd.PyLayer): - """recompute of postnorm and gate""" @staticmethod def forward(ctx, x, rms_norm_weight, moe_gate_weight, eps): - """doc""" ctx.dtype = paddle.float32 norm_output, invar = fused.fused_rms_norm(x, rms_norm_weight, eps) with paddle.amp.auto_cast(False): @@ -620,11 +443,8 @@ def forward(ctx, x, rms_norm_weight, moe_gate_weight, eps): @staticmethod def backward(ctx, d_gate_logits, d_norm_output): - """doc""" x, rms_norm_weight, moe_gate_weight, eps = ctx.saved_tensor() - # recompute rmsnorm norm_output, invar = fused.fused_rms_norm(x, rms_norm_weight, eps) - # with paddle.amp.auto_cast(False): d_norm_output_linear, d_moe_gate_weight = matmul_bwd( cast_if_needed(norm_output, ctx.dtype), cast_if_needed(moe_gate_weight, ctx.dtype), @@ -643,54 +463,7 @@ def backward(ctx, d_gate_logits, d_norm_output): return dx, d_rms_norm_weight, d_moe_gate_weight -class FusedNormGateMoe(paddle.nn.Layer): - """recompute of postnorm and gate""" - - def __init__(self, gate, rms_norm_weight, eps) -> None: - """doc""" - super().__init__() - self.rms_norm_weight = rms_norm_weight - self.gate = gate - self.eps = eps - - def forward(self, x): - """doc""" - moe_gate_weight = self.gate.get_gate_weight(True) - capacity = self.gate.get_capacity(x.shape[0]) - - router_loss = paddle.zeros([1], dtype="float32") - router_loss.stop_gradient = False - - gate_logits, norm_output = FusedNormGateFunc.apply( - x, self.rms_norm_weight, moe_gate_weight, self.eps - ) - return gate_logits, capacity, router_loss, norm_output - - class MOELayer(nn.Layer): - """MOELayer module which implements MixtureOfExperts as described in Gshard_. - :: - - gate = Top2Gate(model_dim, num_experts) - - moe = MOELayer(gate, expert) - output = moe(input) - l_aux = moe.l_aux - - .. Gshard_: https://arxiv.org/pdf/2006.16668.pdf - - Args: - gate (paddle.nn.Layer): - gate network - expert (paddle.nn.LayerList): - expert network, LayerList 长度是 per_device 上的 expert 数。 - group (paddle.ProgressGroup) - recompute: 启用MOE内recomupte - Returns: - output - combine_weight - router-loss - """ def __init__( self, @@ -789,17 +562,7 @@ def __init__( assert 0, "no supported, checkout earylier code" assert self.num_local_experts == 1 - if enable_bpr: - logger.info("using BPR") - prepost_process_buffer = {} - self.input_preprocess = partial( - bpr_preprocess, buffer=prepost_process_buffer - ) - self.output_postprocess = partial( - bpr_postprocess, buffer=prepost_process_buffer - ) - else: - self.input_preprocess = self.output_postprocess = None + self.input_preprocess = self.output_postprocess = None self.group_experts = group_experts self.config = self.gate.config self.zero = paddle.to_tensor(0, dtype=paddle.float32) @@ -828,21 +591,8 @@ def __init__( p, "color", {"color": "moe_expert", "group": moe_grad_group} ) - def add_gate_recompute_func(self, post_norm_weight, post_norm_eps): - """Add FusedNormGateMoe recompute function""" - self.config.use_norm_gate_recompute = True - self.fused_norm_gate = FusedNormGateMoe( - self.gate, post_norm_weight, post_norm_eps - ) - def forward_experts(self, dispatched_input): - """ - call experts sequently - Args: - dispatched_input: Tensor[num_experts, capacity, dim] - Returns: - expert_output: Tensor[num_experts, capacity, dim] - """ + with profile("fwd-expert"): dispatched_input = dispatched_input.reshape( [ @@ -944,9 +694,7 @@ def fused_gate_logits_process( return prob, max_prob def gate_distpach_and_quant(self, input, token_type_ids): - """ - gate_distpach_and_quant - """ + assert isinstance(self.gate, (TopKGateFused)), "Only fused gate is supported." assert not self.config.use_ep_comm_overlap, "ep_comm_overlap is not supported" assert ( @@ -994,7 +742,6 @@ def gate_distpach_and_quant(self, input, token_type_ids): input, prob, corr_bias, k=k, capacity=capacity, use_pad=True ) - # TODO(zhangyuqin): 把这些代码封装起来, 增强代码复用 dispatch_mask = paddle.diff(F.pad(dispatch_mask, (1, 0))) if self.use_correction_bias: if self.gate.config.multimodel_experts: @@ -1009,7 +756,7 @@ def gate_distpach_and_quant(self, input, token_type_ids): scatter_index.stop_gradient = True dispatch_mask.stop_gradient = True - scatter_index = scatter_index.transpose([1, 0]) # [k,s] ->[s,k] + scatter_index = scatter_index.transpose([1, 0]) if self.group_experts: if max_prob is not None: if token_type_ids is not None: @@ -1057,18 +804,7 @@ def reshape_for_a2a(tensor): ) def gate_and_distpach(self, input, token_type_ids): - """ - calc gate and dispatch inputs (and do logging, optionaly) - Args: - input: Tensor[seq, dim], float - token_type_ids: Tensor[seq], int - Returns: - dispatched_input: Tensor[num_experts, capacity, dim] - combine_weights: [seq, k] - scatter_index: [seq, k] - router_loss: scalar - gate_logits: [seq, num_experts] - """ + seqlen, d_model = input.shape args = () if token_type_ids is not None: @@ -1120,77 +856,68 @@ def gate_and_distpach(self, input, token_type_ids): # capacity no use k = self.k prob, max_prob = self.fused_gate_logits_process(gate_logits, token_type_ids) - if get_env_device() == "xpu": - assert xpu_moe_gate_dispatch is not None - ( - dispatched_input, - combine_weights_unnorm, - scatter_index, - dispatch_mask, - _, - ) = xpu_moe_gate_dispatch(input, prob, k, capacity, True) - else: - assert moe_ops is not None - with profile("dispatch_op"): - if ( - "corr_bias" - in inspect.signature(moe_ops.moe_gate_dispatch).parameters - ): - if self.use_correction_bias: - compat_args = (self.moe_statics.e_score_correction_bias[0],) - else: - compat_args = (None,) - else: - assert ( - not self.use_correction_bias - ), "correction bias not supported, rebuild moe-ops" - compat_args = () - if not self.config.use_ep_comm_overlap: - if self._rr_moe_gate_dispatch is None: - ( - dispatched_input, - combine_weights_unnorm, - scatter_index, - dispatch_mask, - _, - ) = moe_ops.moe_gate_dispatch( - input, - prob, - *compat_args, - k=k, - capacity=capacity, - use_pad=True, - ) - else: - ( - dispatched_input, - combine_weights_unnorm, - scatter_index, - dispatch_mask, - _, - ) = self._rr_moe_gate_dispatch( - input, - prob, - compat_args, - k=k, - capacity=capacity, - use_pad=True, - ) + + assert moe_ops is not None + with profile("dispatch_op"): + if ( + "corr_bias" + in inspect.signature(moe_ops.moe_gate_dispatch).parameters + ): + if self.use_correction_bias: + compat_args = (self.moe_statics.e_score_correction_bias[0],) else: + compat_args = (None,) + else: + assert ( + not self.use_correction_bias + ), "correction bias not supported, rebuild moe-ops" + compat_args = () + if not self.config.use_ep_comm_overlap: + if self._rr_moe_gate_dispatch is None: ( dispatched_input, combine_weights_unnorm, scatter_index, dispatch_mask, _, - ) = moe_ops.moe_gate_dispatch_permute( + ) = moe_ops.moe_gate_dispatch( input, prob, *compat_args, k=k, capacity=capacity, - world_size=self.group.nranks, + use_pad=True, + ) + else: + ( + dispatched_input, + combine_weights_unnorm, + scatter_index, + dispatch_mask, + _, + ) = self._rr_moe_gate_dispatch( + input, + prob, + compat_args, + k=k, + capacity=capacity, + use_pad=True, ) + else: + ( + dispatched_input, + combine_weights_unnorm, + scatter_index, + dispatch_mask, + _, + ) = moe_ops.moe_gate_dispatch_permute( + input, + prob, + *compat_args, + k=k, + capacity=capacity, + world_size=self.group.nranks, + ) dispatch_mask = paddle.diff(F.pad(dispatch_mask, (1, 0))) if self.use_correction_bias and framework._dygraph_tracer()._has_grad: if self.gate.config.multimodel_experts: @@ -1525,17 +1252,8 @@ def calc_router_loss_and_logging( return router_loss def combine_expert_output(self, expert_output, combine_weights, scatter_index): - """ - Combine Expert output - Args: - expert_output: Tensor[num_experts, caapcity, dim] - combine_weights: - Returns: - combined_output: Tensor[seqlen, dim] - """ - expert_output = expert_output.reshape( - [-1, expert_output.shape[-1]] - ) # [e*1,c,m] + + expert_output = expert_output.reshape([-1, expert_output.shape[-1]]) use_fuse = isinstance(self.gate, (TopKGateFused)) combine_fn = combining_fused if use_fuse else combining combined_output = combine_fn(expert_output, combine_weights, scatter_index) @@ -1545,58 +1263,15 @@ def combine_expert_output(self, expert_output, combine_weights, scatter_index): return combined_output def forward_single_stage(self, dispatched_input, stage_id): - """forward_single_stage""" assert isinstance(self.experts, nn.LayerList) return self.experts[stage_id](dispatched_input) - def all2all_expert_overlap(self, x, group): - """all2all_expert_overlap""" - all2all_tasks = [] - all2all_ins = paddle.unbind(x, axis=0) - for stage_id in range(1): - stage_input = all2all_ins[stage_id] - x_out, task = AlltoAll.apply(stage_input, group=self.group, sync_op=False) - all2all_tasks.append((task, x_out)) - - expert_outputs = [] - for stage_id in range(self.num_local_experts): - if stage_id + 1 != self.num_local_experts: - stage_input = all2all_ins[stage_id + 1] - x_out, task = AlltoAll.apply( - stage_input, group=self.group, sync_op=False - ) - all2all_tasks.append((task, x_out)) - - task, dispatched_input = all2all_tasks[stage_id] - task.wait() - expert_outputs_cur_stage = ( - recompute(self.forward_single_stage, dispatched_input, stage_id) - if self.recompute and self.training - else self.forward_single_stage(dispatched_input, stage_id) - ) - expert_outputs.append(expert_outputs_cur_stage) - - expert_output = paddle.stack(expert_outputs, axis=1) - return expert_output - def forward( self, input: Tensor, token_type_ids=None, ) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor]: - """ - Args: - input (`Tensor`): The input data with shape ``(s, d)``. - Only one token is supported for now. - token_type_ids (`Tensor`) int64 tensor with shape (s), - if specified, rount tensor according to `token_type_ids`. - Returns: - output (`Tensor`): The final output tensor with shape ``(s, d)`` where ``m`` is the - size of model parameters. - combine_weights (`Tensor`, optional): A tensor with shape ``(s,)``, which represents weights - for each expert in MoE. - router_loss (`Tensor`, optional): A scalar tensor representing the loss of routing function. - """ + if input.ndim == 3: orig_shape = input.shape input = input.reshape([-1, input.shape[-1]]) @@ -1777,311 +1452,3 @@ def forward( orig_shape[:-1] + [combined_output.shape[-1]] ) return combined_output, combine_weights, router_loss2, gate_logits - - -class MOEInferLayer(nn.Layer): - - def __init__( - self, - gate: nn.Layer, - experts: List[nn.Layer], - group: Group = None, - recompute=False, - ) -> None: - - super().__init__() - self.gate = gate - self.recompute = recompute - logger.info(f"using infer moe recompute={recompute}") - for p in self.gate.parameters(): - p.is_gate = True - if type(experts) == nn.LayerList: - self.experts = experts - else: - self.experts = nn.LayerList([experts]) - self.group = group - for p in experts.parameters(): - p.expert = True # type: ignore - p.no_sync = True - - self.world_size = dist.get_world_size(self.group) - self.rank = dist.get_rank(self.group) - - if self.world_size < 1: - self.world_size = 1 - if self.rank < 0: - self.rank = 0 - self.num_local_experts = len(self.experts) - - def forward( - self, - input: Tensor, - token_type_ids=None, - ) -> Tensor: - """_summary_ - - Args: - input (Tensor): _description_ - - Returns: - Tensor: _description_ - """ - # assert len(input) == 1, "only single input Tensor supported" - if input.ndim == 3: - orig_shape = input.shape - input = input.reshape([-1, input.shape[-1]]) - else: - orig_shape = None - assert ( - len(input.shape) == 2 - ), f"input Tensor must have dimensions: (s)equence, (d)im, got:{input.shape}" - - # Implement Algorithm 2 from GShard paper. - seqlen, d_model = input.shape - - # Reshape into S tokens by dropping sequence dimension. - # reshaped_input = input.reshape(-1, d_model) - # assert reshaped_input.shape[0] % len(self.experts) == 0, - # f'num tokens must be order of number of local experts, {input[0].shape[0]} vs {len(self.experts)}' - def fwdfn(dispatched_input): - chunks = dispatched_input.unbind(1) - expert_outputs = [] - for chunk, expert in zip(chunks, self.experts): - expert_outputs += [expert(chunk)] - expert_output = paddle.stack(expert_outputs, axis=1) # [ecm] - return expert_output - - assert self.gate is not None - ( - capacity, - dispatch_mask, - combine_weights, - scatter_index, - router_loss, - ) = self.gate(input) - - dispatched_input = dispatching( - input, - dispatch_mask, - scatter_index, - num_experts=self.world_size * self.num_local_experts, - capacity=capacity, - ) - dispatched_input = dispatched_input.reshape( - [self.world_size * self.num_local_experts, capacity, d_model] - ) - # dispatched_input = _AllToAll.apply(dispatched_input, self.group) #[ecm] - dispatched_input = dispatched_input.reshape( - [self.world_size, self.num_local_experts, -1, d_model] - ) # [e,1,c,m] - dispatched_input = dispatched_input[ - self.rank : (self.rank + 1) - ] # [1, local_experts, c, m] - - expert_output = ( - recompute(fwdfn, dispatched_input) - if self.recompute and self.training - else fwdfn(dispatched_input) - ) - # expert_output = fwdfn(dispatched_input) - # expert_output = _AllToAll.apply(expert_output, self.group) #[ecm] - if self.world_size > 1: - tmp = [] - dist.all_gather(tmp, expert_output, group=self.group) - expert_output = paddle.concat(tmp, axis=0) - - expert_output = expert_output.reshape( - [self.world_size * self.num_local_experts * capacity, d_model] - ) # [e*1,c,m] - combined_output = combining(expert_output, combine_weights, scatter_index) - - # combined_output = paddle.einsum("sec,ecm->sm", combine_weights, expert_output) - if orig_shape: - combined_output = combined_output.reshape(orig_shape) - top1_gate_experts_per_token = ( - paddle.cast(dispatch_mask[0], dtype="float32").sum() / seqlen - ) - top2_gate_experts_per_token = ( - paddle.cast(dispatch_mask[1], dtype="float32").sum() / seqlen - ) - leakage_experts_per_token = ( - paddle.cast( - (~dispatch_mask[0]) & (~dispatch_mask[1]), dtype="float32" - ).sum() - / seqlen - ) - - experts_per_token = top1_gate_experts_per_token + top2_gate_experts_per_token - global_training_logs.update( - experts_per_token=experts_per_token.detach(), - top1_experts_per_token=top1_gate_experts_per_token.detach(), - top2_experts_per_token=top2_gate_experts_per_token.detach(), - leakage_experts_per_token=leakage_experts_per_token.detach(), - ) - return combined_output, combine_weights, router_loss, None - - -class MOELayerWithAllGatherDispatcher(MOELayer): - """ - MOELayer with allgather dispatcher. - """ - - def __init__( - self, - gate: nn.Layer, - experts: List[nn.Layer], - layer_idx, - shared_experts: Optional[List[nn.Layer]] = None, - group: Group = None, - recompute=False, - enable_logging: bool = False, - k=2, - enable_bpr: bool = False, - all_to_all_dropout=0, - group_experts=False, - ): - super(MOELayerWithAllGatherDispatcher, self).__init__( - gate=gate, - experts=experts, - layer_idx=layer_idx, - shared_experts=shared_experts, - group=group, - recompute=recompute, - enable_logging=enable_logging, - k=k, - enable_bpr=enable_bpr, - all_to_all_dropout=all_to_all_dropout, - group_experts=group_experts, - ) - logger.info("Using MOELayerWithAllGatherDispatcher") - assert get_env_device() == "xpu" - assert isinstance(self.gate, TopKGateFused) - assert self.shared_experts is not None - local_expert_indices_offset = self.rank * self.num_local_experts - self.expert_indices = [ - local_expert_indices_offset + i for i in range(self.num_local_experts) - ] - - def gate_and_distpach(self, input, token_type_ids): - """ - gate and dispatch - """ - args = () - - gate_logits, capacity, router_loss = self.gate(input, *args) - - if self.input_preprocess is not None: - input, gate_logits = self.input_preprocess(input, gate_logits, capacity) - - moe_allgather_dispatcher_return = MOEAllGatherDispatcher.token_dispatcher( - input, - gate_logits, - self.k, - self.expert_indices, - self.num_local_experts * self.world_size, - self.num_local_experts, - ) - global_hidden_states = moe_allgather_dispatcher_return.global_hidden_states - dispatched_input = moe_allgather_dispatcher_return.dispatched_input - combine_weights = moe_allgather_dispatcher_return.combine_weights - scatter_index = moe_allgather_dispatcher_return.scatter_index - gather_scatter_mask = moe_allgather_dispatcher_return.gather_scatter_mask - dispatch_mask = moe_allgather_dispatcher_return.dispatch_mask - tokens_per_expert = moe_allgather_dispatcher_return.tokens_per_expert - - dispatched_input.stop_gradient = False - combine_weights.stop_gradient = False - scatter_index.stop_gradient = True - gather_scatter_mask.stop_gradient = True - dispatch_mask.stop_gradient = True - - return ( - dispatched_input, - combine_weights, - gather_scatter_mask, - dispatch_mask, - scatter_index, - router_loss, - gate_logits, - global_hidden_states, - tokens_per_expert, - ) - - def forward_experts( - self, dispatched_input, global_hidden_states, tokens_per_expert - ): - """ - call moe experts and share experts - """ - tokens_per_expert_no_zero = list( - filter(lambda x: x != 0, tokens_per_expert.tolist()) - ) - chunks_per_expert = paddle.split( - dispatched_input, tokens_per_expert_no_zero, axis=0 - ) - assert len(chunks_per_expert) <= len(self.experts) - moe_output = [] - offset = 0 - for index, cur_tokens in enumerate(tokens_per_expert.tolist()): - if cur_tokens == 0: - offset += 1 - else: - cur_expert = self.experts[index] - cur_chunk = chunks_per_expert[index - offset] - moe_output.append(cur_expert(cur_chunk)) - hidden_states = paddle.concat(moe_output, axis=0) - shared_expert_out = self.shared_experts(global_hidden_states) - return hidden_states, shared_expert_out - - def forward(self, input, token_type_ids): - """ - forward function - """ - assert ( - len(input.shape) == 2 - ), f"input Tensor must have dimensions: (s)equence, (d)im, got:{input.shape}" - orig_shape = input.shape - global_shape = [orig_shape[0] * self.world_size, orig_shape[1]] - if token_type_ids is not None: - token_type_ids.stop_gradient = True - assert self.gate is not None - - ( - dispatched_input, - combine_weights, - gather_scatter_mask, - dispatch_mask, - scatter_index, - router_loss, - gate_logits, - global_hidden_states, - tokens_per_expert, - ) = self.gate_and_distpach(input, token_type_ids) - - expert_out, shared_out = ( - recompute( - self.forward_experts, - dispatched_input, - global_hidden_states, - tokens_per_expert, - ) - if self.recompute and self.training - else self.forward_experts( - dispatched_input, global_hidden_states, tokens_per_expert - ) - ) - combined_output = MOEAllGatherDispatcher.token_combine( - expert_out, - shared_out, - combine_weights, - scatter_index, - gather_scatter_mask, - global_shape, - ) - if self.shared_experts.down_proj.bias is not None: - combined_output = combined_output + self.shared_experts.down_proj.bias - router_loss2 = self.calc_router_loss_and_logging( - router_loss, combine_weights, dispatch_mask, gate_logits, token_type_ids - ) - - return combined_output, combine_weights, router_loss2, gate_logits diff --git a/examples/pre-training/models/moe/moe_utils_auto.py b/examples/pre-training/models/moe/moe_utils_auto.py index 546d1c8e..fbaba34f 100644 --- a/examples/pre-training/models/moe/moe_utils_auto.py +++ b/examples/pre-training/models/moe/moe_utils_auto.py @@ -14,198 +14,9 @@ # See the License for the specific language governing permissions and # limitations under the License. -""" moe utils for allgather dispatcher """ -import paddle + import paddle.distributed as dist from paddle.distributed import fleet -import paddle.nn.functional as F -from paddle import nn -from paddle.autograd import PyLayer - -from models.sequence_parallel_utils_auto import ( - AllGatherOp, - ReduceScatterOp, -) - - -class MOEGather(PyLayer): - """ - MOE Gather - """ - - @staticmethod - def forward(ctx, input_, map_): - """ - MOE Gather forward - """ - ctx.input_shape = input_.shape - ctx.map = map_ - return paddle.take_along_axis(input_, map_, 0) - - @staticmethod - def backward(ctx, grad_output): - """ - MOE Gather backward - """ - input_shape = ctx.input_shape - map_ = ctx.map - - output = paddle.zeros(input_shape, dtype=grad_output.dtype) - return paddle.put_along_axis(output, map_, grad_output, 0), None - - -class MOEScatter(PyLayer): - """ - MOE Scatter - """ - - @staticmethod - def forward(ctx, input_, map_, output_size=None): - """ - MOE Scatter forward - """ - ctx.map = map_ - - if output_size is not None: - output = paddle.zeros(output_size, dtype=input_.dtype) - else: - output = paddle.zeros_like(input_) - - return paddle.put_along_axis(output, map_, input_, 0) - - @staticmethod - def backward(ctx, grad_output): - """ - MOE Scatter backward - """ - map_ = ctx.map - return paddle.take_along_axis(grad_output, map_, 0), None - - -class AllgatherDispatcherReturn(object): - """ - MOE allgather dispatcher return value - """ - - def __init__( - self, - global_hidden_states, - dispatched_input, - combine_weights, - scatter_index, - gather_scatter_mask, - dispatch_mask, - tokens_per_expert, - ): - self.global_hidden_states = global_hidden_states - self.dispatched_input = dispatched_input - self.combine_weights = combine_weights - self.scatter_index = scatter_index - self.gather_scatter_mask = gather_scatter_mask - self.dispatch_mask = dispatch_mask - self.tokens_per_expert = tokens_per_expert - - -class MOEAllGatherDispatcher(nn.Layer): - """ - MOE with allgather dispatcher. - Contains two static methos. - MOEAllGatherDispatcher.token_dispatcher - MOEAllGatherDispatcher.token_combine - """ - - @staticmethod - def token_dispatcher( - hidden_states, - local_gate_logits, - top_k, - local_expert_indices, - num_moe_experts, - num_local_experts, - ): - """ - MOE token dispatcher with allgather - """ - seq_len = local_gate_logits.shape[0] - num_experts = local_gate_logits.shape[-1] - prob = F.softmax(local_gate_logits.reshape([seq_len, top_k, -1]), axis=-1) - max_prob = prob.max(-1, keepdim=True) - prob /= max_prob - prob = prob.reshape([-1, num_experts]) - - probs, scatter_index = paddle.topk(prob, top_k, axis=-1) - dispatch_mask = paddle.cumsum( - paddle.histogram(scatter_index.flatten(), bins=num_experts) - ) - - # dispatch - with paddle.no_grad(): - global_indices = AllGatherOp.apply(scatter_index) - global_local_mask = (global_indices >= local_expert_indices[0]) & ( - global_indices <= local_expert_indices[-1] - ) - local_indices = global_indices.masked_select(global_local_mask) - - global_hidden_states = AllGatherOp.apply(hidden_states) - global_probs = AllGatherOp.apply(probs) - - # get local hidden states - combine_weights = global_probs.masked_select(global_local_mask).cast( - dtype=hidden_states.dtype - ) - gather_scatter_mask = global_local_mask.nonzero()[:, 0] - gather_scatter_mask = paddle.reshape(gather_scatter_mask, shape=[-1, 1]) - gather_scatter_mask = paddle.expand( - gather_scatter_mask, shape=[-1, hidden_states.shape[-1]] - ) - local_hidden_states = MOEGather.apply(global_hidden_states, gather_scatter_mask) - - with paddle.no_grad(): - # The indices of local_indices that give its sorted order along dim 0. - scatter_index = paddle.argsort(local_indices, axis=0) - tokens_per_expert = paddle.bincount( - paddle.reshape(local_indices, [-1]), minlength=num_moe_experts - ) - if num_local_experts < num_moe_experts: - start = local_expert_indices[0] - end = local_expert_indices[-1] + 1 - tokens_per_expert = tokens_per_expert[start:end] - - scatter_index = paddle.reshape(scatter_index, shape=[-1, 1]) - scatter_index = paddle.expand( - scatter_index, shape=[-1, hidden_states.shape[-1]] - ) - - dispatched_input = MOEGather.apply(local_hidden_states, scatter_index) - - return AllgatherDispatcherReturn( - global_hidden_states, - dispatched_input, - combine_weights, - scatter_index, - gather_scatter_mask, - dispatch_mask, - tokens_per_expert, - ) - - @staticmethod - def token_combine( - expert_out, - shared_out, - combine_weights, - scatter_index, - gather_scatter_mask, - global_shape, - ): - """ - MOE token combine with reduce scatter - """ - expert_out = MOEScatter.apply(expert_out, scatter_index) - expert_out = expert_out * paddle.reshape(combine_weights, shape=[-1, 1]) - expert_out = MOEScatter.apply(expert_out, gather_scatter_mask, global_shape) - combine_out = expert_out + shared_out - combine_out = ReduceScatterOp.apply(combine_out) - return combine_out def get_flatten_mesh(mesh): diff --git a/examples/pre-training/models/moe/top2_gate_auto.py b/examples/pre-training/models/moe/top2_gate_auto.py index 79c2a3aa..a8aee34d 100644 --- a/examples/pre-training/models/moe/top2_gate_auto.py +++ b/examples/pre-training/models/moe/top2_gate_auto.py @@ -24,7 +24,6 @@ from paddle import Tensor import paddle.distributed as dist -# import paddle.nn.functional as F logger = logging.getLogger(__name__) diff --git a/examples/pre-training/models/moe/top2_gate_auto_auto.py b/examples/pre-training/models/moe/top2_gate_auto_auto.py index 6460c157..6ce094d2 100644 --- a/examples/pre-training/models/moe/top2_gate_auto_auto.py +++ b/examples/pre-training/models/moe/top2_gate_auto_auto.py @@ -14,10 +14,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -""" -top2gate -""" - from typing import Tuple from functools import partial @@ -57,11 +53,9 @@ class CalOrthogonalLossOptEachWeightFunctor(paddle.autograd.PyLayer): - """CalOrthogonalLossOptEachWeightFunctor""" @staticmethod def forward(ctx, gate_weight, moe_k, use_group, eps=1e-12): - """forward""" if gate_weight.dtype != paddle.float32: gate_weight = gate_weight.astype(paddle.float32) ( @@ -83,7 +77,6 @@ def forward(ctx, gate_weight, moe_k, use_group, eps=1e-12): @staticmethod def backward(ctx, out_grad): - """backward""" gate_weight, wnorm, weight_scale, normed_weight, weight_matmul = ( ctx.saved_tensor() ) @@ -105,11 +98,9 @@ def backward(ctx, out_grad): class CalZLossFunctor(paddle.autograd.PyLayer): - """CalZLossFunctor""" @staticmethod def forward(ctx, logits, loss_mask=None, clip_min=1e-6): - """forward""" if loss_mask is not None: assert loss_mask.stop_gradient loss, max_logits, safe_sumexp, logsumexp_per_token = ( @@ -123,7 +114,6 @@ def forward(ctx, logits, loss_mask=None, clip_min=1e-6): @staticmethod def backward(ctx, out_grad): - """backward""" logits, loss_mask, max_logits, safe_sumexp, logsumexp_per_token = ( ctx.saved_tensor() ) @@ -142,7 +132,6 @@ def backward(ctx, out_grad): class CalAuxLossFunctor(paddle.autograd.PyLayer): - """CalAuxLossFunctor""" @staticmethod def forward( @@ -156,7 +145,6 @@ def forward( moe_k, clip_min=1e-6, ): - """forward""" if tokens_mask is not None and tokens_mask.dtype != gate_prob.dtype: tokens_mask = tokens_mask.astype(gate_prob.dtype) loss, seqlen_float, ce = paddle.incubate.nn.functional.cal_aux_loss( @@ -177,7 +165,6 @@ def forward( @staticmethod def backward(ctx, out_grad): - """backward""" gate_prob, seqlen_float, ce = ctx.saved_tensor() num_experts = ctx.num_experts use_group = ctx.use_group @@ -190,7 +177,6 @@ def backward(ctx, out_grad): def cal_orthogonal_loss_opt_each_weight_func( weight, moe_k, use_group, eps, xpu_matmul=None, training=True ): - """cal_orthogonal_loss_opt_each_weight_func""" weight = weight.transpose([1, 0]).contiguous() # transpose weight here wnorm = weight.norm(axis=1) weight = weight / paddle.maximum(wnorm, eps).unsqueeze(1) @@ -212,7 +198,6 @@ def cal_orthogonal_loss_opt_each_weight_func( def cal_z_loss_func(logits, loss_mask): - """cal_z_loss_func""" if loss_mask is not None: loss_mask = loss_mask.astype(logits.dtype) l_zloss = (logits.logsumexp(1).square() * loss_mask).sum() / paddle.clip( @@ -235,7 +220,6 @@ def cal_aux_loss_func( rank=None, group=None, ): - """cal_aux_loss_func""" if tokens_mask is not None and tokens_mask.dtype != gate_prob.dtype: tokens_mask = tokens_mask.astype(gate_prob.dtype) @@ -287,27 +271,10 @@ def masked_fill(x, mask, value): @paddle.no_grad() def compute_optimal_transport(M, r, c, lam=1.0, epsilon=1e-8, max_iters: int = 10): - """ - Computes the optimal transport matrix and Slinkhorn distance using the - Sinkhorn-Knopp algorithm - - Inputs: - - M : cost matrix (n x m) - - r : vector of marginals (n, ) - - c : vector of marginals (m, ) - - lam : strength of the entropic regularization - - epsilon : convergence parameter - - Outputs: - - P : optimal transport matrix (n x m) - - dist : Sinkhorn distance - """ + n, _ = M.shape - # P = (- lam * M).exp() - # P /= P.sum() P = F.softmax(-M / lam) u = paddle.zeros(n, "float32") - # normalize this matrix for _ in range(max_iters): if (u - P.sum(1)).abs().max() < epsilon: break @@ -319,31 +286,22 @@ def compute_optimal_transport(M, r, c, lam=1.0, epsilon=1e-8, max_iters: int = 1 def cast_if_needed(x, dtype): - """ - cast_if_needed - """ + return x.cast(dtype) if x.dtype != dtype else x class FusedGateDetachMatmul(paddle.autograd.PyLayer): - """ - FusedGateDetachMatmul - """ @staticmethod def forward(ctx, x, w): - """ - forward - """ + ctx.dtype = paddle.float32 ctx.save_for_backward(x, w) return F.linear(cast_if_needed(x, ctx.dtype), cast_if_needed(w, ctx.dtype)) @staticmethod def backward(ctx, y_grad): - """ - backward - """ + x, w = ctx.saved_tensor() assert ctx.dtype == y_grad.dtype, "dtype not match" x_g, w_g = matmul_bwd( @@ -357,9 +315,7 @@ def backward(ctx, y_grad): def gate_detach_matmul(x, weight, use_fuse, use_fake_gate=False): - """ - gate_detach_matmul - """ + if use_fuse: score = FusedGateDetachMatmul.apply(x, weight) else: @@ -372,20 +328,6 @@ def gate_detach_matmul(x, weight, use_fuse, use_fake_gate=False): class Top2Gate(nn.Layer): - """Gate module which implements Top2Gating as described in Gshard_. - :: - - gate = Top2Gate(model_dim, num_experts) - l_aux, combine_weights, dispatch_mask = gate(input) - - .. Gshard_: https://arxiv.org/pdf/2006.16668.pdf - - Args: - model_dim (int): - size of model embedding dimension - num_experts (ints): - number of experts in model - """ def __init__(self, config, layer_idx: int, group, gate_weight=None) -> None: @@ -551,18 +493,14 @@ def _create_gate_parameter(self): ) p.expert_type = f"expert_type_{i}" self.add_parameter( - ( - "weight" if i == 0 else f"weight_{i}" - ), + ("weight" if i == 0 else f"weight_{i}"), p, ) else: self.weight = self.create_parameter( shape=[self.model_dim, self.num_experts], dtype="float32", - attr=paddle.ParamAttr( - name=unique_name.generate("moe_gate") - ), + attr=paddle.ParamAttr(name=unique_name.generate("moe_gate")), ) logger.info(f"moe-Gate, {self.weight}") @@ -699,9 +637,7 @@ def forward( ) def get_capacity(self, num_tokens, cap_factor=None): - """ - return capcity - """ + num_experts = ( sum(self.num_experts) if self.config.multimodel_experts @@ -972,39 +908,18 @@ def _cal_aux_loss( if use_group is None: use_group = self.config.moe_group_experts - moe_router_loss_ops = None - if ( - moe_router_loss_ops is not None - and get_env_device() != "xpu" - and (tokens_mask is None or len(tokens_mask.shape) == 1) - and (tokens_mask is None or tokens_mask.shape[0] == gate_prob.shape[0]) - and (gate_prob.shape[0] >= gate_prob.shape[1]) - and (not self.global_aux_loss) - and (gate_prob.dtype == paddle.float32) - ): - return CalAuxLossFunctor.apply( - gate_prob, - dispatch_mask, - tokens_mask, - dispatch_tokens_mask, - num_experts, - use_group, - self.config.moe_k, - clip_min=1e-6, - ) - else: - return cal_aux_loss_func( - gate_prob, - dispatch_mask, - tokens_mask, - dispatch_tokens_mask, - num_experts, - use_group, - self.config.moe_k, - self.global_aux_loss, - self.rank if self.global_aux_loss else None, - self.group if self.global_aux_loss else None, - ) + return cal_aux_loss_func( + gate_prob, + dispatch_mask, + tokens_mask, + dispatch_tokens_mask, + num_experts, + use_group, + self.config.moe_k, + self.global_aux_loss, + self.rank if self.global_aux_loss else None, + self.group if self.global_aux_loss else None, + ) def _cal_z_loss(self, logits, loss_mask=None): @@ -1019,9 +934,7 @@ def _cal_z_loss(self, logits, loss_mask=None): return cal_z_loss_func(logits, loss_mask) def _cal_orthogonal_loss_opt_each_weight(self, weight, use_group): - """ - gate正交loss(优化版) - """ + if weight.dtype != paddle.float32: weight = weight.astype(paddle.float32) @@ -1044,9 +957,7 @@ def _cal_orthogonal_loss_opt_each_weight(self, weight, use_group): ) def _cal_orthogonal_loss(self, weight_id=None, use_group=None): - """ - gate正交Loss - """ + if use_group is None: use_group = ( self.config.moe_group_experts and self.config.moe_group_orthogonal_loss @@ -1073,7 +984,6 @@ def _cal_orthogonal_loss(self, weight_id=None, use_group=None): class TopKGateFused(Top2Gate): - """doc""" def forward( self, @@ -1081,16 +991,7 @@ def forward( token_type_ids=None, transform_weight=True, ) -> Tuple[Tensor, Tensor, Tensor]: # type: ignore - """ - Args: - input: paddle.Tensor, hidden-states of layer - token_type_ids: paddle.Tensor[Seqw], token_type_ids of input - transform_weight: bool, when using multimodal experts, perform `self.get_gate_weight` if specified - Retruns: - paddle.Tensor [Seq, Expert, Capacity]: float32, combine weights - paddle.Tensor [Seq, Expert, Capacity]: bool, dispatch mask - Tuple[paddle.Tensor]: `GateOutput` - """ + capacity = self.get_capacity(input.shape[0]) weight = self.get_gate_weight(transform_weight) with paddle.amp.auto_cast(False): diff --git a/examples/pre-training/models/sequence_parallel_utils_auto.py b/examples/pre-training/models/sequence_parallel_utils_auto.py index 4b80ca3f..408a7227 100644 --- a/examples/pre-training/models/sequence_parallel_utils_auto.py +++ b/examples/pre-training/models/sequence_parallel_utils_auto.py @@ -20,24 +20,15 @@ import paddle from paddle import distributed as dist -from paddle.nn import functional as F from paddle.autograd import PyLayer -from paddle.nn.layer.layers import Layer from paddle.distributed import fleet -from paddle.distributed.fleet.meta_parallel import get_rng_state_tracker -from paddle.distributed.fleet.utils.hybrid_parallel_util import ( - fused_allreduce_gradients_with_group, -) -from paddle.incubate.tensor.manipulation import create_async_load + from models.comm_utils import ( scatter, all_gather, reduce_scatter, - mp_slice, - all_gather_varlen, ) -from paddleformers.utils.tools import get_env_device from paddle.distributed import in_auto_parallel_align_mode @@ -45,7 +36,6 @@ try: from paddle.nn.functional import gemm_reduce_scatter, all_gather_gemm - import paddle.nn.functional.flux as flux except ImportError: gemm_reduce_scatter = None all_gather_gemm = None @@ -56,9 +46,7 @@ if not hasattr(paddle.Tensor, "contiguous"): def contiguous(self): - """ - Make the tensor contiguous. - """ + return self setattr(paddle.Tensor, "contiguous", contiguous) @@ -67,9 +55,6 @@ def contiguous(self): if not hasattr(paddle.Tensor, "_md5sum"): def _md5sum(self): - """ - Calculate the md5sum of the Tensor. - """ numpy_array = np.array(self) array_bytes = numpy_array.tobytes() return hashlib.md5(array_bytes).hexdigest() @@ -81,143 +66,10 @@ def get_hcg(): return fleet.get_hybrid_communicate_group() -async_loader = None - - -def get_async_loader(): - assert get_env_device() != "xpu" - global async_loader - """get_async_loader""" - if not hasattr(fleet.fleet, "_hcg"): - if async_loader is None: - async_loader = create_async_load() - return async_loader - - hcg = get_hcg() - if not hasattr(hcg, "async_loader"): - setattr(hcg, "async_loader", create_async_load()) - return hcg.async_loader - - -def hack_offload_wait(task): - """hack_offload_wait""" - task.cpu_wait() - - -def hack_reload_wait(task): - """hack_offload_wait""" - task.cuda_wait() - - -class _AllToAll(paddle.autograd.PyLayer): - @staticmethod - def forward( - ctx, - input, - group, - output_split_sizes=None, - input_split_sizes=None, - ): - """ - All-to-all communication in the group. - - Args: - ctx (Any): Context object. - input (Tensor): Input tensor. - group (Group): The group object. - - Returns: - Tensor: Output tensor. - """ - - ctx.group = group - ctx.input_split_sizes = input_split_sizes - ctx.output_split_sizes = output_split_sizes - # return input - if dist.get_world_size(group) <= 1: - return input - if input_split_sizes is None and output_split_sizes is None: - output = paddle.empty_like(input) - task = dist.stream.alltoall_single( - output, input, None, None, group, True, True - ) - task.wait() - else: - out_sizes = [sum(output_split_sizes)] - out_sizes.extend(input.shape[1:]) - output = paddle.empty(out_sizes, dtype=input.dtype) - task = dist.stream.alltoall_single( - output, - input, - output_split_sizes, - input_split_sizes, - group, - sync_op=False, - ) - task.wait() - return output - - @staticmethod - def backward(ctx, *grad_output): - """ - all-to-all backward - - """ - # return grad_output - if ctx.input_split_sizes is None and ctx.output_split_sizes is None: - return _AllToAll.apply(*grad_output, ctx.group) - else: - return _AllToAll.apply( - *grad_output, ctx.group, ctx.input_split_sizes, ctx.output_split_sizes - ) - - -class AllGatherVarlenOpV2(PyLayer): - - @staticmethod - def forward(ctx, input, indices, axis=0, group=None): - """fwd""" - ctx.axis = axis - ctx.group = group - ctx.indices = indices - return all_gather_varlen(input, indices, axis=axis, group=group) - - @staticmethod - def backward(ctx, grad): - """bwd""" - return mp_slice(grad, ctx.indices, axis=ctx.axis, group=ctx.group) - - -class SliceVarlenOp(PyLayer): - - @staticmethod - def forward( - ctx, - input, - indices, - group=None, - ): - """ - fwd - """ - ctx.indices = indices - ctx.group = group - ret = mp_slice(input, indices, group=ctx.group) - return ret - - @staticmethod - def backward(ctx, grad): - """ - bwd - """ - return all_gather_varlen(grad, axis=ctx.axis, group=ctx.group) - - class ScatterOp(PyLayer): @staticmethod def forward(ctx, input, axis=0, group=None): - """fwd""" ctx.axis = axis ctx.group = group return scatter(input, axis=axis, group=ctx.group) @@ -227,19 +79,10 @@ def backward(ctx, grad): return all_gather(grad, axis=ctx.axis, group=ctx.group) -SliceOp = ScatterOp # `ScatterOp` 的行为应该更像 Sclice - - class GatherOp(PyLayer): - """ - input shape: [s/n, b, h], n is mp parallelism - after forward shape: [s, b, h] - 行为类似`AllGather`,反向不会汇聚梯度,从MP 异步态,回到 MP 同步态。 - """ @staticmethod def forward(ctx, input, axis=0, group=None): - """fwd""" ctx.axis = axis ctx.group = group return all_gather(input, axis=axis, group=group) @@ -249,23 +92,13 @@ def backward(ctx, grad): return scatter(grad, axis=ctx.axis, group=ctx.group) -# All gather along the first dim during forward pass -# All reduce and scatter along the first dim during backward pass class AllGatherOp(PyLayer): - """ - input shape: [s/n, b, h], n is mp parallelism - after forward shape: [s, b, h] - 行为类似`AllGather`,反向会汇聚梯度,AllGather 完之后还是 MP 异步态。 - """ @staticmethod def forward(ctx, input, group=None): - """fwd""" ctx.group = group return all_gather(input, group=group) - # grad shape: [s, b, h], n is mp parallelism - # after forward shape: [s/n, b, h] @staticmethod def backward(ctx, grad): if in_auto_parallel_align_mode(): @@ -279,30 +112,21 @@ def backward(ctx, grad): return reduce_scatter(grad, group=ctx.group) -# All reduce and scatter along the first dim during forward pass -# All gather along the first dim during backward pass class ReduceScatterOp(PyLayer): - # input shape: [s, b, h], n is mp parallelism - # after forward shape: [s/n, b, h] @staticmethod def forward(ctx, input, group=None): - """fwd""" ctx.group = group return reduce_scatter(input, group=group) - # grad shape: [s/n, b, h], n is mp parallelism - # after forward shape: [s, b, h] @staticmethod def backward(ctx, grad): return all_gather(grad, group=ctx.group) class AllGatherVarlenOp(PyLayer): - """the shape of allgather can be not same for each rank""" @staticmethod def forward(ctx, input, group=None): - """ """ hcg = fleet.get_hybrid_communicate_group() if group is None: group = hcg.get_model_parallel_group() @@ -342,7 +166,6 @@ def forward(ctx, input, group=None): @staticmethod def backward(ctx, grad): - """ """ input_shape = grad.shape input_shape[0] = ctx.max_shape0 * ctx.shape0_all.shape[0] output = paddle.zeros(shape=input_shape, dtype=grad.dtype) @@ -357,16 +180,10 @@ def backward(ctx, grad): class GemmReduceScatterOp(PyLayer): - """overlap gemm and reduce scatter""" @staticmethod def forward(ctx, input, weight, group): - """ - Args: input: Tensor[b * s, h / mp_size] - weight: Tensor[h / mp_size, h'] or Tensor[h', h / mp_size] - group: mp_group - Returns: output: Tensor[b * s / mp_size, h'] - """ + ctx.save_for_backward(input, weight) ctx.group = group output = gemm_reduce_scatter(input, weight, group) @@ -374,11 +191,6 @@ def forward(ctx, input, weight, group): @staticmethod def backward(ctx, grad): - """ - Args: grad: Tensor[b * s / mp_size, h'] - Returns: input_grad: Tensor[b * s, h / mp_size] - weight_grad: Tensor[h / mp_size, h'] or Tensor[h', h / mp_size] - """ input, weight = ctx.saved_tensor() group = ctx.group if input.stop_gradient and weight.stop_gradient: @@ -401,50 +213,7 @@ def backward(ctx, grad): return input_grad, weight_grad -class AllGatherGemmOp(PyLayer): - """overlap all gather and gemm""" - - @staticmethod - def forward(ctx, input, weight, group): - """ - Args: input: Tensor[b * s / mp_size, h] - weight: Tensor[h, h' / mp_size] or Tensor[h' / mp_size, h] - group: mp_group - Returns: output: Tensor[b * s, h' / mp_size] - """ - output, input_parallel = all_gather_gemm( - input, weight, group, deepcopy_input_parallel=True - ) - ctx.save_for_backward(input_parallel, weight) - ctx.group = group - ctx.input_stop_gradient = input.stop_gradient - return output - - @staticmethod - def backward(ctx, grad): - """ - Args: grad: Tensor[b * s, h' / mp_size] - Returns: input_grad: Tensor[b * s / mp_size, h] - weight_grad: Tensor[h, h' / mp_size] or Tensor[h' / mp_size, h] - """ - input_parallel, weight = ctx.saved_tensor() - group = ctx.group - if ctx.input_stop_gradient and weight.stop_gradient: - return None, None - if ctx.input_stop_gradient: - input_grad = None - else: - input_grad = gemm_reduce_scatter(grad, weight, group) - if weight.stop_gradient: - weight_grad = None - else: - weight_grad = paddle.matmul(input_parallel, grad, transpose_x=True) - - return input_grad, weight_grad - - def sequence_parallel_sparse_mask_labels(labels, ignore_label=-100): - """allgather sparse label and return sparse idx""" hcg = fleet.get_hybrid_communicate_group() group = hcg.get_model_parallel_group() labels = labels.flatten() @@ -458,453 +227,3 @@ def sequence_parallel_sparse_mask_labels(labels, ignore_label=-100): labels_local_gather = paddle.take_along_axis(labels_local, tgt_index, axis=0) labels_all_gather = AllGatherVarlenOp.apply(labels_local_gather) return labels_all_gather, tgt_index.reshape([-1, 1]) - - -################################################### -# # -# Modified Parallel Linear Operator # -# # -################################################### - - -def mark_as_sequence_parallel_parameter(parameter): - setattr(parameter, "sequence_parallel", True) - - -def is_sequence_parallel_parameter(parameter): - return getattr(parameter, "sequence_parallel", False) - - -def create_fused_allreduce_gradient_hook(parameter_list, accumulation_steps): - hcg = get_hcg() - group = hcg.get_model_parallel_group() - - step = [0] - accumulation_steps *= len(parameter_list) - - def __impl__(grad): - step[0] += 1 - if step[0] == accumulation_steps: - step[0] = 0 - fused_allreduce_gradients_with_group(parameter_list, group=group, scale=1.0) - return grad - - return __impl__ - - -def create_non_fused_allreduce_gradient_hook(param, model, verbose=False): - - hcg = get_hcg() - pg = hcg.get_model_parallel_group().process_group - step = [0] - - @paddle.autograd.no_grad() - def __impl__(): - step[0] += 1 - # if accumulation_steps is None: - accumulation_steps = model.accumulate_steps - if verbose: - logger.info( - f'hook called: acc-step={step[0]}/{accumulation_steps}, use_main_grad={hasattr(param, "main_grad")}' - ) - if (step[0] % accumulation_steps) == 0: - step[0] = 0 - if hasattr(param, "main_grad"): - pg.allreduce(param.main_grad).wait() - else: - pg.allreduce(param.grad).wait() - - return __impl__ - - -def register_sequence_parallel_allreduce_hooks( - model, fuse_sequence_parallel_allreduce=False -): - logger.warning( - "DO NOT use sphook unless your PyLayer does not trigger param backward hook" - ) - mp_group = get_hcg().get_model_parallel_group() - if mp_group.nranks <= 1: - return - - params = [] - for n, p in model._layers.named_parameters(): - if is_sequence_parallel_parameter(p): - logger.info(f"register bw hook for:{n}") - params.append(p) - logger.info(f"#-sp-sync param:{len(params)}") - - if fuse_sequence_parallel_allreduce: - raise NotImplementedError() - else: - for i, p in enumerate(params): - if p.stop_gradient: - continue - hook = create_non_fused_allreduce_gradient_hook(p, model, verbose=False) - p._register_backward_hook(hook) - - -def is_fused_matmul_bias_supported(): - if paddle.is_compiled_with_cuda() and not paddle.is_compiled_with_rocm(): - import_module_error = False - try: - from paddle.base import core - except ModuleNotFoundError: - logger.warning( - "Unable to import paddle.base, are you using paddle latest build?" - ) - import_module_error = True - - if import_module_error: - try: - from paddle.fluid import core - except ModuleNotFoundError: - logger.warning( - "Unable to import paddle.fluid, are you using paddle latest build?" - ) - return False - return hasattr(core.eager.ops.legacy, "fused_gemm_epilogue") - else: - return False - - -class ColumnSequenceParallelLinear(Layer): - def __init__( - self, - in_features, - out_features, - weight_attr=None, - has_bias=None, - gather_output=True, - fuse_matmul_bias=False, - mp_group=None, - use_rr=False, - name=None, - use_comm=True, - use_tpsp_comm_overlap=False, - ): - super(ColumnSequenceParallelLinear, self).__init__() - - hcg = get_hcg() - self.model_parallel_group = ( - hcg.get_model_parallel_group() if mp_group is None else mp_group - ) - self.world_size = ( - hcg.get_model_parallel_group().nranks - if mp_group is None - else mp_group.nranks - ) - self._name = name - self.is_mp = self.world_size > 1 - self.use_comm = use_comm - if not self.use_comm: - assert not use_rr, "The moe allgather not compatibale with rr for now." - logger.warning( - "ColumnSequenceParallelLinear will NOT call ANY comm, " - "this feature is only used for XPU moe allgather dispatcher. " - "If this is not your purpose, please unset XPU_MOE_USE_ALLGATHER." - ) - - self.use_tpsp_comm_overlap = use_tpsp_comm_overlap - if self.use_tpsp_comm_overlap: - assert all_gather_gemm is not None - assert flux is not None - - assert ( - gather_output is False - ), "If sequence_parallel is True, \ - gather_output is False" - - self.gather_output = gather_output - assert out_features % self.world_size == 0, ( - "Number of column of the weight for linear ({}) must be" - " divisible by model parallel size ({})".format( - out_features, self.world_size - ) - ) - self.output_size_per_partition = out_features // self.world_size - - self._weight_attr = weight_attr - self._dtype = self._helper.get_default_dtype() - - if self.is_mp and paddle.in_dynamic_mode(): - with get_rng_state_tracker().rng_state(): - self.weight = self.create_parameter( - shape=[in_features, self.output_size_per_partition], - attr=self._weight_attr, - dtype=self._dtype, - is_bias=False, - ) - else: - self.weight = self.create_parameter( - shape=[in_features, self.output_size_per_partition], - attr=self._weight_attr, - dtype=self._dtype, - is_bias=False, - ) - - self.weight.is_distributed = True if self.is_mp else False - if self.weight.is_distributed: - self.weight.split_axis = 1 - - if has_bias: - # initialize bias to zero like Megatron - self.bias = self.create_parameter( - shape=[self.output_size_per_partition], - attr=paddle.nn.initializer.Constant(value=0.0), - dtype=self._dtype, - is_bias=True, - ) - self.bias.is_distributed = True if self.is_mp else False - if self.bias.is_distributed: - self.bias.split_axis = 0 - else: - self.bias = None - - self.linear = F.linear - - if self.use_tpsp_comm_overlap and self.is_mp and self.use_comm: - self._rr_column_comm_ln = None - - self._rr_column_ln = None - - if fuse_matmul_bias: - if not is_fused_matmul_bias_supported(): - raise NotImplementedError( - "You set fuse_matmul_bias=True in ColumnSequenceParallelLinear, " - "however, the paddle you are using not support this operation. " - "Please set fuse_matmul_bias=False or use paddle compiled " - "with cuda 11.6 or higher." - ) - from paddle.incubate.nn.functional import fused_linear - - self.linear = fused_linear - - def forward(self, x, use_comm=True): - """ - Args: - x: Tensor:[seq/mp, dim]: input tensor: - use_comm: bool, skip all gahther set to false - """ - # sequence parallelism is same as model parallelism - # if sequence parallel is true, input shape is [s, b, h] - # else input shape is [b, s, h] - if ( - self.use_tpsp_comm_overlap - and self.is_mp - and (use_comm and self.use_comm) - and flux.all_gather_gemm_can_implement( - x, self.weight, self.model_parallel_group - ) - ): - if self._rr_column_ln is not None and self.training: - output = self._rr_column_comm_ln( - x=x, weight=self.weight, group=self.model_parallel_group - ) - else: - output = AllGatherGemmOp.apply( - x, self.weight, self.model_parallel_group - ) - if self.bias is not None: - output += self.bias - return output - else: - if self.is_mp and (use_comm and self.use_comm): - input_parallel = AllGatherOp.apply(x) - else: - input_parallel = x - - if self._rr_column_ln is not None and self.training: - output = self._rr_column_ln( - self.linear, x=input_parallel, weight=self.weight, bias=self.bias - ) - else: - output = self.linear(input_parallel, self.weight, self.bias) - return output - - -class MPScale(PyLayer): - @staticmethod - def forward(ctx, x, mp_degree): - out = paddle.scale(x, 1.0 / mp_degree) - return out - - @staticmethod - def backward(ctx, dout): - return dout - - -class RowSequenceParallelLinear(Layer): - def __init__( - self, - in_features, - out_features, - weight_attr=None, - has_bias=True, - input_is_parallel=False, - fuse_matmul_bias=False, - use_rr=False, - mp_group=None, - name=None, - use_comm=True, - use_tpsp_comm_overlap=False, - ): - super(RowSequenceParallelLinear, self).__init__() - - self.in_features = in_features - self.out_features = out_features - assert ( - input_is_parallel is True - ), "If sequence_parallel is True, \ - input_is_parallel should be true." - - self.input_is_parallel = input_is_parallel - self._weight_attr = weight_attr - self._dtype = self._helper.get_default_dtype() - self._name = name - self.use_comm = use_comm - if not self.use_comm: - assert not use_rr, "The moe allgather not compatibale with rr for now." - logger.warning( - "RowSequenceParallelLinear will NOT call ANY comm, " - "this feature is only used for XPU moe allgather dispatcher. " - "If this is not your purpose, please unset XPU_MOE_USE_ALLGATHER." - ) - - self.use_tpsp_comm_overlap = use_tpsp_comm_overlap - if self.use_tpsp_comm_overlap: - assert gemm_reduce_scatter is not None - assert flux is not None - - if self.use_tpsp_comm_overlap and self.use_comm: - self._rr_rown_comm_ln = None - self._rr_rown_ln = None - - hcg = get_hcg() - self.model_parallel_group = ( - hcg.get_model_parallel_group() if mp_group is None else mp_group - ) - self.world_size = ( - hcg.get_model_parallel_group().nranks - if mp_group is None - else mp_group.nranks - ) - self.rank = ( - hcg.get_model_parallel_group().rank if mp_group is None else mp_group.rank - ) - - self.is_mp = self.world_size > 1 - assert in_features % self.world_size == 0, ( - "Number of row of the weight for linear ({}) must be" - " divisible by model parallel size ({})".format( - in_features, self.world_size - ) - ) - - self.input_size_per_partition = in_features // self.world_size - - if self.is_mp and paddle.in_dynamic_mode(): - with get_rng_state_tracker().rng_state(): - self.weight = self.create_parameter( - shape=[self.input_size_per_partition, self.out_features], - attr=self._weight_attr, - dtype=self._dtype, - is_bias=False, - ) - else: - self.weight = self.create_parameter( - shape=[self.input_size_per_partition, self.out_features], - attr=self._weight_attr, - dtype=self._dtype, - is_bias=False, - ) - - self.weight.is_distributed = True if self.is_mp else False - if self.weight.is_distributed: - self.weight.split_axis = 0 - - # if sequence parallel is true, - # register hook to all_reduce gradient of weight and bias - if has_bias: - self.bias = self.create_parameter( - shape=[self.out_features], - attr=paddle.nn.initializer.Constant(value=0.0), - dtype=self._dtype, - is_bias=True, - ) - if self.is_mp: - mark_as_sequence_parallel_parameter(self.bias) - else: - self.bias = None - - self.linear = F.linear - self.mp_scale = None - - if fuse_matmul_bias: - if not is_fused_matmul_bias_supported(): - raise NotImplementedError( - "You set fuse_matmul_bias=True in RowParallelLinear, " - "however, the paddle you are using not support this operation. " - "Please set fuse_matmul_bias=False or use paddle compiled " - "with cuda 11.6 or higher." - ) - from paddle.incubate.nn.functional import fused_linear - - self.linear = fused_linear - - def forward(self, x): - input_parallel = x - if self.is_mp: - if self.mp_scale is not None: - bias = self.mp_scale(self.bias, self.world_size) - else: - bias = None - - if self._rr_rown_ln is not None and self.training: - if ( - self.use_tpsp_comm_overlap - and self.use_comm - and flux.gemm_reduce_scatter_can_implement( - x, self.weight, self.model_parallel_group - ) - ): - output_ = self._rr_rown_comm_ln( - x=input_parallel, - weight=self.weight, - group=self.model_parallel_group, - ) - if bias is not None: - output_ += bias - else: - output_ = self._rr_rown_ln( - self.linear, x=input_parallel, weight=self.weight, bias=bias - ) - else: - if ( - self.use_tpsp_comm_overlap - and self.use_comm - and flux.gemm_reduce_scatter_can_implement( - x, self.weight, self.model_parallel_group - ) - ): - output_ = GemmReduceScatterOp.apply( - x, self.weight, self.model_parallel_group - ) - if bias is not None: - output_ = output_ + bias - else: - output_parallel = self.linear(input_parallel, self.weight, bias) - if self.use_comm: - output_ = ReduceScatterOp.apply(output_parallel) - else: - output_ = output_parallel - - # if self.bias is not none, sequence parallel will use - # register_hook to all_reduce self.bias - if bias is None and self.bias is not None and self.use_comm: - output = output_ + self.bias - else: - output = output_ - else: - output = self.linear(input_parallel, self.weight, self.bias) - return output From f42cf3f3b3dc84a13bfeab483114f4e28219ba39 Mon Sep 17 00:00:00 2001 From: xuexixi Date: Wed, 13 Aug 2025 16:19:41 +0800 Subject: [PATCH 03/15] adapt data loader --- ...list_ernie45turbo_tk_m100k_250321.txt.1000 | 1000 -------------- examples/pre-training/conf/ratio_eb45t_0321 | 1176 ----------------- examples/pre-training/ernie/pretrain_auto.py | 126 +- .../ernie/src/datasets/__init__.py | 1 - .../ernie/src/datasets/dist_data_loader.py | 84 +- .../ernie/src/datasets/pretrain_task.py | 788 ----------- .../src/trainers/pretraining_trainer_auto.py | 645 +-------- .../pre-training/ernie/src/utils/__init__.py | 1 - .../ernie/src/utils/data_utils.py | 218 --- .../ernie/src/utils/ipc_server.py | 265 ---- .../pre-training/yamls/pretrain_96_auto.yaml | 7 +- 11 files changed, 84 insertions(+), 4227 deletions(-) delete mode 100644 examples/pre-training/conf/filelist_ernie45turbo_tk_m100k_250321.txt.1000 delete mode 100644 examples/pre-training/conf/ratio_eb45t_0321 delete mode 100644 examples/pre-training/ernie/src/datasets/pretrain_task.py delete mode 100644 examples/pre-training/ernie/src/utils/data_utils.py delete mode 100644 examples/pre-training/ernie/src/utils/ipc_server.py diff --git a/examples/pre-training/conf/filelist_ernie45turbo_tk_m100k_250321.txt.1000 b/examples/pre-training/conf/filelist_ernie45turbo_tk_m100k_250321.txt.1000 deleted file mode 100644 index d3868f63..00000000 --- a/examples/pre-training/conf/filelist_ernie45turbo_tk_m100k_250321.txt.1000 +++ /dev/null @@ -1,1000 +0,0 @@ -2 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200046/out/attempt_20250123021212_586550_r001_000000_0_0.h5 -2 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200046/out/attempt_20250123021212_587109_r001_000000_0_0.h5 -3 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200046/out/attempt_20250123021212_586550_r001_000001_0_0.h5 -3 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200046/out/attempt_20250123021212_587109_r001_000001_0_0.h5 -4 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200046/out/attempt_20250123021212_586550_r001_000002_0_0.h5 -4 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200046/out/attempt_20250123021212_587109_r001_000002_0_0.h5 -5 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200046/out/attempt_20250123021212_586550_r001_000003_0_0.h5 -5 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200046/out/attempt_20250123021212_587109_r001_000003_0_0.h5 -6 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200046/out/attempt_20250123021212_586550_r001_000004_0_0.h5 -6 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200046/out/attempt_20250123021212_587109_r001_000004_0_0.h5 -7 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200046/out/attempt_20250123021212_586550_r001_000005_0_0.h5 -7 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200046/out/attempt_20250123021212_587109_r001_000005_0_0.h5 -8 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200046/out/attempt_20250123021212_586550_r001_000006_0_0.h5 -8 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200046/out/attempt_20250123021212_587109_r001_000006_0_0.h5 -9 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200046/out/attempt_20250123021212_586550_r001_000007_0_0.h5 -9 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200046/out/attempt_20250123021212_587109_r001_000007_0_0.h5 -10 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200046/out/attempt_20250123021212_586550_r001_000008_0_0.h5 -10 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200046/out/attempt_20250123021212_587109_r001_000008_0_0.h5 -11 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200046/out/attempt_20250123021212_586550_r001_000009_0_0.h5 -11 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200046/out/attempt_20250123021212_587109_r001_000009_0_0.h5 -12 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200047/out/attempt_20250123021212_586551_r001_000000_0_0.h5 -12 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200047/out/attempt_20250123021212_587110_r001_000000_0_0.h5 -13 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200047/out/attempt_20250123021212_586551_r001_000001_0_0.h5 -13 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200047/out/attempt_20250123021212_587110_r001_000001_0_0.h5 -14 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200047/out/attempt_20250123021212_586551_r001_000002_0_0.h5 -14 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200047/out/attempt_20250123021212_587110_r001_000002_0_0.h5 -15 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200047/out/attempt_20250123021212_586551_r001_000003_0_0.h5 -15 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200047/out/attempt_20250123021212_587110_r001_000003_0_0.h5 -16 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200047/out/attempt_20250123021212_586551_r001_000004_0_0.h5 -16 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200047/out/attempt_20250123021212_587110_r001_000004_0_0.h5 -17 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200047/out/attempt_20250123021212_586551_r001_000005_0_0.h5 -17 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200047/out/attempt_20250123021212_587110_r001_000005_0_0.h5 -18 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200047/out/attempt_20250123021212_586551_r001_000006_0_0.h5 -18 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200047/out/attempt_20250123021212_587110_r001_000006_0_0.h5 -19 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200047/out/attempt_20250123021212_586551_r001_000007_0_0.h5 -19 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200047/out/attempt_20250123021212_587110_r001_000007_0_0.h5 -20 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200047/out/attempt_20250123021212_586551_r001_000008_0_0.h5 -20 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200047/out/attempt_20250123021212_587110_r001_000008_0_0.h5 -21 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200047/out/attempt_20250123021212_586551_r001_000009_0_0.h5 -21 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200047/out/attempt_20250123021212_587110_r001_000009_0_0.h5 -22 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200048/out/attempt_20250123021212_586552_r001_000000_0_0.h5 -22 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200048/out/attempt_20250123021212_587111_r001_000000_0_0.h5 -23 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200048/out/attempt_20250123021212_586552_r001_000001_0_0.h5 -23 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200048/out/attempt_20250123021212_587111_r001_000001_0_0.h5 -24 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200048/out/attempt_20250123021212_586552_r001_000002_0_0.h5 -24 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200048/out/attempt_20250123021212_587111_r001_000002_0_0.h5 -25 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200048/out/attempt_20250123021212_586552_r001_000003_0_0.h5 -25 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200048/out/attempt_20250123021212_587111_r001_000003_0_0.h5 -26 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200048/out/attempt_20250123021212_586552_r001_000004_0_0.h5 -26 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200048/out/attempt_20250123021212_587111_r001_000004_0_0.h5 -27 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200048/out/attempt_20250123021212_586552_r001_000005_0_0.h5 -27 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200048/out/attempt_20250123021212_587111_r001_000005_0_0.h5 -28 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200048/out/attempt_20250123021212_586552_r001_000006_0_0.h5 -28 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200048/out/attempt_20250123021212_587111_r001_000006_0_0.h5 -29 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200048/out/attempt_20250123021212_586552_r001_000007_0_0.h5 -29 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200048/out/attempt_20250123021212_587111_r001_000007_0_0.h5 -30 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200048/out/attempt_20250123021212_586552_r001_000008_0_0.h5 -30 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200048/out/attempt_20250123021212_587111_r001_000008_0_0.h5 -31 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200048/out/attempt_20250123021212_586552_r001_000009_0_0.h5 -31 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200048/out/attempt_20250123021212_587111_r001_000009_0_0.h5 -32 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200049/out/attempt_20250123021212_586553_r001_000000_0_0.h5 -32 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200049/out/attempt_20250123021212_587112_r001_000000_0_0.h5 -33 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200049/out/attempt_20250123021212_586553_r001_000001_0_0.h5 -33 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200049/out/attempt_20250123021212_587112_r001_000001_0_0.h5 -34 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200049/out/attempt_20250123021212_586553_r001_000002_0_0.h5 -34 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200049/out/attempt_20250123021212_587112_r001_000002_0_0.h5 -35 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200049/out/attempt_20250123021212_586553_r001_000003_0_0.h5 -35 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200049/out/attempt_20250123021212_587112_r001_000003_0_0.h5 -36 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200049/out/attempt_20250123021212_586553_r001_000004_0_0.h5 -36 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200049/out/attempt_20250123021212_587112_r001_000004_0_0.h5 -37 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200049/out/attempt_20250123021212_586553_r001_000005_0_0.h5 -37 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200049/out/attempt_20250123021212_587112_r001_000005_0_0.h5 -38 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200049/out/attempt_20250123021212_586553_r001_000006_0_0.h5 -38 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200049/out/attempt_20250123021212_587112_r001_000006_0_0.h5 -39 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200049/out/attempt_20250123021212_586553_r001_000007_0_0.h5 -39 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200049/out/attempt_20250123021212_587112_r001_000007_0_0.h5 -40 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200049/out/attempt_20250123021212_586553_r001_000008_0_0.h5 -40 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200049/out/attempt_20250123021212_587112_r001_000008_0_0.h5 -41 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200049/out/attempt_20250123021212_586553_r001_000009_0_0.h5 -41 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200049/out/attempt_20250123021212_587112_r001_000009_0_0.h5 -42 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200050/out/attempt_20250123021212_586554_r001_000000_0_0.h5 -42 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200050/out/attempt_20250123021212_587113_r001_000000_0_0.h5 -43 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200050/out/attempt_20250123021212_586554_r001_000001_0_0.h5 -43 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200050/out/attempt_20250123021212_587113_r001_000001_0_0.h5 -44 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200050/out/attempt_20250123021212_586554_r001_000002_0_0.h5 -44 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200050/out/attempt_20250123021212_587113_r001_000002_0_0.h5 -45 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200050/out/attempt_20250123021212_586554_r001_000003_0_0.h5 -45 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200050/out/attempt_20250123021212_587113_r001_000003_0_0.h5 -46 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200050/out/attempt_20250123021212_586554_r001_000004_0_0.h5 -46 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200050/out/attempt_20250123021212_587113_r001_000004_0_0.h5 -47 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200050/out/attempt_20250123021212_586554_r001_000005_0_0.h5 -47 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200050/out/attempt_20250123021212_587113_r001_000005_0_0.h5 -48 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200050/out/attempt_20250123021212_586554_r001_000006_0_0.h5 -48 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200050/out/attempt_20250123021212_587113_r001_000006_0_0.h5 -49 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200050/out/attempt_20250123021212_586554_r001_000007_0_0.h5 -49 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200050/out/attempt_20250123021212_587113_r001_000007_0_0.h5 -50 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200050/out/attempt_20250123021212_586554_r001_000008_0_0.h5 -50 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200050/out/attempt_20250123021212_587113_r001_000008_0_0.h5 -51 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200050/out/attempt_20250123021212_586554_r001_000009_0_0.h5 -51 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200050/out/attempt_20250123021212_587113_r001_000009_0_0.h5 -52 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200051/out/attempt_20250123021212_586555_r001_000000_0_0.h5 -52 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200051/out/attempt_20250123021212_587114_r001_000000_0_0.h5 -53 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200051/out/attempt_20250123021212_586555_r001_000001_0_0.h5 -53 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200051/out/attempt_20250123021212_587114_r001_000001_0_0.h5 -54 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200051/out/attempt_20250123021212_586555_r001_000002_0_0.h5 -54 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200051/out/attempt_20250123021212_587114_r001_000002_0_0.h5 -55 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200051/out/attempt_20250123021212_586555_r001_000003_0_0.h5 -55 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200051/out/attempt_20250123021212_587114_r001_000003_0_0.h5 -56 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200051/out/attempt_20250123021212_586555_r001_000004_0_0.h5 -56 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200051/out/attempt_20250123021212_587114_r001_000004_0_0.h5 -57 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200051/out/attempt_20250123021212_586555_r001_000005_0_0.h5 -57 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200051/out/attempt_20250123021212_587114_r001_000005_0_0.h5 -58 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200051/out/attempt_20250123021212_586555_r001_000006_0_0.h5 -58 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200051/out/attempt_20250123021212_587114_r001_000006_0_0.h5 -59 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200051/out/attempt_20250123021212_586555_r001_000007_0_0.h5 -59 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200051/out/attempt_20250123021212_587114_r001_000007_0_0.h5 -60 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200051/out/attempt_20250123021212_586555_r001_000008_0_0.h5 -60 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200051/out/attempt_20250123021212_587114_r001_000008_0_0.h5 -61 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200051/out/attempt_20250123021212_586555_r001_000009_0_0.h5 -61 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200051/out/attempt_20250123021212_587114_r001_000009_0_0.h5 -62 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200052/out/attempt_20250123021212_586556_r001_000000_0_0.h5 -62 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200052/out/attempt_20250123021212_587115_r001_000000_0_0.h5 -63 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200052/out/attempt_20250123021212_586556_r001_000001_0_0.h5 -63 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200052/out/attempt_20250123021212_587115_r001_000001_0_0.h5 -64 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200052/out/attempt_20250123021212_586556_r001_000002_0_0.h5 -64 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200052/out/attempt_20250123021212_587115_r001_000002_0_0.h5 -65 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200052/out/attempt_20250123021212_586556_r001_000003_0_0.h5 -65 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200052/out/attempt_20250123021212_587115_r001_000003_0_0.h5 -66 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200052/out/attempt_20250123021212_586556_r001_000004_0_0.h5 -66 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200052/out/attempt_20250123021212_587115_r001_000004_0_0.h5 -67 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200052/out/attempt_20250123021212_586556_r001_000005_0_0.h5 -67 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200052/out/attempt_20250123021212_587115_r001_000005_0_0.h5 -68 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200052/out/attempt_20250123021212_586556_r001_000006_0_0.h5 -68 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200052/out/attempt_20250123021212_587115_r001_000006_0_0.h5 -69 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200052/out/attempt_20250123021212_586556_r001_000007_0_0.h5 -69 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200052/out/attempt_20250123021212_587115_r001_000007_0_0.h5 -70 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200052/out/attempt_20250123021212_586556_r001_000008_0_0.h5 -70 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200052/out/attempt_20250123021212_587115_r001_000008_0_0.h5 -71 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200052/out/attempt_20250123021212_586556_r001_000009_0_0.h5 -71 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200052/out/attempt_20250123021212_587115_r001_000009_0_0.h5 -72 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200053/out/attempt_20250123021212_586557_r001_000000_0_0.h5 -72 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200053/out/attempt_20250123021212_587116_r001_000000_0_0.h5 -73 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200053/out/attempt_20250123021212_586557_r001_000001_0_0.h5 -73 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200053/out/attempt_20250123021212_587116_r001_000001_0_0.h5 -74 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200053/out/attempt_20250123021212_586557_r001_000002_0_0.h5 -74 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200053/out/attempt_20250123021212_587116_r001_000002_0_0.h5 -75 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200053/out/attempt_20250123021212_586557_r001_000003_0_0.h5 -75 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200053/out/attempt_20250123021212_587116_r001_000003_0_0.h5 -76 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200053/out/attempt_20250123021212_586557_r001_000004_0_0.h5 -76 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200053/out/attempt_20250123021212_587116_r001_000004_0_0.h5 -77 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200053/out/attempt_20250123021212_586557_r001_000005_0_0.h5 -77 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200053/out/attempt_20250123021212_587116_r001_000005_0_0.h5 -78 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200053/out/attempt_20250123021212_586557_r001_000006_0_0.h5 -78 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200053/out/attempt_20250123021212_587116_r001_000006_0_0.h5 -79 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200053/out/attempt_20250123021212_586557_r001_000007_0_0.h5 -79 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200053/out/attempt_20250123021212_587116_r001_000007_0_0.h5 -80 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200053/out/attempt_20250123021212_586557_r001_000008_0_0.h5 -80 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200053/out/attempt_20250123021212_587116_r001_000008_0_0.h5 -81 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200053/out/attempt_20250123021212_586557_r001_000009_0_0.h5 -81 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200053/out/attempt_20250123021212_587116_r001_000009_0_0.h5 -82 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200054/out/attempt_20250123021212_586558_r001_000000_0_0.h5 -82 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200054/out/attempt_20250123021212_587117_r001_000000_0_0.h5 -83 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200054/out/attempt_20250123021212_586558_r001_000001_0_0.h5 -83 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200054/out/attempt_20250123021212_587117_r001_000001_0_0.h5 -84 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200054/out/attempt_20250123021212_586558_r001_000002_0_0.h5 -84 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200054/out/attempt_20250123021212_587117_r001_000002_0_0.h5 -85 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200054/out/attempt_20250123021212_586558_r001_000003_0_0.h5 -85 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200054/out/attempt_20250123021212_587117_r001_000003_0_0.h5 -86 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200054/out/attempt_20250123021212_586558_r001_000004_0_0.h5 -86 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200054/out/attempt_20250123021212_587117_r001_000004_0_0.h5 -87 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200054/out/attempt_20250123021212_586558_r001_000005_0_0.h5 -87 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200054/out/attempt_20250123021212_587117_r001_000005_0_0.h5 -88 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200054/out/attempt_20250123021212_586558_r001_000006_0_0.h5 -88 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200054/out/attempt_20250123021212_587117_r001_000006_0_0.h5 -89 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200054/out/attempt_20250123021212_586558_r001_000007_0_0.h5 -89 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200054/out/attempt_20250123021212_587117_r001_000007_0_0.h5 -90 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200054/out/attempt_20250123021212_586558_r001_000008_0_0.h5 -90 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200054/out/attempt_20250123021212_587117_r001_000008_0_0.h5 -91 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200054/out/attempt_20250123021212_586558_r001_000009_0_0.h5 -91 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200054/out/attempt_20250123021212_587117_r001_000009_0_0.h5 -92 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200055/out/attempt_20250123021212_586559_r001_000000_0_0.h5 -92 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200055/out/attempt_20250123021212_587118_r001_000000_0_0.h5 -93 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200055/out/attempt_20250123021212_586559_r001_000001_0_0.h5 -93 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200055/out/attempt_20250123021212_587118_r001_000001_0_0.h5 -94 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200055/out/attempt_20250123021212_586559_r001_000002_0_0.h5 -94 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200055/out/attempt_20250123021212_587118_r001_000002_0_0.h5 -95 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200055/out/attempt_20250123021212_586559_r001_000003_0_0.h5 -95 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200055/out/attempt_20250123021212_587118_r001_000003_0_0.h5 -96 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200055/out/attempt_20250123021212_586559_r001_000004_0_0.h5 -96 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200055/out/attempt_20250123021212_587118_r001_000004_0_0.h5 -97 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200055/out/attempt_20250123021212_586559_r001_000005_0_0.h5 -97 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200055/out/attempt_20250123021212_587118_r001_000005_0_0.h5 -98 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200055/out/attempt_20250123021212_586559_r001_000006_0_0.h5 -98 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200055/out/attempt_20250123021212_587118_r001_000006_0_0.h5 -99 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200055/out/attempt_20250123021212_586559_r001_000007_0_0.h5 -99 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200055/out/attempt_20250123021212_587118_r001_000007_0_0.h5 -100 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200055/out/attempt_20250123021212_586559_r001_000008_0_0.h5 -100 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200055/out/attempt_20250123021212_587118_r001_000008_0_0.h5 -101 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200055/out/attempt_20250123021212_586559_r001_000009_0_0.h5 -101 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200055/out/attempt_20250123021212_587118_r001_000009_0_0.h5 -102 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200056/out/attempt_20250123021212_586562_r001_000000_0_0.h5 -102 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200056/out/attempt_20250123021212_587120_r001_000000_0_0.h5 -103 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200056/out/attempt_20250123021212_586562_r001_000001_0_0.h5 -103 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200056/out/attempt_20250123021212_587120_r001_000001_0_0.h5 -104 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200056/out/attempt_20250123021212_586562_r001_000002_0_0.h5 -104 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200056/out/attempt_20250123021212_587120_r001_000002_0_0.h5 -105 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200056/out/attempt_20250123021212_586562_r001_000003_0_0.h5 -105 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200056/out/attempt_20250123021212_587120_r001_000003_0_0.h5 -106 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200056/out/attempt_20250123021212_586562_r001_000004_0_0.h5 -106 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200056/out/attempt_20250123021212_587120_r001_000004_0_0.h5 -107 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200056/out/attempt_20250123021212_586562_r001_000005_0_0.h5 -107 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200056/out/attempt_20250123021212_587120_r001_000005_0_0.h5 -108 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200056/out/attempt_20250123021212_586562_r001_000006_0_0.h5 -108 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200056/out/attempt_20250123021212_587120_r001_000006_0_0.h5 -109 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200056/out/attempt_20250123021212_586562_r001_000007_0_0.h5 -109 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200056/out/attempt_20250123021212_587120_r001_000007_0_0.h5 -110 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200056/out/attempt_20250123021212_586562_r001_000008_0_0.h5 -110 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200056/out/attempt_20250123021212_587120_r001_000008_0_0.h5 -111 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200056/out/attempt_20250123021212_586562_r001_000009_0_0.h5 -111 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200056/out/attempt_20250123021212_587120_r001_000009_0_0.h5 -112 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200057/out/attempt_20250123021212_586563_r001_000000_0_0.h5 -112 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200057/out/attempt_20250123021212_587121_r001_000000_0_0.h5 -113 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200057/out/attempt_20250123021212_586563_r001_000001_0_0.h5 -113 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200057/out/attempt_20250123021212_587121_r001_000001_0_0.h5 -114 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200057/out/attempt_20250123021212_586563_r001_000002_0_0.h5 -114 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200057/out/attempt_20250123021212_587121_r001_000002_0_0.h5 -115 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200057/out/attempt_20250123021212_586563_r001_000003_0_0.h5 -115 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200057/out/attempt_20250123021212_587121_r001_000003_0_0.h5 -116 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200057/out/attempt_20250123021212_586563_r001_000004_0_0.h5 -116 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200057/out/attempt_20250123021212_587121_r001_000004_0_0.h5 -117 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200057/out/attempt_20250123021212_586563_r001_000005_0_0.h5 -117 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200057/out/attempt_20250123021212_587121_r001_000005_0_0.h5 -118 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200057/out/attempt_20250123021212_586563_r001_000006_0_0.h5 -118 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200057/out/attempt_20250123021212_587121_r001_000006_0_0.h5 -119 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200057/out/attempt_20250123021212_586563_r001_000007_0_0.h5 -119 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200057/out/attempt_20250123021212_587121_r001_000007_0_0.h5 -120 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200057/out/attempt_20250123021212_586563_r001_000008_0_0.h5 -120 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200057/out/attempt_20250123021212_587121_r001_000008_0_0.h5 -121 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200057/out/attempt_20250123021212_586563_r001_000009_0_0.h5 -121 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200057/out/attempt_20250123021212_587121_r001_000009_0_0.h5 -122 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200058/out/attempt_20250123021212_586565_r001_000000_0_0.h5 -122 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200058/out/attempt_20250123021212_587122_r001_000000_0_0.h5 -123 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200058/out/attempt_20250123021212_586565_r001_000001_0_0.h5 -123 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200058/out/attempt_20250123021212_587122_r001_000001_0_0.h5 -124 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200058/out/attempt_20250123021212_586565_r001_000002_0_0.h5 -124 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200058/out/attempt_20250123021212_587122_r001_000002_0_0.h5 -125 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200058/out/attempt_20250123021212_586565_r001_000003_0_0.h5 -125 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200058/out/attempt_20250123021212_587122_r001_000003_0_0.h5 -126 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200058/out/attempt_20250123021212_586565_r001_000004_0_0.h5 -126 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200058/out/attempt_20250123021212_587122_r001_000004_0_0.h5 -127 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200058/out/attempt_20250123021212_586565_r001_000005_0_0.h5 -127 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200058/out/attempt_20250123021212_587122_r001_000005_0_0.h5 -128 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200058/out/attempt_20250123021212_586565_r001_000006_0_0.h5 -128 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200058/out/attempt_20250123021212_587122_r001_000006_0_0.h5 -129 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200058/out/attempt_20250123021212_586565_r001_000007_0_0.h5 -129 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200058/out/attempt_20250123021212_587122_r001_000007_0_0.h5 -130 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200058/out/attempt_20250123021212_586565_r001_000008_0_0.h5 -130 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200058/out/attempt_20250123021212_587122_r001_000008_0_0.h5 -131 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200058/out/attempt_20250123021212_586565_r001_000009_0_0.h5 -131 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200058/out/attempt_20250123021212_587122_r001_000009_0_0.h5 -132 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200059/out/attempt_20250123021212_586566_r001_000000_0_0.h5 -132 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200059/out/attempt_20250123021212_587123_r001_000000_0_0.h5 -133 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200059/out/attempt_20250123021212_586566_r001_000001_0_0.h5 -133 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200059/out/attempt_20250123021212_587123_r001_000001_0_0.h5 -134 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200059/out/attempt_20250123021212_586566_r001_000002_0_0.h5 -134 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200059/out/attempt_20250123021212_587123_r001_000002_0_0.h5 -135 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200059/out/attempt_20250123021212_586566_r001_000003_0_0.h5 -135 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200059/out/attempt_20250123021212_587123_r001_000003_0_0.h5 -136 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200059/out/attempt_20250123021212_586566_r001_000004_0_0.h5 -136 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200059/out/attempt_20250123021212_587123_r001_000004_0_0.h5 -137 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200059/out/attempt_20250123021212_586566_r001_000005_0_0.h5 -137 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200059/out/attempt_20250123021212_587123_r001_000005_0_0.h5 -138 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200059/out/attempt_20250123021212_586566_r001_000006_0_0.h5 -138 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200059/out/attempt_20250123021212_587123_r001_000006_0_0.h5 -139 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200059/out/attempt_20250123021212_586566_r001_000007_0_0.h5 -139 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200059/out/attempt_20250123021212_587123_r001_000007_0_0.h5 -140 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200059/out/attempt_20250123021212_586566_r001_000008_0_0.h5 -140 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200059/out/attempt_20250123021212_587123_r001_000008_0_0.h5 -141 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200059/out/attempt_20250123021212_586566_r001_000009_0_0.h5 -141 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200059/out/attempt_20250123021212_587123_r001_000009_0_0.h5 -142 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200060/out/attempt_20250123021212_586567_r001_000000_0_0.h5 -142 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200060/out/attempt_20250123021212_587124_r001_000000_0_0.h5 -143 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200060/out/attempt_20250123021212_586567_r001_000001_0_0.h5 -143 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200060/out/attempt_20250123021212_587124_r001_000001_0_0.h5 -144 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200060/out/attempt_20250123021212_586567_r001_000002_0_0.h5 -144 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200060/out/attempt_20250123021212_587124_r001_000002_0_0.h5 -145 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200060/out/attempt_20250123021212_586567_r001_000003_0_0.h5 -145 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200060/out/attempt_20250123021212_587124_r001_000003_0_0.h5 -146 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200060/out/attempt_20250123021212_586567_r001_000004_0_0.h5 -146 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200060/out/attempt_20250123021212_587124_r001_000004_0_0.h5 -147 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200060/out/attempt_20250123021212_586567_r001_000005_0_0.h5 -147 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200060/out/attempt_20250123021212_587124_r001_000005_0_0.h5 -148 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200060/out/attempt_20250123021212_586567_r001_000006_0_0.h5 -148 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200060/out/attempt_20250123021212_587124_r001_000006_0_0.h5 -149 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200060/out/attempt_20250123021212_586567_r001_000007_0_0.h5 -149 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200060/out/attempt_20250123021212_587124_r001_000007_0_0.h5 -150 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200060/out/attempt_20250123021212_586567_r001_000008_0_0.h5 -150 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200060/out/attempt_20250123021212_587124_r001_000008_0_0.h5 -151 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200060/out/attempt_20250123021212_586567_r001_000009_0_0.h5 -151 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200060/out/attempt_20250123021212_587124_r001_000009_0_0.h5 -152 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200061/out/attempt_20250123021212_586568_r001_000000_0_0.h5 -152 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200061/out/attempt_20250123021212_587125_r001_000000_0_0.h5 -153 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200061/out/attempt_20250123021212_586568_r001_000001_0_0.h5 -153 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200061/out/attempt_20250123021212_587125_r001_000001_0_0.h5 -154 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200061/out/attempt_20250123021212_586568_r001_000002_0_0.h5 -154 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200061/out/attempt_20250123021212_587125_r001_000002_0_0.h5 -155 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200061/out/attempt_20250123021212_586568_r001_000003_0_0.h5 -155 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200061/out/attempt_20250123021212_587125_r001_000003_0_0.h5 -156 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200061/out/attempt_20250123021212_586568_r001_000004_0_0.h5 -156 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200061/out/attempt_20250123021212_587125_r001_000004_0_0.h5 -157 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200061/out/attempt_20250123021212_586568_r001_000005_0_0.h5 -157 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200061/out/attempt_20250123021212_587125_r001_000005_0_0.h5 -158 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200061/out/attempt_20250123021212_586568_r001_000006_0_0.h5 -158 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200061/out/attempt_20250123021212_587125_r001_000006_0_0.h5 -159 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200061/out/attempt_20250123021212_586568_r001_000007_0_0.h5 -159 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200061/out/attempt_20250123021212_587125_r001_000007_0_0.h5 -160 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200061/out/attempt_20250123021212_586568_r001_000008_0_0.h5 -160 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200061/out/attempt_20250123021212_587125_r001_000008_0_0.h5 -161 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200061/out/attempt_20250123021212_586568_r001_000009_0_0.h5 -161 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200061/out/attempt_20250123021212_587125_r001_000009_0_0.h5 -162 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200062/out/attempt_20250123021212_586570_r001_000000_0_0.h5 -162 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200062/out/attempt_20250123021212_587126_r001_000000_0_0.h5 -163 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200062/out/attempt_20250123021212_586570_r001_000001_0_0.h5 -163 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200062/out/attempt_20250123021212_587126_r001_000001_0_0.h5 -164 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200062/out/attempt_20250123021212_586570_r001_000002_0_0.h5 -164 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200062/out/attempt_20250123021212_587126_r001_000002_0_0.h5 -165 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200062/out/attempt_20250123021212_586570_r001_000003_0_0.h5 -165 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200062/out/attempt_20250123021212_587126_r001_000003_0_0.h5 -166 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200062/out/attempt_20250123021212_586570_r001_000004_0_0.h5 -166 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200062/out/attempt_20250123021212_587126_r001_000004_0_0.h5 -167 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200062/out/attempt_20250123021212_586570_r001_000005_0_0.h5 -167 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200062/out/attempt_20250123021212_587126_r001_000005_0_0.h5 -168 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200062/out/attempt_20250123021212_586570_r001_000006_0_0.h5 -168 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200062/out/attempt_20250123021212_587126_r001_000006_0_0.h5 -169 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200062/out/attempt_20250123021212_586570_r001_000007_0_0.h5 -169 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200062/out/attempt_20250123021212_587126_r001_000007_0_0.h5 -170 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200062/out/attempt_20250123021212_586570_r001_000008_0_0.h5 -170 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200062/out/attempt_20250123021212_587126_r001_000008_0_0.h5 -171 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200062/out/attempt_20250123021212_586570_r001_000009_0_0.h5 -171 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200062/out/attempt_20250123021212_587126_r001_000009_0_0.h5 -172 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200063/out/attempt_20250123021212_586571_r001_000000_0_0.h5 -172 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200063/out/attempt_20250123021212_587127_r001_000000_0_0.h5 -173 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200063/out/attempt_20250123021212_586571_r001_000001_0_0.h5 -173 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200063/out/attempt_20250123021212_587127_r001_000001_0_0.h5 -174 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200063/out/attempt_20250123021212_586571_r001_000002_0_0.h5 -174 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200063/out/attempt_20250123021212_587127_r001_000002_0_0.h5 -175 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200063/out/attempt_20250123021212_586571_r001_000003_0_0.h5 -175 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200063/out/attempt_20250123021212_587127_r001_000003_0_0.h5 -176 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200063/out/attempt_20250123021212_586571_r001_000004_0_0.h5 -176 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200063/out/attempt_20250123021212_587127_r001_000004_0_0.h5 -177 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200063/out/attempt_20250123021212_586571_r001_000005_0_0.h5 -177 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200063/out/attempt_20250123021212_587127_r001_000005_0_0.h5 -178 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200063/out/attempt_20250123021212_586571_r001_000006_0_0.h5 -178 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200063/out/attempt_20250123021212_587127_r001_000006_0_0.h5 -179 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200063/out/attempt_20250123021212_586571_r001_000007_0_0.h5 -179 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200063/out/attempt_20250123021212_587127_r001_000007_0_0.h5 -180 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200063/out/attempt_20250123021212_586571_r001_000008_0_0.h5 -180 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200063/out/attempt_20250123021212_587127_r001_000008_0_0.h5 -181 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200063/out/attempt_20250123021212_586571_r001_000009_0_0.h5 -181 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200063/out/attempt_20250123021212_587127_r001_000009_0_0.h5 -182 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200064/out/attempt_20250123021212_586572_r001_000000_0_0.h5 -182 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200064/out/attempt_20250123021212_587132_r001_000000_0_0.h5 -183 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200064/out/attempt_20250123021212_586572_r001_000001_0_0.h5 -183 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200064/out/attempt_20250123021212_587132_r001_000001_0_0.h5 -184 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200064/out/attempt_20250123021212_586572_r001_000002_0_0.h5 -184 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200064/out/attempt_20250123021212_587132_r001_000002_0_0.h5 -185 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200064/out/attempt_20250123021212_586572_r001_000003_0_0.h5 -185 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200064/out/attempt_20250123021212_587132_r001_000003_0_0.h5 -186 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200064/out/attempt_20250123021212_586572_r001_000004_0_0.h5 -186 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200064/out/attempt_20250123021212_587132_r001_000004_0_0.h5 -187 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200064/out/attempt_20250123021212_586572_r001_000005_0_0.h5 -187 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200064/out/attempt_20250123021212_587132_r001_000005_0_0.h5 -188 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200064/out/attempt_20250123021212_586572_r001_000006_0_0.h5 -188 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200064/out/attempt_20250123021212_587132_r001_000006_0_0.h5 -189 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200064/out/attempt_20250123021212_586572_r001_000007_0_0.h5 -189 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200064/out/attempt_20250123021212_587132_r001_000007_0_0.h5 -190 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200064/out/attempt_20250123021212_586572_r001_000008_0_0.h5 -190 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200064/out/attempt_20250123021212_587132_r001_000008_0_0.h5 -191 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200064/out/attempt_20250123021212_586572_r001_000009_0_0.h5 -191 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200064/out/attempt_20250123021212_587132_r001_000009_0_0.h5 -192 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200065/out/attempt_20250123021212_586575_r001_000000_0_0.h5 -192 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200065/out/attempt_20250123021212_587134_r001_000000_0_0.h5 -193 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200065/out/attempt_20250123021212_586575_r001_000001_0_0.h5 -193 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200065/out/attempt_20250123021212_587134_r001_000001_0_0.h5 -194 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200065/out/attempt_20250123021212_586575_r001_000002_0_0.h5 -194 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200065/out/attempt_20250123021212_587134_r001_000002_0_0.h5 -195 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200065/out/attempt_20250123021212_586575_r001_000003_0_0.h5 -195 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200065/out/attempt_20250123021212_587134_r001_000003_0_0.h5 -196 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200065/out/attempt_20250123021212_586575_r001_000004_0_0.h5 -196 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200065/out/attempt_20250123021212_587134_r001_000004_0_0.h5 -197 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200065/out/attempt_20250123021212_586575_r001_000005_0_0.h5 -197 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200065/out/attempt_20250123021212_587134_r001_000005_0_0.h5 -198 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200065/out/attempt_20250123021212_586575_r001_000006_0_0.h5 -198 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200065/out/attempt_20250123021212_587134_r001_000006_0_0.h5 -199 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200065/out/attempt_20250123021212_586575_r001_000007_0_0.h5 -199 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200065/out/attempt_20250123021212_587134_r001_000007_0_0.h5 -200 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200065/out/attempt_20250123021212_586575_r001_000008_0_0.h5 -200 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200065/out/attempt_20250123021212_587134_r001_000008_0_0.h5 -201 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200065/out/attempt_20250123021212_586575_r001_000009_0_0.h5 -201 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200065/out/attempt_20250123021212_587134_r001_000009_0_0.h5 -202 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200066/out/attempt_20250123021212_586590_r001_000000_0_0.h5 -202 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200066/out/attempt_20250123021212_587141_r001_000000_0_0.h5 -203 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200066/out/attempt_20250123021212_586590_r001_000001_0_0.h5 -203 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200066/out/attempt_20250123021212_587141_r001_000001_0_0.h5 -204 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200066/out/attempt_20250123021212_586590_r001_000002_0_0.h5 -204 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200066/out/attempt_20250123021212_587141_r001_000002_0_0.h5 -205 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200066/out/attempt_20250123021212_586590_r001_000003_0_0.h5 -205 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200066/out/attempt_20250123021212_587141_r001_000003_0_0.h5 -206 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200066/out/attempt_20250123021212_586590_r001_000004_0_0.h5 -206 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200066/out/attempt_20250123021212_587141_r001_000004_0_0.h5 -207 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200066/out/attempt_20250123021212_586590_r001_000005_0_0.h5 -207 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200066/out/attempt_20250123021212_587141_r001_000005_0_0.h5 -208 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200066/out/attempt_20250123021212_586590_r001_000006_0_0.h5 -208 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200066/out/attempt_20250123021212_587141_r001_000006_0_0.h5 -209 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200066/out/attempt_20250123021212_586590_r001_000007_0_0.h5 -209 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200066/out/attempt_20250123021212_587141_r001_000007_0_0.h5 -210 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200066/out/attempt_20250123021212_586590_r001_000008_0_0.h5 -210 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200066/out/attempt_20250123021212_587141_r001_000008_0_0.h5 -211 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200066/out/attempt_20250123021212_586590_r001_000009_0_0.h5 -211 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200066/out/attempt_20250123021212_587141_r001_000009_0_0.h5 -212 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200067/out/attempt_20250123021212_586577_r001_000000_0_0.h5 -212 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200067/out/attempt_20250123021212_587135_r001_000000_0_0.h5 -213 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200067/out/attempt_20250123021212_586577_r001_000001_0_0.h5 -213 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200067/out/attempt_20250123021212_587135_r001_000001_0_0.h5 -214 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200067/out/attempt_20250123021212_586577_r001_000002_0_0.h5 -214 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200067/out/attempt_20250123021212_587135_r001_000002_0_0.h5 -215 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200067/out/attempt_20250123021212_586577_r001_000003_0_0.h5 -215 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200067/out/attempt_20250123021212_587135_r001_000003_0_0.h5 -216 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200067/out/attempt_20250123021212_586577_r001_000004_0_0.h5 -216 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200067/out/attempt_20250123021212_587135_r001_000004_0_0.h5 -217 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200067/out/attempt_20250123021212_586577_r001_000005_0_0.h5 -217 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200067/out/attempt_20250123021212_587135_r001_000005_0_0.h5 -218 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200067/out/attempt_20250123021212_586577_r001_000006_0_0.h5 -218 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200067/out/attempt_20250123021212_587135_r001_000006_0_0.h5 -219 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200067/out/attempt_20250123021212_586577_r001_000007_0_0.h5 -219 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200067/out/attempt_20250123021212_587135_r001_000007_0_0.h5 -220 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200067/out/attempt_20250123021212_586577_r001_000008_0_0.h5 -220 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200067/out/attempt_20250123021212_587135_r001_000008_0_0.h5 -221 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200067/out/attempt_20250123021212_586577_r001_000009_0_0.h5 -221 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200067/out/attempt_20250123021212_587135_r001_000009_0_0.h5 -222 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200068/out/attempt_20250123021212_586581_r001_000000_0_0.h5 -222 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200068/out/attempt_20250123021212_587137_r001_000000_0_0.h5 -223 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200068/out/attempt_20250123021212_586581_r001_000001_0_0.h5 -223 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200068/out/attempt_20250123021212_587137_r001_000001_0_0.h5 -224 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200068/out/attempt_20250123021212_586581_r001_000002_0_0.h5 -224 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200068/out/attempt_20250123021212_587137_r001_000002_0_0.h5 -225 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200068/out/attempt_20250123021212_586581_r001_000003_0_0.h5 -225 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200068/out/attempt_20250123021212_587137_r001_000003_0_0.h5 -226 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200068/out/attempt_20250123021212_586581_r001_000004_0_0.h5 -226 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200068/out/attempt_20250123021212_587137_r001_000004_0_0.h5 -227 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200068/out/attempt_20250123021212_586581_r001_000005_0_0.h5 -227 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200068/out/attempt_20250123021212_587137_r001_000005_0_0.h5 -228 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200068/out/attempt_20250123021212_586581_r001_000006_0_0.h5 -228 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200068/out/attempt_20250123021212_587137_r001_000006_0_0.h5 -229 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200068/out/attempt_20250123021212_586581_r001_000007_0_0.h5 -229 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200068/out/attempt_20250123021212_587137_r001_000007_0_0.h5 -230 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200068/out/attempt_20250123021212_586581_r001_000008_0_0.h5 -230 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200068/out/attempt_20250123021212_587137_r001_000008_0_0.h5 -231 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200068/out/attempt_20250123021212_586581_r001_000009_0_0.h5 -231 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200068/out/attempt_20250123021212_587137_r001_000009_0_0.h5 -232 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200069/out/attempt_20250123021212_586582_r001_000000_0_0.h5 -232 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200069/out/attempt_20250123021212_587139_r001_000000_0_0.h5 -233 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200069/out/attempt_20250123021212_586582_r001_000001_0_0.h5 -233 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200069/out/attempt_20250123021212_587139_r001_000001_0_0.h5 -234 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200069/out/attempt_20250123021212_586582_r001_000002_0_0.h5 -234 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200069/out/attempt_20250123021212_587139_r001_000002_0_0.h5 -235 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200069/out/attempt_20250123021212_586582_r001_000003_0_0.h5 -235 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200069/out/attempt_20250123021212_587139_r001_000003_0_0.h5 -236 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200069/out/attempt_20250123021212_586582_r001_000004_0_0.h5 -236 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200069/out/attempt_20250123021212_587139_r001_000004_0_0.h5 -237 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200069/out/attempt_20250123021212_586582_r001_000005_0_0.h5 -237 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200069/out/attempt_20250123021212_587139_r001_000005_0_0.h5 -238 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200069/out/attempt_20250123021212_586582_r001_000006_0_0.h5 -238 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200069/out/attempt_20250123021212_587139_r001_000006_0_0.h5 -239 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200069/out/attempt_20250123021212_586582_r001_000007_0_0.h5 -239 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200069/out/attempt_20250123021212_587139_r001_000007_0_0.h5 -240 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200069/out/attempt_20250123021212_586582_r001_000008_0_0.h5 -240 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200069/out/attempt_20250123021212_587139_r001_000008_0_0.h5 -241 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200069/out/attempt_20250123021212_586582_r001_000009_0_0.h5 -241 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200069/out/attempt_20250123021212_587139_r001_000009_0_0.h5 -242 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200070/out/attempt_20250123021212_586589_r001_000000_0_0.h5 -242 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200070/out/attempt_20250123021212_587140_r001_000000_0_0.h5 -243 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200070/out/attempt_20250123021212_586589_r001_000001_0_0.h5 -243 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200070/out/attempt_20250123021212_587140_r001_000001_0_0.h5 -244 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200070/out/attempt_20250123021212_586589_r001_000002_0_0.h5 -244 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200070/out/attempt_20250123021212_587140_r001_000002_0_0.h5 -245 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200070/out/attempt_20250123021212_586589_r001_000003_0_0.h5 -245 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200070/out/attempt_20250123021212_587140_r001_000003_0_0.h5 -246 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200070/out/attempt_20250123021212_586589_r001_000004_0_0.h5 -246 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200070/out/attempt_20250123021212_587140_r001_000004_0_0.h5 -247 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200070/out/attempt_20250123021212_586589_r001_000005_0_0.h5 -247 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200070/out/attempt_20250123021212_587140_r001_000005_0_0.h5 -248 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200070/out/attempt_20250123021212_586589_r001_000006_0_0.h5 -248 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200070/out/attempt_20250123021212_587140_r001_000006_0_0.h5 -249 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200070/out/attempt_20250123021212_586589_r001_000007_0_0.h5 -249 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200070/out/attempt_20250123021212_587140_r001_000007_0_0.h5 -250 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200070/out/attempt_20250123021212_586589_r001_000008_0_0.h5 -250 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200070/out/attempt_20250123021212_587140_r001_000008_0_0.h5 -251 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200070/out/attempt_20250123021212_586589_r001_000009_0_0.h5 -251 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200070/out/attempt_20250123021212_587140_r001_000009_0_0.h5 -252 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200624/out/attempt_20250123021212_586633_r001_000000_0_0.h5 -252 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200624/out/attempt_20250123021212_587171_r001_000000_0_0.h5 -253 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200624/out/attempt_20250123021212_586633_r001_000001_0_0.h5 -253 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200624/out/attempt_20250123021212_587171_r001_000001_0_0.h5 -254 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200624/out/attempt_20250123021212_586633_r001_000002_0_0.h5 -254 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200624/out/attempt_20250123021212_587171_r001_000002_0_0.h5 -255 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200624/out/attempt_20250123021212_586633_r001_000003_0_0.h5 -255 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200624/out/attempt_20250123021212_587171_r001_000003_0_0.h5 -256 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200624/out/attempt_20250123021212_586633_r001_000004_0_0.h5 -256 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200624/out/attempt_20250123021212_587171_r001_000004_0_0.h5 -257 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200624/out/attempt_20250123021212_586633_r001_000005_0_0.h5 -257 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200624/out/attempt_20250123021212_587171_r001_000005_0_0.h5 -258 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200624/out/attempt_20250123021212_586633_r001_000006_0_0.h5 -258 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200624/out/attempt_20250123021212_587171_r001_000006_0_0.h5 -259 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200624/out/attempt_20250123021212_586633_r001_000007_0_0.h5 -259 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200624/out/attempt_20250123021212_587171_r001_000007_0_0.h5 -260 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200624/out/attempt_20250123021212_586633_r001_000008_0_0.h5 -260 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200624/out/attempt_20250123021212_587171_r001_000008_0_0.h5 -261 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/200624/out/attempt_20250123021212_586633_r001_000009_0_0.h5 -261 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/200624/out/attempt_20250123021212_587171_r001_000009_0_0.h5 -262 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300330/out/attempt_20250123021212_586542_r001_000000_0_0.h5 -262 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/300330/out/attempt_20250123021212_587102_r001_000000_0_0.h5 -263 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300330/out/attempt_20250123021212_586542_r001_000001_0_0.h5 -263 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/300330/out/attempt_20250123021212_587102_r001_000001_0_0.h5 -264 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300330/out/attempt_20250123021212_586542_r001_000002_0_0.h5 -264 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/300330/out/attempt_20250123021212_587102_r001_000002_0_0.h5 -265 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300330/out/attempt_20250123021212_586542_r001_000003_0_0.h5 -265 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/300330/out/attempt_20250123021212_587102_r001_000003_0_0.h5 -266 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300330/out/attempt_20250123021212_586542_r001_000004_0_0.h5 -266 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/300330/out/attempt_20250123021212_587102_r001_000004_0_0.h5 -267 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300330/out/attempt_20250123021212_586542_r001_000005_0_0.h5 -267 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/300330/out/attempt_20250123021212_587102_r001_000005_0_0.h5 -268 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300330/out/attempt_20250123021212_586542_r001_000006_0_0.h5 -268 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/300330/out/attempt_20250123021212_587102_r001_000006_0_0.h5 -269 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300330/out/attempt_20250123021212_586542_r001_000007_0_0.h5 -269 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/300330/out/attempt_20250123021212_587102_r001_000007_0_0.h5 -270 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300330/out/attempt_20250123021212_586542_r001_000008_0_0.h5 -270 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/300330/out/attempt_20250123021212_587102_r001_000008_0_0.h5 -271 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300330/out/attempt_20250123021212_586542_r001_000009_0_0.h5 -271 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/300330/out/attempt_20250123021212_587102_r001_000009_0_0.h5 -272 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300330/out/attempt_20250123021212_586542_r001_000010_0_0.h5 -272 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/300330/out/attempt_20250123021212_587102_r001_000010_0_0.h5 -273 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300330/out/attempt_20250123021212_586542_r001_000011_0_0.h5 -273 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/300330/out/attempt_20250123021212_587102_r001_000011_0_0.h5 -274 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300330/out/attempt_20250123021212_586542_r001_000012_0_0.h5 -274 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/300330/out/attempt_20250123021212_587102_r001_000012_0_0.h5 -275 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300330/out/attempt_20250123021212_586542_r001_000013_0_0.h5 -275 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/300330/out/attempt_20250123021212_587102_r001_000013_0_0.h5 -276 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300330/out/attempt_20250123021212_586542_r001_000014_0_0.h5 -276 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/300330/out/attempt_20250123021212_587102_r001_000014_0_0.h5 -277 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300330/out/attempt_20250123021212_586542_r001_000015_0_0.h5 -277 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/300330/out/attempt_20250123021212_587102_r001_000015_0_0.h5 -278 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300330/out/attempt_20250123021212_586542_r001_000016_0_0.h5 -278 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/300330/out/attempt_20250123021212_587102_r001_000016_0_0.h5 -279 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300330/out/attempt_20250123021212_586542_r001_000017_0_0.h5 -279 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/300330/out/attempt_20250123021212_587102_r001_000017_0_0.h5 -280 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300330/out/attempt_20250123021212_586542_r001_000018_0_0.h5 -280 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/300330/out/attempt_20250123021212_587102_r001_000018_0_0.h5 -281 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300330/out/attempt_20250123021212_586542_r001_000019_0_0.h5 -281 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/300330/out/attempt_20250123021212_587102_r001_000019_0_0.h5 -282 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300330/out/attempt_20250123021212_586542_r001_000020_0_0.h5 -282 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/300330/out/attempt_20250123021212_587102_r001_000020_0_0.h5 -283 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300330/out/attempt_20250123021212_586542_r001_000021_0_0.h5 -283 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/300330/out/attempt_20250123021212_587102_r001_000021_0_0.h5 -284 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300330/out/attempt_20250123021212_586542_r001_000022_0_0.h5 -284 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/300330/out/attempt_20250123021212_587102_r001_000022_0_0.h5 -285 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300330/out/attempt_20250123021212_586542_r001_000023_0_0.h5 -285 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/300330/out/attempt_20250123021212_587102_r001_000023_0_0.h5 -286 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300330/out/attempt_20250123021212_586542_r001_000024_0_0.h5 -286 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/300330/out/attempt_20250123021212_587102_r001_000024_0_0.h5 -287 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300330/out/attempt_20250123021212_586542_r001_000025_0_0.h5 -287 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/300330/out/attempt_20250123021212_587102_r001_000025_0_0.h5 -288 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300330/out/attempt_20250123021212_586542_r001_000026_0_0.h5 -288 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/300330/out/attempt_20250123021212_587102_r001_000026_0_0.h5 -289 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300330/out/attempt_20250123021212_586542_r001_000027_0_0.h5 -289 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/300330/out/attempt_20250123021212_587102_r001_000027_0_0.h5 -290 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300330/out/attempt_20250123021212_586542_r001_000028_0_0.h5 -290 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/300330/out/attempt_20250123021212_587102_r001_000028_0_0.h5 -291 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300330/out/attempt_20250123021212_586542_r001_000029_0_0.h5 -291 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/300330/out/attempt_20250123021212_587102_r001_000029_0_0.h5 -292 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300330/out/attempt_20250123021212_586542_r001_000030_0_0.h5 -292 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/300330/out/attempt_20250123021212_587102_r001_000030_0_0.h5 -293 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300330/out/attempt_20250123021212_586542_r001_000031_0_0.h5 -293 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/300330/out/attempt_20250123021212_587102_r001_000031_0_0.h5 -294 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300330/out/attempt_20250123021212_586542_r001_000032_0_0.h5 -294 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/300330/out/attempt_20250123021212_587102_r001_000032_0_0.h5 -295 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300330/out/attempt_20250123021212_586542_r001_000033_0_0.h5 -295 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/300330/out/attempt_20250123021212_587102_r001_000033_0_0.h5 -296 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300330/out/attempt_20250123021212_586542_r001_000034_0_0.h5 -296 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/300330/out/attempt_20250123021212_587102_r001_000034_0_0.h5 -297 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300330/out/attempt_20250123021212_586542_r001_000035_0_0.h5 -297 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/300330/out/attempt_20250123021212_587102_r001_000035_0_0.h5 -298 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300330/out/attempt_20250123021212_586542_r001_000036_0_0.h5 -298 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/300330/out/attempt_20250123021212_587102_r001_000036_0_0.h5 -299 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300330/out/attempt_20250123021212_586542_r001_000037_0_0.h5 -299 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/300330/out/attempt_20250123021212_587102_r001_000037_0_0.h5 -300 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300330/out/attempt_20250123021212_586542_r001_000038_0_0.h5 -300 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/300330/out/attempt_20250123021212_587102_r001_000038_0_0.h5 -301 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300330/out/attempt_20250123021212_586542_r001_000039_0_0.h5 -301 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/300330/out/attempt_20250123021212_587102_r001_000039_0_0.h5 -302 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300330/out/attempt_20250123021212_586542_r001_000040_0_0.h5 -302 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/300330/out/attempt_20250123021212_587102_r001_000040_0_0.h5 -303 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300330/out/attempt_20250123021212_586542_r001_000041_0_0.h5 -303 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/300330/out/attempt_20250123021212_587102_r001_000041_0_0.h5 -304 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300330/out/attempt_20250123021212_586542_r001_000042_0_0.h5 -304 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/300330/out/attempt_20250123021212_587102_r001_000042_0_0.h5 -305 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300330/out/attempt_20250123021212_586542_r001_000043_0_0.h5 -305 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/300330/out/attempt_20250123021212_587102_r001_000043_0_0.h5 -306 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300330/out/attempt_20250123021212_586542_r001_000044_0_0.h5 -306 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/300330/out/attempt_20250123021212_587102_r001_000044_0_0.h5 -307 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300330/out/attempt_20250123021212_586542_r001_000045_0_0.h5 -307 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/300330/out/attempt_20250123021212_587102_r001_000045_0_0.h5 -308 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300330/out/attempt_20250123021212_586542_r001_000046_0_0.h5 -308 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/300330/out/attempt_20250123021212_587102_r001_000046_0_0.h5 -309 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300330/out/attempt_20250123021212_586542_r001_000047_0_0.h5 -309 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/300330/out/attempt_20250123021212_587102_r001_000047_0_0.h5 -310 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300330/out/attempt_20250123021212_586542_r001_000048_0_0.h5 -310 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/300330/out/attempt_20250123021212_587102_r001_000048_0_0.h5 -311 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300330/out/attempt_20250123021212_586542_r001_000049_0_0.h5 -311 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/300330/out/attempt_20250123021212_587102_r001_000049_0_0.h5 -312 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300330/out/attempt_20250123021212_586542_r001_000050_0_0.h5 -312 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/300330/out/attempt_20250123021212_587102_r001_000050_0_0.h5 -313 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300330/out/attempt_20250123021212_586542_r001_000051_0_0.h5 -313 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/300330/out/attempt_20250123021212_587102_r001_000051_0_0.h5 -314 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300330/out/attempt_20250123021212_586542_r001_000052_0_0.h5 -314 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/300330/out/attempt_20250123021212_587102_r001_000052_0_0.h5 -315 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300330/out/attempt_20250123021212_586542_r001_000053_0_0.h5 -315 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/300330/out/attempt_20250123021212_587102_r001_000053_0_0.h5 -316 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300330/out/attempt_20250123021212_586542_r001_000054_0_0.h5 -316 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/300330/out/attempt_20250123021212_587102_r001_000054_0_0.h5 -317 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300330/out/attempt_20250123021212_586542_r001_000055_0_0.h5 -317 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/300330/out/attempt_20250123021212_587102_r001_000055_0_0.h5 -318 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300330/out/attempt_20250123021212_586542_r001_000056_0_0.h5 -318 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/300330/out/attempt_20250123021212_587102_r001_000056_0_0.h5 -319 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300330/out/attempt_20250123021212_586542_r001_000057_0_0.h5 -319 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/300330/out/attempt_20250123021212_587102_r001_000057_0_0.h5 -320 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300330/out/attempt_20250123021212_586542_r001_000058_0_0.h5 -320 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/300330/out/attempt_20250123021212_587102_r001_000058_0_0.h5 -321 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300330/out/attempt_20250123021212_586542_r001_000059_0_0.h5 -321 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/300330/out/attempt_20250123021212_587102_r001_000059_0_0.h5 -322 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300330/out/attempt_20250123021212_586542_r001_000060_0_0.h5 -322 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/300330/out/attempt_20250123021212_587102_r001_000060_0_0.h5 -323 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300333/out/attempt_20250123021212_586543_r001_000000_0_0.h5 -323 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/300333/out/attempt_20250123021212_587104_r001_000000_0_0.h5 -324 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300333/out/attempt_20250123021212_586543_r001_000001_0_0.h5 -324 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/300333/out/attempt_20250123021212_587104_r001_000001_0_0.h5 -325 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300333/out/attempt_20250123021212_586543_r001_000002_0_0.h5 -325 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/300333/out/attempt_20250123021212_587104_r001_000002_0_0.h5 -326 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300333/out/attempt_20250123021212_586543_r001_000003_0_0.h5 -326 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/300333/out/attempt_20250123021212_587104_r001_000003_0_0.h5 -327 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300333/out/attempt_20250123021212_586543_r001_000004_0_0.h5 -327 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/300333/out/attempt_20250123021212_587104_r001_000004_0_0.h5 -328 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300333/out/attempt_20250123021212_586543_r001_000005_0_0.h5 -328 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/300333/out/attempt_20250123021212_587104_r001_000005_0_0.h5 -329 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300333/out/attempt_20250123021212_586543_r001_000006_0_0.h5 -329 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/300333/out/attempt_20250123021212_587104_r001_000006_0_0.h5 -330 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300333/out/attempt_20250123021212_586543_r001_000007_0_0.h5 -330 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/300333/out/attempt_20250123021212_587104_r001_000007_0_0.h5 -331 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300333/out/attempt_20250123021212_586543_r001_000008_0_0.h5 -331 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/300333/out/attempt_20250123021212_587104_r001_000008_0_0.h5 -332 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300333/out/attempt_20250123021212_586543_r001_000009_0_0.h5 -332 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch4_add1/300333/out/attempt_20250123021212_587104_r001_000009_0_0.h5 -333 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000000_0_0.h5 -334 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000001_0_0.h5 -335 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000002_0_0.h5 -336 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000003_0_0.h5 -337 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000004_0_0.h5 -338 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000005_0_0.h5 -339 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000006_0_0.h5 -340 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000007_0_0.h5 -341 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000008_0_0.h5 -342 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000009_0_0.h5 -343 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000010_0_0.h5 -344 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000011_0_0.h5 -345 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000012_0_0.h5 -346 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000013_0_0.h5 -347 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000014_0_0.h5 -348 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000015_0_0.h5 -349 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000016_0_0.h5 -350 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000017_0_0.h5 -351 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000018_0_0.h5 -352 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000019_0_0.h5 -353 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000020_0_0.h5 -354 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000021_0_0.h5 -355 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000022_0_0.h5 -356 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000023_0_0.h5 -357 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000024_0_0.h5 -358 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000025_0_0.h5 -359 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000026_0_0.h5 -360 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000027_0_0.h5 -361 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000028_0_0.h5 -362 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000029_0_0.h5 -363 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000030_0_0.h5 -364 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000031_0_0.h5 -365 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000032_0_0.h5 -366 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000033_0_0.h5 -367 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000034_0_0.h5 -368 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000035_0_0.h5 -369 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000036_0_0.h5 -370 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000037_0_0.h5 -371 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000038_0_0.h5 -372 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000039_0_0.h5 -373 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000040_0_0.h5 -374 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000041_0_0.h5 -375 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000042_0_0.h5 -376 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000043_0_0.h5 -377 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000044_0_0.h5 -378 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000045_0_0.h5 -379 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000046_0_0.h5 -380 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000047_0_0.h5 -381 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000048_0_0.h5 -382 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000049_0_0.h5 -383 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000050_0_0.h5 -384 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000051_0_0.h5 -385 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000052_0_0.h5 -386 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000053_0_0.h5 -387 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000054_0_0.h5 -388 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000055_0_0.h5 -389 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000056_0_0.h5 -390 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000057_0_0.h5 -391 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000058_0_0.h5 -392 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000059_0_0.h5 -393 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000060_0_0.h5 -394 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000061_0_0.h5 -395 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000062_0_0.h5 -396 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000063_0_0.h5 -397 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000064_0_0.h5 -398 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000065_0_0.h5 -399 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000066_0_0.h5 -400 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000067_0_0.h5 -401 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000068_0_0.h5 -402 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000069_0_0.h5 -403 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000070_0_0.h5 -404 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000071_0_0.h5 -405 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000072_0_0.h5 -406 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000073_0_0.h5 -407 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000074_0_0.h5 -408 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000075_0_0.h5 -409 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000076_0_0.h5 -410 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000077_0_0.h5 -411 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000078_0_0.h5 -412 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000079_0_0.h5 -413 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000080_0_0.h5 -414 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000081_0_0.h5 -415 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000082_0_0.h5 -416 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000083_0_0.h5 -417 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000084_0_0.h5 -418 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000085_0_0.h5 -419 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000086_0_0.h5 -420 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000087_0_0.h5 -421 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000088_0_0.h5 -422 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000089_0_0.h5 -423 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000090_0_0.h5 -424 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000091_0_0.h5 -425 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000092_0_0.h5 -426 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000093_0_0.h5 -427 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000094_0_0.h5 -428 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000095_0_0.h5 -429 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000096_0_0.h5 -430 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000097_0_0.h5 -431 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000098_0_0.h5 -432 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000099_0_0.h5 -433 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000100_0_0.h5 -434 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000101_0_0.h5 -435 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000102_0_0.h5 -436 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000103_0_0.h5 -437 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000104_0_0.h5 -438 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000105_0_0.h5 -439 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000106_0_0.h5 -440 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000107_0_0.h5 -441 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000108_0_0.h5 -442 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000109_0_0.h5 -443 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000110_0_0.h5 -444 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000111_0_0.h5 -445 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000112_0_0.h5 -446 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000113_0_0.h5 -447 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000114_0_0.h5 -448 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000115_0_0.h5 -449 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000116_0_0.h5 -450 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000117_0_0.h5 -451 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000118_0_0.h5 -452 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000119_0_0.h5 -453 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000120_0_0.h5 -454 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000121_0_0.h5 -455 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000122_0_0.h5 -456 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000123_0_0.h5 -457 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000124_0_0.h5 -458 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000125_0_0.h5 -459 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000126_0_0.h5 -460 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000127_0_0.h5 -461 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000128_0_0.h5 -462 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000129_0_0.h5 -463 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000130_0_0.h5 -464 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000131_0_0.h5 -465 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000132_0_0.h5 -466 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000133_0_0.h5 -467 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000134_0_0.h5 -468 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000135_0_0.h5 -469 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000136_0_0.h5 -470 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000137_0_0.h5 -471 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000138_0_0.h5 -472 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000139_0_0.h5 -473 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000140_0_0.h5 -474 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000141_0_0.h5 -475 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000142_0_0.h5 -476 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000143_0_0.h5 -477 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000144_0_0.h5 -478 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000145_0_0.h5 -479 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000146_0_0.h5 -480 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000147_0_0.h5 -481 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000148_0_0.h5 -482 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000149_0_0.h5 -483 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000150_0_0.h5 -484 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000151_0_0.h5 -485 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000152_0_0.h5 -486 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000153_0_0.h5 -487 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000154_0_0.h5 -488 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000155_0_0.h5 -489 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000156_0_0.h5 -490 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000157_0_0.h5 -491 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000158_0_0.h5 -492 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000159_0_0.h5 -493 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000160_0_0.h5 -494 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000161_0_0.h5 -495 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000162_0_0.h5 -496 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000163_0_0.h5 -497 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000164_0_0.h5 -498 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000165_0_0.h5 -499 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000166_0_0.h5 -500 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000167_0_0.h5 -501 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000168_0_0.h5 -502 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000169_0_0.h5 -503 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000170_0_0.h5 -504 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000171_0_0.h5 -505 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000172_0_0.h5 -506 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000173_0_0.h5 -507 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000174_0_0.h5 -508 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000175_0_0.h5 -509 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000176_0_0.h5 -510 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000177_0_0.h5 -511 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000178_0_0.h5 -512 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000179_0_0.h5 -513 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000180_0_0.h5 -514 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000181_0_0.h5 -515 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000182_0_0.h5 -516 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000183_0_0.h5 -517 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000184_0_0.h5 -518 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000185_0_0.h5 -519 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000186_0_0.h5 -520 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000187_0_0.h5 -521 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000188_0_0.h5 -522 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000189_0_0.h5 -523 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000190_0_0.h5 -524 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000191_0_0.h5 -525 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000192_0_0.h5 -526 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000193_0_0.h5 -527 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000194_0_0.h5 -528 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000195_0_0.h5 -529 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000196_0_0.h5 -530 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000197_0_0.h5 -531 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000198_0_0.h5 -532 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000199_0_0.h5 -533 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000200_0_0.h5 -534 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000201_0_0.h5 -535 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000202_0_0.h5 -536 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000203_0_0.h5 -537 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000204_0_0.h5 -538 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000205_0_0.h5 -539 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000206_0_0.h5 -540 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000207_0_0.h5 -541 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000208_0_0.h5 -542 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000209_0_0.h5 -543 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000210_0_0.h5 -544 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000211_0_0.h5 -545 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000212_0_0.h5 -546 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000213_0_0.h5 -547 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000214_0_0.h5 -548 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000215_0_0.h5 -549 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000216_0_0.h5 -550 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000217_0_0.h5 -551 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000218_0_0.h5 -552 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000219_0_0.h5 -553 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000220_0_0.h5 -554 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000221_0_0.h5 -555 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000222_0_0.h5 -556 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000223_0_0.h5 -557 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000224_0_0.h5 -558 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000225_0_0.h5 -559 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000226_0_0.h5 -560 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000227_0_0.h5 -561 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000228_0_0.h5 -562 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000229_0_0.h5 -563 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000230_0_0.h5 -564 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000231_0_0.h5 -565 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000232_0_0.h5 -566 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000233_0_0.h5 -567 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000234_0_0.h5 -568 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000235_0_0.h5 -569 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000236_0_0.h5 -570 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000237_0_0.h5 -571 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000238_0_0.h5 -572 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000239_0_0.h5 -573 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000240_0_0.h5 -574 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000241_0_0.h5 -575 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000242_0_0.h5 -576 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000243_0_0.h5 -577 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000244_0_0.h5 -578 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000245_0_0.h5 -579 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000246_0_0.h5 -580 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000247_0_0.h5 -581 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000248_0_0.h5 -582 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000249_0_0.h5 -583 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000250_0_0.h5 -584 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000251_0_0.h5 -585 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000252_0_0.h5 -586 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000253_0_0.h5 -587 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000254_0_0.h5 -588 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000255_0_0.h5 -589 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000256_0_0.h5 -590 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000257_0_0.h5 -591 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000258_0_0.h5 -592 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000259_0_0.h5 -593 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000260_0_0.h5 -594 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000261_0_0.h5 -595 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000262_0_0.h5 -596 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000263_0_0.h5 -597 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000264_0_0.h5 -598 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000265_0_0.h5 -599 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000266_0_0.h5 -600 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000267_0_0.h5 -601 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000268_0_0.h5 -602 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000269_0_0.h5 -603 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000270_0_0.h5 -604 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000271_0_0.h5 -605 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000272_0_0.h5 -606 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000273_0_0.h5 -607 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000274_0_0.h5 -608 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000275_0_0.h5 -609 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000276_0_0.h5 -610 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000277_0_0.h5 -611 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000278_0_0.h5 -612 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000279_0_0.h5 -613 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000280_0_0.h5 -614 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000281_0_0.h5 -615 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000282_0_0.h5 -616 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000283_0_0.h5 -617 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000284_0_0.h5 -618 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000285_0_0.h5 -619 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000286_0_0.h5 -620 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000287_0_0.h5 -621 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000288_0_0.h5 -622 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000289_0_0.h5 -623 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000290_0_0.h5 -624 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000291_0_0.h5 -625 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000292_0_0.h5 -626 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000293_0_0.h5 -627 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000294_0_0.h5 -628 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000295_0_0.h5 -629 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000296_0_0.h5 -630 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000297_0_0.h5 -631 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000298_0_0.h5 -632 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000299_0_0.h5 -633 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000300_0_0.h5 -634 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000301_0_0.h5 -635 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000302_0_0.h5 -636 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000303_0_0.h5 -637 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000304_0_0.h5 -638 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000305_0_0.h5 -639 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000306_0_0.h5 -640 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000307_0_0.h5 -641 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000308_0_0.h5 -642 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000309_0_0.h5 -643 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000310_0_0.h5 -644 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000311_0_0.h5 -645 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000312_0_0.h5 -646 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000313_0_0.h5 -647 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000314_0_0.h5 -648 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000315_0_0.h5 -649 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000316_0_0.h5 -650 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000317_0_0.h5 -651 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000318_0_0.h5 -652 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000319_0_0.h5 -653 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000320_0_0.h5 -654 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000321_0_0.h5 -655 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000322_0_0.h5 -656 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000323_0_0.h5 -657 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000324_0_0.h5 -658 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000325_0_0.h5 -659 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000326_0_0.h5 -660 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000327_0_0.h5 -661 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000328_0_0.h5 -662 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000329_0_0.h5 -663 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000330_0_0.h5 -664 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000331_0_0.h5 -665 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000332_0_0.h5 -666 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000333_0_0.h5 -667 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000334_0_0.h5 -668 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000335_0_0.h5 -669 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000336_0_0.h5 -670 /root/afs_ro/xiangshan.afs.baidu.com/user/ernie_pretrain/eb45_turbo/text_h5/zoupengyu/ernie4_tk_m100k-250208_epoch1_add1/300336/out/attempt_20250123021212_582069_r001_000337_0_0.h5 diff --git a/examples/pre-training/conf/ratio_eb45t_0321 b/examples/pre-training/conf/ratio_eb45t_0321 deleted file mode 100644 index e1f9c4bb..00000000 --- a/examples/pre-training/conf/ratio_eb45t_0321 +++ /dev/null @@ -1,1176 +0,0 @@ -/300332/ 0.004202218 10 英文-网页-Chegg 数据集(一期、二期) -- 纯文部分 -/300018/ 0.000426423 10 英文-问答-Chain-of-thought数据集 -/300031/ 0.000672956 10 英文(推理)-其他-专项预训练数据(推理cot数据) -/300079/ 0.004154126 10 英文(推理)-试题-AMPS数学题 -/300268/ 0.011664832 10 英文-问答-mathpile问答数据 -/301010/ 0.000621846 10 英文(推理)-试题-AMPS数学题-COT QA数据-英文 -/400048/ 0.000495729 10 中文-书籍-中小学k12教材_纯文本_P0 -/400073/ 0.000051698 10 中文-书籍-15本逻辑学cot数据 -/400079/ 0.000741630 10 中文(推理)-问答-专项预训练数据(推理cot数据) -/400384/ 0.017233671 10 中文-书籍-十二五高等教育教科书(纯文) -/400500/ 0.001329830 10 中文-书籍-35本大学数学教材与习题-理科专项(文本) -/400501/ 0.001060006 10 中文-书籍-92本各学科大学教材-理科专项提供书单(文本) -/400636/ 0.001132464 10 中文-合成-推理基础能力-合成COT数据(问答) -/400637/ 0.010130296 10 中文-合成-推理相关试题-合成COT数据 -/400640/ 0.038171000 10 中文-合成-合成数据_V34(百度教育) -/400641/ 0.030401655 10 中文-合成-K12百度教育题库&职业考试(法律、医疗)new -/400643/ 0.001701378 10 中文-合成-裁判猜谜合成数据 -/401003/ 0.000070062 10 中文-试题-公务员(网页) -/401004/ 0.000177366 10 中文(推理)-试题-公务员(sft) -/401008/ 0.024912935 10 中文(推理)-试题-数学专项思维链 -/401018/ 0.004168075 10 中文-试题-全品题舟初中试题 -/401019/ 0.002761711 10 中文-试题-组卷网高中试题 -/401020/ 0.002670994 10 中文(推理)-试题-全品题舟小学试题+组卷网初中试题 -/401025/ 0.030762161 10 中文-试题-百度教育253W处理数据 -/401027/ 0.005162020 10 中文-试题-全品题舟高中试题batch1 -/401036/ 0.001720236 10 中文-试题-考试酷-课后资料抓取 -/401041/ 0.010854240 10 中文-试题-学科竞赛资料试题 -/401042/ 0.005200987 10 中文-试题-火星搜题APP试题数据-纯文试题(全量) -/401047/ 0.000193406 10 中文-试题-33iq智力题第一批1万条-文本对 -/401049/ 0.022248625 10 中文-试题-华图公务员题库-推理专项-文本对 -/401137/ 0 10 中文(推理)-垂类-数学专项QA数据-中文 -/401138/ 0.004837967 10 中文(推理)-试题-小学数学-猿辅导train-COT QA数据-中文 -/300317/ 0.008617457 10 英文-书籍-【纯文】open textbook-各类书籍教材 -/300114/ 0.001816212 10 英文-网页-逻辑推理COT数据(英文-70w) -/400131/ 0.009835131 10 中文-网页-逻辑推理COT数据(中文-123w) -/301016/ 1.0998e-7 10 英文-试题-【纯文】mathsisfun网站-试题数据 -/300321/ 0.007123413 10 英文-网页-【数据集筛选】benchmark反查站点-英文数学数据 -/300037/ 1.084535795 50 英文(推理)-学术-open-web-math -/401037/ 0.042431255 10 中文-试题-数学线上 query 定向合成试题 -/401048/ 0.055261370 10 中文-试题-百度教育精品题库推理相关试题-推理专项-文本对 -/500009/ 0.000805158 10 任务数据-任务-MammoTH数据 -/300136/ 0.005368267 10 英文-百科-GoodWiki数据集 -/301027/ 0.000001196 10 英文-试题-electrical-engineering电气工程问答数据-能源开源Benchmark -/300328/ 0.000014418 10 英文-网页-【生物数据】koreabiomed -/301025/ 0.000045433 10 英文-试题-fingpt-fiqa-qa问答数据-金融开源Benchmark -/301032/ 0.002056826 10 英文-网页-开源数学数据-SciInstruct-文本对 -/401053/ 0 10 中文-问答-信息处理能力退火数据-信息处理专项(问答)-文本对 -/400389/ 0.000668421 10 中文-书籍-10类小说剧本-PDD-文本 -/400388/ 0.000878760 10 中文-垂类-能源行业1.8万标准文档-ACG-文本 -/300326/ 0.000745006 10 英文-书籍-【ACG】电力行业第一批1330本专业书籍-文本 -/300327/ 0.043608102 10 英文-网页-CK12数学-纯文(网页) -/301026/ 0.000326669 10 英文-试题-Sujet-Finance-Instruct-177k英文评估-金融开源Benchmark -/301034/ 0.038548893 10 英文-网页-开源数学数据-MMIQC-文本对 -/301035/ 0.041540716 10 英文-网页-代码知识点合成QA数据(codeInstruct)-第二批 -/400392/ 0.000055332 10 中文-垂类-【生物数据】东方财富-文档转纯文 -/401052/ 0.015406246 10 中文-试题-百度教育第二批K12试题-人工定产高质-文本对 -/401054/ 0.000475462 10 中文-网页-牛客网 -/301030/ 0.086946662 10 英文-网页-代码知识点合成QA数据(python教科书)-第三批 -/400723/ 0.000651345 10 中文-书籍-【网络小说】ppd采购-2024Q3-第二批(文本) -/301033/ 0.000003762 10 英文-试题-puzzleprime谜题类试题问答数据(纯文) -/401050/ 9.3184e-7 10 中文-试题-FinEval试题数据-金融开源Benchmark -/100161/ 0.006921342 10 代码-代码-洛谷题库数据 -/100159/ 0.000024031 10 代码-代码-hackerearth代码数据 -/100160/ 7.8628125e-7 10 代码-代码-DevProjects By codementor -/100162/ 0.000511434 10 代码-代码-Lintcode题库数据 -/301017/ 0.000085270 10 英文-问答-investopedia-embedding-dataset问答数据-金融开源Benchmark -/301024/ 0.006414237 10 英文-问答-bartleby教育-书籍问答对-第一批 -/301022/ 0.000063948 10 英文-试题-finance-tasks AdaptLLM金融开源Benchmark -/301018/ 0.000004426 10 英文-试题-financial-qa-10K-金融开源Benchmark -/301020/ 0.000018547 10 英文-问答-BrainBashers谜题问答-纯文 -/301019/ 5.9093333e-8 10 英文-试题-NuclearQA核能数据-能源开源Benchmark -/301012/ 0.000056494 10 英文-试题-【纯文】khanacademy教学数据-试题问答 -/300023/ 0.031022331 23 英文(推理)-学术-DM Mathematics 数学问题数据(包括代数、算数、微积分、数论与概率论等数据) -/500007/ 0.000317181 10 任务数据-任务-flan_v2_math -/301006/ 0.556026240 52 英文-试题-AlgebraicStack数学数据 -/300085/ 0.034850577 13 英文(推理)-试题-WolframAlpha生成语料0629 -/500004/ 0.000085429 10 任务数据-任务-modelscope_v1.1 -/300019/ 0.156326843 110 英文(推理)-学术-PubMed Central 生物医学论文数据 -/301007/ 0.004619885 10 英文-试题-Quizlet-flashcard -/300316/ 0.008069836 10 英文-网页-bartleby教育数据-文档文献 -/300324/ 0.002509667 10 英文-网页-弱势学科相关网页抓取数据-第一批 -/300299/ 0.003595824 10 英文-网页-IXL-AANki web数据-纯文 -/500011/ 0.010269707 10 任务数据-任务-huggingface_v2.3&v3任务数据 -/301005/ 0.001998200 10 英文-试题-Quizlet试题 -/401145/ 0.000015125 10 中文-试题-FinanceIQ-度小满金融开源Benchmark -/401146/ 1.75973333e-7 10 中文-试题-mmlu-electrical电气领域数据-能源开源Benchmark -/301031/ 0.289928856 27 英文-网页-开源数学数据-OpenMathInstruct-2-文本对 -/400010/ 0.141365042 10 中文(推理)-其他-专项预训练数据(数学) -/401022/ 0.013196836 10 中文-试题-DM Mathematics 数学问题数据(中文版) -/401028/ 0.1614 20 中文-试题-刷刷题定向试题增强-中文 -/500003/ 0.040748335 15 任务数据-任务-flan_v2_no_math -/401147/ 0.003231284 10 中文-网页-【说服力】强说服力数据挖掘-文本对 -/400383/ 0.001394032 10 中文-书籍-历史数据-3600个知识点检索得到的书籍清单(mask策略捞回数据) -/300156/ 0.014032425 10 英文-学术-pubscholar-9w -/700010/ 0.036641681 13 英文-试题-os_sft_15m_wo_g4 -/800002/ 0.034635151 12 英文-试题-opc -/700008/ 0 11 英文-试题-long_text_sft_data -/100146/ 0.154701220 24 代码-代码-opc-annealing-corpus -/300073/ 0.046649456 10 英文-书籍-redpjama_Books -/300320/ 0.557234266 52 英文-网页-【HuggingFace开源数据集】FineMath-4plus(数学推理) -/300101/ 0.002820591 10 英文-学术-地理数据-GAKG数据集 -/301009/ 0 10 英文(推理)-垂类-数学专项QA数据-英文 -/301004/ 0.005779125 10 英文-试题-生物分子数据Mol-Instructions -/300303/ 0.014933236 10 英文-网页-PPT文本数据(slideplayer +slideserve) -/400681/ 0.000239836 10 中文-垂类-交通行业书籍-ACG(纯文) -/500006/ 0.000024415 10 任务数据-任务-math + gsm8k -/401033/ 0.00514 10 中文-试题-百度教育-不挂科电力试题-知识点增强-中文【纯文】【内容】【中文】【合作】【自然】 -/200007/ 0.011311881 10 平行语料-平行语料-翻译-平行网页数据 -/300001/ 0.160349719 20 英文-百科-Wikipedia (en)英文维基百科 -/301011/ 0.000211221 10 英文-网页-researchgate纯文问答数据 -/100130/ 0.003521280 10 代码-代码-commitpackft 数据 -/100140/ 0.135185069 22 代码-代码-codeforces-v1.1 -/100157/ 0.006554617 10 代码-代码-代码知识点合成QA数据-第一批 -/400680/ 0.000256299 10 中文-垂类-电力行业标准文档(纯文) -/400667/ 0.032566909 11 中文-百科-百度百科2024年5月26日更新的全量数据(覆盖原先百科数据) -/401030/ 0.073292 10 中文-试题-刷刷题双向知识增强-单多选-中文 -/401029/ 0.055556 10 中文-试题-刷刷题双向知识增强-知识点-中文 -/300153/ 0.003525420 10 英文-对话-quora问答评论组成的多轮对话数据 -/301002/ 0.000804454 10 英文(推理)-试题-英文数学定理证明(NaturalProofs_proofwiki) -/401013/ 0.034229525 10 中文(推理)-试题-百度文库试题 -/300081/ 0.073932628 10 英文(推理)-网页-redpajama_StackExchange过滤数学站点 -/401031/ 0.02362 10 中文-试题-百度教育-不挂科电力试题-中文 -/401035/ 0.000006635 10 中文-问答-劳动纠纷领域精编问答对【纯文字】【问答】【中文】【抓取】【自然】 -/401040/ 0.096686604 12 中文-试题-K12领域理科试题-理科专项(文本对) -/300139/ 0.018957618 10 英文-垂类-digitalcorpora文档 -/400671/ 0.007065981 10 中文-学术-spider-论文-1700w-第二批-中文 -/400391/ 0.000170387 10 中文-书籍-【说服力】心理学&情商648本书籍-文本 -/401032/ 0.08354 10 中文-试题-百度教育-百度题库-电力相关试题-中文 -/400390/ 0.015422963 11 中文-书籍-【说服力】说服力中文书籍第一批-文档 -/300075/ 0.49133022 92 英文-问答-redpjama_StackExchange -/300278/ 0.408453110 288 英文-学术-ArXiv 论文数据(LaTeX格式,主要由数学、计算机和物理学组成) -/300095/ 0.003341633 10 英文-百科-维基百科数学网页 -/300008/ 0.428304576 101 英文-书籍-Books3 长篇书籍(包含科幻与非科幻小说) -/300297/ 0.006656111 10 英文-书籍-用户上传打分分析数据-14000-pdf -/300276/ 0.002535074 10 英文-书籍-复旦书籍-2077-pdf -/300273/ 0.000041790 10 英文-书籍-行业数据-82-pdf-简单版式 -/300275/ 0.000080798 10 英文-书籍-剧本文案-86-pdf -/300021/ 0.055823765 20 英文-学术-USPTO Backgrounds 美国专利数据(仅包含被美国专利商标局批准的专利的背景章节) -/300289/ 0.012064707 10 英文-书籍-英文 Springer出版图书 -/300151/ 0.216530092 40 英文-对话-reddit英文对话(第一批) -/300022/ 0.029885555 21 英文-学术-PubMed Abstracts 生物医学论文摘要数据 -/200005/ 0.000032112 10 平行语料-平行语料-篇章级多语言平行数据-chinadaily中英频道 -/300017/ 0.252459481 47 英文-问答-Stack Exchange问答数据 -/300024/ 0.009099030 10 英文-学术-EuroParl 欧洲议会期刊数据 -/300025/ 0.005437775 10 英文-学术-PhilPapers 哲学论文数据 -/401005/ 0.001047805 10 中文(推理)-试题-小学数学-猿辅导train -/100138/ 0.012531748 10 代码-代码-code-instruction-v1.1(清洗后重新入仓) -/100142/ 0.324497172 51 代码-代码-general-code-v1.1(清洗后重新入仓) -/400150/ 0.042563868 10 中文-合成-代数与分析能力 -/401021/ 0.001276874 10 中文-试题-LaWGPT法律竞赛试题 -/200621/ 0.004428583 10 平行语料-网页-中英向量合成数据(源400116 -/300026/ 0.008649082 10 英文-学术-NIH ExPorter 生物医学专利摘要数据 -/400145/ 0.028199290 21 中文-对话-B站评论第一批 -/400685/ 0.000019176 10 中文-对话-DuConv主动聊天任务数据集(纯文) -/300084/ 0.001032644 10 英文-试题-toolbench工具调用 -/300010/ 0.007351746 10 英文-书籍-BookCorpus2 书籍数据(由未正式发表书籍组成) -/100154/ 0.500997626 200 代码-代码-commitpack-v1 -/100155/ 0.063144033 25 代码-代码-github-issue代码数据 -/300072/ 0.608984059 131 英文-学术-redpjama_ArXiv -/400655/ 0.027240954 10 中文-对话-spider网页过滤多轮(高质量)更新 -/400373/ 0.010967084 10 中文-学术-spider-论文1700w-第一批-47w-pdf(中文) -/400105/ 0.174754822 16 中文(推理)-网页-数学网页vip库0608 -/300087/ 0.009795067 10 英文-网页-openreview论文摘要与评论 -/401034/ 0.13008 16 中文-试题-百度教育-百度题库-电力相关试题-知识点增强-中文【纯文】【内容】【中文】【合作】【自然】 -/300009/ 0.029606909 11 英文-书籍-Gutenberg (PG-19) 经典西方文学书籍数据 -/300143/ 0.471065105 110 英文-书籍-zlibary (epub解析) -/200012/ 0.239147163 29 平行语料-平行语料-翻译-中英文语料wmt -/300068/ 0.008538754 10 英文-书籍-亚马逊12w英文书单-第二批-约4000 亚马逊12w英文书单-理工科书籍-2.7w -/400120/ 0.165578080 59 中文-百科-百度百科2023年8月4日更新的全量数据(覆盖原先百科数据) -/300082/ 0.000029408 10 英文-试题-类BBH训练样本 -/400178/ 0.002457650 10 中文-垂类-中国证券报 -/300069/ 0.235950495 57 英文-数据库-PubChem,Reactome, NASA -/300113/ 0.010712575 10 英文-网页-readthedocs网页数据 -/300292/ 0.019369272 10 英文-合成-Algorithm Solver 1期&2期-英文 -/300126/ 0.007587147 10 英文-合成-低级数学运算 -/300291/ 0.035686748 13 英文-合成-Algorithm Solver 3.1期&3.2期 -/100139/ 0.003852292 10 代码-代码-code-problem-v1.1 -/100143/ 0.025920724 10 代码-代码-jupyter-v1.1(清洗后重新入仓) -/100144/ 0.023971742 10 代码-代码-jupyter-v2.1(清洗后重新入仓) -/100147/ 0.005426379 10 代码-代码-jupyter-v3.1(清洗后重新入仓) -/100148/ 0.001321260 10 代码-代码-leetcode-v1 -/100149/ 0.036570082 10 代码-代码-leetcode-v2 -/100150/ 0.045218142 10 代码-代码-codebench-v1代码数据 -/300173/ 0.155629214 23 英文-问答-里屋社区数据集:数学领域有关的问答数据 -/401139/ 0.011541056 10 中文-试题-刷刷题数据改造-多选题改单选题 -/401140/ 0.016042149 10 中文-试题-刷刷题数据改造-选择题选项顺序打乱&选择题改判断题&选择题改简答题 -/300329/ 0.305080876 51 英文-网页-【数学推理文本】dolmino-mix-1124-math-HuggingFace-退火数据 -/400724/ 0.193480222 72 中文-百科-百度百科2025年1月12日全量更新-百科团队-文本 -/400101/ 0.037382915 28 中文-文库-文库-法律(替代400055) -/200004/ 0.000053225 10 平行语料-平行语料-篇章级多语言平行数据-ft中英频道 -/400370/ 0.006267959 10 中文-专利-维普专利-9w-pdf -/300140/ 0.003650648 10 英文-网页-K12相关网页(字幕解析) -/300308/ 0.000334133 10 英文-学术-历史数据-pubscholar-9w(mask策略捞回数据) -/300314/ 0.008328762 10 英文-学术-历史数据-aminer-128w-简单版式(mask策略捞回数据) -/300318/ 0.274455412 34 英文-百科-【审核专项】wiki百科数据-条目(英文全量) -/300305/ 0.010346531 10 英文-网页-Goodreads评论 -/400623/ 0.014226065 10 中文-数据库-中文优质 caption -/300158/ 0.004283739 10 英文-问答-里屋社区数据集:wikihow英文问答 -/400665/ 0.018959938 10 中文-合成-Algorithm Solver 1期&2期-中文 -/400161/ 0.001164170 10 中文-对话-京东-淘宝电商对话 -/300323/ 0.000526813 10 英文-网页-【说服力】开源数据集DailyPersuasion -/300313/ 0.062847912 47 英文-网页-知识点试题改造合成数据集 -/300319/ 2.050215290 192 英文-网页-【HuggingFace开源数据集】FineMath-3plus(数学推理) -/300322/ 0.012923087 10 英文-网页-【说服力】说服力英文书籍第一批-文档 -/400181/ 0.003513481 10 中文-对话-电商客服数据 -/400656/ 0.007287521 10 中文-对话-spider网页过滤多轮(中质量) -/400660/ 0.056956065 15 中文-对话-spider-电商直播视频ASR转文本数据 -/400129/ 0.002704659 10 中文-书籍-中图简单版式4000本 -/300300/ 0.11802262 81 英文-学术-spider-论文1700w-第一批-178w-pdf(英文) -/300167/ 0.016092177 12 英文-数据库-英文优质 caption -/400024/ 0.430057396 80 中文-试题-百度教育作文/诗词 -/400387/ 0.001297218 10 中文-垂类-智慧职教(课程)(纯文) -/400381/ 0.011926724 10 中文-学术-历史数据-维普学位论文-0619(mask策略捞回数据) -/400386/ 0.000045197 10 中文-垂类-【acg-行业数据】金融行业第二批41本专业书籍-ACG(文档/文本(content)/理解&生成/通用) -/400718/ 0.000087185 10 中文-对话-小说转对话-多轮专项-文本对 -/400719/ 0.000078533 10 中文-对话-selfplay对话数据-多轮专项-文本对 -/400378/ 0.000029977 10 中文-垂类-电商直播文稿(纯文) -/400385/ 0.000011633 10 中文-垂类-【acg-行业数据】交通行业公路业务知识(纯文) -/100141/ 0.135998526 21 代码-代码-developer-community-v1.1(清洗后重新入仓) -/400646/ 0.005912905 10 中文-书籍-15w中文书籍-其他理工科书籍 -/200014/ 0.022128531 10 平行语料-平行语料-来自翻译团队的中英文平行语料 -/300096/ 0.000274601 10 英文-网页-从CC中筛选包含高等数学与科学相关公式的网页 -/300277/ 0.055427506 10 英文-网页-从commoncrawl中筛选包含高等数学与科学相关公式的网页 -/300290/ 0.049165793 10 英文-网页-数学网页专项九合一汇总 -/200620/ 0.009161019 13 平行语料-学术-中英向量合成数据(源400619 -/400239/ 0.014897824 10 中文-对话-微博-多轮数据 -/400712/ 0.000158719 10 中文-网页-【音频转纯文】得到-音视频流媒体(纯文) -/400707/ 0.013256509 10 中文-网页-问一问&播客对话数据-多轮专项 -/400716/ 0.022618291 16 中文-网页-【数据集筛选】【说服力】强说服力数据挖掘-文本 -/400710/ 0.000255895 10 中文-网页-【视频转纯文】【说服力】pdd-辩论赛比赛视频-文本(content) -/400717/ 0.001778207 10 中文-网页-【数据集筛选】【说服力】强说服力数据精加工-说服要素增强 -/400715/ 0.000123443 10 中文-网页-【音视频转纯文】【说服力】pdd-心理学音视频-文本(content) -/400713/ 0.000076532 10 中文-网页-【音频转纯文】电商直播文稿-视频数据(文本) -/400711/ 0.000028231 10 中文-网页-【视频转纯文】【说服力】pdd奇葩说1-7季-文本(content) -/400709/ 0.000004578 10 中文-网页-【视频转纯文】【说服力】pdd-辩论赛培训视频-文本(content) -/400714/ 0.000002955 10 中文-网页-【音频转纯文】央6电影纪录片音频(文本) -/300003/ 0.002559880 10 英文-对话-Ubuntu IRCUbuntu 系统相关对话日志 -/300162/ 0.080042447 32 英文-合成-icl-ref -/300127/ 0.666819573 250 英文-对话-reddit对话数据 -/400085/ 0.026819219 11 中文-学术-10w理工科论文 -/400207/ 0.028018129 21 中文-合成-需求标签tag增强 -/400205/ 0.008679003 10 中文-合成-小红书主题tag增强 -/400626/ 0.087133313 65 中文-合成-条件约束生成数据(collie工具) -/201008/ 0.009847289 10 平行语料-试题-源401008 -/200111/ 0.010215286 10 平行语料-网页-源400111 -/400208/ 0.003940606 10 中文-合成-写作风格增强 -/400206/ 0.003422340 10 中文-合成-知乎主题tag增强 -/200106/ 0.01629529 10 平行语料-网页-源400106 -/200091/ 0.029744786 10 平行语料-网页-源400091 -/200113/ 0.061492316 13 平行语料-网页-源400113 -/200033/ 0.053508506 12 平行语料-网页-源400033 -/200010/ 0.035356092 10 平行语料-其他-源400010 -/200023/ 0.030241943 10 平行语料-网页-源400023 -/300088/ 0.011886271 10 英文-网页-维基百科nature&science词条引文外链站点数据 -/200093/ 0.083837447 18 平行语料-网页-源400093 -/200092/ 0.101052883 22 平行语料-网页-源400092 -/300302/ 0.188836192 155 英文-学术-spider-论文-1700w-第二批-英文 -/400084/ 0.003962362 10 中文-学术-文科论文解析(包含400083) -/200038/ 0 10 平行语料-问答-源400038 -/200045/ 0.018567822 10 平行语料-问答-源400045 -/200122/ 0.006794635 10 平行语料-专利-源400122 -/400382/ 0.004016988 10 中文-书籍-历史数据-中图简单版式4000本(mask策略捞回数据) -/200008/ 0.009533830 10 平行语料-试题-融合多个中文试题数据源 -/200116/ 0.002967857 10 平行语料-网页-源400116 -/200041/ 0.001845739 10 平行语料-问答-源400041 -/400222/ 0.058112990 23 中文-书籍-万话网络小说 -/400627/ 0.000476358 10 中文-合成-角色扮演数据 -/400191/ 0.000701442 10 中文-合成-角色对话 -/401026/ 0.196423801 24 中文-试题-百度教育文科题目 -/400036/ 0.213550949 81 中文-问答-百度知道 -/400151/ 0.181640498 70 中文-对话-小说&网文对话数据 -/400674/ 2.769272989 1106 中文-网页-微信公众号纯文2024.10.11存量数据 -/400675/ 0.000464246 10 中文-垂类-金融相关数据-研报&财报 -/300310/ 0.140917415 52 英文-网页-【new】benchmark网站反查-英文数据 -/300271/ 0.126941465 39 英文-网页-mathpile文本数据 -/300298/ 2.456243178 921 英文-网页-FineWeb-Edu 开源数据 -/400683/ 1.809877020 528 中文-网页-benchmark反查站点-综合主站(6个) -/300067/ 0.052336458 39 英文-数据库-UniProt, OEIS, LIPID -/300304/ 0.364691495 136 英文-网页-Huggingface 弱势学科数据集 -/400103/ 0.161841677 70 中文-文库-文库-学前教育(覆盖400057) -/400204/ 0.022169518 16 中文-书籍-豆瓣 -/400368/ 0.056001816 42 中文-书籍-spider-书籍-52w-pdf -/400356/ 0.026642813 13 中文-书籍-复旦书籍-34504-pdf -/400192/ 0.025978778 10 中文-书籍-3.5w学科专项-简单版式 -/400364/ 0.003851156 10 中文-书籍-用户上传打分分析数据-24000-pdf -/400352/ 0.000595459 10 中文-书籍-剧本文案-3180-pdf -/400351/ 0.001016385 10 中文-书籍-通用书籍-675-pdf & 地震震例-90-pdf -/400247/ 0.000527025 10 中文-书籍-通用书籍-675-pdf -/200077/ 0.000089357 10 平行语料-网页-源100077(leetcode) -/401143/ 0.000013073 10 中文-垂类-【acg-行业数据】交通行业FAQ数据(纯文) -/401141/ 0.000002629 10 中文-垂类-【acg-行业数据】交通行业数据(专业考试数据)-ACG(纯文) -/401142/ 0.000044407 10 中文-垂类-【acg-行业数据】交通行业数据(开源数据集)(纯文) -/400697/ 0.000496174 10 中文-垂类-交通行业数据-ACG(纯文) -/401144/ 0.005225356 10 中文-垂类-【acg-行业数据】电力能源相关数据-电力问答 -/400379/ 0.007165436 10 中文-垂类-【acg-行业数据】金融行业数据-ACG(纯文) -/400698/ 0.000506602 10 中文-垂类-交通行业数据(国行标)-ACG(纯文) -/400705/ 0.000010460 10 中文-垂类-【acg-行业数据】交通法律法规(纯文) -/400375/ 0.000055653 10 中文-垂类-【acg-行业数据】交通行业数据(轨交书籍和国行标)-ACG(纯文) -/400377/ 0.000033962 10 中文-垂类-【acg-行业数据】交通行业文档(纯文) -/400376/ 0.000047017 10 中文-垂类-【acg-行业数据】交通行业数据(书籍)11.26新增(纯文) -/400699/ 0.000031349 10 中文-垂类-能源行业法规数据-国家能源局 -/400704/ 0.000730821 10 中文-对话-多领域中文多轮对话-多轮专项(文本对) -/400380/ 0.001839532 10 中文-学术-历史数据-spider-论文1700w-第一批-47w-pdf(mask策略捞回数据) -/400016/ 0.058636600 43 中文-书籍-法律 -/400678/ 1.490375738 408 中文-网页-benchmark网站反查-中文数据 -/400102/ 0.202163233 97 中文-文库-文库-行业资料(覆盖400056) -/300150/ 0.219513474 164 英文-网页-低速源核心数据第二期 -/401007/ 0.010259254 10 中文-试题-普通作文(非议论文)(web+百度文库) -/400673/ 0.012700600 10 中文-书籍-番茄小说数据 -/300098/ 0.000710771 10 英文-网页-paperswithcode站点所有methods及下面所有的方法和内容 -/100068/ 0.217081934 87 代码-starcoder-git-commits -/100069/ 0.222823771 89 代码-starcoder-github-issues -/100151/ 0.007411130 10 代码-代码-starcoder-jupyter-scripts -/100152/ 0.008740367 10 代码-代码-ee-code-v3 -/100153/ 0.000945145 10 代码-代码-ee-code-v2 -/300002/ 0.006581946 10 英文-对话-OpenSubtitles 电影与电视节目的字幕数据 -/400657/ 0.031458149 30 中文-对话-电商直播数据 -/400371/ 0.174836572 101 中文-书籍-第二批离线数据集-书籍-txt -/6003350001/ 0.000949284 10 多语言-维基百科-Minnan -/400248/ 0.000601401 10 中文-问答-北京帕依提提财税问答数据 -/400014/ 0.019422007 14 中文-书籍-图书出版物 -/6000500096/ 0.005268196 10 多语言-CommonCrawl-布列塔尼语 -/6000690096/ 0.016805797 10 多语言-CommonCrawl-宗喀语 -/6000490096/ 0.007834570 10 多语言-CommonCrawl-波斯尼亚语 -/6000590096/ 0.004257278 10 多语言-CommonCrawl-楚瓦什语 -/300135/ 0.182460476 68 英文-网页-Clueweb22 Category B(推理高浓度) -/6001180096/ 0.015998604 10 多语言-CommonCrawl-马耳他语 -/6001050096/ 0.016079103 10 多语言-CommonCrawl-库尔德语 -/6000770095/ 0.000122776 10 多语言-维基百科-Galician -/6000820096/ 0.003101589 10 多语言-CommonCrawl-海地克里奥尔语 -/6000760096/ 0.004383107 10 多语言-CommonCrawl-苏格兰盖尔语 -/6001690096/ 0.005609511 10 多语言-CommonCrawl-土库曼语 -/6001460096/ 0.012187497 10 多语言-CommonCrawl-梵语 -/6000880096/ 0.000938697 10 多语言-CommonCrawl-伊多语 -/6001640096/ 0.017000598 10 多语言-CommonCrawl-藏语 -/6000940096/ 0.019475851 10 多语言-CommonCrawl-爱尔兰语 -/300281/ 1.112133548 417 英文-网页-RefinedWeb英文网站(推理高浓度) -/6000560096/ 0.002267842 10 多语言-CommonCrawl-车臣语 -/6001130096/ 0.009125619 10 多语言-CommonCrawl-卢森堡语 -/6001360096/ 0.001350078 10 多语言-CommonCrawl-奥塞梯语 -/6001780096/ 0.007382466 10 多语言-CommonCrawl-弗里斯兰语 -/6000450096/ 0.012277542 10 多语言-CommonCrawl-巴什基尔语 -/6001150096/ 0.006805956 10 多语言-CommonCrawl-马达加斯加语 -/6000170096/ 0.008275904 10 多语言-CommonCrawl-爪哇语 -/6001550096/ 0.011083386 10 多语言-CommonCrawl-索马里语 -/6001810096/ 0.007309711 10 多语言-CommonCrawl-意第绪语 -/6001710096/ 0.014157654 10 多语言-CommonCrawl-维吾尔语 -/6001510096/ 0.005953573 10 多语言-CommonCrawl-信德语 -/6001420096/ 0.001181368 10 多语言-CommonCrawl-罗曼什语 -/6001580096/ 0.003222900 10 多语言-CommonCrawl-巽他语 -/6001380096/ 0.017529166 10 多语言-CommonCrawl-普什图语 -/6000390096/ 0.010052251 10 多语言-CommonCrawl-阿萨姆语 -/6001340096/ 0.020935854 10 多语言-CommonCrawl-奥里亚语 -/6001750096/ 0.000748991 10 多语言-CommonCrawl-沃拉普克语 -/6001760096/ 0.000654264 10 多语言-CommonCrawl-瓦隆语 -/6001160095/ 0.000433849 10 多语言-维基百科-Malay -/6000370096/ 0.000548524 10 多语言-CommonCrawl-阿拉贡语 -/6000660096/ 0.010642623 10 多语言-CommonCrawl-迪维希语 -/300005/ 0.004963501 10 英文-对话-YoutubeSubtitlesYoutube 字幕数据(多语言平行预料,由教育内容、流行文化与对话等数据组成) -/6000700095/ 0.000107328 10 多语言-维基百科-Estonian -/6001530095/ 0.000229382 10 多语言-维基百科-Slovak -/6000800095/ 0.000267213 10 多语言-维基百科-Greek -/6000900096/ 0.000528172 10 多语言-CommonCrawl-国际语 -/6000830095/ 0.000405922 10 多语言-维基百科-Hebrew -/6001540095/ 0.000122431 10 多语言-维基百科-Slovenian -/6000790095/ 0.000101249 10 多语言-维基百科-Georgian -/300170/ 0.031127557 23 英文-合成-条件约束生成数据(collie工具) -/300159/ 0.123790280 46 英文-问答-里屋社区数据集:StackExchange问答数据 -/6000080095/ 0.000194590 10 多语言-维基百科-Thai -/6000200095/ 0.000097229 10 多语言-维基百科-Urdu -/6000510095/ 0.000192587 10 多语言-维基百科-Bulgarian -/6001770095/ 0.000095230 10 多语言-维基百科-Welsh -/6000470095/ 0.000149749 10 多语言-维基百科-Belarusian -/6000400096/ 0.000413514 10 多语言-CommonCrawl-阿瓦尔语 -/6000630095/ 0.000127107 10 多语言-维基百科-Croatian -/6001020096/ 0.000285866 10 多语言-CommonCrawl-科米语 -/6001400096/ 0.000203648 10 多语言-CommonCrawl-克丘亚语 -/6001940001/ 0.000123159 10 多语言-维基百科-Asturian -/400632/ 0.000673756 10 中文-问答-里屋社区数据集:wikihow中文问答 -/6001090096/ 0.000299480 10 多语言-CommonCrawl-林堡语 -/6000980095/ 0.000088780 10 多语言-维基百科-Kazakh -/6000380095/ 0.000244474 10 多语言-维基百科-Armenian -/6000490095/ 0.000088780 10 多语言-维基百科-Bosnian -/6001730095/ 0.000121679 10 多语言-维基百科-Uzbek -/6001010096/ 0.021613542 10 多语言-CommonCrawl-柯尔克孜语 -/6001820096/ 0.000274791 10 多语言-CommonCrawl-约鲁巴语 -/6000810096/ 0.000289954 10 多语言-CommonCrawl-瓜拉尼语 -/6000250095/ 0.000119719 10 多语言-维基百科-Telugu -/6000230095/ 0.000084849 10 多语言-维基百科-Hindi -/400700/ 0.000071534 10 中文-书籍-小说剧本-pdd采购2024Q3 -/400374/ 0.000218060 10 中文-书籍-小说名著 -/6000690095/ 0.000153115 10 多语言-维基百科-Esperanto -/6000600096/ 0.000112903 10 多语言-CommonCrawl-康瓦尔语 -/400648/ 0.716732068 67 中文-网页-数学网页spider -/6002070001/ 0.000143369 10 多语言-维基百科-Bangla -/6001110095/ 0.000095701 10 多语言-维基百科-Lithuanian -/6001080095/ 0.000078525 10 多语言-维基百科-Latvian -/6000650095/ 0.000141432 10 多语言-维基百科-Danish -/6000430095/ 0.000107086 10 多语言-维基百科-Azerbaijani -/6000260095/ 0.000105641 10 多语言-维基百科-Tamil -/6001140095/ 0.000115618 10 多语言-维基百科-Macedonian -/6001170095/ 0.000068962 10 多语言-维基百科-Malayalam -/400041/ 0.201815565 151 中文-问答-新医疗问答数据 -/6003070001/ 0.000064261 10 多语言-维基百科-Simple English -/301001/ 0.000005174 10 英文-试题-英语四六级雅思(电子书) -/401012/ 0.000200636 10 中文-试题-议论文作文(web+百度教育+百度文库) -/400045/ 0.053851749 32 中文-问答-知乎 -/400702/ 0.073980175 10 中文-网页-【审核专项】wiki百科数据-条目(中文繁体全量) -/400703/ 0.062628071 10 中文-网页-【审核专项】wiki百科数据-条目(中文简体全量) -/6002030001/ 0.000054121 10 多语言-维基百科-Belarusian (Taraškievica orthography) -/6000330095/ 0.000053668 10 多语言-维基百科-Afrikaans -/6001310095/ 0.000049165 10 多语言-维基百科-Norwegian Nynorsk -/6001980001/ 0.000046495 10 多语言-维基百科-South Azerbaijani -/200003/ 0.01990948 10 平行语料-翻译-wmt / UN -/6000450095/ 0.000046053 10 多语言-维基百科-Bashkir -/6000290095/ 0.000044290 10 多语言-维基百科-Kannada -/300099/ 0.000020255 10 英文-网页-IUPAC Goldbook所有化学概念 -/400209/ 0.105112368 66 中文-合成-写作质量提升-precot -/400670/ 0.011606157 13 中文-垂类-智源纯文数据集-汽车 -/300070/ 0.009732606 10 英文-网页-reddit用户评论交流数据 -/100132/ 12.277663946 2000 代码-代码-github-v2-0415 -/400049/ 0.001149245 10 中文-书籍-计算机书籍(中文) -/300004/ 0.011594312 10 英文-对话-HackerNews 热点评论数据(由针对热点话题的用户评论组成,话题大多与计算机与企业家精神相关) -/6000520095/ 0.000037359 10 多语言-维基百科-Burmese -/6000350095/ 0.000037359 10 多语言-维基百科-Albanian -/6003360001/ 0.000032713 10 多语言-维基百科-Cantonese -/400210/ 0.053394477 35 中文-垂类-电商内部数据 -/6000300096/ 0.000037359 10 多语言-CommonCrawl-比哈尔语 -/400104/ 0.283824680 190 中文-文库-文库-others(替代400058) -/6000270095/ 0.000030058 10 多语言-维基百科-Punjabi -/6001070095/ 0.000029894 10 多语言-维基百科-Latin -/400067/ 0.003617295 10 中文-书籍-科学文库-中文理工科书籍 -/6001770096/ 0.022690344 10 多语言-CommonCrawl-威尔士语 -/6002770001/ 0.000029114 10 多语言-维基百科-Low German -/6000240095/ 0.000026230 10 多语言-维基百科-Marathi -/6002670001/ 0.000026230 10 多语言-维基百科-Minangkabau -/6000910096/ 0.000029155 10 多语言-CommonCrawl-西方国际语 -/400037/ 0.017456813 12 中文-问答-问一问 -/6003000001/ 0.000025428 10 多语言-维基百科-Santali -/6001310096/ 0.022779158 10 多语言-CommonCrawl-新挪威语 -/6001780095/ 0.000025348 10 多语言-维基百科-Western Frisian -/6001320095/ 0.000024310 10 多语言-维基百科-Occitan -/6001150095/ 0.000024271 10 多语言-维基百科-Malagasy -/6000180095/ 0.000023993 10 多语言-维基百科-Tagalog -/6001190096/ 0.000025668 10 多语言-CommonCrawl-马恩岛语 -/6000360096/ 0.023071663 10 多语言-CommonCrawl-阿姆哈拉语 -/6002950001/ 0.000021050 10 多语言-维基百科-Western Punjabi -/6002590001/ 0.000021012 10 多语言-维基百科-Ladin -/6001620095/ 0.000020624 10 多语言-维基百科-Tajik -/6003050001/ 0.000020547 10 多语言-维基百科-Shan -/300100/ 0.000008498 10 英文-网页-NASA Exoplanet 新闻数据 -/6000390095/ 0.000020431 10 多语言-维基百科-Assamese -/400075/ 0.000779161 10 中文-书籍-3600个知识点检索得到的书籍清单 -/6000210095/ 0.000020161 10 多语言-维基百科-Hausa -/400047/ 0.015566506 11 中文-书籍-小说-行业top网文 -/6000370095/ 0.000018250 10 多语言-维基百科-Aragonese -/6000160096/ 0.022840342 10 多语言-CommonCrawl-斯瓦希里语 -/6001010095/ 0.000018213 10 多语言-维基百科-Kyrgyz -/6002150001/ 0.000017834 10 多语言-维基百科-Central Kurdish -/6001130095/ 0.000017082 10 多语言-维基百科-Luxembourgish -/6003030001/ 0.001420308 10 多语言-维基百科-Serbo-Croatian -/6001960001/ 0.000016633 10 多语言-维基百科-Kotava -/6000500095/ 0.000016372 10 多语言-维基百科-Breton -/6001630096/ 0.022942882 11 多语言-CommonCrawl-鞑靼语 -/6000870095/ 0.000016335 10 多语言-维基百科-Icelandic -/6000890095/ 0.000016261 10 多语言-维基百科-Igbo -/6001860001/ 0.000016224 10 多语言-维基百科-Alemannic -/300142/ 0.900666666 337 英文-网页-MMLU-难样本-英文 -/6001380095/ 0.000015631 10 多语言-维基百科-Pashto -/6001050095/ 0.000015153 10 多语言-维基百科-Kurdish -/400013/ 0.086267911 122 中文-书籍-龙源期刊 -/6001640095/ 0.000014713 10 多语言-维基百科-Tibetan -/6002810001/ 0.000002898 10 多语言-维基百科-N’Ko -/6001260095/ 0.000014494 10 多语言-维基百科-Nepali -/6000170095/ 0.000013803 10 多语言-维基百科-Javanese -/6003060001/ 0.000013659 10 多语言-维基百科-Sinhala -/100070/ 0.051859991 10 代码-starcoder-jupyter-structured -/400176/ 0.017917055 17 中文-合成-抽象符号推理-ascii_art -/6000940095/ 0.000013191 10 多语言-维基百科-Irish -/6000280095/ 0.000013083 10 多语言-维基百科-Gujarati -/500008/ 0.000285964 10 任务数据-任务-汉语拆字&专有名词 -/6003250001/ 0.000012051 10 多语言-维基百科-Venetian -/6002850001/ 0.000011417 10 多语言-维基百科-Odia -/6000460095/ 0.001446829 10 多语言-维基百科-Basque -/6000590095/ 0.000011207 10 多语言-维基百科-Chuvash -/6002780001/ 0.000010720 10 多语言-维基百科-Newari -/6001070096/ 0.023282341 12 多语言-CommonCrawl-拉丁语 -/6000160095/ 0.000010065 10 多语言-维基百科-Swahili -/300145/ 0.000031086 10 英文-垂类-k12相关网页-pdf-简单版式 -/6001220095/ 0.000009723 10 多语言-维基百科-Mongolian -/300295/ 0.000133656 10 英文-对话-历史英文被过滤语料回捞数据-对话(reddit英文对话第一批) -/6002500001/ 0.000008977 10 多语言-维基百科-Khmer -/6001580095/ 0.000008943 10 多语言-维基百科-Sundanese -/6002410001/ 0.000008574 10 多语言-维基百科-Western Armenian -/6002000001/ 0.000008341 10 多语言-维基百科-Bavarian -/400155/ 0.036901387 13 中文-网页-百科优质词条外链站点数据(推理高浓度) -/300013/ 0.240575585 65 英文-网页-OpenWebText2 网页库数据(多语言) -/6003120001/ 0.000007713 10 多语言-维基百科-Silesian -/6002690001/ 0.000007680 10 多语言-维基百科-Mon -/6001060096/ 0.023799202 13 多语言-CommonCrawl-老挝语 -/6002060001/ 0.000002863 10 多语言-维基百科-Pa'O -/6003020001/ 0.000006835 10 多语言-维基百科-Scots -/6000820095/ 0.000006707 10 多语言-维基百科-Haitian Creole -/400188/ 0.054522614 40 中文-对话-贴吧v3-多轮对话-优质数据 -/6000880095/ 0.000006515 10 多语言-维基百科-Ido -/6001180095/ 0.00000642 10 多语言-维基百科-Maltese -/6001420095/ 0.000002548 10 多语言-维基百科-Romansh -/400040/ 0.001365487 10 中文-问答-新浪 -/6002200001/ 0.000006103 10 多语言-维基百科-Zazaki -/6002600001/ 0.000005914 10 多语言-维基百科-Lombard -/400012/ 0.047220190 35 中文-书籍-小说百度阅读出版物 -/6001990001/ 0.000005694 10 多语言-维基百科-Balinese -/6000280096/ 0.023643323 13 多语言-CommonCrawl-古吉拉特语 -/6000060095/ 0.002383376 10 多语言-维基百科-Indonesian -/400100/ 0.029084601 21 中文-网页-VIP库-微信公众号文本 -/6000270096/ 0.023662608 14 多语言-CommonCrawl-旁遮普语 -/6002940001/ 0.000004500 10 多语言-维基百科-Piedmontese -/6001400095/ 0.000004500 10 多语言-维基百科-Quechua -/300349/ 0.708291032 265 英文-网页-RefinedWeb英文网站(弱势学科)_minhash0.7网页全局&局部去重 -/400676/ 0.043443129 32 中文-垂类-智源纯文数据集-体育 -/6001510095/ 0.000004231 10 多语言-维基百科-Sindhi -/6002020001/ 0.000004172 10 多语言-维基百科-Central Bikol -/6001460095/ 0.000004202 10 多语言-维基百科-Sanskrit -/6002990001/ 0.000004202 10 多语言-维基百科-Yakut -/6001920001/ 0.000004083 10 多语言-维基百科-Moroccan Arabic -/401014/ 0.000321003 10 中文-试题-BAAI Exam文科题 -/6002180001/ 0.000003790 10 多语言-维基百科-Dagbani -/6001820095/ 0.000003558 10 多语言-维基百科-Yoruba -/300311/ 0.474249004 355 英文-网页-英文知识分级退火数据 -/6001700095/ 0.000003443 10 多语言-维基百科-Twi -/6001090095/ 0.000003414 10 多语言-维基百科-Limburgish -/6003220001/ 0.000001334 10 多语言-维基百科-Tuvinian -/6003310001/ 0.000003328 10 多语言-维基百科-Mingrelian -/6001750095/ 0.000003214 10 多语言-维基百科-Volapük -/6002420001/ 0.000003271 10 多语言-维基百科-Iloko -/6000360095/ 0.000003158 10 多语言-维基百科-Amharic -/6000560095/ 0.001507632 10 多语言-维基百科-Chechen -/6001810095/ 0.000003073 10 多语言-维基百科-Yiddish -/6002400001/ 0.000002905 10 多语言-维基百科-Upper Sorbian -/6000900095/ 0.000002933 10 多语言-维基百科-Interlingua -/6002080001/ 0.000002905 10 多语言-维基百科-Bishnupriya -/6003280001/ 0.001523826 10 多语言-维基百科-Waray -/6000150095/ 0.002037323 10 多语言-维基百科-Turkish -/6001470096/ 0.00000285 10 多语言-CommonCrawl-萨丁尼亚语 -/400644/ 0.000856489 10 中文-合成-QA-style合成数据 -/6003330001/ 0.000001007 10 多语言-维基百科-Standard Moroccan Tamazight -/6003240001/ 0.000002492 10 多语言-维基百科-Uyghur -/6003150001/ 9.75e-7 10 多语言-维基百科-Tulu -/6001870001/ 9.72e-7 10 多语言-维基百科-Southern Altai -/400369/ 0.000127151 10 中文-期刊-维普期刊-1w-pdf -/400357/ 0.000013011 10 中文-对话-三联生活周刊、中国新闻周刊对话数据 -/400355/ 0.000098824 10 中文-对话-凤凰卫视-媒体逐字稿数据 -/400366/ 0.000008572 10 中文-对话-凤凰卫视媒体逐字稿-第二批 -/400367/ 0.000004555 10 中文-对话-南方人物周刊 -/400359/ 0.000002782 10 中文-对话-圆桌派视频字幕 -/6000610096/ 0.000002574 10 多语言-CommonCrawl-科西嘉语 -/6000330096/ 0.024599836 16 多语言-CommonCrawl-南非语 -/6001240095/ 0.000002249 10 多语言-维基百科-Navajo -/6002680001/ 0.000002329 10 多语言-维基百科-Manipuri -/6000660095/ 8.466e-7 10 多语言-维基百科-Divehi -/6002320001/ 0.000002061 10 多语言-维基百科-Scottish Gaelic -/6002630001/ 0.000002045 10 多语言-维基百科-Maithili -/6001730096/ 0.024639228 16 多语言-CommonCrawl-乌孜别克语 -/6002580001/ 0.000002005 10 多语言-维基百科-Ligurian -/6002100001/ 7.938e-7 10 多语言-维基百科-Russia Buriat -/6002860001/ 0.000001940 10 多语言-维基百科-Ossetic -/6001630095/ 0.001533951 10 多语言-维基百科-Tatar -/6003130001/ 7.728e-7 10 多语言-维基百科-Sakizaya -/300006/ 0.030760461 10 英文-对话-reddit -/6000720095/ 0.000001888 10 多语言-维基百科-Faroese -/6002720001/ 0.000001881 10 多语言-维基百科-Erzya -/6002710001/ 0.000001873 10 多语言-维基百科-Mirandese -/6002880001/ 0.000001798 10 多语言-维基百科-Pampanga -/6003010001/ 0.000001798 10 多语言-维基百科-Sicilian -/6002730001/ 0.000001756 10 多语言-维基百科-Mazanderani -/6002970001/ 0.000001716 10 多语言-维基百科-Tarantino -/6002160001/ 0.000001708 10 多语言-维基百科-Crimean Tatar -/6003320001/ 0.000001693 10 多语言-维基百科-Zeelandic -/6000310095/ 6.798e-7 10 多语言-维基百科-Abkhazian -/6003290001/ 0.000001685 10 多语言-维基百科-Wu -/6002660001/ 0.000001588 10 多语言-维基百科-Eastern Mari -/100134/ 5.251074464 2000 代码-代码-The Stack v2-train-full-ids(去除github-v2-0415已有repo) -/6002340001/ 0.000001571 10 多语言-维基百科-Goan Konkani -/6003080001/ 0.000001551 10 多语言-维基百科-Saraiki -/6001550095/ 0.000001538 10 多语言-维基百科-Somali -/6003260001/ 0.000001533 10 多语言-维基百科-Veps -/6002930001/ 6.192e-7 10 多语言-维基百科-Palatine German -/6003340001/ 0.000001513 10 多语言-维基百科-Literary Chinese -/6000970095/ 5.943e-7 10 多语言-维基百科-Kashmiri -/6003210001/ 0.000001473 10 多语言-维基百科-Tumbuka -/6003110001/ 5.598e-7 10 多语言-维基百科-Saterland Frisian -/6001760095/ 0.000001394 10 多语言-维基百科-Walloon -/6001320096/ 0.025684532 18 多语言-CommonCrawl-奥克语 -/6002270001/ 0.000001369 10 多语言-维基百科-Northern Frisian -/6000610095/ 0.000001332 10 多语言-维基百科-Corsican -/6003200001/ 5.352e-7 10 多语言-维基百科-Taroko -/6002040001/ 0.000001311 10 多语言-维基百科-Bhojpuri -/200623/ 0.065344722 48 平行语料-网页-中英向量合成数据(源400093 -/6001060095/ 0.000001296 10 多语言-维基百科-Lao -/6003270001/ 0.000001284 10 多语言-维基百科-West Flemish -/6001690095/ 0.000001264 10 多语言-维基百科-Turkmen -/6002760001/ 0.000001257 10 多语言-维基百科-Low Saxon -/6001900001/ 2.50537438e-7 10 多语言-维基百科-Angika -/6001000095/ 0.000001247 10 多语言-维基百科-Kinyarwanda -/6000810095/ 0.000001216 10 多语言-维基百科-Guarani -/6000780095/ 4.815e-7 10 多语言-维基百科-Ganda -/6001850001/ 0.000001197 10 多语言-维基百科-Achinese -/300325/ 0.356531958 263 英文-网页-fineweb推理数据(第二批)(模糊去重) -/6002010001/ 0.000001163 10 多语言-维基百科-Samogitian -/6002210001/ 4.719e-7 10 多语言-维基百科-Lower Sorbian -/6002110001/ 4.623e-7 10 多语言-维基百科-Chavacano -/200002/ 0.044962196 33 平行语料-翻译-OPUS -/6002280001/ 4.551e-7 10 多语言-维基百科-Friulian -/6001470095/ 0.000001129 10 多语言-维基百科-Sardinian -/6000180096/ 0.025123174 19 多语言-CommonCrawl-他加禄语 -/6001410095/ 0.001561550 10 多语言-维基百科-Romanian -/6002510001/ 4.245e-7 10 多语言-维基百科-Komi-Permyak -/300086/ 0.066175197 25 英文-网页-RoBERTa-stories数据集 -/6000750095/ 2.075551707e-7 10 多语言-维基百科-Fula -/6002650001/ 0.000001018 10 多语言-维基百科-Moksha -/6002840001/ 4.032e-7 10 多语言-维基百科-Livvi-Karelian -/6002980001/ 9.954e-7 10 多语言-维基百科-Rusyn -/6001840001/ 9.858e-7 10 多语言-维基百科-Zulu -/6000140095/ 0.002091422 10 多语言-维基百科-Persian -/6002560001/ 9.672e-7 10 多语言-维基百科-Lezghian -/6003040001/ 3.801e-7 10 多语言-维基百科-Tachelhit -/6001190095/ 9.444e-7 10 多语言-维基百科-Manx -/6002220001/ 3.801e-7 10 多语言-维基百科-Doteli -/6002450001/ 3.525e-7 10 多语言-维基百科-Lojban -/400706/ 0.240076382 173 中文-网页-中文知识分级退火数据 -/6001490095/ 8.274e-7 10 多语言-维基百科-Shona -/6003090001/ 8.184e-7 10 多语言-维基百科-Inari Sami -/300312/ 0.180215637 135 英文-垂类-【审核专项】legal-mc4-法律开源数据集 -/6000540096/ 0.024792144 19 多语言-CommonCrawl-高棉语 -/6002750001/ 7.734e-7 10 多语言-维基百科-Neapolitan -/6002350001/ 7.602e-7 10 多语言-维基百科-Gorontalo -/6002830001/ 7.512e-7 10 多语言-维基百科-Northern Sotho -/6002790001/ 3.072e-7 10 多语言-维基百科-Nias -/6002460001/ 7.512e-7 10 多语言-维基百科-Kara-Kalpak -/300280/ 0.138689640 208 英文-专利-英文专利 -/6001020095/ 7.398e-7 10 多语言-维基百科-Komi -/6000290096/ 0.025091585 20 多语言-CommonCrawl-卡纳达语 -/6001680095/ 2.76269503e-7 10 多语言-维基百科-Tswana -/6002170001/ 2.784664743e-7 10 多语言-维基百科-Kashubian -/6001290095/ 0.00159324 10 多语言-维基百科-Norwegian -/6002050001/ 6.786e-7 10 多语言-维基百科-Banjar -/6001440095/ 1.34848023e-7 10 多语言-维基百科-Samoan -/6002120001/ 6.564e-7 10 多语言-维基百科-Mindong -/6002470001/ 6.432e-7 10 多语言-维基百科-Kabyle -/6001790095/ 2.544623304e-7 10 多语言-维基百科-Wolof -/6002900001/ 6.324e-7 10 多语言-维基百科-Picard -/6002620001/ 1.242053655e-7 10 多语言-维基百科-Madurese -/300012/ 0.407028972 171 英文-网页-Pile-CC 网页库数据 -/400093/ 0.050580450 37 中文-网页-中文高质量网页库第二批(0616过滤) -/6000580095/ 1.231276377e-7 10 多语言-维基百科-Church Slavic -/6002570001/ 6.048e-7 10 多语言-维基百科-Lingua Franca Nova -/6000400095/ 6.066e-7 10 多语言-维基百科-Avaric -/6000910095/ 5.961e-7 10 多语言-维基百科-Interlingue -/6001280095/ 5.769e-7 10 多语言-维基百科-Northern Sami -/6002480001/ 2.28701057e-7 10 多语言-维基百科-Kabardian -/6002530001/ 2.35098237e-7 10 多语言-维基百科-Colognian -/6002190001/ 2.28701129e-7 10 多语言-维基百科-Dagaare -/6002640001/ 5.577e-7 10 多语言-维基百科-Basa Banyumasan -/6000150096/ 0.034764535 29 多语言-CommonCrawl-土耳其语 -/6002870001/ 4.338e-7 10 多语言-维基百科-Pangasinan -/6002490001/ 2.09690186e-7 10 多语言-维基百科-Kabiye -/6000600095/ 5.178e-7 10 多语言-维基百科-Cornish -/6003230001/ 5.073e-7 10 多语言-维基百科-Udmurt -/6001200095/ 5.052e-7 10 多语言-维基百科-Māori -/6000240096/ 0.025417773 22 多语言-CommonCrawl-马拉地语 -/6002370001/ 4.845e-7 10 多语言-维基百科-Hakka Chinese -/6001350095/ 1.888999455e-7 10 多语言-维基百科-Oromo -/300014/ 0.000758372 10 英文-网页-Enron Emails 邮件数据 -/6002440001/ 1.868410965e-7 10 多语言-维基百科-Jamaican Creole English -/400244/ 0.010880140 10 中文-网页-大搜—小红书数据 -/6000740095/ 0.001614875 10 多语言-维基百科-Finnish -/6002240001/ 4.392e-7 10 多语言-维基百科-Extremaduran -/6003300001/ 1.705085886e-7 10 多语言-维基百科-Kalmyk -/6002250001/ 4.191e-7 10 多语言-维基百科-Võro -/6002890001/ 4.212e-7 10 多语言-维基百科-Papiamento -/6001880001/ 1.66868598e-7 10 多语言-维基百科-Amis -/6000990095/ 8.291558e-8 10 多语言-维基百科-Kikuyu -/6000420095/ 4.068e-7 10 多语言-维基百科-Aymara -/6002960001/ 8.0616336e-8 10 多语言-维基百科-Aromanian -/6002380001/ 1.59828762e-7 10 多语言-维基百科-Hawaiian -/6002700001/ 3.969e-7 10 多语言-维基百科-Western Mari -/6002390001/ 3.828e-7 10 多语言-维基百科-Fiji Hindi -/6002260001/ 3.828e-7 10 多语言-维基百科-Arpitan -/6001620096/ 0.025905823 23 多语言-CommonCrawl-塔吉克语 -/6002330001/ 3.552e-7 10 多语言-维基百科-Gilaki -/400029/ 0.092360254 69 中文-网页-文案(作文、演讲稿等) -/6001520096/ 0.025990615 23 多语言-CommonCrawl-僧伽罗语 -/6002360001/ 1.33364553e-7 10 多语言-维基百科-Gun -/300342/ 0.129748617 41 英文-网页-Clueweb22CategoryB(弱势学科)_minhash0.7网页全局&局部去重 -/6002540001/ 3.3e-7 10 多语言-维基百科-Ladino -/6002610001/ 1.318104054e-7 10 多语言-维基百科-Latgalian -/6003180001/ 1.275523281e-7 10 多语言-维基百科-Tongan -/6002910001/ 1.265878506e-7 10 多语言-维基百科-Nigerian Pidgin -/6000110095/ 0.002167557 10 多语言-维基百科-Vietnamese -/6001890001/ 3.048e-7 10 多语言-维基百科-Old English -/6002800001/ 1.81772176e-7 10 多语言-维基百科-Novial -/6002520001/ 2.992002771e-7 10 多语言-维基百科-Karachay-Balkar -/6001220096/ 0.025902340 24 多语言-CommonCrawl-蒙古语 -/6000120095/ 0.002226276 10 多语言-维基百科-Korean -/6003140001/ 2.80205471e-7 10 多语言-维基百科-Tayal -/6002310001/ 1.105726713e-7 10 多语言-维基百科-Guianan Creole -/6001100095/ 2.59556788e-7 10 多语言-维基百科-Lingala -/400701/ 0.121971089 91 中文-垂类-智源纯文数据集-文学 -/6003160001/ 9.2918323e-8 10 多语言-维基百科-Tetum -/6001800095/ 2.355035583e-7 10 多语言-维基百科-Xhosa -/6002430001/ 2.24536354e-7 10 多语言-维基百科-Ingush -/6002230001/ 2.245363452e-7 10 多语言-维基百科-Emiliano-Romagnolo -/400035/ 0.007915126 10 中文-网页-作文 -/6001970001/ 8.87233128e-8 10 多语言-维基百科-Awadhi -/6002300001/ 2.17273911e-7 10 多语言-维基百科-Gan -/100019/ 0.000063483 10 代码-网页-VBA编程相关网站+400054 emojiall -/100021/ 0.008660972 10 代码-markdown-各类带表格的markdown语料 -/301008/ 2.033326666 1524 英文(推理)-网页-fineweb英文推理数据 -/6002820001/ 2.06456002e-7 10 多语言-维基百科-Norman -/6002140001/ 8.29415673e-8 5 多语言-维基百科-Cherokee -/6002090001/ 2.028706863e-7 10 多语言-维基百科-Buginese -/6001480095/ 0.001647660 10 多语言-维基百科-Serbian -/400677/ 0.024186679 34 中文-垂类-金融相关数据-金融资讯 -/6003170001/ 1.92178418e-7 10 多语言-维基百科-Talysh -/6002290001/ 1.555048722e-7 10 多语言-维基百科-Gagauz -/200011/ 0.306725598 314 平行语料-平行语料-翻译-基于opus集翻译小语种数据集 -/6001910001/ 6.96567603e-8 10 多语言-维基百科-Aramaic -/400091/ 0.015889569 11 中文-网页-安全权威网页第三批并与第一批、第二批合并 -/6002920001/ 5.8982595e-8 10 多语言-维基百科-Pennsylvania German -/6001830001/ 5.8304299e-8 10 多语言-维基百科-Zhuang -/6001030095/ 2.90674728e-8 10 多语言-维基百科-Kongo -/6003100001/ 5.5772518e-8 10 多语言-维基百科-Sranan Tongo -/400654/ 0.334551889 946 中文-学术-维普论文 -/6002550001/ 5.19275805e-8 10 多语言-维基百科-Lak -/400722/ 0.323560728 121 中文-网页-头条号截止2024年12月全量数据-Spider-文本 -/6003190001/ 4.78015863e-8 10 多语言-维基百科-Tok Pisin -/6002740001/ 1.16070562e-7 10 多语言-维基百科-Nāhuatl -/6001950001/ 4.60044882e-8 10 多语言-维基百科-Atikamekw -/6001160096/ 0.026818105 29 多语言-CommonCrawl-马来语 -/6000480095/ 2.259573e-8 10 多语言-维基百科-Bislama -/400152/ 0.023280952 39 中文-网页-百科优质词条外链站点数据(弱势学科) -/400650/ 0.143103541 107 中文-网页-vip网页库(推理高浓度) -/6000770096/ 0.027044638 30 多语言-CommonCrawl-加利西亚语 -/6000520096/ 0.026830243 30 多语言-CommonCrawl-缅甸语 -/6000730095/ 3.7514555e-8 10 多语言-维基百科-Fijian -/400144/ 0.000162475 10 中文-网页-医学-药品名和商品名 && 科学能力-xmol期刊资讯 -/400030/ 0.021575597 16 中文-网页-科技百家号 -/400026/ 0.005048637 10 中文-网页-综合新闻 -/6000030095/ 0.002705957 10 多语言-维基百科-Arabic -/6000640095/ 0.001697821 10 多语言-维基百科-Czech -/400198/ 0.036545755 27 中文-网页-低速源核心数据第二期 -/6000860095/ 0.001705734 10 多语言-维基百科-Hungarian -/400113/ 0.002695897 10 中文-网页-大学计算机数据46w -/400034/ 0.007476953 10 中文-网页-新闻 -/6000070095/ 0.002732014 10 多语言-维基百科-Portuguese -/6000460096/ 0.027628220 33 多语言-CommonCrawl-巴斯克语 -/6001930001/ 0.001712297 10 多语言-维基百科-Egyptian Arabic -/6001610095/ 2.2241634e-8 10 多语言-维基百科-Tahitian -/6001140096/ 0.027383413 34 多语言-CommonCrawl-马其顿语 -/400664/ 0.155170136 116 中文-书籍-中图豆瓣淘宝京东epub -/400645/ 0.057696071 43 中文-对话-健康医疗-多轮问诊对话 -/400112/ 0.004705202 10 中文-网页-LaWGPT法律开源数据 -/6000530095/ 0.001745842 10 多语言-维基百科-Catalan -/6000470096/ 0.027883990 36 多语言-CommonCrawl-白俄罗斯语 -/400022/ 0.000200493 10 中文-网页-党政 -/400189/ 0.048884509 36 中文-对话-贴吧v3-多轮对话-一般优质数据 -/300269/ 0.000001023 5 英文-对话-onepocket对话访谈 -/400666/ 0.003927789 10 中文-网页-剧本创作 -/300154/ 0.741774062 278 英文-学术-dolma_peS2o -/400183/ 0.000846748 10 中文-网页-健康医疗-医院、医生 -/400068/ 0.000668563 10 中文-书籍-中文古籍-3.59w -/400027/ 0.007461071 10 中文-网页-医疗 -/300015/ 0.085395505 59 英文-网页-CC-NEWS 网页库新闻数据 -/400060/ 0.027824244 20 中文-对话-贴吧v2 -/400730/ 0.012030753 90 中文-网页-vip网页库(弱势学科)_minhash0.7网页全局&局部去重 -/400149/ 0.010336823 15 中文-合成-符号替换symbol_substitution -/6000250096/ 0.027730615 39 多语言-CommonCrawl-泰卢固语 -/400184/ 0.004399811 10 中文-网页-健康医疗-其他数据 -/400114/ 0.000069139 10 中文-网页-地理数据 -/400230/ 0.000112892 10 中文-网页-93歌词数据 -/400684/ 1.038841914 779 中文(推理)-网页-vip&大搜推理数据2023.11-2024-04 -/6000200096/ 0.028003724 40 多语言-CommonCrawl-乌尔都语 -/300274/ 0.078405061 58 英文-书籍-FreeLaw 法律数据 -/6000670095/ 0.001773496 10 多语言-维基百科-Dutch -/6001260096/ 0.027893926 41 多语言-CommonCrawl-尼泊尔语 -/400023/ 0.002560072 10 中文-网页-金融 -/6000870096/ 0.028706252 43 多语言-CommonCrawl-冰岛语 -/6001370095/ 1.63769256e-8 10 多语言-维基百科-Pali -/6001170096/ 0.028387901 44 多语言-CommonCrawl-马拉雅拉姆语 -/400033/ 0.003183366 10 中文-网页-科技 -/400233/ 0.000010543 10 中文-专业创作-保险产品条款-301-pdf -/400096/ 0.041686120 31 中文-网页-新浪及简书博客文章 -/400032/ 0.002755136 10 中文-网页-人民网 -/401002/ 1.28e-7 10 中文-试题-政治考研(web) -/300270/ 0.246624685 30 英文-网页-wiki-多语言-英语 -/6000030096/ 0.045033413 79 多语言-CommonCrawl-阿拉伯语 -/300301/ 3.306503875 2000 英文-网页-commoncrawl纯文推理数据2013-2023 -/6000380096/ 0.029266659 54 多语言-CommonCrawl-亚美尼亚语 -/401001/ 2.14666667e-7 10 中文-试题-中医考研(web) -/6000350096/ 0.029836300 56 多语言-CommonCrawl-阿尔巴尼亚语 -/400028/ 0.003871893 10 中文-网页-3c、旅游 -/200009/ 0.618969248 1196 平行语料-平行语料-翻译-opus数据集 -/400245/ 0.018641081 13 中文-网页-大搜—旅游数据 -/300347/ 0.542201090 255 英文-网页-RefinedWeb英文网站(推理中浓度)_minhash0.7网页全局&局部去重 -/6000980096/ 0.030103785 61 多语言-CommonCrawl-哈萨克语 -/400154/ 0.009107741 28 中文-网页-百科优质词条外链站点数据(推理中浓度) -/400220/ 0.000009362 10 中文-问答-金融财报人工精标问答数据 -/6001390095/ 0.001896546 10 多语言-维基百科-Polish -/400121/ 0.143819858 107 中文-网页-中文创作spider网页库 -/6001600095/ 0.001926623 10 多语言-维基百科-Swedish -/6000790096/ 0.030429894 65 多语言-CommonCrawl-格鲁吉亚语 -/400629/ 0.000176079 10 中文-网页-93歌词(第二批) -/400173/ 0.006682723 10 中文-合成-抽象符号推理-字符串处理-第一批 -/400020/ 0.020552609 15 中文-网页-小红书 -/400737/ 0.025183065 71 中文-网页-SE网页库9月例行(弱势学科)_minhash0.7网页全局&局部去重 -/6000060096/ 0.048800635 108 多语言-CommonCrawl-印度尼西亚语 -/6000430096/ 0.030468871 68 多语言-CommonCrawl-阿塞拜疆语 -/400159/ 0.040556361 30 中文-网页-SE网页库9月例行(推理高浓度) -/6000040095/ 0.003087481 10 多语言-维基百科-Spanish -/400177/ 0.000012448 10 中文-网页-上海城市法规全书 -/6001720095/ 0.001927410 10 多语言-维基百科-Ukrainian -/100024/ 0.000308844 10 代码-代码-各语言编程网站 -/100027/ 0.000865627 10 代码-代码-github证明题代码 -/6000140096/ 0.042191683 111 多语言-CommonCrawl-波斯语 -/400133/ 0.056244973 228 中文-合成-中文棋类内容(围棋、中国象棋、国际象棋) -/400682/ 0.048176924 36 中文-垂类-党政政策解读、分析相关数据 -/6009990096/ 0.032153579 87 多语言-CommonCrawl-其他 -/6000050095/ 0.003165565 10 多语言-维基百科-Russian -/6000070096/ 0.051388692 142 多语言-CommonCrawl-葡萄牙语 -/6000190095/ 0.002014799 10 多语言-维基百科-Italian -/6000260096/ 0.031766319 91 多语言-CommonCrawl-泰米尔语 -/6000130095/ 0.002756493 10 多语言-维基百科-Japanese -/400215/ 0.001864484 10 中文-问答-里屋社区数据集:知乎问答 -/6000220096/ 0.032028172 96 多语言-CommonCrawl-孟加拉语 -/6000090095/ 0.002745445 10 多语言-维基百科-French -/300344/ 0.029037864 34 英文-网页-Clueweb22CategoryB(推理中浓度)_minhash0.7网页全局&局部去重 -/6001080096/ 0.032613506 110 多语言-CommonCrawl-拉脱维亚语 -/6000670096/ 0.033514756 115 多语言-CommonCrawl-荷兰语 -/6000700096/ 0.033573350 116 多语言-CommonCrawl-世界语 -/400164/ 0.000101043 10 中文-网页-电商-京东商品数据 -/300348/ 0.141539919 530 英文-网页-RefinedWeb英文网站(推理低浓度)_minhash0.7网页全局&局部去重 -/6000100095/ 0.00284676 10 多语言-维基百科-German -/300294/ 0.116943048 87 英文-网页-历史英文被过滤语料回捞数据-网页(Dolma CC、BAAI-MTP) -/300296/ 0.000004768 10 英文-网页-OpenNewsArchive 新闻数据集 -/6002130001/ 0.002161981 10 多语言-维基百科-Cebuano -/400741/ 0.021644925 107 中文-网页-悟道_minhash0.7网页全局&局部去重 -/400635/ 3.6982e-7 5 中文-对话-似是故人来第一季唱词10篇 -/400025/ 0.000283245 10 中文-网页-财经 -/6000090096/ 0.047730460 210 多语言-CommonCrawl-法语 -/6001540096/ 0.035161323 155 多语言-CommonCrawl-斯洛文尼亚语 -/400017/ 0.003022108 10 中文-网页-百度经验 -/6000230096/ 0.035196511 173 多语言-CommonCrawl-印地语 -/100023/ 0.002430262 10 代码-pytorch-使用pytorch框架的python代码 -/6000740096/ 0.036302397 190 多语言-CommonCrawl-芬兰语 -/400630/ 0.000366207 10 中文-网页-10万条药品说明书 -/400238/ 0.000706079 10 中文-网页-3个作文网 -/6001290096/ 0.036735728 202 多语言-CommonCrawl-挪威语 -/6000650096/ 0.036725799 203 多语言-CommonCrawl-丹麦语 -/400019/ 0.003125926 10 中文-网页-Job Description -/6001110096/ 0.036752247 208 多语言-CommonCrawl-立陶宛语 -/400728/ 0.309798380 232 中文-网页-vip网页库(推理中浓度)_minhash0.7网页全局&局部去重 -/400099/ 0.002636629 10 中文-网页-歌词、笑话、菜谱数据集 -/6000080096/ 0.048579735 296 多语言-CommonCrawl-泰语 -/6001480096/ 0.037829073 246 多语言-CommonCrawl-塞尔维亚语 -/400237/ 0.000061661 10 中文-网页-健康-药品说明书 -/300336/ 1.307952901 2000 英文-网页-CC-MAIN-英语-合并-202405_minhash0.7网页全局&局部去重 -/400690/ 0.014820299 11 中文-垂类-智源纯文数据集-农业 -/400694/ 0.121218750 90 中文-垂类-智源纯文数据集-教育 -/400687/ 0.032787332 46 中文-垂类-智源纯文数据集-医疗 -/6000120096/ 0.051928047 359 多语言-CommonCrawl-朝鲜语 -/400696/ 0.000932791 10 中文-垂类-国家法律法规数据库 -/400691/ 0.001886995 10 中文-垂类-国家、行业、企业标准等相关数据 -/400688/ 0.148094313 111 中文-垂类-金融相关数据-ACG -/6000530096/ 0.038919130 278 多语言-CommonCrawl-加泰罗尼亚语 -/400689/ 0.000013803 10 中文-垂类-44本金融行业重点书籍-ACG(纯文) -/6000630096/ 0.038588745 279 多语言-CommonCrawl-克罗地亚语 -/300306/ 0.001757989 10 英文-网页-CNN-DailyMail-newspaper-新闻摘要 -/300337/ 1 2000 英文-网页-commoncrawl_minhash0.7网页全局&局部去重 -/6001530096/ 0.038953692 298 多语言-CommonCrawl-斯洛伐克语 -/6001600096/ 0.039239266 306 多语言-CommonCrawl-瑞典语 -/300343/ 0.010931487 35 英文-网页-Clueweb22CategoryB(推理低浓度)_minhash0.7网页全局&局部去重 -/400729/ 0.143491965 538 中文-网页-vip网页库(推理低浓度)_minhash0.7网页全局&局部去重 -/6000830096/ 0.039081741 315 多语言-CommonCrawl-希伯来语 -/6000510096/ 0.038850066 314 多语言-CommonCrawl-保加利亚语 -/400692/ 0.000476538 10 中文-书籍-【网络小说】ppd采购-2024Q3 -/400731/ 0.490618665 1051 中文-网页-VIP库例行生产_minhash0.7网页全局&局部去重 -/400634/ 0.000326967 10 中文-网页-1954年到2023年全国各省直辖市地级市政府报告 -/400735/ 0.464511214 1741 中文-网页-ext数据_minhash0.7网页全局&局部去重 -/400734/ 0.321270648 481 中文-网页-se数据_minhash0.7网页全局&局部去重 -/400732/ 0.243574094 182 中文-网页-VIP例行生产2024.03-04_minhash0.7网页全局&局部去重 -/400742/ 0.010588006 130 中文-网页-CC94份中文合并数据(简体)_minhash0.7网页全局&局部去重 -/400658/ 0.093877701 70 中文-网页-低速源核心数据第一期&第二期 -/400647/ 0.006078744 79 中文-网页-中文高点击网页库/高质量网页库(0616过滤) -/400668/ 0.005336899 69 中文-网页-CC94份中文合并数据(繁体) -/400362/ 0.013446240 19 中文-网页-OpenNewsArchive 新闻数据集 -/400663/ 0.017843859 13 中文-网页-中文网页-党政官媒类高质量站点抓取 -/400661/ 0.000288970 10 中文-网页-93歌词数据 -/400365/ 0.000031484 10 中文-网页-教案库数据 -/400190/ 0.080211077 60 中文-对话-贴吧v3-多轮对话-中等质量数据 -/6000190096/ 0.039903901 350 多语言-CommonCrawl-意大利语 -/400721/ 0.121492969 911 中文-书籍-百度小说(全量) -/400649/ 0.953990031 715 中文-专利-中国专利 -/400153/ 0.002920912 44 中文-网页-百科优质词条外链站点数据(推理低浓度) -/400672/ 0.078886848 59 中文-网页-小红书纯文2024.10.10存量数据 -/400642/ 0.002630645 41 中文-合成-写作要求指令增强-中文 -/400240/ 4.3802708e-8 10 中文-网页-化妆品、三品一械相关法规条例 -/6001390096/ 0.041045574 436 多语言-CommonCrawl-波兰语 -/6001720096/ 0.041580866 484 多语言-CommonCrawl-乌克兰语 -/6000100096/ 0.056805958 672 多语言-CommonCrawl-德语 -/400740/ 0.541951715 2000 中文-网页-dadu库数据_minhash0.7网页全局&局部去重 -/6000860096/ 0.042586047 543 多语言-CommonCrawl-匈牙利语 -/300131/ 1.376364399 1028 英文-网页-Dolma CC( 2020–05~2023–06)(推理高浓度) -/400171/ 0.028050290 42 中文-合成-抽象符号推理-编解码(非COT) -/400199/ 0.016500530 352 中文-网页-裁判文书网全量数据(截止2021年) -/400065/ 0.000660962 10 中文-网页-cot-裁判文书(上海高院) -/300338/ 0.168918041 920 英文-网页-DolmaCC(2020–05~2023–06)(弱势学科)_minhash0.7网页全局&局部去重 -/400163/ 0.007696080 187 中文-合成-百度搜索行为数据 -/6001410096/ 0.044887275 716 多语言-CommonCrawl-罗马尼亚语 -/6000800096/ 0.044028267 726 多语言-CommonCrawl-希腊语(现代,1453–) -/400693/ 0.000461641 10 中文-网页-播客音频洗出对话数据-汉语 -/400695/ 0.001879579 49 中文-网页-里屋社区数据集MNBVC-CommonCrawl中清洗出来的通用文本数据 -/6000110096/ 0.06285045 1411 多语言-CommonCrawl-越南语 -/6000640096/ 0.047711892 1073 多语言-CommonCrawl-捷克语 -/400659/ 0.018200173 13 中文-书籍-zlibary (pdf解析)-简单版式 -/400739/ 0.001365763 45 中文-网页-SE网页库9月例行(推理中浓度)_minhash0.7网页全局&局部去重 -/400241/ 0.103746891 77 中文-网页-大搜-微信数据 -/6000050096/ 0.078562287 2000 多语言-CommonCrawl-俄语 -/6000040096/ 0.081746977 2000 多语言-CommonCrawl-西班牙语 -/400119/ 0.015833751 846 中文-网页-爱企查判决文书 -/6000130096/ 0.072749946 2000 多语言-CommonCrawl-日语 -/400242/ 0.140827638 105 中文-网页-大搜-知乎专栏数据 -/400132/ 0.011133470 727 中文-合成-中文牌类内容(斗地主、麻将、UNO) -/300288/ 0.212246929 79 英文-论文-mag-简单版式 -/400174/ 0.078907165 118 中文-合成-抽象符号推理-古典密码(非COT) -/400172/ 0.065238524 97 中文-合成-抽象符号推理-编解码(COT) -/400243/ 0.180635069 135 中文-网页-大搜—法律文书数据 -/400175/ 0.105579055 158 中文-合成-抽象符号推理-古典密码(COT) -/400738/ 0.000714383 71 中文-网页-SE网页库9月例行(推理低浓度)_minhash0.7网页全局&局部去重 -/400160/ 0.000155801 49 中文-网页-电商-ugc数据 -/300340/ 0.016854611 741 英文-网页-DolmaCC(2020–05~2023–06)(推理中浓度)_minhash0.7网页全局&局部去重 -/300286/ 0.105916101 149 英文-书籍-zlibary (pdf解析)-简单版式 -/300287/ 0.200661735 150 英文-书籍-archive-pdf-简单版式 -/300163/ 0.212951405 79 英文-论文-aminer-128w-简单版式 -/300339/ 0.005421938 990 英文-网页-DolmaCC(2020–05~2023–06)(推理低浓度)_minhash0.7网页全局&局部去重 -/400733/ 0.720390371 1080 中文-网页-大搜—vip2.0数据_minhash0.7网页全局&局部去重 -/300124/ 0.000005793 571 英文-网页-BAAI-MTP英文语义向量模型 BGE-1.7亿条 -/400148/ 5.68345553e-7 216 中文-网页-BAAI-MTP中文语义向量模型 BGE-1.1亿条 -/301044/ 0.000322066 10 代码-代码-code-problem-v2-codecademy -/301045/ 0.000659759 10 代码-代码-code-problem-v2-programiz -/301040/ 0.001326477 10 代码-代码-benchmark-instruction-evo-241212-合成 -/301046/ 0.002573786 10 代码-代码-代码知识点合成QA数据(CodeBench)-第四批 -/code-1/ 0 42 代码-代码-code-log-synthetic-250106 -/code-2/ 0 12 代码-代码-code-log-synthetic-250207 -/301028/ 0.036818530 10 代码-代码-code-problem-v2-exercism -/301043/ 0.000911915 10 代码-代码-code-problem-v2-coderbyte -/100166/ 0.001392763 10 代码-代码-SVG代码理解数据集 -/100167/ 0.000290147 10 代码-代码-SVG代码HF合集-tu-berlin-svgs-纯文 -/300331/ 0.000180343 10 中文-网页-CK12数学-纯文(文档解析) -/400396/ 0.002094046 10 中文-网页-大学教材习题册171本-数学专项-文本 -/401056/ 0.004770410 10 中文-网页-华律网问答数据【文本对】【问答】【中文】【合作】【自然】 -/300334/ 0.000002737 10 中文-网页-CK12数学-纯文(视频数据纯文本改造) -/300335/ 0.009313413 10 中文-网页-CommonCrawl数学站点筛选【纯文】 -/401059/ 0.000778927 10 中文-网页-E考试网试题每日一练(资格考试为主)-文本对 -/301048/ 0.053228242 10 中文-试题-百度教育第一批精品试题格式优化-english-文本对 -/401060/ 0.003291972 10 中文-试题-百度教育第一批精品试题格式优化-politics-文本对 -/401061/ 0.005437544 10 中文-试题-百度教育第一批精品试题格式优化-geography-文本对 -/401062/ 0.082348446 10 中文-试题-百度教育第一批精品试题格式优化-chinese-文本对 -/401063/ 0.036789083 10 中文-试题-百度教育第一批精品试题格式优化-chemistry-文本对 -/401065/ 0.201537322 10 中文-试题-百度教育第一批精品试题格式优化-math-文本对 -/401066/ 0.017092634 10 中文-试题-百度教育第一批精品试题格式优化-history-文本对 -/401067/ 0.117661457 14 中文-试题-百度教育第一批精品试题格式优化- other-文本对 -/401068/ 0.015326292 10 中文-试题-百度教育第一批精品试题格式优化-politics2-文本对 -/401151/ 0.000659135 10 中文-试题-百度教育第一批精品试题格式优化-大学-文本对 -/401064/ 0.028558718 10 中文-试题-百度教育第一批精品试题格式优化-physics-文本对 -/301029/ 0.011923442 10 中文-试题-math.stackexchange试题-文本对 -/301037/ 0.091007017 11 中文-试题-brainly教育问答&试题数据第一批 -/301038/ 0.180398182 22 中文-试题-brainly教育问答&试题数据第二批 -/401051/ 0.142497603 17 中文-试题-K12理科试题202412(20250107试题更新)-数学专项-文本对 -/2025021800000001/ 0.291133333 250 中文-网页-创作类CPT-part-1 -/2025021800000002/ 2.217066666 2000 中文-网页-创作类CPT-part-2 -/2025021800000003/ 0.825066666 960 中文-网页-创作类CPT-part-3 -/200046/ 0.000204372 10 多语言-CommonCrawl-中翻混合语种句对-汉语+印尼-1453544(zhongwen_yini) -/200047/ 0.000943932 10 多语言-CommonCrawl-中翻混合语种句对-汉语+ja-348w(zh_ja) -/200048/ 0.000022979 10 多语言-CommonCrawl-中翻混合语种句对-汉语+han-10w(zh_han) -/200049/ 0.000051249 10 多语言-CommonCrawl-中翻混合语种句对-jazh+jiongrong-9w -/200050/ 0.000445868 10 多语言-CommonCrawl-中翻混合语种句对-en2th-114905 -/200051/ 0.000254730 10 多语言-CommonCrawl-中翻混合语种句对-波斯语+汉语(bosi_zh) -/200052/ 0.000223686 10 多语言-CommonCrawl-中翻混合语种句对-波斯语+汉语-1500000(bosi_zh) -/200053/ 0.001152494 10 多语言-CommonCrawl-中翻混合语种句对-越南语+汉语-400w(ViZh_400w) -/200054/ 0.000158281 10 多语言-CommonCrawl-中翻混合语种句对-汉语+hi-61w(zh_hi) -/200055/ 0.001010838 10 多语言-CommonCrawl-中翻混合语种句对-汉语+韩语-507.4w(zhko) -/200056/ 0.005276874 10 多语言-CommonCrawl-中翻混合语种句对-汉语+meng-250w(han_meng) -/200057/ 0.000377929 10 多语言-CommonCrawl-中翻混合语种句对-缅甸语+汉语-120w(miandian_zhongwen) -/200058/ 0.000180456 10 多语言-CommonCrawl-中翻混合语种句对-汉语+高棉-50w(zhongwen_gaomian) -/200059/ 0.000575149 10 多语言-CommonCrawl-中翻混合语种句对-汉语+蒙-20w(zhong_meng) -/200060/ 0.000218231 10 多语言-CommonCrawl-中翻混合语种句对-汉语+老挝-61w(zhongwen_laowo) -/200061/ 0.001685358 10 多语言-CommonCrawl-中翻混合语种句对-汉语+日语(zh_jp) -/200062/ 0.000348243 10 多语言-CommonCrawl-中翻混合语种句对-汉语+蒙语(zh_waimeng) -/200063/ 0.000766032 10 多语言-CommonCrawl-中翻混合语种句对-汉语+韩语(zh_ko) -/200064/ 0.000110161 10 多语言-CommonCrawl-中翻混合语种句对-汉语+尼泊尔语(zh_nepal) -/200065/ 0.000260237 10 多语言-CommonCrawl-中翻混合语种句对-汉语+缅甸语(zh_mya) -/200067/ 0.000028358 10 多语言-CommonCrawl-中翻混合语种句对-汉语+印度尼西亚语(zh_ind) -/200068/ 0.000763076 10 多语言-CommonCrawl-中翻混合语种句对-汉语+德语(zh_deu) -/200069/ 0.000656297 10 多语言-CommonCrawl-中翻混合语种句对-汉语+越南语(zh_vie) -/200070/ 0.000081524 10 多语言-CommonCrawl-中翻混合语种句对-汉语+朝鲜语(zh_kor) -/200066/ 0.000304197 10 多语言-CommonCrawl-中翻混合语种句对-汉语+英语(zh_eng) -/6000040097/ 0.004690451 35 英文-垂类-【审核专项】legal-mc4-法律开源数据集 -/6000100097/ 0.004524982 33 英文-垂类-【审核专项】legal-mc4-法律开源数据集 -/300307/ 9.617887232 1999 英文-网页-FineWeb全量数据集 -/6000070097/ 0.000816024 10 英文-垂类-【审核专项】legal-mc4-法律开源数据集 -/6000090097/ 0.001257192 10 英文-垂类-【审核专项】legal-mc4-法律开源数据集 -/6000190097/ 0.001938434 14 英文-垂类-【审核专项】legal-mc4-法律开源数据集 -/6000510097/ 0.000010060 10 英文-垂类-【审核专项】legal-mc4-法律开源数据集 -/6000640097/ 0.007286504 10 英文-垂类-【审核专项】legal-mc4-法律开源数据集 -/6000670097/ 0.000075349 10 英文-垂类-【审核专项】legal-mc4-法律开源数据集 -/6000700097/ 0.000463792 10 英文-垂类-【审核专项】legal-mc4-法律开源数据集 -/6000740097/ 0.000264480 10 英文-垂类-【审核专项】legal-mc4-法律开源数据集 -/6000800097/ 0.000007447 10 英文-垂类-【审核专项】legal-mc4-法律开源数据集 -/6000860097/ 0.001060364 10 英文-垂类-【审核专项】legal-mc4-法律开源数据集 -/6001110097/ 0.000036506 10 英文-垂类-【审核专项】legal-mc4-法律开源数据集 -/6001180097/ 0.000015436 10 英文-垂类-【审核专项】legal-mc4-法律开源数据集 -/6001390097/ 0.008755500 13 英文-垂类-【审核专项】legal-mc4-法律开源数据集 -/6001540097/ 0.000402673 10 英文-垂类-【审核专项】legal-mc4-法律开源数据集 -/6001600097/ 0.001056753 10 英文-垂类-【审核专项】legal-mc4-法律开源数据集 -/6001410097/ 0.001735976 10 英文-垂类-【审核专项】legal-mc4-法律开源数据集 -/6000650098/ 0.000006720 10 英文-垂类-【审核专项】legal-mc4-法律开源数据集 -/6001530097/ 0.000255724 10 英文-垂类-【审核专项】legal-mc4-法律开源数据集 -/200624/ 0.000002979 10 中文-网页-时政党史专业翻译语料-天津外国语大学-文本对 -/400725/ 0.014728966 11 中文-对话-【音频转纯文】喜马拉雅ASR数据(纯文) -/400393/ 0.025928793 19 中文-垂类-电力行业采购的4万+专业书籍-ACG-文本 -/401055/ 0.000050154 10 中文-试题-AnHuiDianxinZhidao(纯文-5.6万条安徽电信问答数据) -/400394/ 0.008479758 12 中文-专利-维普期刊12月增量50万+期刊论文-维普(文本) -/400395/ 0.000202317 10 中文-网页-剧本段子文案数据-pdd-纯文 -/400726/ 0.011834500 10 中文-对话-【音频转纯文】爱奇艺ASR数据(文本) -/300330/ 0.081723645 61 英文-网页-【音频转纯文】Youtube ASR数据(文本) -/300333/ 0.000570876 10 英文-网页-STEAM游戏信息数据-纯文 -/401149/ 0.003821260 10 中文-试题-百度教育第二批K12试题-教辅拆解、合作采买-文本对-有答案解析 -/401150/ 0.001438031 10 中文-试题-百度教育第二批K12试题-教辅拆解、合作采买-文本对-无答案解析 -/301041/ 0.000030120 10 英文-垂类-MedQuad-MedicalQnADataset医疗问答-医疗开源Benchmark -/301042/ 0.002053802 10 英文-问答-Math ematics数学问答数据 -/301047/ 0.078382249 10 英文-试题-K12理科专项合成试题20250210-英文-文本对 -/401057/ 0.532893024 66 中文-试题-K12理科专项合成试题20250210-中文-文本对 -/400744/ 0.020501561 10 中文-问答-数学书籍知识点合成数据-中文 -/10000100010011/ 0.005288875 10 英文-试题-chegg数学试题20250216第1.3批220万题-文本对 -/10000100010012/ 0.008434328 10 英文-试题-chegg数学试题20250216第1.2批350万题-文本对 -/10000100010013/ 0.004758764 10 英文-试题-chegg数学试题20250216第1.1批200万题-文本对 -/10000100010015/ 0.021962204 10 英文-试题-chegg数学试题20250221第二批1000万题-文本对 -/10000100010016/ 0.020773026 10 英文-试题-【多模转纯文】 chegg图文试题转纯文 -/10000100010021/ 0.049392972 10 英文-试题-chegg数学试题20250224第三批1000万题-文本对 -/10000100010025/ 0 10 英文-试题-chegg图文数学试题转纯文20250216、20250221、20250224共三批2800万题-文本对 -/10000100010020/ 0.031240055 10 英文-问答-Quora 1亿问答20250219第二批-文本对 -/10000100010017/ 0.015085636 10 英文-问答-Quora 1亿问答20250219-文本对 -/301050/ 0.001224881 10 英文-问答-medical-qa-datasets医疗问答-医疗开源Benchmark -/400746/ 0.000098480 10 中文-网页-【生物数据】prnasia 美通社 -/400750/ 0.000004216 10 中文-网页-【说服力】【视频转纯文】主持人大赛 -/401152/ 0.000002475 10 中文-问答-弱智吧-HuggingFace-文本对 -/10000100010032/ 0.211483136 79 英文-问答-Quora 1亿问答20250219第三批-文本对 -/301051/ 0.002882570 10 英文-问答-CUAD法律合同数据-法律开源Benchmark -/10000100000009/ 0.000216301 10 英文-网页-【生物数据】biospace -/300350/ 0.000080930 10 英文-网页-【生物数据】ascopub -/10000100000004/ 0.084811013 63 英文-问答-红石REDSTONE-Open Question Aswering -文本 -/10000100000002/ 0.000038184 10 英文-网页-【音频转纯文】TED纯文数据 -/10000200000003/ 5.76293333333333e-7 10 英文-问答-红石REDSTONE-Open Question Aswering -文本 -/301056/ 0.001646013 10 中文-问答-baike_qa2019(问答150 万个)数据 -/400743/ 0.000017539 10 中文-网页-【说服力】沟通课程与文案-文档 -/400748/ 0.000066221 10 中文-网页-金融政策数据20250214-ACG-文本 -/10000200000004/ 0.005043936 10 中文-对话-MEG电话客服数据转纯文角色对话数据 -/10000200000006/ 0.007739220 10 中文-对话 -MEG-CRM电话销售数据转纯文角色对话数据 -/10000200000008/ 0.019111735 12 中文-网页-大搜金融站点数据20250214-ACG-wenda -/10000200000010/ 0.497842698 1066 中文-网页-百家号2025.02存量数据-文本 -/400234/ 0.000450864 10 中文-问答-NL2SQL_开源数据 -/400639/ 0.000307161 10 中文-合成-NL2SQL_合成数据 -/400745/ 0.079470070 119 中文-书籍-113万图书-中文在线(文本)-epub -/400747/ 0.010073886 15 中文-书籍-【音频转纯文】懒人听书数据第一批(ASR数据)(文本) -/400749/ 0.017416038 26 中文-书籍-113万图书-中文在线(文本)-txt -/401148/ 0.000324319 10 中文-问答-token字数训练语料-信息处理专项-文本 -/10000200000005/ 0.019837646 29 中文(推理)-网页-vip&大搜数学数据2023.11-2024-04【Loop0】 -/300351/ 0.002377099 10 英文-书籍-【说服力】说服力书籍第二批-pdf文档 -/400399/ 0.006603937 10 中文-书籍-【说服力】说服力书籍第二批-pdf文档 -/10000100010006/ 0.003533202 10 英文-问答-BBH专项提升数据-dyck_languages-文本对 -/10000100010028/ 0.001203027 10 英文(推理)-试题-BBH专项提升数据-word_sorting-文本对 -/10000200000007/ 0.002033263 10 英文-书籍-【说服力】说服力书籍第二批-epub文档 -/10000100000001/ 0.045815276 17 英文-网页-红石REDSTONE-MATH-文本 -/10000100010014/ 0.000665171 10 英文-试题-REDSTONE-MultiChoiceQuestion-文本对 -/10000200000001/ 0.000003050 10 英文(推理)-网页-红石REDSTONE-MATH-文本 -/10000100010024/ 0.034897821 13 英文(推理)-试题-英文数学问答题答案解析生成第一批 -/10000200020016/ 0.020399428 10 中文(推理)-试题-中文数学问答题答案解析生成第一批 -/10000100000005/ 0.007139503 10 英文(推理)-网页-fineweb数学网页数据【Loop0】 -/10000100010034/ 0.062262077 23 英文-试题-【benchmark反查】huggingface开源学科数据集 -/10000100010008/ 0 10 英文-问答-BBH专项提升数据-salient_translation_error_detection-文本对 -/10000200020010/ 0 10 英文-合成-BBH专项提升数据-logic_deduction-文本对 -/10000100010027/ 0 10 英文(推理)-试题-BBH专项提升数据-ruin_names-文本对 -/10000100010029/ 0.000125882 10 英文(推理)-试题-BBH专项提升数据-geometric_shapes-文本对 -/10000100020007/ 0 10 英文(推理)-试题-BBH专项提升数据-snarks-文本对 -/400398/ 0.002114105 10 英文-问答-K12&特殊教育文档-深圳教育云-纯文 -/401153/ 0.003500047 10 中文-试题-鑫创职业资格试题-文本对 -/401155/ 0.040426706 10 中文-试题-百度教育精品试题2月份例行更新-文本对 -/10000200020015/ 0.053691484 10 中文(推理)-试题-鑫创K12中文试题-文本对 -/10000100010022/ 0.00032586 10 英文(推理)-试题-aopsonline数学竞赛-文本对 -/10000100010023/ 0.000095751 10 英文(推理)-网页-benchmark反查网站覆盖:web2.0calc.com/questions/ -/10000100000013/ 0.000883960 10 英文-问答-gauthmath所有学科knowledge-文本 -/10000100010033/ 0.045247112 10 英文-试题-凤凰智媒quizlet英文试题-文本对 -/10000100000014/ 0.005132933 10 英文-网页-原始Common Crawl数据数学站点筛选网页【2023第一批】 -/301049/ 0.000032657 10 英文-试题-AMC/AIME/BMO/IMO 试题-文本对 -/401058/ 0.000010356 10 中文-问答-脑筋急转弯1万题0212-推理专项-文本对 -/10000100010018/ 0 36 英文(推理)-试题-K12理科专项合成试题20250227-文本对(英文) -/10000100010019/ 0.017926272 10 英文(推理)-试题-brainly试题20250222-20250226期间增量-文本对 -/10000200020012/ 0.000032824 10 中文(推理)-试题-33iq智力题第二批-本文对 -/10000200020013/ 0.000030966 10 中文-试题-2025年考研数学真题-试题 -/numiamath_query/ 0.003930693 10 英文-试题-Numiamath query 改写试题数据第一批 -/10000100000006/ 0.003247139 10 英文-试题-弱势学科textbook增强合成数据-anatomy -/10000100000007/ 0.013114467 10 英文-试题-弱势学科textbook增强合成数据-professional_accounting -/10000100000008/ 0.009572875 10 英文-试题-弱势学科textbook增强合成数据-formal_logic -/10000100010030/ 0.042406978 10 英文-试题-brainly试题20250227-20250302期间增量-文本对 -/10000100010031/ 0.029642469 10 英文-试题-澳鹏homework.study英文试题-文本对 -/10000100000015/ 0.000394343 10 英文-书籍-【生物数据】business wire -/10000100000016/ 0.011167630 10 英文-书籍-【说服力】说服力书籍第三批-epub文档 -/10000100000017/ 0.036646001 10 英文-书籍-【说服力】说服力书籍第三批-pdf文档 -/10000100000018/ 0.85233422 213 英文-网页-【benchmark反查】huggingface.co math-ai/AutoMathText数据集 -/10000100010035/ 0.001839838 10 英文-问答-MedicalQA-医疗开源Benchmark -/10000100010036/ 0.011245594 10 英文-问答-【benchmark反查】huggingface.co math-ai/StackMathQA数据集 -/10000100010037/ 0.000048483 10 英文-问答-MedQA-USMLE-4-options医疗考试选择题-医疗开源Benchmark -/10000100010038/ 0.000573528 10 英文-问答-MedMCQA选择题-医疗开源Benchmark -/10000100010039/ 0.005300162 10 英文-问答-medical-question-answering-datasets医疗问答-医疗开源Benchmark -/10000100010040/ 0.000385748 10 英文-问答-PubMedQA-医疗开源Benchmark -/10000100010041/ 0.277890436 34 英文-试题-K12理科专项合成英文试题20250227(数据迭代更新)-文本对 -/10000200000011/ 0.000055442 10 中文-网页-【说服力】【音视频转纯文】销售培训课程 -/10000200000012/ 0.000007219 10 中文-网页-中国科普博览文物数据-纯文 -/10000200000013/ 0.000300687 10 中文-网页-【说服力】【音视频转纯文】沟通课程 -/10000200000014/ 0.000119291 10 中文-网页-咨询课程文档 -/10000200000015/ 0.004665145 10 中文-网页-【acg-行业数据】金融研报数据20250214-ACG-文档 -/10000200000016/ 0.007079626 10 中文-网页-【说服力】说服力书籍第三批-epub文档 -/10000200000017/ 0.009367401 10 中文-网页-【说服力】说服力书籍第三批-pdf文档 -/10000100010043/ 0.015168132 10 英文-试题-brainly试题20250303-20250310期间美国站增量-文本对 -/10000100010044/ 0.028362376 10 英文-试题-chegg20250303第4.1批STEM试题706w题-文本对 -/10000100010045/ 0.047063267 10 英文-试题-chegg图文数学试题转纯文第二批图转文144w-文本对 -/10000100010046/ 0.016908574 10 英文-试题-chegg图文数学试题转纯文第一批50w数据-fix -/10000100010048/ 0.020887173 10 英文-试题-开源数学QA数据_英文_不带模板 -/10000100020008/ 0.000001064 10 英文-试题-【补】AMC/AIME/BMO/IMO 试题-文本对 -/10000200000020/ 0.001733574 10 中文-书籍-计算机书籍(中文)(源400049) -/10000200020019/ 0.018817498 10 中文-试题-开源数学QA数据_中文_不带模板 -/10000100000020/ 0.089288913 22 英文-网页-CommonCrawl数学站点筛选网页【纯文】 -/10000100000021/ 0.002020114 10 英文-网页-【说服力】kialo英文辩论网站 -/10000100000023/ 2.873354966 431 英文-网页-【benchmark反查】simpleQA反查长尾站点召回CC数据-文本 -/10000100000022/ 0.001690478 10 英文-网页-【benchmark反查】chem.libretexts.org-文档文本 -/10000100000024/ 0.311246095 77 英文-网页-InfiMM-WebMath-40B-文本 -/10000100000025/ 0.149771971 449 英文-网页-【benchmark反查】pmc.ncbi.nlm.nih.gov网站覆盖—纯文文本 -/10000100010049/ 0.125617411 10 英文-试题-chegg20250303第4.2批STEM试题-文本对 -/10000100010050/ 0.090095504 12 代码-代码-code-instruction-v2.1 -/10000100010051/ 0.018196497 10 英文-试题-brainly试题20250305-20250311期间印度站增量-文本对 -/10000100010052/ 0.069538147 10 英文-试题-20250312-g4o图转文第三批(包含多图)-文本对 -/10000200020020/ 0.013792681 10 中文-试题-【百度教育】多模转纯文(答案图转文)-数学(第一批fix2) -/10000200020021/ 0.000655053 10 中文-试题-【百度教育】多模转纯文(答案图转文)-生物(第一批fix) -/10000200020023/ 0.003159375 10 中文-试题-【百度教育】多模转纯文(答案图转文)-化学(第一批fix) -/10000200020024/ 0.011188260 10 中文-试题-【百度教育】多模转纯文(答案图转文)-物理(第一批fix) -/10000200020025/ 0 10 代码-代码-code-log-synthetic-250207-update-250313 -/10000200020026/ 0 21 代码-代码-code-log-synthetic-250106-update-250313 -/10000100000026/ 0.178253971 26 英文-网页-Finemath-e5模型筛选CommonCrawl数学数据【2023-50】 -/10000100000027/ 0.004469794 10 英文-网页-【benchmark反查】simpleQA评测集anwser来源url网页覆盖 -/10000100010053/ 0.153438946 10 英文-试题-chegg20250303第4.3批STEM试题-文本对 -/10000100010054/ 0.006074558 10 英文-网页-【benchmark反查】socratic.org网站覆盖 -/10000100010055/ 0.036150889 10 英文-试题-20250312-g4o图转文第四批(包含多图)-文本对 -/10000200000023/ 0.000021881 10 中文-网页-【说服力】【音视频转纯文】pdd演讲与口才课程音视频 -/10000200000024/ 0.001989468 10 中文-网页-B站优质UP主视频字幕转纯文数据【第一批】 -/10000200000025/ 0.102165515 30 中文-论文-维普50w论文-2024Q3 -/10000200020027/ 0.102444928 10 代码-代码-code-log-synthetic-250207-update-250315 -/10000200020028/ 0.363919886 38 代码-代码-code-log-synthetic-250106-update-250315 -/10000200000021/ 0.043793891 10 中文-百科-抖音百科第一批存量词条-spider-文本 -/10000200000022/ 0.001558578 10 中文-网页-【说服力】【音视频转纯文】【电商】抖音强说服力口播文案-说服要素增强 -/2025031210001/ 0.002210112 10 英文-试题-AMPS query 改写试题数据第一批 -/10000100010056/ 0.000319968 10 英文-试题-crackap网页抓取数据-第一批 -/2025031190001/ 0.000335608 10 英文-试题-crackap网页抓取数据-第二批 -/bbh-fewshot-79/ 0.000003251 10 英文-试题-bbh-fewshot-79条 -/math-related-7/ 0.666666666 25 英文-网页-MATH相关7个开源数据集 -/chegg-g4o-part5/ 0.1 15 英文-试题-20250312-g4o图转文第五批(包含多图)-文本对 -/10000100000028/ 0.001087514 10 英文-网页-【benchmark反查】simpleQA评测集g4o+RAG来源url网页覆盖 diff --git a/examples/pre-training/ernie/pretrain_auto.py b/examples/pre-training/ernie/pretrain_auto.py index 1772797b..75388307 100644 --- a/examples/pre-training/ernie/pretrain_auto.py +++ b/examples/pre-training/ernie/pretrain_auto.py @@ -16,12 +16,10 @@ import time import json import numpy as np -from functools import partial import random import paddle import paddle.distributed.fleet as fleet from src.utils import logger -from paddleformers.datasets import MapDataset from paddleformers.trainer import ( PdArgumentParser, get_last_checkpoint, @@ -31,7 +29,6 @@ from omegaconf.dictconfig import DictConfig from src.callbacks import ( ProgreesiveBatchingCallback, - DataTraceCallbackAuto, GlobalRNGCallback, ) from models.ernie import ( @@ -42,18 +39,14 @@ ErnieConfig, ErnieMoEConfig, ) -from src.datasets import PretrainTask -from src.datasets.pretrain_task import parse_data_weight from src.trainers import AutoPretrainingTrainer, AutoPreTrainingArguments from src.utils import ( setup_logger_output_file, ) -from src.utils.data_utils import merge_fn_group_batch from src.utils.misc import global_training_logs +from pretrain import create_pretrained_dataset -# from pretrain import create_pretrained_dataset - from config import get_config try: @@ -61,7 +54,7 @@ except ImportError: def log_trainer_start(): - """print main process messgae""" + """Print main process messgae""" if "MAIN_PROCESS_STARTED" not in os.environ: start_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) logger.info( @@ -86,19 +79,12 @@ def log_trainer_start(): def update_model_config_from_args(config: ErnieConfig, model_args: dict): - """update model config from args - - Args: - config (ErnieConfig): _description_ - model_args (dict): _description_ - - Returns: - _type_: _description_ - """ for k, v in model_args.items(): if hasattr(config, k): logger.info(f"update model config: {k} = {v}") setattr(config, k, v) + else: + logger.warning(f"model config key: {k} does not exist") return config @@ -109,7 +95,7 @@ def init_parameter(model): def main(): - """main function""" + """Main function""" config = get_config(verbose=True) os.makedirs(config.model_args.output_dir, exist_ok=True) parser = PdArgumentParser(AutoPreTrainingArguments) @@ -155,9 +141,9 @@ def formatv(v): i / sum(args.modality_ratio) for i in args.modality_ratio ] - # combine_batch = args.combine_batch // config.trainer_args.data_parallel_degree - # data_processor_args = {k: formatv(v) for k, v in dict(getattr(config, "data_processor_args", {})).items()} - # (args,) = parser.parse_dict(dict(**model_args, **trainer_args, **data_processor_args)) + args.eval_iters = 10 + args.test_iters = args.eval_iters * 10 + args.use_moe = dict(**dict(config.model_args), **dict(config.trainer_args)).get( "use_moe", False ) @@ -201,7 +187,7 @@ def formatv(v): ) logger.info("======monitor allreduce done!=======\n") except Exception as e: - logger.warning("fleet test unexcepted error! skip exception[{}]...".format(e)) + logger.warning(f"fleet test unexcepted error! skip exception[{e}]...") # Detecting last checkpoint. last_checkpoint = None @@ -264,9 +250,11 @@ def compute_metrics(p): logger.info(f"disable moe flag when using moe-group={args.moe_group}") args.use_moe = False + args.multi_token_pred_depth = model_config.get("multi_token_pred_depth", 0) + cfg = ErnieConfig.from_pretrained(args.model_name_or_path) - cfg = update_model_config_from_args(cfg, model_config) cfg.seqlen = args.max_seq_length + cfg.token_balance_seqlen = args.max_seq_length * args.per_device_train_batch_size cfg.fp16_opt_level = args.fp16_opt_level cfg.moe_group = args.moe_group cfg.dtype = dtype @@ -285,13 +273,15 @@ def compute_metrics(p): cfg.tensor_parallel_degree = 1 cfg.tensor_parallel_rank = 0 + cfg.micro_batch_size = args.per_device_train_batch_size tokenizer = ErnieBotTokenizer.from_pretrained(args.tokenizer_name) tokenizer.ignored_index = cfg.ignored_index logger.info( f"using tokenizer={type(tokenizer)}, bos:{tokenizer.bos_token_id} " f"eos:{tokenizer.eos_token_id} pad:{tokenizer.pad_token_id} " ) - image_preprocess = None # set if `vision_model_name_or_path is not None` + + cfg = update_model_config_from_args(cfg, model_config) if args.model_type == "ernie": model_class = ErnieForCausalLMAuto @@ -310,9 +300,6 @@ def compute_metrics(p): config=cfg, ) - if image_preprocess is not None: - model.add_image_preprocess(image_preprocess) - cfg = model.config logger.info(f"using model type:{type(model)}") paddle.set_default_dtype("float32") @@ -326,90 +313,11 @@ def compute_metrics(p): # data logger.info("loading data...") - train_file_list, data_weights = parse_data_weight( - args.data_weights, args.data_filelist + train_dataset, eval_dataset, test_dataset, data_collator = ( + create_pretrained_dataset(args) ) - # train_dataset, eval_dataset, test_dataset, data_collator = create_pretrained_dataset(args) - max_seq_length = args.max_seq_length - - if args.do_train: - assert ( - args.max_seq_length // args.base_seq_length >= 1 - and args.max_seq_length % args.base_seq_length == 0 - ) - if args.combine_batch > 1: - logger.info( - f"max seq length is larger than base_seq_length, use combine batch: {args.combine_batch}" - ) - assert ( - args.use_train_part_sharding - ), "not `use_train_part_sharding` is not supported when using `combine_batch`" - assert ( - args.num_consecutive // args.combine_batch >= 1 - and args.num_consecutive % args.combine_batch == 0 - ), "num_consecutive must be a multiple of max_seq_length / base_seq_length" - assert ( - args.data_weights - ), "no `data_weights` is not supported when using `combine_batch`" - max_seq_length = args.base_seq_length - if args.need_data: - if args.multimodal: - assert False, "Do not support multimodal!" - else: - pretrain_task = PretrainTask(train_file_list, tokenizer) - train_dataset = pretrain_task.train_data( - max_seq_length + 1, - stride=max_seq_length, - rng=random.Random(args.seed), - weights=data_weights, - evaluate=False, - seed=args.seed, - num_consecutive=args.num_consecutive, - shuffle=not args.no_part_shuffle, - combine_batch=args.combine_batch, - load_process_num=args.data_load_process_num, - ) - train_dataset.load( - use_shard=args.use_train_part_sharding, - dp_rank=args.reeao_dataset_rank, - dp_size=args.reeao_dataset_world_size, - ) - train_dataset = MapDataset(train_dataset) - else: - logger.info( - f"mp_{args.pipeline_parallel_rank}_pp{args.tensor_parallel_rank} no data needed, \ - skip init train_dataset" - ) - train_dataset = None - - if args.do_eval: - eval_dataset = PretrainTask( - [[args.dev_data]], - tokenizer, - max_seq_len=max_seq_length, - ).train_data( - max_seq_length + 1, - stride=max_seq_length, - overlap_len=32, - rng=random.Random(0), - evaluate=True, - shuffle=False, - ) - eval_dataset.load(False, dp_rank=0, dp_size=1) - eval_dataset = MapDataset(eval_dataset) - else: - eval_dataset = None - - data_collator = partial( - merge_fn_group_batch, - tokenizer, - pad_to_max_seqlen=args.max_seq_length, - combine_batch=args.combine_batch, - image_dtype="uint8", - ) callbacks = [] - callbacks = [DataTraceCallbackAuto()] if not args.use_dummy_dataset else [] callbacks += [GlobalRNGCallback()] if args.batch_size_warmup_steps: diff --git a/examples/pre-training/ernie/src/datasets/__init__.py b/examples/pre-training/ernie/src/datasets/__init__.py index 0b558806..b9c4df26 100644 --- a/examples/pre-training/ernie/src/datasets/__init__.py +++ b/examples/pre-training/ernie/src/datasets/__init__.py @@ -16,4 +16,3 @@ """ from .dist_data_loader import DistDataLoader, DistDataLoaderAuto -from .pretrain_task import ExampleSet, ExampleSetSingleDataSource, PretrainTask diff --git a/examples/pre-training/ernie/src/datasets/dist_data_loader.py b/examples/pre-training/ernie/src/datasets/dist_data_loader.py index 846ad593..1dbd0bf4 100644 --- a/examples/pre-training/ernie/src/datasets/dist_data_loader.py +++ b/examples/pre-training/ernie/src/datasets/dist_data_loader.py @@ -1,6 +1,3 @@ -# -*- coding: utf-8 -*- -# !/usr/bin/env python3 - # Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -21,6 +18,8 @@ It can replace paddle.io.DataLoader in most cases. """ import logging +import hashlib +from collections import deque from collections import OrderedDict from itertools import groupby from functools import reduce @@ -42,6 +41,8 @@ logger = logging.getLogger(__name__) +input_ids_for_mtp = deque() + log = logging.getLogger(__name__) _MAX_DATA_DIM = 64 @@ -50,6 +51,12 @@ G_DEBUG_DATA_MD5 = os.getenv("G_DEBUG_DATA_MD5") +def md5(tensor): + numpy_array = tensor.numpy() + array_bytes = numpy_array.tobytes() + return hashlib.md5(array_bytes).hexdigest() + + class DummyDataset(paddle.io.Dataset): def __len__(self): return 0 @@ -93,14 +100,12 @@ def __init__( num_workers=num_workers, ) self.need_magic_trans = need_magic_trans - # log.info(f'DistDataloader using image-dtype: {self.image_dtype}') self._hcg = fleet.get_hybrid_communicate_group() # init pp data comm group if self._hcg.get_pipe_parallel_world_size() > 1 and pp_broadcast: self._pp_data_group = self._init_dataloader_comm_group() else: - log.info("skip pp broadcast") self._pp_data_group = None # tensor parallel message @@ -132,14 +137,11 @@ def __init__( persistent_workers, ) - # self._dataloder_iter = iter(self._dataloder) self._lazy_dataloader_iter = None else: log.info( - "mp{}_pp{}_sharding{}_dp{} no data needed, " - "skip init dataloader.".format( - self.mp_rank, self.pp_rank, sharding_rank, self.dp_rank - ) + f"mp{self.mp_rank}_pp{self.pp_rank}_sharding{sharding_rank}_dp{self.dp_rank} no data needed, " + "skip init dataloader." ) @property @@ -162,7 +164,6 @@ def _init_dataloader_comm_group(self): parallel_groups = topo.get_comm_list("pipe") for group in parallel_groups: - # only first rank and last rank if self.need_magic_trans: assert ( len(group) > 2 @@ -181,7 +182,6 @@ def __iter__(self): def __next__(self): get_timers() and get_timers()("read-raw-data").start() if self._need_data: - # {'input_ids': int64, 'labels': int64, 'data_id': int64} data = next(self._dataloder_iter) if "data_not_valid" in data: global_training_logs.update( @@ -190,8 +190,6 @@ def __next__(self): ( input_ids, labels, - data_id, - src_id, data_type, images, token_type_ids, @@ -205,8 +203,6 @@ def __next__(self): ) = ( data["input_ids"], data["labels"], - data["data_id"], - data["src_id"], data.get("data_type", None), data.get("images", None), data.get("token_type_ids", None), @@ -218,18 +214,14 @@ def __next__(self): data.get("position_ids", None), data.get("log_prob", None), ) - assert {input_ids.dtype, labels.dtype, data_id.dtype, src_id.dtype} == { - paddle.int64 - }, ( + assert {input_ids.dtype, labels.dtype} == {paddle.int64}, ( f"Distloader requires dtype == `int64`, " - f"got:{[input_ids.dtype, labels.dtype, data_id.dtype, src_id.dtype]}" + f"got:{[input_ids.dtype, labels.dtype]}" ) else: ( input_ids, labels, - data_id, - src_id, data_type, images, token_type_ids, @@ -253,8 +245,6 @@ def __next__(self): None, None, None, - None, - None, ) get_timers() and get_timers()("read-raw-data").stop() @@ -264,8 +254,6 @@ def __next__(self): ( input_ids, labels, - data_id, - src_id, data_type, images, token_type_ids, @@ -280,8 +268,6 @@ def __next__(self): [ input_ids, labels, - data_id, - src_id, data_type, images, token_type_ids, @@ -298,13 +284,9 @@ def __next__(self): ) if self._pp_data_group is not None and self._pp_data_group.nranks > 1: - # NOTE(shenliang03): in last stage in pp, we don't need input_ids and data_id. - # But it's only for paddle-new_model_7 compatible upgrade. It will remove in future. ( input_ids, labels, - data_id, - src_id, data_type, images, token_type_ids, @@ -319,8 +301,6 @@ def __next__(self): [ input_ids, labels, - data_id, - src_id, data_type, images, token_type_ids, @@ -336,6 +316,11 @@ def __next__(self): self._pp_data_group, ) + if self.need_magic_trans: + if input_ids is not None: + global input_ids_for_mtp + input_ids_for_mtp.append(input_ids) + if VOCAB_SIZE is not None: if input_ids is not None: input_ids %= int(VOCAB_SIZE) @@ -346,8 +331,6 @@ def __next__(self): [ ("input_ids", input_ids), ("labels", labels), - ("data_id", data_id), - ("src_id", src_id), ("data_type", data_type), ("images", images), ("token_type_ids", token_type_ids), @@ -376,6 +359,9 @@ def __next__(self): ] for k in none_keys: to_return.pop(k) + if G_DEBUG_DATA_MD5 and int(G_DEBUG_DATA_MD5): + printable = map_structure(lambda i: md5(i), to_return) + logger.info(f"data-md5: {printable}") return to_return @@ -383,7 +369,6 @@ def broadcast_data_list(data_list, datatype, comm_rank=0, comm_group=None, src_r """ Broadcast data from src_rank to all ranks in comm_group. """ - # Move to GPU and broadcast. size_cpu = [] if comm_rank == 0: for data in data_list: @@ -407,9 +392,7 @@ def broadcast_data_list(data_list, datatype, comm_rank=0, comm_group=None, src_r if comm_rank == 0: assert ( data.dtype == datatype - ), "input has data type {} which " "is different than {}".format( - data.dtype, datatype - ) + ), f"input has data type {data.dtype} which " f"is different than {datatype}" data_b = paddle.concat( [d.to(get_env_device()).reshape([-1]) for d in data_list], 0 ) @@ -437,7 +420,8 @@ def broadcast_data_list(data_list, datatype, comm_rank=0, comm_group=None, src_r class _DtypeSndShape: """_summary_ - Returns: + Returns + ------- _type_: _description_ """ @@ -447,7 +431,8 @@ class _DtypeSndShape: def size(self): """_summary_ - Returns: + Returns + ------- _type_: _description_ """ return reduce(lambda x, y: x * y, self.shape) @@ -460,7 +445,8 @@ def split_group(grouped, split_size): grouped (_type_): _description_ split_size (_type_): _description_ - Yields: + Yields + ------ _type_: _description_ """ ret = [] @@ -473,9 +459,7 @@ def split_group(grouped, split_size): yield ret -# Tea.chen congmin(葱明) brodcast def broadcast_data_obj(data, src_rank, group): - this_rank = dist.get_rank() if this_rank == src_rank: template = [ @@ -492,7 +476,6 @@ def broadcast_data_obj(data, src_rank, group): template = [None] dist.broadcast_object_list(template, src_rank, group) template = template[0] - # log.info(f'[rank={dist.get_rank()}]: {template}') temp_flat = flatten(template) data_flat = flatten(data) @@ -520,10 +503,8 @@ def keyfn(i): [sum(data_buf_shapes)], dtype=grouped_chunk[0][1].dtype ) dist.broadcast(data_buf, src_rank, group) - # log.info(f'[rank={dist.get_rank()}]: done broadcast data:{data_buf.shape}') if this_rank != src_rank: - # log.info(f'[rank={dist.get_rank()}] split:{data_buf_shapes}') if len(data_buf_shapes) == 1: data_buf = [data_buf] else: @@ -547,11 +528,10 @@ def __next__(self): input_list = [] if "token_type_ids" in data_dict.keys(): + ( input_ids, labels, - data_id, - src_id, data_type, images, token_type_ids, @@ -560,8 +540,6 @@ def __next__(self): ) = ( data_dict["input_ids"], data_dict["labels"], - data_dict["data_id"], - data_dict["src_id"], data_dict["data_type"], data_dict.get("images", None), data_dict["token_type_ids"], @@ -583,8 +561,6 @@ def __next__(self): input_list = [ input_ids, labels, - data_id, - src_id, data_type, images, token_type_ids, diff --git a/examples/pre-training/ernie/src/datasets/pretrain_task.py b/examples/pre-training/ernie/src/datasets/pretrain_task.py deleted file mode 100644 index 31572361..00000000 --- a/examples/pre-training/ernie/src/datasets/pretrain_task.py +++ /dev/null @@ -1,788 +0,0 @@ -# !/usr/bin/env python3 -# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import division -from __future__ import absolute_import -from __future__ import print_function -from __future__ import unicode_literals - -import atexit -import os -import math -import re -import random -import logging -from functools import partial -import numpy as np -from collections import OrderedDict, namedtuple -from typing import List - -import paddle -import h5py -from time import time -from src.utils.ipc_server import IPCServer - - -log = logging.getLogger(__name__) - - -class IPCH5Resource: - - def __init__(self, path, name, server): - - self.path = path - self.name = name - self.server = server - self._length = None - self._to_bool = None - - def __getitem__(self, key): - - return self.server.call(self.path, "get", (self.path, self.name, key)) - - def __len__(self): - - if self._length is None: - self._length = self.server.call(self.path, "len", (self.path, self.name)) - return self._length - - def __bool__(self): - - if self._to_bool is None: - self._to_bool = self.server.call( - self.path, "to_bool", (self.path, self.name) - ) - return self._to_bool - - -class IPCH5MetaResource: - - def __init__(self, path, server): - """ - __init__ - """ - self.path = path - self.server = server - self._meta = None - - def _get_meta(self): - """ - get_meta once - """ - if self._meta is None: - self._meta = self.server.call(self.path, "get_meta", (self.path,)) - - def __getitem__(self, key): - """ - __getitem__ - """ - self._get_meta() - return self._meta[key] - - def __len__(self): - """ - __len__ - """ - self._get_meta() - return len(self._meta) - - -class DatasetHolder: - - def __init__(self, paths, server_idx, server_num): - - self.fps = {} - path_num = len(paths) - start_t = time() - for idx, path in enumerate(paths): - assert path not in self.fps, path - - ds = h5py.File(path, mode="r") - fp = ds["ds16"] - assert ( - "ds16_tokenwise_type_id" not in ds - ), f"this file maybe a multimodal H5, path={path}" - if "ds16_lossmask" in ds: - fp_lossmask = ds["ds16_lossmask"] - assert len(ds["ds16_lossmask"]) == len(ds["ds16"]), ( - len(ds["ds16_lossmask"]), - len(ds["ds16"]), - ) - else: - fp_lossmask = None - - if "ds16_off" in ds: - off = ds["ds16_off"] - else: - off = None - - if "log_prob" in ds: - log_prob = ds["log_prob"] - else: - log_prob = None - - shape = fp.shape - meta = {"shape": shape} - if shape[0] <= 0 or shape[0] >= 1000000000000: - raise OSError - self.fps[path] = { - "fp": fp, - "lossmask": fp_lossmask, - "meta": meta, - "off": off, - "log_prob": log_prob, - } - end_t = time() - log.info( - f"Done loading {path}, shape: {shape}, in server-{server_idx}/{server_num}, " - f"accumulated time = {end_t - start_t}, progress: {idx}/{path_num}" - ) - end_t = time() - log.info( - f"Server-{server_idx}/{server_num} load ends with path number {path_num}, " - f"accumulated time = {end_t - start_t}" - ) - - def get(self, path, name, key): - """ - get - """ - return self.fps[path][name][key] - - def len(self, path, name): - """ - len - """ - return len(self.fps[path][name]) - - def to_bool(self, path, name): - """ - to_bool - """ - return True if self.fps[path][name] else False - - def get_meta(self, path): - """ - get_meta - """ - return self.fps[path]["meta"] - - -class DatasetHolderIniter: - - def __init__(self, paths): - """ - __init__ - """ - self.paths = paths - - def __call__(self, server_idx, server_num): - - return DatasetHolder(self.paths, server_idx, server_num) - - -def create_ipc_h5_resources(paths, num_server): - - n = len(paths) - if n <= 0: - return [] - - num_server = min(n, num_server) - - router_keys = [[] for _ in range(num_server)] - for i, p in enumerate(paths): - router_keys[i % num_server].append(p) - - init_funcs = [DatasetHolderIniter(rk) for rk in router_keys] - server = IPCServer(router_keys, init_funcs) - atexit.register(lambda: server.close()) - fps = [] - for p in paths: - tmp = { - "fp": IPCH5Resource(p, "fp", server), - "lossmask": IPCH5Resource(p, "lossmask", server), - "meta": IPCH5MetaResource(p, server), - "off": IPCH5Resource(p, "off", server), - "log_prob": IPCH5Resource(p, "log_prob", server), - } - fps.append(tmp) - return fps - - -def parse_filelist(filelist): - """parse filelist - - Args: - filelist (_type_): _description_ - - Raises: - ValueError: _description_ - - Returns: - _type_: _description_ - """ - if isinstance(filelist, str): - filelist = [filelist] - part_id_offset = 0 - h5, partids = [], [] - for f in filelist: - lines = [i.strip().split("\t") for i in open(f).readlines()] - if len(lines[0]) == 1: - h5.extend([i[0] for i in lines]) - partids.extend([i + part_id_offset for i in range(len(lines))]) - elif len(lines[0]) == 2: - _ids, _flst = zip(*lines) - h5.extend(_flst) - partids.extend([int(i) + part_id_offset for i in _ids]) - else: - raise ValueError("part format error") - part_id_offset = max(partids) + 1 - assert len(h5) == len(set(h5)), "duplicated filelist" - return partids, h5 - - -def parse_weights(weights): - """parse weights - - Args: - weights (_type_): _description_ - - Returns: - _type_: _description_ - """ - patterns = [] - if isinstance(weights, str): - weights = [weights] - for w in weights: - for i in open(w): - cols = i.strip().split() - assert ( - len(cols) >= 3 - ), f"配比文件至少要4列,格式为:pattern weight num_parts - {cols}" - pattern, w, num_parts = cols[:3] - if len(cols) >= 4 and cols[3] in ["lm", "mm", "audio"]: - data_type = cols[3] - else: - data_type = "mm" if "multimodal" in i else "lm" - - num_parts = int(num_parts) - pattern = re.compile(pattern) - patterns.append((pattern, float(w) / num_parts, data_type)) - return patterns - - -def parse_data_weight(weights, filelist): - - partids, filelist = parse_filelist(filelist) - patterns = parse_weights(weights) - partid2files, weight_filelist = {}, {} - for part_id, f in zip(partids, filelist): - if part_id not in partid2files: - partid2files[part_id] = [f] - else: - partid2files[part_id].append(f) - - for ipattern, (pattern, w, data_type) in enumerate(patterns): - if pattern.search(f): - # weight_filelist[f] = (float(w), ipattern, part_id) - weight_filelist[part_id] = (float(w), ipattern, data_type) - break - else: - log.warning(f"{f} does not match any pattern") - - train_filelist, weights = [], [] - for part_id, (v, source_id, data_type) in weight_filelist.items(): - train_filelist.append((partid2files[part_id], data_type)) - weights.append((v, source_id, part_id)) - return train_filelist, weights - - -def equal_shard(datasets, rank, world_size): - - assert ( - len(datasets) >= world_size - ), f"#filelist={len(datasets)} < world_size{world_size}" - if world_size == 1: - return datasets - if datasets[0].weights is None: - ran = np.array_split(np.arange(len(datasets)), world_size)[rank] - s, e = ran[0], ran[-1] - shard = datasets[s : e + 1] - return shard - buckets = [[] for _ in range(world_size)] - - bucketsize = np.zeros(len(buckets), dtype="float64") - total_w = sum([d.weights for d in datasets]) - for d in datasets: - d.weights = d.weights / total_w - datasets = sorted(datasets, key=lambda d: d.weights, reverse=True) - for d in datasets: - this_bucket = np.argmin(bucketsize) - buckets[this_bucket].append(d) - bucketsize[this_bucket] += d.weights - - log.info( - f"sharding dataset according to prob, group vs probs={[sum([rr.weights for rr in r])for r in buckets]}" - ) - bucketsize = bucketsize[rank] - diff = bucketsize - (1 / world_size) - log.info( - f"unable to perfect shard. prob sum of this bucket:{bucketsize}, diff to perfect portion:{diff}" - ) - assert ( - len(buckets) == world_size - ), f"#ret={len(buckets)} prob not normalized:{[d.weights for d in datasets]}" - return buckets[rank] - - -Example = namedtuple("Example", ["ids", "sids", "task", "lossmask", "src", "log_prob"]) - - -class ExampleSetSingleDataSource: - """Use to pick data from h5""" - - def __init__( - self, - path: List[str], - seqlen, - stride=None, - weights=None, - shuffle: bool = False, - num_consecutive: int = 1, - seed: int = 42, - combine_batch: int = 1, - ): - - self.seqlen = seqlen - if weights is not None: - assert isinstance(weights, tuple) and len(weights) == 3, weights - self.weights, self.src, self.part = weights - else: - self.weights, self.src, self.part = None, 0, 0 - if not stride: - self.stride = seqlen - else: - self.stride = stride - self.path = [os.path.expanduser(p) for p in path] - self._load = False - self.fps = [] - self._data_status = 0 - self.num_consecutive = num_consecutive - self.seed = seed - self.shuffle = shuffle - self.combine_batch = combine_batch - self.epoch = 0 - - @property - def data_status(self): - return self._data_status - - @data_status.setter - def data_status(self, value): - log.info(f"part-{self.part}-load_data_status: {value}") - self._data_status = value - - def set_loaded(self, fps): - """ - Set loaded fps - """ - self._load = True - self.int16_ds = True - self.fps = fps - - def load(self): - self._load = True - self.int16_ds = True - log.info("using int16 ds") - - for path in self.path: - log.info(f"loading {path}, weights={self.weights}") - ds = h5py.File(path, mode="r") - assert ( - "ds16_tokenwise_type_id" not in ds - ), f"this file maybe a multimodal H5, src={self.src}" - - fp = ds["ds16"] - if "ds16_lossmask" in ds: - fp_lossmask = ds["ds16_lossmask"] - assert len(ds["ds16_lossmask"]) == len(ds["ds16"]), ( - len(ds["ds16_lossmask"]), - len(ds["ds16"]), - ) - else: - fp_lossmask = None - # self.fp = self.fps[0] - - if "ds16_off" in ds: - log.info("using ds w/ offset") - off = ds["ds16_off"] - else: - off = None - - if "log_prob" in ds: - log.info("using ds with log_prob") - log_prob = ds["log_prob"] - else: - log_prob = None - shape = fp.shape - meta = {"shape": shape} - if ( - shape[0] <= 0 or shape[0] >= 1000000000000 - ): # 1000000000000 for max tokens of h5 - raise OSError - self.fps.append( - { - "fp": fp, - "lossmask": fp_lossmask, - "meta": meta, - "off": off, - "log_prob": log_prob, - } - ) - log.info( - f"done loading {path}, shape:{shape}: int16:{self.int16_ds} " - f"seqlen:{self.seqlen} stride:{self.stride}" - ) - log.info(f"done loading part-{self.part}, file count: {len(self.fps)}") - - def __getitem__(self, idx): - assert ( - len(idx) == 2 - ), f"idx format must be (`epoch, data_idx`), but got {idx} instead" - epoch, idx = idx - if idx == -1: - return Example( - ids=[], - sids=None, - task="lm", - src=self.part, - lossmask=None, - log_prob=None, - ) - assert self._load - fp = self.fps[epoch % len(self.fps)] - off = fp["off"] - if off: - s = off[idx] - e = off[idx + 1] - else: - s = max(idx * self.stride, 0) - e = idx * self.stride + self.seqlen - - ids = fp["fp"][s:e].astype(np.int32) - if fp["lossmask"]: - lossmask = fp["lossmask"][s:e].astype(np.int32) - else: - lossmask = None - if fp["log_prob"]: - log_prob = fp["log_prob"][s:e].astype(np.float32) - else: - log_prob = None - ret = Example( - ids=ids, - sids=None, - task="lm", - src=self.part, - lossmask=lossmask, - log_prob=log_prob, - ) - return ret - - def __len__(self): - assert self._load - fp = self.fps[self.epoch % len(self.fps)] - if fp["off"]: - return len(fp["off"]) - return int(np.ceil((fp["meta"]["shape"][0]) / self.stride)) - - def __iter__(self): - for i in range(len(self)): - yield self[(0, i)] - - @property - def example_id(self): - example_id = range(0, len(self), self.num_consecutive) - example_id = [ - (ii, min(ii + self.num_consecutive, len(self))) for ii in example_id - ] - if self.shuffle: - rng = random.Random(self.epoch + self.seed + self.part) - rng.shuffle(example_id) - return np.array(example_id) - - @property - def num_examples(self): - assert self.epoch == 0 - # return len(list(range(0, len(self), self.num_consecutive))) - return (len(self) + self.num_consecutive - 1) // self.num_consecutive - - def sampler(self): - assert paddle.io.get_worker_info() is None - - self.epoch = 0 - while 1: - if self._data_status >= len(self): - self._data_status -= len(self) - else: - log.debug( - f"...gen_index_from-[{self.part}]-[{self.epoch}]-offset-[{self.data_status}/{len(self)}]" - ) - for s, e in self.example_id: - _length = ( - math.ceil((e - s) / self.combine_batch) * self.combine_batch - ) - if self._data_status > 0: - if self._data_status >= _length: - self._data_status -= _length - continue - else: - s += self._data_status - self._data_status = 0 - yield self.epoch, list(range(s, e)) - self.epoch += 1 - - -class ExampleSet: - """use to manage all h5 data""" - - def __init__(self, exs, fn, load_process_num=0): - """ - __init__ - """ - self.exs = exs - self.fn = fn - self._load = False - self.global_max_part_id = max([ex.part for ex in exs]) - self.partid2ex = {ex.part: ex for ex in exs} - self.load_process_num = load_process_num - - def append(self, new_exs): - log.info(f"updating exs, #new example: {len(new_exs)}") - self.exs.append(new_exs) - lens = [len(e) for e in self.exs] - len_sum = sum(lens) - log.info("multi task data portion") - log.info( - "\n".join([f"{e.path}={left/len_sum}" for left, e in zip(lens, self.exs)]) - ) - - def load(self, use_shard, dp_rank, dp_size): - self._load = True - log.info(f"loading h5... use_shard={use_shard}, {self._load} {id(self)}") - - log.info(f"loading h5 in dp_env:{dp_rank}/{dp_size}") - if use_shard: - log.info("#shard train file, before load") - - def keyfn(e): - left = e.path.strip("/").split("/") - return left[0] - - path_per_dp = equal_shard(self.exs, dp_rank, dp_size) - log.debug( - f"using source shard, # files before shard={len(self.exs)}, after shard={len(path_per_dp)}" - ) - self.exs = path_per_dp - - if self.load_process_num > 0: - paths = [] - ranges = [] - start_idx = 0 - for i, ex in enumerate(self.exs): - assert isinstance(ex, ExampleSetSingleDataSource), type(ex) - cur_len = len(ex.path) - paths.extend(ex.path) - ranges.append((ex, start_idx, start_idx + cur_len)) - start_idx += cur_len - - fps = create_ipc_h5_resources(paths, self.load_process_num) - for ex, start, end in ranges: - ex.set_loaded(fps[start:end]) - else: - loaded_exs, err_cnt = [], 0 - for ex in self.exs: - try: - if isinstance(ex, ExampleSetSingleDataSource): - ex.load() - except OSError as e: - log.warning(f"loading {ex.path} error:{e}, skip...") - err_cnt += 1 - continue - loaded_exs.append(ex) - assert ( - loaded_exs - ), f"data_dir {[e.path for e in self.exs]} empty, #err:{err_cnt}" - self.exs = loaded_exs - if err_cnt > 0: - raise ValueError( - f"some data load failed, #parts={len(self.exs)}, #err={err_cnt}" - ) - log.info(f"done loading h5 #parts={len(self.exs)}, #err={err_cnt}") - - def __getitem__(self, idx): - # index 为三维坐标 (partid, part_epoch, part_data_idx) - if isinstance(idx, int): - # dev data - s = 0 - for ex in self.exs: - if s + len(ex) < idx: - s += len(ex) - else: - ret = ex[(0, idx - s)] - break - else: - assert ( - len(idx) == 3 - ), f"idx format must be (`part_id`, `part_epoch`, `part_data_idx`), but got {idx} instead" - part_id, epoch, idx = idx - ret = self.partid2ex[part_id][(epoch, idx)] - ret = self.fn(ret, idx) - ret.update(data_id=idx) - # log.info(f"index:{idx}, input_ids: {ret['input_ids'][0:10]}") - return ret - - def __len__(self): - assert self._load - return sum(map(len, self.exs)) - - def __iter__(self): - # print(f"real len: {len(self)}") - for i in range(len(self)): - yield self[i] - - -class PretrainTask: - def __init__(self, data_dir, tokenizer, **kwargs): - self.tokenizer = tokenizer - self.data_dir = data_dir - self.mask_gen = None - - def train_data( - self, - max_seq_len=512, - stride=None, - overlap_len=0, - rng=None, - weights=None, - evaluate=False, - seed=0, - num_consecutive=1, - shuffle=True, - combine_batch=1, - load_process_num=0, - ): - if isinstance(self.data_dir[0][0], list): - path = [i[0] for i in self.data_dir if not i[0][0].endswith("meta")] - else: - path = [i for i in self.data_dir if not i[0].endswith("meta")] - if not weights: - weights = [(None, None, i) for i in range(len(path))] - # assert max_seq_len > 0, f'max_mask_num too big! seqlen={max_seq_len}, max_mask_num={mask_generator.special_token_num}' - examples = ExampleSet( - [ - ExampleSetSingleDataSource( - p, - max_seq_len, - stride=stride, - weights=w, - seed=seed, - num_consecutive=num_consecutive, - shuffle=shuffle, - combine_batch=combine_batch, - ) - for p, w in zip(path, weights) - ], - partial( - self.example_to_feature, - rng=rng, - overlap_len=overlap_len, - evaluate=evaluate, - ), - load_process_num=load_process_num, - ) - return examples - - def example_to_feature( - self, - example, - idx, - rng, - overlap_len, - evaluate, - ): - if not rng: - rng = random - if evaluate: - # print(f"eval index: {idx}") - rng = random.Random(idx) - - if example.lossmask is not None: - labels = [ - self.tokenizer.ignored_index if j == 0 else i - for i, j in zip(example.ids, example.lossmask) - ] - tokens = example.ids[:-1] - lm_labels = labels[1:] - else: - _tokens = example.ids - tokens, lm_labels = _tokens[:-1], _tokens[1:] - if example.log_prob is not None: - log_prob = example.log_prob[1:] - else: - log_prob = None - - if overlap_len and idx != 0: # do overlap - # log.info(f"apply overlaping: overlap_len: {overlap_len}") - if isinstance(lm_labels, np.ndarray): - lm_labels = lm_labels.tolist() - lm_labels = [self.tokenizer.ignored_index] * len( - lm_labels[:overlap_len] - ) + lm_labels[overlap_len:] - assert len(lm_labels) == len( - tokens - ), f"lm_labels:{len(lm_labels)} vs tokens:{len(tokens)}" - - assert len(tokens) == len( - lm_labels - ), f"tokens:{len(tokens)} != labels:{len(lm_labels)}" - token_ids = np.array(tokens, dtype="int64") - lm_labels = np.array(lm_labels, dtype="int64") - - features = OrderedDict( - input_ids=token_ids, labels=lm_labels, src_id=example.src, log_prob=log_prob - ) - return features - - -class PretrainDummyDataset: - """pretrain dummy dataset""" - - def __init__(self, max_seq_len): - self.max_seq_len = max_seq_len - - def __getitem__(self, idx): - return { - "input_ids": np.array([1] * self.max_seq_len), - "labels": np.array([1] * self.max_seq_len), - "src_id": 0, - "data_id": 0, - } - - def __len__(self): - return 10000 - - def __iter__(self): - for i in range(len(self)): - yield self[i] diff --git a/examples/pre-training/ernie/src/trainers/pretraining_trainer_auto.py b/examples/pre-training/ernie/src/trainers/pretraining_trainer_auto.py index 18767fc6..4c672d50 100644 --- a/examples/pre-training/ernie/src/trainers/pretraining_trainer_auto.py +++ b/examples/pre-training/ernie/src/trainers/pretraining_trainer_auto.py @@ -1,5 +1,3 @@ -# !/usr/bin/env python3 - # Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -25,18 +23,15 @@ import re import os import json -import pickle import contextlib -from typing import Optional, List -from collections import OrderedDict, defaultdict +from typing import Optional +from collections import OrderedDict from dataclasses import dataclass, field -import random import time import math import logging from functools import partial -import numpy as np import paddle import paddle.nn as nn @@ -74,7 +69,6 @@ from paddleformers.trainer.trainer_callback import PrinterCallback from paddle.distributed import fleet import paddle.distributed as dist -from paddleformers.datasets import MapDataset from paddleformers.transformers.model_utils import _add_variant @@ -90,12 +84,9 @@ ) from src.datasets import ( DistDataLoaderAuto, - ExampleSet, - ExampleSetSingleDataSource, ) from paddle.distributed import in_auto_parallel_align_mode from src.clip import ClipGradByAdaptiveNorm, ClipGradForMOEByGlobalNorm -from src.trainers.pretraining_trainer import DummySampler try: from paddleformers.trainer.trainer import ( @@ -121,35 +112,6 @@ def is_dp_group_support_in_group_sharded_parallel(): logger.warning("Use TrainingArguments as an alternative but will lose some args!") -def distributed_optimizer_maybe_hack( - optimizer, - use_moe, -): - if use_moe: - from src.trainers.dygraph_optimizer.hybrid_parallel_optimizer import ( - HybridParallelOptimizer as MoEHybridParallelOptimizer, - ) - - fleet_env = fleet.fleet - fleet_env.user_defined_optimizer = optimizer - hp_optim = MoEHybridParallelOptimizer( - optimizer, fleet_env._hcg, fleet_env._user_defined_strategy - ) - - if fleet_env._user_defined_strategy.hybrid_configs[ - "pp_configs" - ].dp_comm_overlap: - hp_optim._dp_enable = False - - if fleet_env._user_defined_strategy.hybrid_configs[ - "pp_configs" - ].sharding_comm_overlap: - hp_optim._sharding_enable = False - return hp_optim - else: - return fleet.distributed_optimizer(optimizer) - - DATATYPE_2_ID = {"mm": 0, "lm": 1, "audio": 2} @@ -235,6 +197,11 @@ class AutoPreTrainingArguments(AutoTrainingArguments): }, ) + input_dir: str = field(default=None, metadata={"help": "data path"}) + split: str = field( + default="949,50,1", metadata={"help": "Train/valid/test data split ratio"} + ) + data_dir: str = field(default=None, metadata={"help": "数据路径(指向一个目录)"}) data_filelist: str = field( @@ -414,6 +381,11 @@ class AutoPreTrainingArguments(AutoTrainingArguments): }, ) + multi_token_pred_depth: Optional[int] = field( + default=0, + metadata={}, + ) + lr_scheduler: str = field( default="cosine", metadata={ @@ -481,29 +453,9 @@ class AutoPreTrainingArguments(AutoTrainingArguments): metadata={"help": "Control the num of microbatches in one pp step."}, ) - @property - def use_moe(self): - """_summary_ - - Returns: - _type_: _description_ - """ - return getattr(self, "use_expert_parallel", self._use_moe) - - @use_moe.setter - def use_moe(self, value): - """_summary_ - - Args: - value (_type_): _description_ - """ - self.use_expert_parallel = value - self._use_moe = value - @property def need_data(self): - # mp0、pp0状态 卡才需要load数据 if self.pp_need_data_degree: assert self.pipeline_parallel_degree > 1 assert ( @@ -513,7 +465,6 @@ def need_data(self): self.pp_need_data_degree, self.pipeline_parallel_degree, ) - # shift by 1 to avoid last pp no nee data no_need_data_range = list( range(self.pp_need_data_degree - 1, self.pipeline_parallel_degree - 1) ) @@ -524,12 +475,10 @@ def need_data(self): @property def combine_batch(self): - return self.max_seq_length // self.base_seq_length @property def reeao_dataset_rank(self): - if not self.pp_need_data_degree: return super().dataset_rank no_need_data_range = list( @@ -552,9 +501,6 @@ def reeao_dataset_rank(self): @property def reeao_dataset_world_size(self): - """ - 考虑 pp /sharding/ dp 总和的数据流 worldsize - """ if not self.pp_need_data: return super().dataset_world_size return ( @@ -565,11 +511,6 @@ def reeao_dataset_world_size(self): def __post_init__(self): super().__post_init__() - # if self.sharding_parallel_degree > 1 and self.data_parallel_degree > 1: - # # MP/PP下, 当前框架不支持同时开启 sharding 和 DP - # assert ( - # self.pipeline_parallel_degree <= 1 and self.tensor_parallel_degree <= 1 - # ), f"when using mp/pp, `data_parallel_degree` should be 1 but receive {self.data_parallel_degree}" if in_auto_parallel_align_mode(): self.adaptive_norm_clip = False self.adaptive_norm_clip_ratio = 0.0 @@ -617,9 +558,7 @@ def __post_init__(self): if self.batch_size_warmup_steps > 0: assert self.global_batch_size > 0, self.global_batch_size assert self.init_global_batch_size > 0, self.init_global_batch_size - self.max_gradient_accumulation_steps = ( - self.gradient_accumulation_steps - ) # hack add new + self.max_gradient_accumulation_steps = self.gradient_accumulation_steps ( self.per_device_train_batch_size, self.gradient_accumulation_steps, @@ -657,7 +596,6 @@ def __post_init__(self): f"gradient_accumulation_steps[{self.gradient_accumulation_steps}] should be divisible by " f"pp_need_data_degree[{self.pp_need_data_degree}]" ) - # pp_need_data_degree下,args的acc 需要//pp数量,欺骗 在prepare_inputs self.gradient_accumulation_steps = ( self.gradient_accumulation_steps // self.pp_need_data_degree ) @@ -685,14 +623,12 @@ def __post_init__(self): else False ) if sharding_comm_overlap_non_pp: - # update grad acc steps assert hasattr(fleet.fleet, "_user_defined_strategy") user_defined_strategy = fleet.fleet._user_defined_strategy user_defined_strategy.hybrid_configs[ "sharding_configs" ].accumulate_steps = self.gradient_accumulation_steps - # NOTE(shenliang03): Check sanity of `accumulate_steps` when using sharding comm overlap. if hasattr(fleet.fleet, "_user_defined_strategy"): user_defined_strategy = fleet.fleet._user_defined_strategy if ( @@ -722,471 +658,6 @@ def __post_init__(self): self.multimodal = True -class WeightedDistributedSamplerAuto(PaddleNLPDistributedBatchSampler): - - def __init__( - self, - dataset, - batch_size, - output_dir, - dp_rank, - dp_size, - num_consecutive=1, - seed=0, - batch_size_warmup_steps=-1, - gradient_accumulation_steps=None, - max_gradient_accumulation_steps=None, - per_device_train_batch_size=None, - batch_size_warmup_increment=None, - combine_batch: int = 1, - shuffle_consecutive: bool = False, - global_shuffle_num_examples: int = 0, - same_data: bool = False, - modality_ratio: tuple = None, - modality_interleave: int = 1, - **kwargs, - ): - self.num_consecutive = num_consecutive - self.seed = seed - super().__init__(dataset, batch_size, **kwargs) - self.weights = None - self.batch_size = batch_size # per-device-micro-batchsize - self.output_dir = output_dir - self.rng = random.Random(self.seed + self.epoch) - self.dp_rank = dp_rank - self.dp_size = dp_size - self.batch_size_warmup_steps = batch_size_warmup_steps - self.gradient_accumulation_steps = gradient_accumulation_steps - self.max_gradient_accumulation_steps = max_gradient_accumulation_steps - self.per_device_train_batch_size = per_device_train_batch_size - self.batch_size_warmup_increment = batch_size_warmup_increment - self.combine_batch = combine_batch - self.shuffle_consecutive = shuffle_consecutive - self.global_shuffle_seed = 0 - self.global_shuffle_num_examples = global_shuffle_num_examples - self.same_data = same_data - self.load_data_seq = False - self.modality_ratio = modality_ratio - self.modality_interleave = modality_interleave - if self.modality_ratio is not None: - print("[my debug] modality_ratio:", modality_ratio) - logger.info(f"modality ratio set to {self.modality_ratio}") - assert sum(modality_ratio) == 1.0, "modality ratio should sum to 1" - assert ( - self.modality_interleave * self.modality_ratio[0] % 1 == 0 - if len(self.modality_ratio) >= 1 - else True - ), "modality_interleave * modality_ratio[0] should be integer" - assert ( - self.modality_interleave * self.modality_ratio[1] % 1 == 0 - if len(self.modality_ratio) >= 2 - else True - ), "modality_interleave * modality_ratio[1] should be integer" - assert ( - self.modality_interleave * self.modality_ratio[2] % 1 == 0 - if len(self.modality_ratio) >= 3 - else True - ), "modality_interleave * modality_ratio[1] should be integer" - if isinstance(self.dataset, MapDataset): - self.inner_dataset = self.dataset.data - else: - self.inner_dataset = self.dataset - assert self.inner_dataset._load - - self.max_part_id = self.inner_dataset.global_max_part_id - - self.set_epoch(0) - - def load_data_status(self, data_status: List[int], global_shuffle_seed: int = 0): - self.global_shuffle_seed = global_shuffle_seed - if not hasattr(self.inner_dataset.exs[0], "data_status"): - logger.warn( - "Inner Datasource has no attribute data_status, ignore load_data_status" - ) - return - data_status = [ - math.ceil(i / self.combine_batch) * self.combine_batch for i in data_status - ] - for ex in self.inner_dataset.exs: - if ex.part < len(data_status): - ex.data_status = data_status[ex.part] - logger.debug( - f"dp-[{self.dp_rank}/{self.dp_size}]-loaded_data_status--[{data_status[:10]}]" - ) - - def gen_data_seq(self): - """ - 生成随机采样序列。在给定seed + epoch 的情况下,序列结果稳定可复现 - """ - total = [] - for ex in self.inner_dataset.exs: - total.extend([(ex.part, 0, i) for i in range(ex.data_status, len(ex))]) - assert ( - len(total) > self.num_consecutive - ), f"total={total} < num_consecutive={self.num_consecutive}" - indices = np.array_split(np.array(total), len(total) // self.num_consecutive) - if self.shuffle: - self.rng.shuffle(indices) - indices = np.concatenate(indices) - indices = self.roundup_and_shard(indices) - logger.debug(indices[:10]) - return indices - - def load_data_seq_from_cache(self): - """_summary_ - - Returns: - _type_: _description_ - """ - indices_file = os.path.join( - self.output_dir, - f"data_seq.epoch{self.epoch}.dp_{self.dp_rank}_of_{self.dp_size}" - f"_shard_{self.local_rank}_of_{self.nranks}.pth", - ) - if self.same_data and os.path.exists(indices_file): - logger.info(f"load data seq from file - {indices_file}") - self.load_data_seq = True - with open(indices_file, "rb") as of: - return pickle.load(of) - return None - - def gen_data_seq_weighted_multimodal( - self, lm_num_examples, mm_num_examples, audio_num_examples - ): - """multimodal data seq""" - assert self.modality_ratio is not None - logger.info(f"LM-num_examples -- {lm_num_examples}") - lm_indices = ( - self.gen_data_seq_weighted(lm_num_examples, DATATYPE_2_ID["lm"]) - if lm_num_examples > 0 - else None - ) - mm_indices = ( - self.gen_data_seq_weighted(mm_num_examples, DATATYPE_2_ID["mm"]) - if mm_num_examples > 0 - else None - ) - audio_indices = ( - self.gen_data_seq_weighted(audio_num_examples, DATATYPE_2_ID["audio"]) - if audio_num_examples > 0 - else None - ) - - lm_base = ( - int( - int(self.modality_ratio[0] * self.modality_interleave) - * self.combine_batch - * self.per_device_train_batch_size - ) - if len(self.modality_ratio) >= 1 - else 0 - ) - mm_base = ( - int( - int(self.modality_ratio[1] * self.modality_interleave) - * self.combine_batch - * self.per_device_train_batch_size - ) - if len(self.modality_ratio) >= 2 - else 0 - ) - audio_base = ( - int( - int(self.modality_ratio[2] * self.modality_interleave) - * self.combine_batch - * self.per_device_train_batch_size - ) - if len(self.modality_ratio) >= 3 - else 0 - ) - - num_batches = math.inf - if lm_indices is not None and lm_base > 0: - num_batches = min(lm_indices.shape[0] // lm_base, num_batches) - if mm_indices is not None and mm_base > 0: - num_batches = min(mm_indices.shape[0] // mm_base, num_batches) - if audio_indices is not None and audio_base > 0: - num_batches = min(audio_indices.shape[0] // audio_base, num_batches) - - all_indices = [] - if lm_indices is not None and lm_base > 0: - lm_indices = lm_indices[: num_batches * lm_base, :].reshape( - num_batches, lm_base, -1 - ) - all_indices.append(lm_indices) - if mm_indices is not None and mm_base > 0: - mm_indices = mm_indices[: num_batches * mm_base, :].reshape( - num_batches, mm_base, -1 - ) - all_indices.append(mm_indices) - if audio_indices is not None and audio_base > 0: - audio_indices = audio_indices[: num_batches * audio_base, :].reshape( - num_batches, audio_base, -1 - ) - all_indices.append(audio_indices) - - assert len(all_indices) > 0 - indices = np.concatenate(all_indices, axis=1).reshape( - -1, all_indices[0].shape[-1] - ) - logger.debug( - f"multimodal_data_seq={len(indices)}, example={indices[:10]}, " - f"modality_interleave={self.modality_interleave}, lm-{lm_base}, mm-{mm_base}, audio-{audio_base}" - ) - return indices - - def gen_data_seq_weighted(self, num_examples, data_type=None): - - assert ( - self.load_data_seq is False - ), "需要保证所有epoch的data_seq都从文件加载,否则下次删data_seq无法控住随机性" - logger.debug( - f"generating data sequence... #non_consecutive_data_chunks={num_examples}," - f" num_consecutive={self.num_consecutive}" - ) - - if num_examples > 1e5: - logger.debug( - "generating data sequence for very large data, consider use large `num_consecutive`" - ) - - if data_type is not None: - weights = [ - ex.weights for ex in self.inner_dataset.exs if ex.data_type == data_type - ] - exs = [ex for ex in self.inner_dataset.exs if ex.data_type == data_type] - else: - weights = [ex.weights for ex in self.inner_dataset.exs] - exs = self.inner_dataset.exs - assert len(exs) > 0, f"data_type={data_type}, no data found" - total_w = sum(weights) - weights = [w / total_w for w in weights] - - logger.info( - f"using weighted sampler, num_consecutive={self.num_consecutive}:\n" - + "\n".join(["%-100s...%.3e" % (e.path, w) for w, e in zip(weights, exs)]) - ) - - part_indices_gen = {} - indices = [] - for i, ex in enumerate(exs): - sample_size = int(weights[i] * num_examples) - logger.debug( - f"part_data_pre_sampling--[part-{ex.part}]-[sampler-size-{sample_size}]" - ) - assert ex.combine_batch == self.combine_batch - part_indices_gen[ex.part] = ex.sampler() - indices.extend([ex.part] * sample_size) - - logger.debug( - f"shuffle part placeholder index, size={len(indices)}, exmaple={indices[0]}" - ) - if self.shuffle: - self.rng.shuffle(indices) - logger.debug("shuffle done") - indices_ret = [] - logger.debug("build_index from shuffled placeholder") - - for part_id in indices: - epoch, _index = next(part_indices_gen[part_id]) - # combine_batch = max_seqlen (8k) / base_seqlen (1k) - if len(_index) % self.combine_batch != 0: - _index += [-1] * (self.combine_batch - len(_index) % self.combine_batch) - indices_ret += [(part_id, epoch, i) for i in _index] - - if self.shuffle_consecutive and self.combine_batch >= 1: - part_data_gen = defaultdict(lambda: []) - logger.debug("consecutive placeholder 2 shuffle") - for item in indices_ret: - part_data_gen[item[0]].append(item) - logger.debug("consecutive placeholder 2 shuffle...") - part_data_gen_iter = {} - for key in part_data_gen.keys(): - part_data_gen_iter[key] = iter(part_data_gen[key]) - logger.debug("consecutive placeholder 2 shuffle......") - placeholder_indices = [i[0] for i in indices_ret] - placeholder_indices = [ - placeholder_indices[i : i + self.combine_batch] - for i in range(0, len(placeholder_indices), self.combine_batch) - ] - logger.debug("consecutive placeholder 2 shuffle..........") - self.rng.shuffle(placeholder_indices) - logger.debug("consecutive placeholder 2 shuffle.............") - placeholder_indices = [ - item for sublist in placeholder_indices for item in sublist - ] - logger.debug("consecutive placeholder 2 shuffle................") - indices_ret = [next(part_data_gen_iter[i]) for i in placeholder_indices] - logger.debug("consecutive placeholder 2 shuffle done") - - logger.debug("build index done") - indices = np.array(indices_ret) - del indices_ret - logger.debug(f"num_data_seq={len(indices)}, example={indices[:10]}") - indices = self.roundup_and_shard(indices) - return indices - - def roundup_and_shard(self, indices): - if self.nranks == 1: - logger.info("use use_train_part_sharding, skip padding") - return indices - - padding_size = self.total_size - len(indices) - logger.info( - f"padding-size={padding_size}, total_size={self.total_size} shard={self.local_rank}/{self.nranks}" - ) - if padding_size < 0: - indices = indices[:padding_size] - else: - indices = np.concatenate( - [ - indices, - np.tile(indices, math.ceil(padding_size / len(indices)))[ - :padding_size - ], - ] - ) - - assert len(indices) == self.total_size, (len(indices), self.total_size) - - # subsample - indices = indices[self.local_rank : self.total_size : self.nranks] - assert len(indices) == self.num_samples - return indices - - def __len__(self): - # PaddleNLP expect TypeError for infinite datasets: - # https://github.com/PaddlePaddle/PaddleNLP/blob/develop/paddlenlp/trainer/trainer_utils.py#L515 - raise TypeError - - def __iter__(self): - # deterministically shuffle based on epoch and seed - self.rng = random.Random(self.seed + self.epoch + self.global_shuffle_seed) - logger.info(f"seed={self.seed + self.epoch + self.global_shuffle_seed}") - weights = [e.weights for e in self.inner_dataset.exs] - if any([w is None for w in weights]) or sum(weights) == 0.0: - logger.info(f"using normal sampler, num_consecutive={self.num_consecutive}") - indices = self.gen_data_seq() - self.weights = None - else: - self.weights = weights - num_examples = sum([ex.num_examples for ex in self.inner_dataset.exs]) - if self.modality_ratio is not None: - lm_num_examples = sum( - [ - ex.num_examples - for ex in self.inner_dataset.exs - if ex.data_type == DATATYPE_2_ID["lm"] - ] - ) - mm_num_examples = sum( - [ - ex.num_examples - for ex in self.inner_dataset.exs - if ex.data_type == DATATYPE_2_ID["mm"] - ] - ) - audio_num_examples = sum( - [ - ex.num_examples - for ex in self.inner_dataset.exs - if ex.data_type == DATATYPE_2_ID["audio"] - ] - ) - if self.global_shuffle_num_examples > 0: - num_examples = min([self.global_shuffle_num_examples, num_examples]) - if self.modality_ratio is not None: - lm_num_examples = min( - [self.global_shuffle_num_examples, lm_num_examples] - ) - mm_num_examples = min( - [self.global_shuffle_num_examples, mm_num_examples] - ) - audio_num_examples = min( - [self.global_shuffle_num_examples, audio_num_examples] - ) - logger.debug( - f"using global shuffle num examples: {self.global_shuffle_num_examples}" - ) - indices = self.load_data_seq_from_cache() - if indices is None: - indices = ( - self.gen_data_seq_weighted_multimodal( - lm_num_examples, mm_num_examples, audio_num_examples - ) - if self.modality_ratio is not None - else self.gen_data_seq_weighted(num_examples) - ) - - if self.output_dir: - with open( - os.path.join( - self.output_dir, - f"data_seq.epoch{self.epoch}.dp_{self.dp_rank}_of_{self.dp_size}" - f"_shard_{self.local_rank}_of_{self.nranks}.pth", - ), - "wb", - ) as of: - pickle.dump(indices, of, protocol=4) - - def ret(): # 无穷长reader。 - # info = paddle.io.get_worker_info() - nonlocal indices - buf = [] - logger.info(f"start training sequence, data-sequence: {indices[:10]}") - while 1: - if self.consumed_samples >= len(indices): - self.consumed_samples -= len(indices) - else: - for i in range(self.consumed_samples, len(indices)): - if len(buf) == self.batch_size: - yield buf - buf = [] - buf.append(indices[i].tolist()) - self.consumed_samples = 0 - self.epoch += 1 - logger.info( - f"epoch done, #data={self.total_size}, reshuffle-sequence: epoch={self.epoch}" - ) - - self.rng = random.Random(self.seed + self.epoch) - if self.weights: - indices = self.load_data_seq_from_cache() - if indices is None: - indices = ( - self.gen_data_seq_weighted_multimodal( - lm_num_examples, mm_num_examples, audio_num_examples - ) - if self.modality_ratio is not None - else self.gen_data_seq_weighted(num_examples) - ) - else: - indices = self.gen_data_seq() - if self.output_dir: - with open( - os.path.join( - self.output_dir, - f"data_seq.epoch{self.epoch}.dp_{self.dp_rank}_of_{self.dp_size}" - f"_shard_{self.local_rank}_of_{self.nranks}.pth", - ), - "wb", - ) as of: - pickle.dump(indices, of, protocol=4) - - return ret() - - def set_epoch(self, epoch=0, consumed_samples=0): - - consumed_samples = consumed_samples // self.dp_size - logger.debug(f"set consumed samples={consumed_samples}, epoch={epoch}") - super().set_epoch(epoch, consumed_samples) - - if isinstance(self.inner_dataset, ExampleSet): - for ex in self.inner_dataset.exs: - if isinstance(ex, ExampleSetSingleDataSource): - ex.epoch = epoch - - class AutoPretrainingTrainer(AutoTrainer): def __init__(self, _shit=None, args=None, model=None, callbacks=[], **kwargs): @@ -1283,8 +754,12 @@ def autocast_smart_context_manager(self): return ctx_manager def _load_optimizer_state(self, checkpoint): + # def _load_moe_optimizer_state(checkpoint): + # opt_moe_suffix = re.sub(r"moe\d\d", "moe00", self.args.optimizer_name_suffix) + # return self._load_optimizer_state_of_one_shard(checkpoint, opt_moe_suffix) def _broadcast_moe_optimizer_state(state_dict): + # boardcast_keys base_state_dict = {"master_weights": {}} buf = [ { @@ -1308,7 +783,6 @@ def _broadcast_moe_optimizer_state(state_dict): for k, s in buf[0].items(): v = state_dict.get(k, paddle.zeros(s, "float32")).cuda() v.name = k - # k = k.replace("_fp32_master_0", "") # TODO 这一手replace待品 dist.broadcast(v, src=src_rank, group=group) logger.info(f"broadcast moe optimizer {k} from {src_rank}") base_state_dict[k] = v.cpu() @@ -1402,23 +876,19 @@ def _save_moe_weights(self, output_dir): def evaluate( self, eval_dataset=None, ignore_keys=None, metric_key_prefix: str = "eval" ): - """doc""" + self.model_wrapped.accumulate_steps = self.args.gradient_accumulation_steps eval_dataloader = self.get_eval_dataloader(eval_dataset) start_time = time.time() - # Temporarily disable metric computation, we will do it in the loop here. compute_metrics = self.compute_metrics eval_loop = self.evaluation_loop output = eval_loop( eval_dataloader, description="Evaluation", - # No point gathering the predictions if there are no metrics, otherwise we defer to - # self.args.prediction_loss_only prediction_loss_only=True if compute_metrics is None else None, ignore_keys=ignore_keys, - # Only evaluate max_eval_iters max_eval_iters=self.args.eval_iters, ) @@ -1442,7 +912,7 @@ def evaluate( def prediction_pipeline_step( self, model, inputs, prediction_loss_only, ignore_keys ): - """doc""" + loss, _, labels = super().prediction_pipeline_step( model, inputs, prediction_loss_only, ignore_keys ) @@ -1451,48 +921,14 @@ def prediction_pipeline_step( return loss_avg, loss, labels def _get_train_sampler(self) -> Optional[paddle.io.Sampler]: - if self.args.use_dummy_dataset: - return DummySampler( - self.train_dataset, - self.args.per_device_train_batch_size * self.args.combine_batch, - ) - if self.args.use_train_part_sharding: - num_replicas = 1 - rank = 0 - else: - num_replicas = self.args.reeao_dataset_world_size - rank = self.args.reeao_dataset_rank - batch_size = self.args.per_device_train_batch_size * self.args.combine_batch - batch_size *= self.args.gradient_accumulation_steps - batch_sampler = WeightedDistributedSamplerAuto( + return PaddleNLPDistributedBatchSampler( self.train_dataset, - batch_size, - self.args.output_dir, - dp_rank=self.args.reeao_dataset_rank, - dp_size=self.args.reeao_dataset_world_size, - num_replicas=num_replicas, - rank=rank, - seed=self.args.seed, - batch_size_warmup_steps=self.args.batch_size_warmup_steps, # used to reesume from ckpt - gradient_accumulation_steps=self.args.gradient_accumulation_steps, - max_gradient_accumulation_steps=self.args.max_gradient_accumulation_steps, - per_device_train_batch_size=self.args.per_device_train_batch_size, - batch_size_warmup_increment=self.args.batch_size_warmup_increment, - shuffle=not self.args.no_shuffle, - drop_last=False, - num_consecutive=self.args.num_consecutive, - combine_batch=self.args.combine_batch, - shuffle_consecutive=self.args.shuffle_consecutive, - global_shuffle_num_examples=self.args.global_shuffle_num_examples, - same_data=self.args.same_data, - modality_ratio=self.args.modality_ratio, - modality_interleave=( - self.args.modality_interleave * self.args.combine_batch - if self.args.modality_interleave - else None - ), + batch_size=self.args.per_device_train_batch_size, + shuffle=False, + num_replicas=self.args.dataset_world_size, + rank=self.args.dataset_rank, + drop_last=self.args.dataloader_drop_last, ) - return batch_sampler def get_train_dataloader(self): @@ -1508,7 +944,7 @@ def get_train_dataloader(self): if self._is_iterable_dataset(train_dataset): return DataLoader( train_dataset, - batch_size=None, + batch_size=None, # we do data collation in Stream collate_fn=self.data_collator, num_workers=self.args.dataloader_num_workers, use_shared_memory=True, @@ -1561,7 +997,6 @@ def create_scheduler(self, num_training_steps): self.args.max_steps, min_lr=self.args.min_lr if self.args.min_lr else 0.0, ) - print(f"lr_scheduler : {self.lr_scheduler}") return self.lr_scheduler @@ -1653,7 +1088,7 @@ def expert_fn(p): grad_clip = ClipGradForMOEByGlobalNorm( self.args.max_grad_norm, is_expert_param_func=expert_fn, - moe_group=_get_global_group(), # None 为全局通信组, + moe_group=_get_global_group(), local_clip=False, ) else: @@ -1675,7 +1110,6 @@ def expert_fn(p): def lr_ratio_fn(param): if param.name in self.static_name_to_dyg_name.keys(): name = self.static_name_to_dyg_name[param.name] - # logger.info(f'search {param.name} -> {name}') if self.args.moe_gate_lr_ratio is not None and gate_pattern.match( name ): @@ -1738,24 +1172,13 @@ def _get_mesh(pp_idx=0): meshes = [] if self.args.pipeline_parallel_degree > 1: - if self.args.multimodal: - # `input_ids`, `labels`, `data_id`, `src_id`, `data_type`, `images`, `token_type_ids`, - # `image_type_ids`, `has_images` - meshes.append( - [ - _get_mesh(0), - _get_mesh(-1), - _get_mesh(0), - _get_mesh(0), - _get_mesh(0), - _get_mesh(0), - _get_mesh(0), - _get_mesh(0), - _get_mesh(0), - ] - ) - else: - meshes.append([_get_mesh(0), _get_mesh(-1), _get_mesh(0), _get_mesh(0)]) + # input_ids + meshes.append( + [ + _get_mesh(0), + _get_mesh(-1), + ] + ) # labels meshes.append(_get_mesh(self.args.pipeline_parallel_degree - 1)) else: diff --git a/examples/pre-training/ernie/src/utils/__init__.py b/examples/pre-training/ernie/src/utils/__init__.py index 121653ca..30361d4c 100644 --- a/examples/pre-training/ernie/src/utils/__init__.py +++ b/examples/pre-training/ernie/src/utils/__init__.py @@ -16,7 +16,6 @@ # __all__ = ['logger', 'setup_logger_output_file'] -from .data_utils import * # noqa from .logging import * # noqa from .seed_utils import * # noqa from .training_utils import * # noqa diff --git a/examples/pre-training/ernie/src/utils/data_utils.py b/examples/pre-training/ernie/src/utils/data_utils.py deleted file mode 100644 index 6f49e4e7..00000000 --- a/examples/pre-training/ernie/src/utils/data_utils.py +++ /dev/null @@ -1,218 +0,0 @@ -# !/usr/bin/env python3 - -# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -data utils -""" -import logging -import numpy as np -import os -import datetime -import paddle - -logger = logging.getLogger(__name__) - -DEBUG_PRINT_CNT = 0 - -log_dir = os.getenv("PADDLE_LOG_DIR", "./log") -local_rank = os.getenv("PADDLE_LOCAL_RANK", "0") -date_str = datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S") -print_data_path = os.path.join( - log_dir, "data_rank_{}_{}.txt".format(local_rank, date_str) -) - - -def print_data_online(msg): - """ - print data online - """ - with open(print_data_path, "a+") as f: - f.write(datetime.datetime.now().strftime("%Y-%m-%d-%H:%M:%S") + "\n") - f.write(msg + "\n") - - -def pad_sequence(sequences, padding_value=0, fix_len=None): - """Fill sequences(np.ndarray) into a fixed-length matrix.""" - # don't use any paddle.Tensor in collate-fn - # which prevent leakage in multi-process - max_size = sequences[0].shape - trailing_dims = tuple(max_size[1:]) - # print("trailing_dims: ", trailing_dims) - - max_len = max([s.shape[0] for s in sequences]) - if fix_len is not None: - if fix_len < max_len: - logger.warning(f"truncating example from {max_len} to {fix_len}") - max_len = fix_len - out_dims = (len(sequences), max_len) + trailing_dims - out_tensor = np.full(out_dims, padding_value, dtype=sequences[0].dtype) - for i, tensor in enumerate(sequences): - tensor = tensor[:max_len] - length = tensor.shape[0] - out_tensor[i, :length, ...] = tensor - return out_tensor - - -DEBUG_PRINT_CNT = 0 - - -def smart_concat(tensor, axis=0): - """_summary_ - - Args: - tensor (_type_): _description_ - axis (int, optional): _description_. Defaults to 0. - - Returns: - _type_: _description_ - """ - if isinstance(tensor[0], paddle.Tensor): - return paddle.concat(tensor, axis=axis) - else: - return np.concatenate(tensor, axis=axis) - - -def merge_fn_group_batch( - tokenizer, - batch, - pad_to_max_seqlen=None, - debug_print=1, - shift_label=False, - combine_batch: int = 1, - image_dtype="bfloat16", - doc_pack_attn=False, -): - """ - batch 内 n合一 - """ - bsz = len(batch) - global DEBUG_PRINT_CNT - if pad_to_max_seqlen and shift_label: - pad_to_max_seqlen += 1 - - keys = list(batch[0].keys()) - - if combine_batch > 1: - _batch = [] - for group in [ - batch[i : i + combine_batch] for i in range(0, len(batch), combine_batch) - ]: - - if "src_id" in group[0]: - src_lst = list(set([b["src_id"] for b in group])) - assert len(src_lst) == 1, f"src_lst: {src_lst}" - - item = {} - for k in keys: - if group[0][k] is None: - item[k] = None - continue - if isinstance(group[0][k], (int, float)): - item[k] = np.stack([i[k] for i in group], 0) - else: - item[k] = np.concatenate([i[k] for i in group]) - _batch.append(item) - batch = _batch - ret = {} - for k in keys: - if isinstance(batch[0][k], (int, float)): - ret[k] = np.stack([b[k] for b in batch], 0) - elif k in ["src_id", "data_id", "data_type"]: - ret[k] = np.concatenate([b[k] for b in batch]) - elif k == "images": - to_concat = [b[k] for b in batch if b[k] is not None] - if len(to_concat) != 0: - assert ( - image_dtype != "bfloat16" - ), f"Currently, not support {image_dtype} for numpy" - ret[k] = np.concatenate(to_concat, axis=0).astype(image_dtype) - else: - ret[k] = None - elif k == "grid_thw" and batch[0][k] is not None: - ret[k] = np.concatenate([b[k] for b in batch], axis=0).astype("int64") - if pad_to_max_seqlen: - tmp = max(0, pad_to_max_seqlen * bsz - ret[k].shape[0]) - if tmp > 0: - ret[k] = np.concatenate( - [ret[k], np.zeros([tmp, 3])], axis=0 - ).astype("int64") - elif k in ["audio_input_ids", "audio_labels"]: - to_concat = [b[k] for b in batch if b[k] is not None] - if len(to_concat) != 0: - concat_audio_ids = smart_concat(to_concat) - assert ( - len(concat_audio_ids.shape) == 2 - ), "拼接完的audio_ids必须是2维tensor,且shape=[sum(frames), depth]" - ret[k] = pad_sequence( - [concat_audio_ids], - padding_value=tokenizer.ignored_index, - fix_len=pad_to_max_seqlen * bsz, - )[0] - assert ( - len(ret[k].shape) == 2 - ), "padding完的audio_ids 必须是2维tensor,且shape=[bsz*pad_to_max_seqlen, depth]" - else: - ret[k] = None - else: - if k == "input_ids": - pad_value = tokenizer.pad_token_id - elif k == "labels" or k == "image_type_ids": - pad_value = tokenizer.ignored_index - elif k == "token_type_ids": - pad_value = 0 # pad is also considered as text - else: - pad_value = 0 - - if batch[0][k] is not None: - ret[k] = pad_sequence( - [b[k] for b in batch], - padding_value=pad_value, - fix_len=( - pad_to_max_seqlen - if k != "token_type_ids" - else pad_to_max_seqlen + 1 - ), - ) - - batch = ret - - if DEBUG_PRINT_CNT < debug_print: - DEBUG_PRINT_CNT += 1 - for k, v in batch.items(): - if v is not None and v.dtype == np.float32: # do not show image - v = v.shape - print_data_online( - f"Example={DEBUG_PRINT_CNT} key={k}, " - f"len={len(v[0])if isinstance(v, np.ndarray) and v.ndim > 1 else 0}, " - f"value={v if isinstance(v, np.ndarray) else v}" - ) - - if shift_label: - batch["labels"] = batch["labels"][:, 1:] - batch["input_ids"] = batch["input_ids"][:, :-1] - - if doc_pack_attn: - doc_marks = (batch["input_ids"] == 2).astype(np.int64) - doc_marks[:, -1] = 1 - _offset = np.where(doc_marks.reshape([-1]))[0] - _offset = (_offset + 1).tolist() - offset = np.expand_dims(np.array([0] + _offset, dtype=np.int64), axis=0) - offset = pad_sequence( - offset, padding_value=-1, fix_len=batch["input_ids"].shape[1] - ) - batch["inbatch_pack_offset"] = offset - - return batch diff --git a/examples/pre-training/ernie/src/utils/ipc_server.py b/examples/pre-training/ernie/src/utils/ipc_server.py deleted file mode 100644 index a539acf4..00000000 --- a/examples/pre-training/ernie/src/utils/ipc_server.py +++ /dev/null @@ -1,265 +0,0 @@ -# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -IPCServer -""" -import enum -import logging -from dataclasses import dataclass -from multiprocessing import Process, Queue, Lock - - -logger = logging.getLogger(__name__) -logging.getLogger("PIL").setLevel(logging.WARNING) - - -class ServerStatus(enum.Enum): - """ - ServerStatus - """ - - WAIT_RUNNING = 0 - RUNNING = 1 - EXIT_WITH_FAILURE = 2 - EXIT_WITH_CLOSE = 3 - - -class ResponseTag(enum.Enum): - """ - ResponseTag - """ - - SUCCESS = 0 - FAILURE = 1 - - -class ExitFlag: - """ - ExitFlag - """ - - pass - - -@dataclass -class MethodRequest: - """ - MethodRequest - """ - - router_key: object - name: str - args: list - kwargs: dict - - -@dataclass -class AttrRequest: - """ - AttrRequest - """ - - router_key: object - name: str - - -@dataclass -class Response: - """ - Response - """ - - tag: ResponseTag - value: object - exception: Exception - - -def server_loop(init_func, server_idx, server_num, init_queue, send_queue, recv_queue): - """ - server_loop - """ - try: - init_obj = init_func(server_idx, server_num) - init_queue.put( - Response( - tag=ResponseTag.SUCCESS, exception=None, value=ServerStatus.RUNNING - ) - ) - except Exception as e: - logger.exception(e) - init_queue.put( - Response( - tag=ResponseTag.FAILURE, - exception=e, - value=ServerStatus.EXIT_WITH_FAILURE, - ) - ) - return - - while True: - request = send_queue.get() - if isinstance(request, ExitFlag): - break - - try: - value = getattr(init_obj, request.name) - if isinstance(request, MethodRequest): - args = request.args or tuple() - kwargs = request.kwargs or dict() - value = value(*args, **kwargs) - response = Response(tag=ResponseTag.SUCCESS, exception=None, value=value) - except Exception as e: - response = Response(tag=ResponseTag.FAILURE, exception=e, value=None) - print("Exception inside process", e) - - recv_queue.put(response) - - -class SubIPCServer: - """ - SubIPCServer - """ - - def __init__(self, server_idx, server_num, init_func): - """ - __init__ - """ - self.send_queue = Queue() - self.recv_queue = Queue() - self.init_queue = Queue() - self.server_status = ServerStatus.WAIT_RUNNING - self.server_idx = server_idx - self.server_num = server_num - self.process = Process( - target=server_loop, - args=( - init_func, - server_idx, - server_num, - self.init_queue, - self.send_queue, - self.recv_queue, - ), - ) - self.process.daemon = True - self.process.start() - self.lock = Lock() - - def wait_started(self): - """ - wait_started - """ - if self.server_status == ServerStatus.RUNNING: - return - elif self.server_status == ServerStatus.WAIT_RUNNING: - init_response = self.init_queue.get() - assert init_response.value in [ - ServerStatus.RUNNING, - ServerStatus.EXIT_WITH_FAILURE, - ], init_response.value - self.server_status = init_response.value - if init_response.value == ServerStatus.EXIT_WITH_FAILURE: - self.server_status = ServerStatus.EXIT_WITH_FAILURE - raise init_response.exception - elif self.server_status == ServerStatus.EXIT_WITH_FAILURE: - raise RuntimeError("IPCServer does not start successfully") - elif self.server_status == ServerStatus.EXIT_WITH_CLOSE: - raise RuntimeError("IPCServer has been closed") - else: - raise RuntimeError(f"Unknown server status {self.server_status}") - - def response(self, request): - """ - response - """ - with self.lock: - self.wait_started() - self.send_queue.put(request) - ret = self.recv_queue.get() - return ret - - def close(self): - """ - close - """ - with self.lock: - if self.process is not None: - self.wait_started() - self.send_queue.put(ExitFlag()) - self.process.join() - self.process = None - self.server_status = ServerStatus.EXIT_WITH_CLOSE - - -class IPCServer: - """ - IPCServer - """ - - def __init__(self, router_groups, init_funcs): - """ - __init__ - """ - server_num = len(init_funcs) - group_num = len(router_groups) - assert server_num == group_num, f"{server_num} vs {group_num}" - assert ( - server_num > 0 - ), f"server_num should be larger than 0, but got {server_num}" - self.router_map = {} - self.sub_servers = [None] * server_num - for i, (group, init_func) in enumerate(zip(router_groups, init_funcs)): - sub_server = SubIPCServer(i, server_num, init_func) - for router_key in group: - if router_key in self.router_map: - prev_idx = self.router_map[router_key].server_idx - assert prev_idx == i, f"{router_key}: {prev_idx} vs {i}" - else: - self.router_map[router_key] = sub_server - - def _response(self, request): - """ - _response - """ - server = self.router_map[request.router_key] - response = server.response(request) - if response.exception is not None: - raise response.exception - else: - return response.value - - def call(self, router_key, name, args=tuple(), kwargs=dict()): - """ - IPC call method - """ - request = MethodRequest( - router_key=router_key, name=name, args=args, kwargs=kwargs - ) - return self._response(request) - - def attr(self, router_key, name): - """ - IPC get attribute - """ - request = AttrRequest(router_key=router_key, name=name) - return self._response(request) - - def close(self): - """ - IPC close server - """ - for server in self.sub_servers: - if server is not None: - server.close() diff --git a/examples/pre-training/yamls/pretrain_96_auto.yaml b/examples/pre-training/yamls/pretrain_96_auto.yaml index f2380abb..730f0abf 100644 --- a/examples/pre-training/yamls/pretrain_96_auto.yaml +++ b/examples/pre-training/yamls/pretrain_96_auto.yaml @@ -7,9 +7,6 @@ model_args: model_name_or_path: model_configs_auto/ tokenizer_name: ./ernie/src/tokenizers/tokenizer_model output_dir: ./output/ - data_filelist: conf/filelist_ernie45turbo_tk_m100k_250321.txt.1000 - data_weights: conf/ratio_eb45t_0321 - dev_data: ~/afs_ro/baihua.afs.baidu.com/user/sasd-score/rank-score-total/linjianhe/liuweixin/app/model/data/char-en-65536-v1/v4_corpus_wordseg/newgcc.dev.h5 data_load_process_num: 40 max_seq_length: 4096 base_seq_length: 4096 @@ -22,12 +19,14 @@ model_args: model_config: moe_logging: True moe_use_aux_free: true - multi_token_pred_depth: 1 + multi_token_pred_depth: 0 # ---------------------------trainer args-------------------------------------------------# trainer_args: + input_dir: "0.4 ./demo_data/data-1-part0 0.6 ./demo_data/data-1-part0" + split: "998,1,1" loss_spike_settings: enable_loss_spike_watcher: 1 longjob_id: long-78f0ae68688b4659 From d2713231cbdf310e3a6320f000f10a3adeee83be Mon Sep 17 00:00:00 2001 From: xuexixi Date: Fri, 15 Aug 2025 11:57:33 +0800 Subject: [PATCH 04/15] fix rebase bug --- .../ernie/src/callbacks/__init__.py | 11 +- .../src/callbacks/data_trace_callback.py | 251 ------------------ .../ernie/src/datasets/__init__.py | 18 -- .../src/trainers/pretraining_trainer_auto.py | 2 +- .../pre-training/ernie/src/utils/__init__.py | 8 +- 5 files changed, 5 insertions(+), 285 deletions(-) delete mode 100644 examples/pre-training/ernie/src/callbacks/data_trace_callback.py delete mode 100644 examples/pre-training/ernie/src/datasets/__init__.py diff --git a/examples/pre-training/ernie/src/callbacks/__init__.py b/examples/pre-training/ernie/src/callbacks/__init__.py index 51a31a22..b63bf18a 100644 --- a/examples/pre-training/ernie/src/callbacks/__init__.py +++ b/examples/pre-training/ernie/src/callbacks/__init__.py @@ -12,22 +12,17 @@ # See the License for the specific language governing permissions and # limitations under the License. -import logging - -from .tensorboard_callback import TensorBoardCallback - from .gc_callback import GCCallback -from .progressive_batching_callback import ProgreesiveBatchingCallback from .logging_callback import LoggingCallback +from .progressive_batching_callback import ProgreesiveBatchingCallback from .stopper_callback import StopperCallback from .adaptivegradclip_callback import ClipGradByAdaptiveNormCallback - from .moe_correction_bias_adjust_callback import MoECorrectionBiasAdjustCallback from .moe_logging_callback import GlobalRNGCallback, MoeLoggingCallback from .sp_grad_sync_callback import SPGradSyncCallback +from .tensorboard_callback import TensorBoardCallback from .fp8_quant_weight_callback import FP8QuantWeightCallback from .ortho_loss_callback import OrthogonalCallback -from .data_trace_callback import DataTraceCallback, DataTraceCallbackAuto __all__ = [ "TensorBoardCallback", @@ -42,6 +37,4 @@ "ClipGradByAdaptiveNormCallback", "StopperCallback", "ProgreesiveBatchingCallback", - "DataTraceCallbackAuto", - "DataTraceCallback", ] diff --git a/examples/pre-training/ernie/src/callbacks/data_trace_callback.py b/examples/pre-training/ernie/src/callbacks/data_trace_callback.py deleted file mode 100644 index b0c99391..00000000 --- a/examples/pre-training/ernie/src/callbacks/data_trace_callback.py +++ /dev/null @@ -1,251 +0,0 @@ -# -*- coding: utf-8 -*- -# !/usr/bin/env python3 - -# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import logging - -import numpy as np -import paddle -import paddle.distributed as dist -from paddle.distributed import fleet -from paddleformers.trainer.trainer_callback import ( - TrainerCallback, - TrainerControl, - TrainerState, -) -from paddleformers.trainer.training_args import TrainingArguments - -logger = logging.getLogger(__name__) - - -class DataTraceCallback(TrainerCallback): - """Callback 用于DataStatus记录 - - Args: - TrainerCallback (_type_): _description_ - """ - - def on_train_begin( - self, - args: TrainingArguments, - state: TrainerState, - control: TrainerControl, - **kwargs, - ): - - if args.custom_data_status: - custom_trainer_state = TrainerState.load_from_json(args.custom_data_status) - logger.info(f"load custom data status from {args.custom_data_status}") - state.trial_params = custom_trainer_state.trial_params - - if not args.need_data: - self.data_status_shape = paddle.zeros([1], dtype="int32") - if dist.is_initialized(): - logger.info("broadcast data trace callback hook") - dist.broadcast(self.data_status_shape, 0) # 呼应 Line:117 - return - batch_sampler = kwargs["train_dataloader"].batch_sampler - - if state.trial_params is None: - state.trial_params = {} - - if "saved_data_status" not in state.trial_params: - state.trial_params["saved_data_status"] = [ - 0 for _ in range(batch_sampler.max_part_id + 1) - ] - - if "last_start_data_status" not in state.trial_params: - state.trial_params["last_start_data_status"] = [ - 0 for _ in state.trial_params["saved_data_status"] - ] - - if "consumed_samples" not in state.trial_params: - state.trial_params["consumed_samples"] = sum( - state.trial_params["saved_data_status"] - ) - if "global_shuffle_seed" not in state.trial_params: - state.trial_params["global_shuffle_seed"] = 0 - - if not args.same_data: - state.trial_params["last_start_data_status"] = state.trial_params[ - "saved_data_status" - ] - state.trial_params["consumed_samples"] = 0 - state.trial_params["global_shuffle_seed"] = ( - state.trial_params["global_shuffle_seed"] + 1 - ) - - logger.debug( - f"Update global_shuffle_seed to {state.trial_params['global_shuffle_seed']}" - ) - logger.debug( - "Due to changes in the underlying data (ratio, number of files, number of dp), \ - the index needs to be rebuilt by resetting the consumed_samplers to 0." - ) - - if not args.ignore_data_skip: - # 进行数据skip - sampler load data_status状态与consumed_samples状态 - batch_sampler.load_data_status( - state.trial_params["last_start_data_status"], - state.trial_params["global_shuffle_seed"], - ) - batch_sampler.set_epoch(0, state.trial_params["consumed_samples"]) - else: - state.trial_params["consumed_samples"] = 0 - state.trial_params["saved_data_status"] = [ - 0 for _ in range(batch_sampler.max_part_id + 1) - ] - state.trial_params["last_start_data_status"] = [ - 0 for _ in range(batch_sampler.max_part_id + 1) - ] - batch_sampler.load_data_status( - state.trial_params["last_start_data_status"], - state.trial_params["global_shuffle_seed"], - ) - batch_sampler.set_epoch(0, state.trial_params["consumed_samples"]) - logger.info("Ignore data skipping and data status") - - state.trial_params["data_status"] = [ - 0 - for _ in range( - max( - batch_sampler.max_part_id + 1, - len(state.trial_params["saved_data_status"]), - ) - ) - ] - self.data_status_shape = paddle.to_tensor( - len(state.trial_params["data_status"]), dtype="int32" - ) - if dist.is_initialized(): - logger.info("broadcast data trace callback hook") - dist.broadcast(self.data_status_shape, 0) - - def on_load_data_end( - self, - args: TrainingArguments, - state: TrainerState, - control: TrainerControl, - inputs, - **kwargs, - ): - - if not args.need_data: - return - for part_id in inputs["src_id"]: - state.trial_params["data_status"][part_id] += 1 - - def on_step_end( - self, - args: TrainingArguments, - state: TrainerState, - control: TrainerControl, - **kwargs, - ): - - if not args.need_data: - if ( - args.use_hybrid_parallel - and control.should_save - and dist.is_initialized() - and args.pp_need_data_degree - and args.pipeline_parallel_degree > 1 - ): - _hcg = fleet.get_hybrid_communicate_group() - data_status = paddle.zeros( - [self.data_status_shape.item()], dtype="int64" - ) - dist.all_reduce(data_status, group=_hcg.get_pipe_parallel_group()) - return # 呼应 Line:178 - return - - if control.should_save: - data_status = paddle.to_tensor( - state.trial_params["data_status"], dtype="int64" - ) - if dist.is_initialized(): - if args.use_hybrid_parallel: - _hcg = fleet.get_hybrid_communicate_group() - # dp间进行all_reduce - if args.data_parallel_degree > 1: - dist.all_reduce( - data_status, group=_hcg.get_data_parallel_group() - ) - if args.sharding_parallel_degree > 1: - dist.all_reduce( - data_status, group=_hcg.get_sharding_parallel_group() - ) - if args.pp_need_data_degree and args.pipeline_parallel_degree > 1: - dist.all_reduce( - data_status, group=_hcg.get_pipe_parallel_group() - ) - else: - dist.all_reduce(data_status) # + group - logger.debug("All reduced `data_status`") - - _saved_data_status = np.array(state.trial_params["saved_data_status"]) - if len(data_status) > len(_saved_data_status): - # 数据max_part_id变大。 - _saved_data_status = np.append( - _saved_data_status, - np.zeros( - [ - len(data_status) - len(_saved_data_status), - ], - dtype="int64", - ), - ) - - state.trial_params["saved_data_status"] = ( - data_status.numpy() + _saved_data_status - ).tolist() - state.trial_params["consumed_samples"] += sum(data_status.tolist()) - - def on_save( - self, - args: TrainingArguments, - state: TrainerState, - control: TrainerControl, - **kwargs, - ): - - if not args.need_data: - return - state.trial_params["data_status"] = [ - 0 for _ in range(len(state.trial_params["data_status"])) - ] - - -class DataTraceCallbackAuto(DataTraceCallback): - """Callback 用于DataStatus记录 - - Args: - TrainerCallback (_type_): _description_ - """ - - def on_load_data_end( - self, - args: TrainingArguments, - state: TrainerState, - control: TrainerControl, - inputs, - **kwargs, - ): - - if not args.need_data: - return - for part_id in inputs["input_ids"][3]: # src_id - state.trial_params["data_status"][part_id] += 1 diff --git a/examples/pre-training/ernie/src/datasets/__init__.py b/examples/pre-training/ernie/src/datasets/__init__.py deleted file mode 100644 index b9c4df26..00000000 --- a/examples/pre-training/ernie/src/datasets/__init__.py +++ /dev/null @@ -1,18 +0,0 @@ -# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""pretraining task -""" - -from .dist_data_loader import DistDataLoader, DistDataLoaderAuto diff --git a/examples/pre-training/ernie/src/trainers/pretraining_trainer_auto.py b/examples/pre-training/ernie/src/trainers/pretraining_trainer_auto.py index 4c672d50..b583fa14 100644 --- a/examples/pre-training/ernie/src/trainers/pretraining_trainer_auto.py +++ b/examples/pre-training/ernie/src/trainers/pretraining_trainer_auto.py @@ -82,7 +82,7 @@ StopperCallback, ClipGradByAdaptiveNormCallback, ) -from src.datasets import ( +from src.datasets.dist_data_loader import ( DistDataLoaderAuto, ) from paddle.distributed import in_auto_parallel_align_mode diff --git a/examples/pre-training/ernie/src/utils/__init__.py b/examples/pre-training/ernie/src/utils/__init__.py index 30361d4c..edcdc529 100644 --- a/examples/pre-training/ernie/src/utils/__init__.py +++ b/examples/pre-training/ernie/src/utils/__init__.py @@ -12,10 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -# from .logging import logger, setup_logger_output_file +from .logging import logger, setup_logger_output_file -# __all__ = ['logger', 'setup_logger_output_file'] - -from .logging import * # noqa -from .seed_utils import * # noqa -from .training_utils import * # noqa +__all__ = ['logger', 'setup_logger_output_file'] From 102705d8f842b2048afdaf23e08d9f5325faf442 Mon Sep 17 00:00:00 2001 From: xuexixi Date: Fri, 15 Aug 2025 11:56:00 +0800 Subject: [PATCH 05/15] refactor pretrain and pretrain_auto --- examples/pre-training/ernie/pretrain_auto.py | 490 +++++------ .../models/ernie/modeling_auto.py | 777 +++--------------- .../models/ernie/modeling_auto_pp.py | 28 +- 3 files changed, 355 insertions(+), 940 deletions(-) diff --git a/examples/pre-training/ernie/pretrain_auto.py b/examples/pre-training/ernie/pretrain_auto.py index 75388307..ab4299bc 100644 --- a/examples/pre-training/ernie/pretrain_auto.py +++ b/examples/pre-training/ernie/pretrain_auto.py @@ -12,25 +12,20 @@ # See the License for the specific language governing permissions and # limitations under the License. +import json import os +import random import time -import json +from typing import Dict, Any + import numpy as np -import random import paddle -import paddle.distributed.fleet as fleet -from src.utils import logger -from paddleformers.trainer import ( - PdArgumentParser, - get_last_checkpoint, -) -from src.tokenizers.tokenization_eb_v2 import ErnieBotTokenizer -from omegaconf.listconfig import ListConfig -from omegaconf.dictconfig import DictConfig -from src.callbacks import ( - ProgreesiveBatchingCallback, - GlobalRNGCallback, -) +from omegaconf import ListConfig, DictConfig +from paddle.distributed.fleet import fleet, collective_perf + +from paddleformers.trainer import PdArgumentParser, get_last_checkpoint + +from config import get_config from models.ernie import ( ErnieForCausalLMAuto, ErnieForCausalLMAutoPP, @@ -39,227 +34,129 @@ ErnieConfig, ErnieMoEConfig, ) +from pretrain import create_pretrained_dataset +from src.callbacks import GlobalRNGCallback +from src.tokenizers.tokenization_eb_v2 import ErnieBotTokenizer from src.trainers import AutoPretrainingTrainer, AutoPreTrainingArguments -from src.utils import ( - setup_logger_output_file, -) +from src.utils import logger, setup_logger_output_file from src.utils.misc import global_training_logs -from pretrain import create_pretrained_dataset - - -from config import get_config - -try: - from paddleformers.trainer.trainer_utils import log_trainer_start -except ImportError: - - def log_trainer_start(): - """Print main process messgae""" - if "MAIN_PROCESS_STARTED" not in os.environ: - start_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) - logger.info( - f"The Training Main Process Started Successfully. time: {start_time}, pid: {os.getpid()}" - ) - os.environ["MAIN_PROCESS_STARTED"] = "1" -log_trainer_start() - +def log_trainer_start(): + if "MAIN_PROCESS_STARTED" not in os.environ: + start_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) + logger.info( + f"Training Main Process Started. time: {start_time}, pid: {os.getpid()}" + ) + os.environ["MAIN_PROCESS_STARTED"] = "1" -try: - from paddle.distributed.fleet import monitor_perf as collective_perf -except ImportError: - from paddle.distributed.fleet import collective_perf +def format_config_value(v): + if isinstance(v, (ListConfig, DictConfig)): + return list(v) if isinstance(v, ListConfig) else dict(v) + return v -assert paddle.version.mkl() == "OFF", ( - "MKL is not supported" - " in this version. Please set -DWITH_MKL=OFF when compiling PaddlePaddle." -) - -def update_model_config_from_args(config: ErnieConfig, model_args: dict): +def update_model_config_from_args( + config: ErnieConfig, model_args: Dict[str, Any] +) -> ErnieConfig: for k, v in model_args.items(): if hasattr(config, k): - logger.info(f"update model config: {k} = {v}") + logger.info(f"Updating model config: {k} = {v}") setattr(config, k, v) else: - logger.warning(f"model config key: {k} does not exist") + logger.warning(f"Model config key '{k}' does not exist") return config -def init_parameter(model): - +def init_parameters(model): for param in model.parameters(): param.initialize() + model.apply(model.init_weights) -def main(): - """Main function""" - config = get_config(verbose=True) - os.makedirs(config.model_args.output_dir, exist_ok=True) - parser = PdArgumentParser(AutoPreTrainingArguments) - if not hasattr(config.trainer_args, "pipeline_parallel_config"): - config.trainer_args.pipeline_parallel_config = "" - - if "enable_dp_comm_overlap" in config.trainer_args.pipeline_parallel_config: - logger.warning( - "Pipeline dp_comm_overlap and FusedLinearWithGradAdd can not be used at " - "the same time." - ) - - if "enable_timer" in config.trainer_args.pipeline_parallel_config: - from paddle.distributed.fleet.meta_parallel.pipeline_parallel import ( - PipelineParallel, - ) - - PipelineParallel.timer_printer = lambda _: None - - def formatv(v): - if isinstance(v, ListConfig): - return list(v) - elif isinstance(v, DictConfig): - return dict(v) - return v - - model_args = {k: formatv(v) for k, v in dict(config.model_args).items()} - trainer_args = {k: formatv(v) for k, v in dict(config.trainer_args).items()} - (args,) = parser.parse_dict(dict(**model_args, **trainer_args)) - - if args.strategy.pipeline.enable and args.virtual_pp_degree > 1: - pipeline = args.strategy.pipeline - pipeline.vpp_degree = args.virtual_pp_degree - pipeline.vpp_seg_method = args.virtual_pipeline_seg_method - - if args.modality_ratio is not None: - args.modality_interleave = ( - sum(args.modality_ratio) - if args.modality_interleave == "acc" - else sum(args.modality_ratio) * args.gradient_accumulation_steps - ) - args.modality_ratio = [ - i / sum(args.modality_ratio) for i in args.modality_ratio - ] - - args.eval_iters = 10 - args.test_iters = args.eval_iters * 10 - - args.use_moe = dict(**dict(config.model_args), **dict(config.trainer_args)).get( - "use_moe", False - ) - model_config = dict(getattr(config.model_args, "model_config", {})) - model_config = {k: formatv(v) for k, v in model_config.items()} - logger.info(f"model_config_from_yaml: {json.dumps(model_config, indent=4)}") - setup_logger_output_file(config.model_args.output_dir, args.local_rank) +def setup_device_and_seed(args): paddle.set_device(args.device) - np.random.seed(args.seed) random.seed(args.seed) paddle.seed(args.seed) - # set_seed(args.seed) + +def check_memory_preallocation(args): prop = paddle.device.cuda.get_device_properties() - if prop.total_memory < args.pre_alloc_memory * 1024 * 1024 * 1024: - logger.warning( - "Invalid value for `pre_alloc_memory`, so pre-allocating just failed." - ) + if prop.total_memory < args.pre_alloc_memory * (1024**3): + logger.warning("Invalid value for `pre_alloc_memory`, pre-allocation failed.") elif args.pre_alloc_memory > 0: logger.warning( - f"pre-allocating a tensor whose memory capacity is {args.pre_alloc_memory} GB " - "and then release it." + f"Pre-allocating a tensor {args.pre_alloc_memory}GB memory and then release it" ) - memory_size = int(args.pre_alloc_memory * 1024 * 1024 * 1024) + memory_size = int(args.pre_alloc_memory * 1024**3) x = paddle.empty([memory_size], dtype=paddle.uint8) del x - # add fleet test + +def run_fleet_tests(): try: - collective_perf( - "allgather", - round=50, - size_and_time={67108864: 0.00625, 234881024: 0.02, 637534208: 0.057}, - ) - logger.info("======monitor allgather done!=======\n") - collective_perf( - "allreduce", - round=50, - size_and_time={67108864: 0.02, 134217728: 0.038, 268435456: 0.075}, - ) - logger.info("======monitor allreduce done!=======\n") + tests = [ + ("allgather", {67108864: 0.00625, 234881024: 0.02, 637534208: 0.057}), + ("allreduce", {67108864: 0.02, 134217728: 0.038, 268435456: 0.075}), + ] + for test_name, size_time_map in tests: + collective_perf(test_name, round=50, size_and_time=size_time_map) + logger.info(f"======monitor {test_name} done!=======\n") except Exception as e: - logger.warning(f"fleet test unexcepted error! skip exception[{e}]...") - - # Detecting last checkpoint. - last_checkpoint = None - if ( - os.path.isdir(args.output_dir) - and args.do_train - and not args.overwrite_output_dir - ): - last_checkpoint = get_last_checkpoint(args.output_dir) - if last_checkpoint is None and len(os.listdir(args.output_dir)) > 0: - raise ValueError( - f"Output directory ({args.output_dir}) already exists and is not empty. " - "Use --overwrite_output_dir to overcome." - ) - elif last_checkpoint is not None and args.resume_from_checkpoint is None: - logger.info( - f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " - "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." - ) - - # Define the metrics of tasks. - def compute_metrics(p): - preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions - - output = paddle.to_tensor(preds) - labels = paddle.to_tensor(p.label_ids) - output = [t.astype("float32").cuda() for t in output] - labels = [t[t != tokenizer.ignored_index] for t in labels] - labels = [t.cuda() for t in labels] - all_numel = ( - (paddle.concat(labels, 0) != tokenizer.ignored_index).astype("int64").sum() - ) - ignored = (paddle.concat(labels, 0) == -100).astype("int64").sum() - labels = all_numel - ignored - output = sum(output) - logger.info(f"output : {output.item()}, labels : {labels.item()}") - nll_loss = output / (labels + 1.0e-6) # nll_loss is global loss - ppl = paddle.exp(nll_loss) - - return { - "nll_loss": nll_loss.item(), - "ppl": ppl.item(), - "num_token": labels.item(), - } - - # model - dtype = "float32" - if args.fp16 and args.fp16_opt_level == "O2": - paddle.set_default_dtype("float16") - dtype = "float16" - elif args.bf16: - paddle.set_default_dtype("bfloat16") - dtype = "bfloat16" + logger.warning(f"Fleet test error: {e}, skipping...") + + +def compute_metrics(p, tokenizer): + preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions + output = paddle.to_tensor(preds) + labels = paddle.to_tensor(p.label_ids) + + output = [t.astype("float32").cuda() for t in output] + labels = [t[t != tokenizer.ignored_index].cuda() for t in labels] + + all_numel = ( + (paddle.concat(labels, 0) != tokenizer.ignored_index).astype("int64").sum() + ) + ignored = (paddle.concat(labels, 0) == -100).astype("int64").sum() + valid_tokens = all_numel - ignored + + total_output = sum(output) + nll_loss = total_output / (valid_tokens + 1e-6) + ppl = paddle.exp(nll_loss) - if args.use_moe: - global ErnieConfig, ErnieForCausalLMAuto - ErnieConfig = ErnieMoEConfig + logger.info(f"Output: {output[0].item()}, Valid tokens: {valid_tokens.item()}") + return { + "nll_loss": nll_loss.item(), + "ppl": ppl.item(), + "num_token": valid_tokens.item(), + } + + +def setup_model_config(args, model_config): + config_cls = ErnieMoEConfig if args.use_moe else ErnieConfig if args.moe_group.lower() in {"mp", "tp", "model", "dummy"}: logger.info(f"disable moe flag when using moe-group={args.moe_group}") args.use_moe = False - args.multi_token_pred_depth = model_config.get("multi_token_pred_depth", 0) + cfg = config_cls.from_pretrained(args.model_name_or_path) + + update_params = { + "seqlen": args.max_seq_length, + "token_balance_seqlen": args.max_seq_length * args.per_device_train_batch_size, + "fp16_opt_level": args.fp16_opt_level, + "moe_group": args.moe_group, + "dtype": get_dtype(args), + "pipeline_parallel_degree": args.pipeline_parallel_degree, + "virtual_pp_degree": args.virtual_pp_degree, + "micro_batch_size": args.per_device_train_batch_size, + } + + for key, value in update_params.items(): + setattr(cfg, key, value) - cfg = ErnieConfig.from_pretrained(args.model_name_or_path) - cfg.seqlen = args.max_seq_length - cfg.token_balance_seqlen = args.max_seq_length * args.per_device_train_batch_size - cfg.fp16_opt_level = args.fp16_opt_level - cfg.moe_group = args.moe_group - cfg.dtype = dtype - cfg.pipeline_parallel_degree = args.pipeline_parallel_degree - cfg.virtual_pp_degree = args.virtual_pp_degree if args.tensor_parallel_degree > 1: cfg.sequence_parallel = args.sequence_parallel cfg.tensor_parallel_degree = max( @@ -273,64 +170,164 @@ def compute_metrics(p): cfg.tensor_parallel_degree = 1 cfg.tensor_parallel_rank = 0 - cfg.micro_batch_size = args.per_device_train_batch_size + return update_model_config_from_args(cfg, model_config) + + +def get_dtype(args): + if args.fp16 and args.fp16_opt_level == "O2": + return "float16" + if args.bf16: + return "bfloat16" + return "float32" + + +def set_dtype(args): + if args.fp16 and args.fp16_opt_level == "O2": + paddle.set_default_dtype("float16") + if args.bf16: + paddle.set_default_dtype("bfloat16") + return + + +def get_model_class(args): + if args.model_type == "ernie": + return ErnieForCausalLMAuto + if args.model_type == "ernie_pp": + return ErnieForCausalLMAutoPP + raise ValueError(f"Unsupported model_type: {args.model_type}") + + +def setup_tokenizer(args, config): tokenizer = ErnieBotTokenizer.from_pretrained(args.tokenizer_name) - tokenizer.ignored_index = cfg.ignored_index + tokenizer.ignored_index = config.ignored_index logger.info( - f"using tokenizer={type(tokenizer)}, bos:{tokenizer.bos_token_id} " - f"eos:{tokenizer.eos_token_id} pad:{tokenizer.pad_token_id} " + f"Using tokenizer={type(tokenizer)}, bos:{tokenizer.bos_token_id} " + f"eos:{tokenizer.eos_token_id} pad:{tokenizer.pad_token_id}" ) + return tokenizer - cfg = update_model_config_from_args(cfg, model_config) - if args.model_type == "ernie": - model_class = ErnieForCausalLMAuto - elif args.model_type == "ernie_pp": - model_class = ErnieForCausalLMAutoPP - else: - raise ValueError(f"not support model_type: {args.model_type}") +def get_checkpoint(args, output_dir): + if not os.path.isdir(output_dir) or not args.do_train or args.overwrite_output_dir: + return None + + last_checkpoint = get_last_checkpoint(output_dir) + if last_checkpoint is None and len(os.listdir(output_dir)) > 0: + raise ValueError( + f"Output directory ({output_dir}) exists and is not empty. " + "Use --overwrite_output_dir to train from scratch." + ) + if last_checkpoint is not None and args.resume_from_checkpoint is None: + logger.info( + f"Checkpoint detected, resuming training at {last_checkpoint}. " + "To avoid this, change --output_dir or add --overwrite_output_dir." + ) + + return args.resume_from_checkpoint or last_checkpoint + + +def setup_pipeline_config(args): + if "enable_dp_comm_overlap" in args.pipeline_parallel_config: + logger.warning( + "Pipeline dp_comm_overlap and FusedLinearWithGradAdd cannot be used together." + ) + if "enable_timer" in args.pipeline_parallel_config: + from paddle.distributed.fleet.meta_parallel.pipeline_parallel import ( + PipelineParallel, + ) + + PipelineParallel.timer_printer = lambda _: None + if args.strategy.pipeline.enable and args.virtual_pp_degree > 1: + pipeline = args.strategy.pipeline + pipeline.vpp_degree = args.virtual_pp_degree + pipeline.vpp_seg_method = args.virtual_pipeline_seg_method + return args - if args.from_scratch: - with paddle.LazyGuard(): + +def main(): + # 1. init config and parse arg + config = get_config(verbose=True) + if not hasattr(config.trainer_args, "pipeline_parallel_config"): + config.trainer_args.pipeline_parallel_config = "" + os.makedirs(config.model_args.output_dir, exist_ok=True) + + model_args = {k: format_config_value(v) for k, v in dict(config.model_args).items()} + trainer_args = { + k: format_config_value(v) for k, v in dict(config.trainer_args).items() + } + parser = PdArgumentParser(AutoPreTrainingArguments) + (args,) = parser.parse_dict(dict(**model_args, **trainer_args)) + + # 2. check and update + # setup_pipeline_config(config.trainer_args) + if "enable_dp_comm_overlap" in config.trainer_args.pipeline_parallel_config: + logger.warning( + "Pipeline dp_comm_overlap and FusedLinearWithGradAdd cannot be used together." + ) + + if "enable_timer" in config.trainer_args.pipeline_parallel_config: + from paddle.distributed.fleet.meta_parallel.pipeline_parallel import ( + PipelineParallel, + ) + + PipelineParallel.timer_printer = lambda _: None + + if args.strategy.pipeline.enable and args.virtual_pp_degree > 1: + pipeline = args.strategy.pipeline + pipeline.vpp_degree = args.virtual_pp_degree + pipeline.vpp_seg_method = args.virtual_pipeline_seg_method + + args.use_moe = dict(**dict(config.model_args), **dict(config.trainer_args)).get( + "use_moe", False + ) + args.eval_iters = 10 + args.test_iters = args.eval_iters * 10 + args.enable_delay_scale_loss = ( + "enable_delay_scale_loss" in config.trainer_args.pipeline_parallel_config + ) + + # 3. set log and device + setup_logger_output_file(config.model_args.output_dir, args.local_rank) + setup_device_and_seed(args) + check_memory_preallocation(args) + run_fleet_tests() # liyamei not need? + set_dtype(args) + + # 4. init model + model_config = { + k: format_config_value(v) + for k, v in dict(getattr(config.model_args, "model_config", {})).items() + } + logger.info(f"Model config from YAML: {json.dumps(model_config, indent=4)}") + cfg = setup_model_config(args, model_config) + model_class = get_model_class(args) + tokenizer = setup_tokenizer(args, cfg) + + with paddle.LazyGuard(): + if args.from_scratch: model = model_class(cfg) - else: - with paddle.LazyGuard(): - model = model_class.from_pretrained( - args.model_name_or_path, - config=cfg, - ) - - cfg = model.config - logger.info(f"using model type:{type(model)}") - paddle.set_default_dtype("float32") + else: + model = model_class.from_pretrained(args.model_name_or_path, config=cfg) - logger.info(f"using model={type(model)}, cfg={cfg}") + logger.info(f"Using model: {type(model)}, config: {model.config}") + paddle.set_default_dtype("float32") - freeze_config = set(args.freeze_config.split(" ")) + # freeze # liyamei not need? + freeze_config = set(args.freeze_config.split()) if "freeze_vision" in freeze_config and hasattr(model, "freeze_vision"): - logger.info("Freeze model vision module") + logger.info("Freezing model vision module") model.freeze_vision() - # data - logger.info("loading data...") + # 5. dataset + logger.info("Loading datasets...") train_dataset, eval_dataset, test_dataset, data_collator = ( create_pretrained_dataset(args) ) - callbacks = [] - callbacks += [GlobalRNGCallback()] - - if args.batch_size_warmup_steps: - progreesive_batcing_callback = ProgreesiveBatchingCallback( - args.gradient_accumulation_steps, - args.max_gradient_accumulation_steps, - args.batch_size_warmup_steps, - args.batch_size_warmup_increment, - ) - callbacks.append(progreesive_batcing_callback) + # 6. prepare for train/eval + callbacks = [GlobalRNGCallback()] + init_parameters(model) - init_parameter(model) - model.apply(model.init_weights) trainer = AutoPretrainingTrainer( model=model, args=args, @@ -338,17 +335,14 @@ def compute_metrics(p): train_dataset=train_dataset, eval_dataset=eval_dataset, tokenizer=tokenizer, - compute_metrics=compute_metrics, + compute_metrics=lambda p: compute_metrics(p, tokenizer), callbacks=callbacks, ) + global_training_logs.accumulate = args.gradient_accumulation_steps - checkpoint = None - if args.resume_from_checkpoint is not None: - checkpoint = args.resume_from_checkpoint - elif last_checkpoint is not None: - checkpoint = last_checkpoint + checkpoint = get_checkpoint(args, args.output_dir) - # Training + # 7.1 train if args.do_train: train_result = trainer.train(resume_from_checkpoint=checkpoint) metrics = train_result.metrics @@ -357,11 +351,17 @@ def compute_metrics(p): trainer.save_metrics("train", metrics) trainer.save_state() - # Evaluate and tests model + # 7.2 eval if args.do_eval: eval_metrics = trainer.evaluate() trainer.log_metrics("eval", eval_metrics) if __name__ == "__main__": + log_trainer_start() + assert paddle.version.mkl() == "OFF", ( + "MKL is not supported in this version. " + "Please set -DWITH_MKL=OFF when compiling PaddlePaddle." + ) + main() diff --git a/examples/pre-training/models/ernie/modeling_auto.py b/examples/pre-training/models/ernie/modeling_auto.py index f297ced4..78344fb2 100644 --- a/examples/pre-training/models/ernie/modeling_auto.py +++ b/examples/pre-training/models/ernie/modeling_auto.py @@ -20,11 +20,6 @@ import contextlib import inspect -try: - from fast_ln import fast_ln -except ImportError: - fast_ln = None - from copy import deepcopy from dataclasses import dataclass import numpy as np @@ -35,10 +30,6 @@ from paddle.distributed import fleet from paddle.distributed.fleet.utils import recompute from paddle.distributed.fleet.layers.mpu.random import get_rng_state_tracker -from paddle.incubate.nn.memory_efficient_attention import ( - memory_efficient_attention, - BlockDiagonalCausalMask, -) from paddle.distributed import in_auto_parallel_align_mode from models.comm_utils import subbatch @@ -61,7 +52,19 @@ from paddleformers.transformers.model_utils import PretrainedModel, register_base_model -from models.ernie.modeling import FusedDropoutImpl +from models.ernie.modeling import ( + FusedDropoutImpl, + RotaryEmbedding, + RMSNorm, + get_triangle_upper_mask, + mem_eff_attn, + inbatch_pack_offset_to_attn_mask_start_row_indices, + _make_causal_mask, + _expand_mask, +) +from models.ernie.modeling_moe import ( + ErnieMoeMLPFused, +) from models.sequence_parallel_utils_auto import ( sequence_parallel_sparse_mask_labels, ) @@ -93,6 +96,7 @@ class CausalLMOutputWithCrossAttentionsAuto(CausalLMOutputWithCrossAttentions): logger = logging.getLogger(__name__) + try: from paddle.nn.functional.flash_attention import flash_attention @@ -128,12 +132,12 @@ class CausalLMOutputWithCrossAttentionsAuto(CausalLMOutputWithCrossAttentions): to_block_diag_causal_mask = None try: - import fused_ln as fused + from fast_ln import fast_ln except ImportError: logger.warning( - "fused-ln not found, run `python src/ops/fused_ln_setup.py install` to build fused ln" + "fast-ln not found, run `python src/ops/fast_ln_setup.py install` to build fast ln" ) - fused = None + fast_ln = None try: from paddle.incubate.nn.functional import ( @@ -164,16 +168,10 @@ class CausalLMOutputWithCrossAttentionsAuto(CausalLMOutputWithCrossAttentions): ) -def is_pp_enable(): - - mesh = fleet.auto.get_mesh() - return "pp" in mesh.dim_names - - def global_mesh_starts_with_pp(): mesh = fleet.auto.get_mesh() - if is_pp_enable(): + if "pp" in mesh.dim_names: return mesh.get_mesh_with_dim("pp") else: return mesh @@ -193,59 +191,6 @@ def is_fleety_func(): IS_FLEETY = is_fleety_func() -def get_triangle_upper_mask(x, mask=None): - - if mask is not None: - return mask - # [bsz, n_head, q_len, kv_seq_len] - shape = x.shape - # [bsz, 1, q_len, kv_seq_len] - shape[1] = 1 - mask = paddle.full(shape, -np.inf, dtype=x.dtype) - mask.stop_gradient = True - mask = paddle.triu(mask, diagonal=1) - mask.stop_gradient = True - return mask - - -def naive_fuse_split_tp( - weight, - tensor_parallel_degree, - tensor_parallel_rank=None, - is_column=True, - fuse_tensor_parts=2, -): - - logging.info(f"spliting fused-ffn: {weight.shape}") - axis = -1 if is_column else 0 - splited = np.split(weight, fuse_tensor_parts * tensor_parallel_degree, axis=axis) - return np.concatenate( - splited[tensor_parallel_rank::tensor_parallel_degree], axis=axis - ) - - -def parallel_matmul( - x, - y, - bias=None, - transpose_y=False, - tensor_parallel_degree=1, - tensor_parallel_output=True, -): - - if transpose_y: - logits = paddle.matmul(x, y, transpose_y=True) - if bias is not None: - logits += bias - else: - logits = F.linear(x, y, bias) - - if tensor_parallel_degree > 1 and not tensor_parallel_output: - logits = dist.reshard(logits, get_mesh(-1), [dist.Shard(0), dist.Replicate()]) - - return logits - - def calc_lm_head_logits( config, hidden_states, @@ -292,105 +237,16 @@ def calc_lm_head_logits( ) if tensor_parallel_output is None: tensor_parallel_output = config.tensor_parallel_output - logits = parallel_matmul( - hidden_states, - weight, - bias=bias, - transpose_y=config.tie_word_embeddings, - tensor_parallel_degree=config.tensor_parallel_degree, - tensor_parallel_output=tensor_parallel_output, + logits = paddle.matmul( + hidden_states, weight, transpose_y=config.tie_word_embeddings ) + if bias is not None: + logits += bias - return logits - - -def finfo(dtype: paddle.dtype = None): - - if dtype is None: - dtype = paddle.get_default_dtype() - - if dtype == paddle.bfloat16: - - class BFloatFInfo: - """ - Numpy do not support `np.finfo(np.uint16)`, so try to construct a finfo object to fetch min value - """ - - min = -3.3895313892515355e38 - - return BFloatFInfo - if dtype == paddle.float32: - return np.finfo(np.float32) - if dtype == paddle.float16: - return np.finfo(np.float16) - if dtype == paddle.float64: - return np.finfo(np.float64) - - -def masked_fill(x, mask, value): - - y = paddle.full(x.shape, value, x.dtype) - return paddle.where(mask, y, x) - - -def mem_eff_attn( - query, key, value, pack_offset, drop_prob=0.0, dtype=paddle.bfloat16, training=True -): + if config.tensor_parallel_degree > 1 and not tensor_parallel_output: + logits = dist.reshard(logits, get_mesh(-1), [dist.Shard(0), dist.Replicate()]) - pack_offset = pack_offset.numpy() - shape = pack_offset.shape - assert len(shape) == 2, len(shape) - assert shape[0] == 1, shape[0] - n = pack_offset.size - pack_offset = pack_offset.flatten() - seqlens = [] - assert pack_offset[0] == 0, pack_offset[0] - for i in range(1, n): - if pack_offset[i] < 0: - break - cur = pack_offset[i] - pack_offset[i - 1] - assert cur > 0 - seqlens.append(cur) - - assert drop_prob == 0.0, drop_prob - assert dtype == paddle.bfloat16, dtype - - def cast(x): - return x.astype(dtype) if x.dtype != dtype else x - - if len(seqlens) == 1: - out, _ = flash_attention( - query, key, value, drop_prob, causal=True, training=training - ) - else: - mask = BlockDiagonalCausalMask.from_seqlens(seqlens) - out = memory_efficient_attention( - cast(query), - cast(key), - cast(value), - attn_bias=mask, - p=drop_prob, - training=training, - ) - return out - - -def inbatch_pack_offset_to_attn_mask_start_row_indices(inbatch_pack_offset): - """convert inbatch_pack_offset to attn_mask_start_row_indices""" - inbatch_pack_offset = inbatch_pack_offset.numpy() - attn_mask_row_start_indices = [] - min_start_row = np.inf - for bidx in range(inbatch_pack_offset.shape[0]): - item = inbatch_pack_offset[bidx] - cumsum_item = item[item != -1] - record_lens = cumsum_item[1:] - cumsum_item[0:-1] - min_start_row = min(cumsum_item[1], min_start_row) - row_start_indices = np.repeat(cumsum_item[1:], record_lens) - attn_mask_row_start_indices.append(row_start_indices[None, None, ...]) - attn_mask_row_start_indices = np.concatenate(attn_mask_row_start_indices, axis=0) - return paddle.to_tensor(attn_mask_row_start_indices, dtype=paddle.int32), int( - min_start_row - ) + return logits def scaled_dot_product_attention( @@ -529,7 +385,8 @@ def scaled_dot_product_attention( attn_weights = paddle.maximum( attn_weights, paddle.to_tensor( - float(finfo(query_states.dtype).min), dtype=query_states.dtype + float(paddle.finfo(query_states.dtype).min), + dtype=query_states.dtype, ), ) @@ -573,65 +430,6 @@ def scaled_dot_product_attention( return attn_output, None -def _make_causal_mask(input_ids_shape, past_key_values_length, dtype): - """ - Make causal mask used for self-attention. - """ - batch_size, target_length = input_ids_shape - - mask = paddle.full((target_length, target_length), float(finfo(dtype).min)) - - mask_cond = paddle.arange(mask.shape[-1]) - mask = masked_fill( - mask, mask_cond < (mask_cond + 1).reshape([mask.shape[-1], 1]), 0 - ) - - if past_key_values_length > 0: - mask = paddle.concat( - [paddle.zeros([target_length, past_key_values_length]), mask], axis=-1 - ) - - return mask[None, None, :, :].expand( - [batch_size, 1, target_length, target_length + past_key_values_length] - ) - - -def _expand_mask(mask, dtype, tgt_length): - """ - Expands attention_mask from `[batch_size, src_length]` to `[batch_size, 1, tgt_length, src_length]`. - """ - if mask.ndim == 4: - expanded_mask = mask - elif mask.ndim == 3: - expanded_mask = mask[:, None, :, :] - else: - batch_size, src_length = mask.shape[0], mask.shape[-1] - tgt_length = tgt_length if tgt_length is not None else src_length - - expanded_mask = mask[:, None, None, :].expand( - [batch_size, 1, tgt_length, src_length] - ) - - inverted_mask = 1.0 - expanded_mask - return masked_fill( - inverted_mask, inverted_mask.cast("bool"), float(finfo(dtype).min) - ) - - -def slice_experts(experts, moe_world_size): - moe_num_experts_per_device = len(experts) // moe_world_size - experts_per_device = [[] for _ in range(moe_world_size)] - - for i, expert in enumerate(experts): - ep_group_id = i // moe_num_experts_per_device - experts_per_device[ep_group_id].append(expert) - - lm_experts = nn.LayerList([]) - for experts_list in experts_per_device: - lm_experts.extend(experts_list[: moe_num_experts_per_device // 2]) - return lm_experts - - def get_gate( config: ErnieMoEConfig, expert: Tuple[Tuple[int, nn.Layer]], @@ -680,9 +478,6 @@ def get_gate( config, layer_idx=layer_idx, group=config.moe_group, ipp=ipp ) - lm_gate, lm_experts = None, None - logger.info(f"LM-experts-{lm_experts} -- experts-{experts}") - index = 0 if config.moe_group == "dp" else 1 ep_sub_meshes = dist.auto_parallel.api.split_mesh(get_mesh(ipp), index) @@ -694,329 +489,16 @@ def get_gate( ) experts[i].ep_group_id = ep_group_id - return gate, experts, lm_gate, lm_experts - - -def _parse_moe_group(moe_group: str): - moe_group = moe_group.lower() - assert moe_group in { - "dp", - "mp", - "none", - }, f"moe-group not supported, got: {moe_group}" - logger.info(f"using moe-group: {moe_group}") + return gate, experts - return moe_group - -class RMSNorm(nn.Layer): - """ - RMSNorm is a variant of layer normalization. - """ - - def __init__(self, config, ipp=0): - super().__init__() - self.hidden_size = config.hidden_size - self.weight = paddle.create_parameter( - shape=[self.hidden_size], - dtype=paddle.get_default_dtype(), - default_initializer=nn.initializer.Constant(1.0), - ) - self.variance_epsilon = config.rms_norm_eps - self.config = config - - def forward(self, hidden_states): - - if self.config.fuse_rms_norm: - return fused.fused_rms_norm( - hidden_states, self.weight, self.variance_epsilon - )[0] - if paddle.in_dynamic_mode(): - with paddle.amp.auto_cast(False): - variance = hidden_states.astype("float32").pow(2).mean(-1, keepdim=True) - hidden_states = ( - paddle.rsqrt(variance + self.variance_epsilon) * hidden_states - ) - else: - variance = hidden_states.astype("float32").pow(2).mean(-1, keepdim=True) - hidden_states = ( - paddle.rsqrt(variance + self.variance_epsilon) * hidden_states - ) - - if self.weight.dtype in [paddle.float16, paddle.bfloat16]: - hidden_states = paddle.cast(hidden_states, self.weight.dtype) - return hidden_states * self.weight - - -class LayerNorm(nn.LayerNorm): - """ - layer normalization. - """ - - def __init__(self, config, ipp=0): +class FastLayerNorm(nn.LayerNorm): + def __init__(self, config): + assert fast_ln is not None super().__init__(config.hidden_size, epsilon=config.rms_norm_eps) - self.use_fast_ln = config.use_fast_ln - if self.use_fast_ln: - assert fast_ln is not None - self.ipp = ipp - if config.pipeline_parallel_degree > 1: - self.weight = dist.shard_tensor( - self.weight, get_mesh(self.ipp), [dist.Replicate(), dist.Replicate()] - ) - self.bias = dist.shard_tensor( - self.bias, get_mesh(self.ipp), [dist.Replicate(), dist.Replicate()] - ) - - def forward(self, hidden_states): - """ - The layer normalization operator. - """ - if self.use_fast_ln: - return fast_ln(hidden_states, self.weight, self.bias, self._epsilon)[0] - else: - return super().forward(hidden_states) - - -class FusedLayerNorm(nn.Layer): - """ - FusedLayerNorm is a variant of layer normalization. - """ - - def __init__(self, config, ipp=0): - super().__init__() - self.config = config - self.hidden_size = config.hidden_size - self.weight = paddle.create_parameter( - shape=[self.hidden_size], - dtype=paddle.get_default_dtype(), - default_initializer=nn.initializer.Constant(1.0), - ) - self.bias = paddle.create_parameter( - shape=[self.hidden_size], dtype=paddle.get_default_dtype(), is_bias=True - ) - self.variance_epsilon = config.rms_norm_eps - self.ipp = ipp - if config.pipeline_parallel_degree > 1: - self.weight = dist.shard_tensor( - self.weight, get_mesh(self.ipp), [dist.Replicate(), dist.Replicate()] - ) - self.bias = dist.shard_tensor( - self.bias, get_mesh(self.ipp), [dist.Replicate(), dist.Replicate()] - ) - def forward(self, hidden_states): - - return fused.fused_ln( - hidden_states, self.weight, self.bias, self.variance_epsilon - )[0] - - -class RotaryEmbedding(nn.Layer): - r""" - RotaryEmbedding Layer - """ - - def __init__(self, dim, max_position_embeddings=4096, base=10000): - - super().__init__() - # dtype = paddle.get_default_dtype() - self.base = base - self.max_position_embeddings = max_position_embeddings - inv_freq = 1.0 / ( - base ** (paddle.cast(paddle.arange(0, dim, 2), dtype="float32") / dim) - ) - - # self.register_buffer("inv_freq", inv_freq.cast(dtype)) - - # higher acc using float32 - t = paddle.arange(max_position_embeddings, dtype="float32") - freqs = paddle.einsum("i,j->ij", t, inv_freq.cast("float32")) - # Different from paper, but it uses a different permutation in order to obtain the same calculation - emb = paddle.concat([freqs, freqs], axis=-1) - - # [bs, seqlen, nhead, head_dim] - self.cos_cached = emb.cos() # [None, :, None, :] # .astype(dtype) - self.sin_cached = emb.sin() # [None, :, None, :] # .astype(dtype) - - self._cast_to_low_precision = False # 兼容develop分支paddle - self._cast_to_low_precison = False - - def forward(self, x, seq_len=None): - - return ( - self.cos_cached[:seq_len, :], - self.sin_cached[:seq_len, :], - ) - - @classmethod - def rotate_half(cls, x): - """Rotates half the hidden dims of the input.""" - - x1 = x[..., : x.shape[-1] // 2] - x2 = x[..., x.shape[-1] // 2 :] - return paddle.concat([-x2, x1], axis=-1) - - @classmethod - def apply_rotary_pos_emb(cls, q, k, cos, sin, offset: int = 0, position_ids=None): - """doc""" - if position_ids is not None: - # logger.info(f'applying pos:{position_ids}') - assert offset == 0, offset - cos = F.embedding(position_ids, cos) - sin = F.embedding(position_ids, sin) - else: - cos = cos.unsqueeze(0) - sin = sin.unsqueeze(0) - cos = cos[:, offset : q.shape[1] + offset, None, :] - sin = sin[:, offset : q.shape[1] + offset, None, :] - - q_embed = paddle.add( - paddle.multiply(q, cos), paddle.multiply(cls.rotate_half(q), sin) - ) - k_embed = paddle.add( - paddle.multiply(k, cos), paddle.multiply(cls.rotate_half(k), sin) - ) - q_embed = q_embed.astype(q.dtype) # fp32->bf16 - k_embed = k_embed.astype(k.dtype) - return q_embed, k_embed - - -class RopeEmbeddingLegacy(nn.Layer): - - def __init__(self, head_dim, compression_ratio=1.0, base=10000): - super().__init__() - self.head_dim = head_dim - self.compression_ratio = compression_ratio - self.base = base - - def forward(self, seq_length, position_ids=None): - - indices = paddle.arange(0, self.head_dim, 2, dtype="float32") - indices = 1 / self.base ** (indices / self.head_dim) - if position_ids is None: - position_ids = paddle.arange(0, seq_length, 1, dtype="float32").unsqueeze(1) - position_ids = position_ids / self.compression_ratio - sinusoid_inp = position_ids * indices.unsqueeze(0) - else: - position_ids = position_ids / self.compression_ratio - seq_length = position_ids.shape[-1] - sinusoid_inp = position_ids.unsqueeze(-1).astype( - "float32" - ) * indices.unsqueeze( - 0 - ) # [b, s, 1] * [1, d/2] -> [b, s, d/2] - pos_emb = paddle.concat( - [paddle.sin(sinusoid_inp), paddle.cos(sinusoid_inp)], axis=-1 - ) - pos_emb = paddle.reshape(pos_emb, (-1, 1, seq_length, self.head_dim)) - pos_emb.stop_gradient = True - return pos_emb - - def apply_rotary(self, rp, q, k): - - # sin [sequence_length, embed_size_per_head//2] - # cos [sequence_length, embed_size_per_head//2] - sin, cos = paddle.chunk(rp, 2, axis=-1) - # sin [θ0,θ1,θ2......θd/2-1] -> sin_pos [θ0,θ0,θ1,θ1,θ2,θ2......θd/2-1,θd/2-1] - sin_pos = paddle.reshape(paddle.stack([sin, sin], axis=-1), rp.shape) - # cos [θ0,θ1,θ2......θd/2-1] -> cos_pos [θ0,θ0,θ1,θ1,θ2,θ2......θd/2-1,θd/2-1] - cos_pos = paddle.reshape(paddle.stack([cos, cos], axis=-1), rp.shape) - # rotate_half_query_layer [-q1,q0,-q3,q2......,-qd-1,qd-2] - rotate_half_q = paddle.reshape( - paddle.stack([-q[:, :, :, 1::2], q[:, :, :, 0::2]], axis=-1), - paddle.shape(q), - ) - query = paddle.add( - paddle.multiply(q.astype("float32"), cos_pos), - paddle.multiply(rotate_half_q.astype("float32"), sin_pos), - ) - # rotate_half_key_layer [-k1,k0,-k3,k2......,-kd-1,kd-2] - rotate_half_k = paddle.reshape( - paddle.stack([-k[:, :, :, 1::2], k[:, :, :, 0::2]], axis=-1), - paddle.shape(k), - ) - key = paddle.add( - paddle.multiply(k.astype("float32"), cos_pos), - paddle.multiply(rotate_half_k.astype("float32"), sin_pos), - ) - return query, key - - def forward_single(self, position_ids): - - batch_size, seq_length = position_ids.shape[:2] - rope_emb = paddle.zeros( - (2, batch_size, seq_length, 1, self.head_dim), dtype="float32" - ) - inv_freq = self.base ** ( - -paddle.arange(0, self.head_dim, 2, dtype="float32") / self.head_dim - ) - position_ids = position_ids.cast("float32") - position_ids = position_ids / self.compression_ratio - # shape: [B, S, D/2] - freqs = paddle.einsum("ij,k->ijk", position_ids.cast("float32"), inv_freq) - # shape: [B, S, D] - emb = paddle.stack([freqs, freqs], axis=-1).reshape( - (batch_size, seq_length, self.head_dim) - ) - # shape: [B, S, 1, D] - emb = paddle.unsqueeze(emb, 2) - - rope_emb[0] = paddle.cos(emb) - rope_emb[1] = paddle.sin(emb) - return rope_emb - - @staticmethod - def apply_rotary_single(x, rope_emb): - - rotate_half_x = paddle.reshape( - paddle.stack([-x[:, :, :, 1::2], x[:, :, :, 0::2]], axis=-1), - paddle.shape(x), - ) - return x * rope_emb[0] + rotate_half_x * rope_emb[1] - - -class ErnieLinear(nn.Layer): - - def __init__( - self, - in_features, - out_features, - weight_attr=None, - bias_attr=None, - name=None, - ipp=0, - ): - super(ErnieLinear, self).__init__() - self._dtype = self._helper.get_default_dtype() - self._weight_attr = weight_attr - self._bias_attr = bias_attr - self.weight = self.create_parameter( - shape=[in_features, out_features], - attr=self._weight_attr, - dtype=self._dtype, - is_bias=False, - ) - self.bias = self.create_parameter( - shape=[out_features], - attr=self._bias_attr, - dtype=self._dtype, - is_bias=True, - ) - self.name = name - self.ipp = ipp - - def forward(self, input): - - out = F.linear(x=input, weight=self.weight, bias=None, name=self.name) - out = dist.reshard( - out, - get_mesh(self.ipp), - [dist.Shard(1), dist.Shard(0)], - ) - if self.bias: - out += self.bias - return out + return fast_ln(hidden_states, self.weight, self.bias, self._epsilon)[0] class ErnieMLP(nn.Layer): @@ -1028,25 +510,16 @@ def __init__(self, config, ipp=None, do_shard_tensor=True): self.hidden_size = config.hidden_size self.intermediate_size = config.intermediate_size - LinearFN = nn.Linear - self.gate_proj = LinearFN( + self.gate_proj = nn.Linear( self.hidden_size, self.intermediate_size, bias_attr=config.use_bias ) - self.up_proj = LinearFN( + self.up_proj = nn.Linear( self.hidden_size, self.intermediate_size, bias_attr=config.use_bias ) - if config.sequence_parallel: - self.down_proj = ErnieLinear( - self.intermediate_size, - self.hidden_size, - bias_attr=config.use_bias, - ipp=self.ipp, - ) - else: - self.down_proj = LinearFN( - self.intermediate_size, self.hidden_size, bias_attr=config.use_bias - ) + self.down_proj = nn.Linear( + self.intermediate_size, self.hidden_size, bias_attr=config.use_bias + ) if do_shard_tensor and ( self.config.tensor_parallel_degree > 1 @@ -1095,7 +568,10 @@ def forward(self, x): x = fused_swiglu(self.gate_proj(x), self.up_proj(x)) else: x = F.silu(self.gate_proj(x)) * self.up_proj(x) - return self.down_proj(x) + out = self.down_proj(x) + if self.config.sequence_parallel: + out = dist.reshard(out, get_mesh(self.ipp), [dist.Shard(1), dist.Shard(0)]) + return out class ErnieAttentionAuto(nn.Layer): @@ -1127,36 +603,27 @@ def __init__(self, config, ipp: Optional[int] = None): self.hidden_size // self.num_heads * self.num_key_value_heads ) - LinearFN = nn.Linear - self.q_proj = LinearFN( + self.q_proj = nn.Linear( self.hidden_size, self.hidden_size, bias_attr=config.use_bias, ) - self.k_proj = LinearFN( + self.k_proj = nn.Linear( self.hidden_size, self.hidden_size if not self.is_gqa else kv_hidden_size, bias_attr=config.use_bias, ) - self.v_proj = LinearFN( + self.v_proj = nn.Linear( self.hidden_size, self.hidden_size if not self.is_gqa else kv_hidden_size, bias_attr=config.use_bias, ) - if config.sequence_parallel: - self.o_proj = ErnieLinear( - self.hidden_size, - self.hidden_size, - bias_attr=config.use_bias, - ipp=self.ipp, - ) - else: - self.o_proj = LinearFN( - self.hidden_size, - self.hidden_size, - bias_attr=config.use_bias, - ) + self.o_proj = nn.Linear( + self.hidden_size, + self.hidden_size, + bias_attr=config.use_bias, + ) self.config = config @@ -1287,6 +754,10 @@ def forward( attn_output = paddle.transpose(attn_output, [1, 0, 2]) attn_output = self.o_proj(attn_output) + if self.config.sequence_parallel: + attn_output = dist.reshard( + attn_output, get_mesh(self.ipp), [dist.Shard(1), dist.Shard(0)] + ) if not output_attentions: attn_weights = None @@ -1404,9 +875,7 @@ def redistribute_expert(self, mesh, placements): self.gate_proj.weight = dist.shard_tensor( self.gate_proj.weight, mesh, placements ) - # self.gate_proj.bias = dist.shard_tensor(self.gate_proj.bias, mesh, placements) self.up_proj.weight = dist.shard_tensor(self.up_proj.weight, mesh, placements) - # self.up_proj.bias = dist.shard_tensor(self.up_proj.bias, mesh, placements) self.down_proj.weight = dist.shard_tensor( self.down_proj.weight, mesh, placements ) @@ -1453,50 +922,6 @@ def forward(self, x): return paddle.bmm(x, self.weight) -class ErnieMoeMLPFused(nn.Layer): - """Fused Implement of ErnieMoeMLP""" - - def __init__(self, config): - - assert ( - hasattr(config, "disable_ffn_model_parallel") - or config.tensor_parallel_degree == 1 - ), f"fused mlp only suport mp-moe, mp={config.tensor_parallel_degree}" - assert config.fuse_attn_ffn, "fused mlp only support fuse_attn_ffn" - super().__init__() - self.moe_dropout_prob = config.moe_dropout_prob - self.num_local_experts = config.moe_num_experts // config.moe_world_size - logger.info( - f"fused-expert-weight-shape: {[self.num_local_experts, config.hidden_size, config.intermediate_size]}" - ) - - self.up_gate_proj = BMMLinear( - self.num_local_experts, config.hidden_size, config.intermediate_size * 2 - ) - self.down_proj = BMMLinear( - self.num_local_experts, config.intermediate_size, config.hidden_size - ) - self.fuse_swiglu = config.fuse_swiglu - if self.fuse_swiglu: - assert fused_swiglu is not None, "fused_swiglu operator is not found." - - def __len__(self): - return self.num_local_experts - - def __iter__(self): - return (self for _ in range(1)) - - def forward(self, x): - """x""" - if self.fuse_swiglu: - x = fused_swiglu(self.up_gate_proj(x)) - else: - gate, x = self.up_gate_proj(x).chunk(2, axis=-1) - x = F.silu(gate) * x - x = self.down_proj(x) - return x - - class ErnieDecoderLayerAuto(nn.Layer): """ ErnieDecoderLayerAuto is a decoder layer in Ernie model. @@ -1540,11 +965,23 @@ def __init__(self, config, layer_idx=0, ipp=0): self.create_moe_mlp_layer(layer_idx, ipp) else: self.mlp = ErnieMLP(config, ipp) - Norm = RMSNorm if config.use_rmsnorm else LayerNorm - if not config.use_rmsnorm and config.fuse_ln: - Norm = FusedLayerNorm - self.input_layernorm = Norm(config, ipp) - self.post_attention_layernorm = Norm(config, ipp) + if config.use_rmsnorm: + Norm = RMSNorm(config) + elif config.use_fast_ln: + Norm = FastLayerNorm(config) + else: + Norm = nn.LayerNorm(config.hidden_size, epsilon=config.rms_norm_eps) + if config.pipeline_parallel_degree > 1: + Norm.weight = dist.shard_tensor( + Norm.weight, get_mesh(ipp), [dist.Replicate(), dist.Replicate()] + ) + if hasattr(Norm, "bias"): + Norm.bias = dist.shard_tensor( + Norm.bias, get_mesh(ipp), [dist.Replicate(), dist.Replicate()] + ) + + self.input_layernorm = Norm + self.post_attention_layernorm = Norm self.residual_add1 = FusedDropoutImpl( config.hidden_dropout_prob, mode="upscale_in_train" ) @@ -1597,9 +1034,7 @@ def create_moe_mlp_layer(self, layer_idx, ipp): fc = [(_ex_cfg.moe_num_experts, fc_cls(_ex_cfg))] else: fc = [(_ex_cfg.moe_num_experts, fc_cls(_ex_cfg))] - gate, experts, lm_gate, lm_experts = get_gate( - self.config, fc, layer_idx, self.ipp - ) + gate, experts = get_gate(self.config, fc, layer_idx, self.ipp) _sh_cfg = deepcopy(self.config) if _sh_cfg.moe_num_shared_experts > 0: @@ -1994,13 +1429,9 @@ def __init__(self, config: ErnieMoEConfig): ) config.disable_ffn_model_parallel = True - config.moe_group = _parse_moe_group(config.moe_group) - if config.moe_group in fleet.auto.get_mesh().dim_names: - config.moe_world_size = fleet.auto.get_mesh().get_dim_size( - config.moe_group - ) - if config.moe_world_size < 0: - config.moe_world_size = 1 + mesh = fleet.auto.get_mesh() + if config.moe_group in mesh.dim_names: + config.moe_world_size = max(1, mesh.get_dim_size(config.moe_group)) else: config.moe_world_size = 1 @@ -2019,18 +1450,17 @@ def __init__(self, config: ErnieMoEConfig): self.config.tensor_parallel_degree > 1 or self.config.pipeline_parallel_degree > 1 ): - if not in_auto_parallel_align_mode(): - self.embed_tokens.weight = dist.shard_tensor( - self.embed_tokens.weight, - get_mesh(), - [dist.Replicate(), dist.Shard(1)], - ) + self.embed_tokens.weight = dist.shard_tensor( + self.embed_tokens.weight, + get_mesh(), + [dist.Replicate(), dist.Shard(1)], + ) layers_list = [] def get_layer_pp_info(ipp): mesh = fleet.auto.get_mesh() - if is_pp_enable() is False: + if "pp" in mesh.dim_names: return None, False else: pp_degree = mesh.get_dim_size("pp") @@ -2054,10 +1484,14 @@ def get_layer_pp_info(ipp): if input_need_reshard: self.next_pp_stage_indexes.append(layer_idx) self.layers = nn.LayerList(layers_list) - Norm = RMSNorm if config.use_rmsnorm else LayerNorm - if not config.use_rmsnorm and config.fuse_ln: - Norm = FusedLayerNorm - self.norm = Norm(config, -1) + if config.use_rmsnorm: + Norm = RMSNorm(config) + elif config.use_fast_ln: + Norm = FastLayerNorm(config) + else: + Norm = nn.LayerNorm(config.hidden_size, epsilon=config.rms_norm_eps) + + self.norm = Norm self.gradient_checkpointing = False @@ -2099,7 +1533,7 @@ def _prepare_decoder_attention_mask( ) combined_attention_mask = paddle.maximum( combined_attention_mask.astype(dtype), - paddle.to_tensor(float(finfo(dtype).min), dtype=dtype), + paddle.to_tensor(float(paddle.finfo(dtype).min), dtype=dtype), ) return combined_attention_mask @@ -2258,11 +1692,7 @@ def forward( has_gradient = not hidden_states.stop_gradient ipp = decoder_layer.ipp - if not is_pp_enable(): - position_ids_input = position_ids - attention_mask_input = attention_mask - token_type_ids_input = token_type_ids - else: + if "pp" in fleet.auto.get_mesh().dim_names: if position_ids is not None: position_ids_input = dist.reshard( position_ids, @@ -2289,6 +1719,10 @@ def forward( if token_type_ids is not None else None ) + else: + position_ids_input = position_ids + attention_mask_input = attention_mask + token_type_ids_input = token_type_ids if idx in self.next_pp_stage_indexes: hidden_states = dist.reshard( @@ -2563,8 +1997,6 @@ def forward(self, prediction_scores, masked_lm_labels, router_loss=None): [dist.Replicate() for _ in range(len(global_mesh._shape))], ) loss = loss + router_loss - router_loss.detach() - # if isinstance(router_loss, paddle.Tensor): - # global_training_logs.update(router_loss=router_loss.detach()) return loss, loss_sum @@ -2687,30 +2119,13 @@ def __init__(self, config): config.tensor_parallel_degree > 1 ), f"sequence-parallel needs mp>1, got mp={config.tensor_parallel_degree}" - # initialize-trick for big model, see - # https://github.com/bigscience-workshop/bigscience/blob/master/train/tr11-176B-ml/README.md#std-init - new_initializer_range = math.sqrt(0.3333 / config.hidden_size) - logger.info( - f"change initializer-range from {config.initializer_range} to {new_initializer_range}" - ) - config.initializer_range = new_initializer_range + config.initializer_range = math.sqrt(0.3333 / config.hidden_size) self.config = config self.ernie = ErnieModelAuto(config) self.lm_head = ErnieLMHead(config) self.criterion = ErniePretrainingCriterion(config) - self.tie_weights() # maybe weight share - - if self.config.use_rmsnorm: - if self.config.fuse_rms_norm: - logger.info("Use fusedRMSNorm") - else: - logger.info("Use normal RMSNorm") - else: - if self.config.fuse_ln: - logger.info("Use fusedLN") - else: - logger.info("Use normal LayerNorm") + self.tie_weights() def _post_init(self, original_init, *args, **kwargs): """ diff --git a/examples/pre-training/models/ernie/modeling_auto_pp.py b/examples/pre-training/models/ernie/modeling_auto_pp.py index 56e66aba..035f13d7 100644 --- a/examples/pre-training/models/ernie/modeling_auto_pp.py +++ b/examples/pre-training/models/ernie/modeling_auto_pp.py @@ -33,13 +33,12 @@ from models.moe.moe_utils_auto import get_mesh +from models.ernie.modeling import RMSNorm + from .modeling_auto import ( - _parse_moe_group, ErnieDecoderLayerAuto, ErniePretrainedModelAuto, - LayerNorm, - RMSNorm, - FusedLayerNorm, + FastLayerNorm, ErniePretrainingCriterion, ErnieLMHead, ) @@ -215,14 +214,13 @@ def __init__(self, config, layer_idx=0, ipp=0): None. """ if hasattr(config, "use_moe") and config.use_moe: - if config.moe_group in {"mp", "model", "tp", "mpdp"}: + if config.moe_group.lower() in {"mp", "model", "tp", "mpdp"}: assert config.sequence_parallel logger.info( f"disable FFN tensor model parallel, moe-group={config.moe_group}" ) config.disable_ffn_model_parallel = True - config.moe_group = _parse_moe_group(config.moe_group) if config.moe_group in fleet.auto.get_mesh().dim_names: config.moe_world_size = fleet.auto.get_mesh().get_dim_size( config.moe_group @@ -236,14 +234,13 @@ def __init__(self, config, layer_idx=0, ipp=0): self.config = config if hasattr(config, "use_moe") and config.use_moe: - if config.moe_group in {"mp", "model", "tp", "mpdp"}: + if config.moe_group.lower() in {"mp", "model", "tp", "mpdp"}: assert config.sequence_parallel logger.info( f"disable FFN tensor model parallel, moe-group={config.moe_group}" ) config.disable_ffn_model_parallel = True - config.moe_group = _parse_moe_group(config.moe_group) if config.moe_group in fleet.auto.get_mesh().dim_names: config.moe_world_size = fleet.auto.get_mesh().get_dim_size( config.moe_group @@ -282,11 +279,14 @@ def __init__(self, config, layer_idx=0, ipp=0): ) self.layer = ErnieDecoderLayerAuto(config, layer_idx, ipp) - Norm = RMSNorm if config.use_rmsnorm else LayerNorm - if not config.use_rmsnorm and config.fuse_ln: - Norm = FusedLayerNorm + if config.use_rmsnorm: + Norm = RMSNorm(config) + elif config.use_fast_ln: + Norm = FastLayerNorm(config) + else: + Norm = nn.LayerNorm(config.hidden_size, epsilon=config.rms_norm_eps) if self.layer_idx == self.config.num_hidden_layers - 1: - self.norm = Norm(config, -1) + self.norm = Norm self.lm_head = ErnieLMHead(config) def recompute_training( @@ -575,8 +575,8 @@ def __init__(self, config): else: logger.info("Use normal RMSNorm") else: - if self.config.fuse_ln: - logger.info("Use fusedLN") + if self.config.use_fast_ln: + logger.info("Use FastLN") else: logger.info("Use normal LayerNorm") From b82d6d1eea2efa3db23667b5e6f87f191262c7f3 Mon Sep 17 00:00:00 2001 From: xuexixi Date: Fri, 15 Aug 2025 14:31:55 +0800 Subject: [PATCH 06/15] code organization --- .../ernie/src/callbacks/__init__.py | 2 - .../callbacks/adaptivegradclip_callback.py | 2 - .../progressive_batching_callback.py | 70 - .../ernie/src/datasets/dist_data_loader.py | 20 +- .../ernie/src/trainers/pretraining_trainer.py | 2 +- .../pre-training/models/aadiff_decorator.py | 63 - .../models/ernie/modeling_auto.py | 105 +- .../models/ernie/modeling_auto_pp.py | 19 - .../models/ernie_moe/configuration.py | 13 +- examples/pre-training/models/moe/moe_layer.py | 1 - .../pre-training/models/moe/moe_layer_auto.py | 1426 +++++++++++++--- .../models/moe/moe_layer_auto_utils.py | 1454 ----------------- examples/pre-training/models/moe/moe_utils.py | 229 --- .../pre-training/models/moe/top2_gate_auto.py | 966 ++++++++++- .../models/moe/top2_gate_auto_auto.py | 1036 ------------ .../models/sequence_parallel_utils_auto.py | 116 -- 16 files changed, 2204 insertions(+), 3320 deletions(-) delete mode 100644 examples/pre-training/ernie/src/callbacks/progressive_batching_callback.py delete mode 100644 examples/pre-training/models/aadiff_decorator.py delete mode 100644 examples/pre-training/models/moe/moe_layer_auto_utils.py delete mode 100644 examples/pre-training/models/moe/moe_utils.py delete mode 100644 examples/pre-training/models/moe/top2_gate_auto_auto.py diff --git a/examples/pre-training/ernie/src/callbacks/__init__.py b/examples/pre-training/ernie/src/callbacks/__init__.py index b63bf18a..3b1384e1 100644 --- a/examples/pre-training/ernie/src/callbacks/__init__.py +++ b/examples/pre-training/ernie/src/callbacks/__init__.py @@ -14,7 +14,6 @@ from .gc_callback import GCCallback from .logging_callback import LoggingCallback -from .progressive_batching_callback import ProgreesiveBatchingCallback from .stopper_callback import StopperCallback from .adaptivegradclip_callback import ClipGradByAdaptiveNormCallback from .moe_correction_bias_adjust_callback import MoECorrectionBiasAdjustCallback @@ -36,5 +35,4 @@ "OrthogonalCallback", "ClipGradByAdaptiveNormCallback", "StopperCallback", - "ProgreesiveBatchingCallback", ] diff --git a/examples/pre-training/ernie/src/callbacks/adaptivegradclip_callback.py b/examples/pre-training/ernie/src/callbacks/adaptivegradclip_callback.py index f05e4500..00188856 100644 --- a/examples/pre-training/ernie/src/callbacks/adaptivegradclip_callback.py +++ b/examples/pre-training/ernie/src/callbacks/adaptivegradclip_callback.py @@ -51,7 +51,6 @@ def on_train_begin(self, args, state, control, **kwargs): resume_from_checkpoint = ( None if not args.resume_from_checkpoint else args.resume_from_checkpoint ) - # Load potential model checkpoint if isinstance(resume_from_checkpoint, bool) and resume_from_checkpoint: resume_from_checkpoint = get_last_checkpoint(args.output_dir) if resume_from_checkpoint is None: @@ -62,7 +61,6 @@ def on_train_begin(self, args, state, control, **kwargs): if resume_from_checkpoint is None: return - # if use distributed training if args.world_size > 1: process_index = args.process_index path = os.path.join( diff --git a/examples/pre-training/ernie/src/callbacks/progressive_batching_callback.py b/examples/pre-training/ernie/src/callbacks/progressive_batching_callback.py deleted file mode 100644 index 79de8beb..00000000 --- a/examples/pre-training/ernie/src/callbacks/progressive_batching_callback.py +++ /dev/null @@ -1,70 +0,0 @@ -# !/usr/bin/env python3 - -# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import logging -import numpy as np -from paddleformers.trainer.trainer_callback import TrainerCallback - -logger = logging.getLogger(__name__) - - -def progressive_accumulate_steps( - acc_step_begin, acc_step_end, warmup_global_steps, increment, step -): - - assert step >= 0, step - if step >= warmup_global_steps: - return acc_step_end - slope = (acc_step_end - acc_step_begin) / warmup_global_steps - acc_steps = int(slope * step + acc_step_begin) - acc_steps = int(np.ceil(acc_steps / increment) * increment) - return acc_steps - - -class ProgreesiveBatchingCallback(TrainerCallback): - def __init__(self, acc_step_bigin, acc_step_end, warmup_global_steps, increment): - self.acc_step_bigin = acc_step_bigin - self.acc_step_end = acc_step_end - self.warmup_global_steps = warmup_global_steps - self.increment = increment - - def on_train_begin(self, args, state, control, **kwargs): - new_acc_step = progressive_accumulate_steps( - self.acc_step_bigin, - self.acc_step_end, - self.warmup_global_steps, - self.increment, - state.global_step, - ) - if new_acc_step != args.gradient_accumulation_steps: - logger.info( - f"updating acc_step{args.gradient_accumulation_steps}->{new_acc_step}, global_step={state.global_step}" - ) - args.gradient_accumulation_steps = new_acc_step - - def on_step_end(self, args, state, control, **kwargs): - new_acc_step = progressive_accumulate_steps( - self.acc_step_bigin, - self.acc_step_end, - self.warmup_global_steps, - self.increment, - state.global_step, - ) - if new_acc_step != args.gradient_accumulation_steps: - logger.info( - f"updating acc_step{args.gradient_accumulation_steps}->{new_acc_step}, global_step={state.global_step}" - ) - args.gradient_accumulation_steps = new_acc_step diff --git a/examples/pre-training/ernie/src/datasets/dist_data_loader.py b/examples/pre-training/ernie/src/datasets/dist_data_loader.py index 1dbd0bf4..0d79250c 100644 --- a/examples/pre-training/ernie/src/datasets/dist_data_loader.py +++ b/examples/pre-training/ernie/src/datasets/dist_data_loader.py @@ -12,11 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -""" -DistDataLoader is a wrapper of paddle.io.DataLoader. -It is used to support hybrid parallelism. -It can replace paddle.io.DataLoader in most cases. -""" import logging import hashlib from collections import deque @@ -63,9 +58,6 @@ def __len__(self): class DistDataLoader(paddle.io.DataLoader): - """ - DistDataLoader is a wrapper of paddle.io.DataLoader. - """ def __init__( self, @@ -102,13 +94,11 @@ def __init__( self.need_magic_trans = need_magic_trans self._hcg = fleet.get_hybrid_communicate_group() - # init pp data comm group if self._hcg.get_pipe_parallel_world_size() > 1 and pp_broadcast: self._pp_data_group = self._init_dataloader_comm_group() else: self._pp_data_group = None - # tensor parallel message self.mp_rank = self._hcg.get_model_parallel_rank() self.mp_group = self._hcg.get_model_parallel_group() self.mp_src_rank = self._hcg.get_model_parallel_group_src_rank() @@ -248,7 +238,6 @@ def __next__(self): ) get_timers() and get_timers()("read-raw-data").stop() - # broadcast data pp_broadcast = (self._pp_data_group is None) or self.pp_rank == 0 if self.mp_group is not None and self.mp_group.nranks > 1 and pp_broadcast: ( @@ -390,9 +379,9 @@ def broadcast_data_list(data_list, datatype, comm_rank=0, comm_group=None, src_r i += 1 + rank if comm_rank == 0: - assert ( - data.dtype == datatype - ), f"input has data type {data.dtype} which " f"is different than {datatype}" + assert data.dtype == datatype, ( + f"input has data type {data.dtype} which " f"is different than {datatype}" + ) data_b = paddle.concat( [d.to(get_env_device()).reshape([-1]) for d in data_list], 0 ) @@ -403,7 +392,6 @@ def broadcast_data_list(data_list, datatype, comm_rank=0, comm_group=None, src_r else: data_b = paddle.empty([numel], dtype=datatype).to(get_env_device()) - # Broadcast paddle.distributed.broadcast(data_b, src_rank, group=comm_group).wait() ret = [] @@ -557,7 +545,7 @@ def __next__(self): raise NotImplementedError has_images = paddle.full([data_world_size, 1], True, dtype="bool") if image_type_ids is None: - image_type_ids = paddle.zeros_like(token_type_ids) # padding for dy2st + image_type_ids = paddle.zeros_like(token_type_ids) input_list = [ input_ids, labels, diff --git a/examples/pre-training/ernie/src/trainers/pretraining_trainer.py b/examples/pre-training/ernie/src/trainers/pretraining_trainer.py index 01f20a6e..9308c69a 100644 --- a/examples/pre-training/ernie/src/trainers/pretraining_trainer.py +++ b/examples/pre-training/ernie/src/trainers/pretraining_trainer.py @@ -493,7 +493,7 @@ def load_data_seq_from_cache(self): def gen_data_seq_weighted(self, num_examples, data_type=None): assert ( self.load_data_seq is False - ), "需要保证所有epoch的data_seq都从文件加载,否则下次删data_seq无法控住随机性" + ), "Ensure that the data_seq for all epochs is loaded from the file; otherwise, the randomness cannot be controlled when deleting data_seq next time." logger.info( f"generating data sequence... #non_consecutive_data_chunks={num_examples}," f" num_consecutive={self.num_consecutive}" diff --git a/examples/pre-training/models/aadiff_decorator.py b/examples/pre-training/models/aadiff_decorator.py deleted file mode 100644 index 64b7aa63..00000000 --- a/examples/pre-training/models/aadiff_decorator.py +++ /dev/null @@ -1,63 +0,0 @@ -# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -The AADiff decorator. -""" -import os -import paddle -import decorator - - -def get_md5(tensors): - """ - Get MD5 of tensor, list of tensors or the combination of them. - """ - if tensors is None: - return None - elif isinstance(tensors, paddle.Tensor): - return tensors._md5sum() - elif isinstance(tensors, (list, tuple)): - return [get_md5(t) for t in tensors] - else: - raise ValueError(tensors) - - -def check_aadiff(ntimes=None): - """ - The AADiff decorator. - """ - if ntimes is None: - ntimes = int(os.getenv("AADIFF_TIMES", "0")) - - @decorator.decorator - def __impl__(_func, *args, **kwargs): - if ntimes > 0: - with paddle.no_grad(): - old_md5 = None - for idx in range(ntimes): - ret = _func(*args, **kwargs) - print("AADiff Pass {}/{} ...".format(idx, ntimes)) - cur_md5 = get_md5(ret) - del ret - if old_md5 is None: - old_md5 = cur_md5 - else: - assert old_md5 == cur_md5, "Rank {} has aadiff".format( - paddle.distributed.get_rank() - ) - - return _func(*args, **kwargs) - - return __impl__ diff --git a/examples/pre-training/models/ernie/modeling_auto.py b/examples/pre-training/models/ernie/modeling_auto.py index 78344fb2..475b6a32 100644 --- a/examples/pre-training/models/ernie/modeling_auto.py +++ b/examples/pre-training/models/ernie/modeling_auto.py @@ -34,12 +34,10 @@ from models.comm_utils import subbatch -from models.moe.top2_gate_auto_auto import Top2Gate +from models.moe.top2_gate_auto import Top2Gate from models.moe.top2_gate_auto import TopKGateFusedAuto -# from src/ops which is install in build_envs - from paddleformers.transformers.conversion_utils import ( StateDictNameMapping, init_name_mappings, @@ -74,12 +72,6 @@ from .configuration import ErnieMoEConfig from models.moe.moe_utils_auto import get_mesh -# Because param_name is generated based on the class name, -# when changes in distributed strategies result in class modifications, -# there may be mismatches during parameter loading. -# You can achieve class name changes by importing the following environment variables. -# Example: `export rowcol_parallel_linear_class_name_convert_map="tpsp->smp"` - @dataclass class BaseModelOutputWithPastAndCrossAttentions(_BaseModelOutput): @@ -206,7 +198,6 @@ def calc_lm_head_logits( not config.use_sparse_head_and_loss_fn ), "use_sparse_head_and_loss_fn is not supported now." - # do all gather hcg = paddle.distributed.fleet.get_hybrid_communicate_group() dp_rank = hcg.get_data_parallel_rank() sharding_rank = hcg.get_sharding_parallel_rank() @@ -222,7 +213,6 @@ def calc_lm_head_logits( get_mesh(-1), [dist.Shard(1), dist.Replicate()], ) - # [S, B, H] to [B, S, H] hidden_states = paddle.transpose(hidden_states, [1, 0, 2]) if not config.using_dynamic_sequence_length: hidden_states = hidden_states.reshape( @@ -359,7 +349,6 @@ def scaled_dot_product_attention( query_states = paddle.transpose(query_states, [0, 2, 1, 3]) / math.sqrt( head_dim ) - # merge with the next tranpose key_states = paddle.transpose(key_states, [0, 2, 1, 3]) value_states = paddle.transpose(value_states, [0, 2, 1, 3]) @@ -371,7 +360,6 @@ def scaled_dot_product_attention( f" {attn_weights.shape}" ) - # Pipeline 的Attention mask不能从外面传。 if attention_mask is None: attention_mask = get_triangle_upper_mask(attn_weights) @@ -399,7 +387,7 @@ def scaled_dot_product_attention( attn_weights = F.softmax(attn_weights, axis=-1, dtype="float32").astype( query_states.dtype ) - else: # use inplace operation to save memory + else: attn_weights = attn_weights.cast(paddle.float32) attention_mask = attention_mask.cast(paddle.float32) attn_weights = attn_weights.add_(attention_mask) @@ -583,7 +571,7 @@ def __init__(self, config, ipp: Optional[int] = None): self.num_heads = config.num_attention_heads self.num_key_value_heads = config.num_key_value_heads self.head_dim = self.hidden_size // self.num_heads - self.use_recompute_attn = config.use_recompute_attn # aka recompute core-attn + self.use_recompute_attn = config.use_recompute_attn self.is_gqa = ( config.num_key_value_heads is not None and config.num_key_value_heads != self.num_heads @@ -678,40 +666,29 @@ def forward( use_cache: bool = False, inbatch_pack_offset: Optional[Tuple[paddle.Tensor]] = None, ) -> Tuple[paddle.Tensor, Optional[paddle.Tensor], Optional[Tuple[paddle.Tensor]]]: - """Input shape: Batch x Time x Channel""" if self.config.sequence_parallel: - # do all-gather hidden_states = dist.reshard( hidden_states, get_mesh(self.ipp), [dist.Shard(1), dist.Replicate()] ) - query_states = ( - self.q_proj(hidden_states).reshape( - shape=[0, 0, self.num_heads, self.head_dim] - ) - # .transpose([0, 2, 1, 3]) + query_states = self.q_proj(hidden_states).reshape( + shape=[0, 0, self.num_heads, self.head_dim] ) - key_states = ( - self.k_proj(hidden_states).reshape( - shape=[ - 0, - 0, - self.num_key_value_heads if self.is_gqa else self.num_heads, - self.head_dim, - ] - ) - # .transpose([0, 2, 1, 3]) + key_states = self.k_proj(hidden_states).reshape( + shape=[ + 0, + 0, + self.num_key_value_heads if self.is_gqa else self.num_heads, + self.head_dim, + ] ) - value_states = ( - self.v_proj(hidden_states).reshape( - shape=[ - 0, - 0, - self.num_key_value_heads if self.is_gqa else self.num_heads, - self.head_dim, - ] - ) - # .transpose([0, 2, 1, 3]) + value_states = self.v_proj(hidden_states).reshape( + shape=[ + 0, + 0, + self.num_key_value_heads if self.is_gqa else self.num_heads, + self.head_dim, + ] ) if self.config.sequence_parallel: @@ -802,9 +779,8 @@ def rope_attn( if offset > 0 or position_ids is not None or not self.fuse_rope: cos_sin = self.rotary_emb(kv_seq_len, position_ids).transpose( [0, 2, 1, 3] - ) # [b,h,s,d]->[b,s,h,d] + ) if offset > 0 and position_ids is None: - # position_ids has been sliced in prepare_inputs_for_generation cos_sin = cos_sin[:, offset:] query_states, key_states = self.rotary_emb.apply_rotary( cos_sin, query_states, key_states @@ -857,7 +833,6 @@ def __init__(self, config, ipp=0): config, "disable_ffn_model_parallel", False ) if disable_ffn_model_parallel: - # assert config.moe_group == "mp", f"when using mp_moe, expect moe-group == mp, but get {config.moe_group}" config = deepcopy(config) config.tensor_parallel_degree = 1 config.sequence_parallel = False @@ -1051,7 +1026,7 @@ def create_moe_mlp_layer(self, layer_idx, ipp): _sh_cfg.intermediate_size = ( _sh_cfg.intermediate_size * _sh_cfg.moe_num_shared_experts ) - _sh_cfg.disable_ffn_model_parallel = False # split shared epxert + _sh_cfg.disable_ffn_model_parallel = False shared_experts = ErnieMoeMLP(_sh_cfg, ipp) else: shared_experts = None @@ -1090,7 +1065,7 @@ def forward( use_cache: Optional[bool] = False, inbatch_pack_offset: Optional[paddle.Tensor] = None, token_type_ids: Optional[paddle.Tensor] = None, - output_gate_logits=True, # PP model should not output gate logits, + output_gate_logits=True, ) -> Tuple[paddle.Tensor, Optional[Tuple[paddle.Tensor, paddle.Tensor]]]: """ Args: @@ -1109,7 +1084,6 @@ def forward( residual = hidden_states hidden_states = self.input_layernorm(hidden_states) - # Self Attention (hidden_states, self_attn_weights, present_key_value, *router_loss_attn) = ( self.self_attn( hidden_states=hidden_states, @@ -1134,7 +1108,6 @@ def forward( else: hidden_states = self.residual_add1(hidden_states, residual) - # Fully Connected residual = hidden_states hidden_states = self.post_attention_layernorm(hidden_states) @@ -1148,7 +1121,6 @@ def forward( ) else: if self.config.sequence_parallel: - # do all-gather hidden_states = dist.reshard( hidden_states, get_mesh(self.ipp), @@ -1190,7 +1162,6 @@ def forward( if output_gate_logits: outputs += (gate_logits,) - # remove empty tuple for pipeline parallel if type(outputs) is tuple and len(outputs) == 1: outputs = outputs[0] return outputs @@ -1276,14 +1247,12 @@ def _get_tensor_parallel_mappings(cls, config, is_split=True): def get_tensor_parallel_split_mappings(num_layers): final_actions = {} base_actions = { - # Column Linear "layers.0.self_attn.q_proj.weight": partial(fn, is_column=True), "layers.0.self_attn.k_proj.weight": partial(fn, is_column=True), "layers.0.self_attn.v_proj.weight": partial(fn, is_column=True), "layers.0.mlp.gate_proj.weight": partial(fn, is_column=True), "layers.0.mlp.up_proj.weight": partial(fn, is_column=True), "lm_head.weight": partial(fn, is_column=not config.tie_word_embeddings), - # Row Linear "embed_tokens.weight": partial(fn, is_column=False), "layers.0.self_attn.o_proj.weight": partial(fn, is_column=False), "layers.0.mlp.down_proj.weight": partial(fn, is_column=False), @@ -1291,7 +1260,6 @@ def get_tensor_parallel_split_mappings(num_layers): if config.use_bias: base_actions.update( { - # Column Linear "layers.0.self_attn.q_proj.bias": partial(fn, is_column=True), "layers.0.self_attn.k_proj.bias": partial(fn, is_column=True), "layers.0.self_attn.v_proj.bias": partial(fn, is_column=True), @@ -1361,14 +1329,10 @@ def init_weights(self, layer): inv_freq = 1.0 / ( layer.base ** (np.arange(0, head_dim, 2).astype("float32") / head_dim) ) - # self.register_buffer("inv_freq", inv_freq.cast(dtype)) - # higher acc using float32 t = np.arange(layer.max_position_embeddings, dtype="float32") freqs = np.einsum("i,j->ij", t, inv_freq) - # Different from paper, but it uses a different permutation in order to obtain the same calculation emb = np.concatenate([freqs, freqs], axis=-1) - # [bs, seqlen, nhead, head_dim] cos_cached = np.cos(emb)[:, :] sin_cached = np.sin(emb)[:, :] layer.cos_cached.set_value(cos_cached) @@ -1495,8 +1459,6 @@ def get_layer_pp_info(ipp): self.gradient_checkpointing = False - # Initialize weights and apply final processing - self.placements = ( [dist.Shard(1), dist.Shard(0)] if self.config.sequence_parallel @@ -1513,8 +1475,6 @@ def set_input_embeddings(self, value): def _prepare_decoder_attention_mask( cls, attention_mask, input_shape, past_key_values_length, dtype ): - # create causal mask - # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] combined_attention_mask = None if input_shape[-1] > 1: combined_attention_mask = _make_causal_mask( @@ -1522,7 +1482,6 @@ def _prepare_decoder_attention_mask( ) if attention_mask is not None: - # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] expanded_attn_mask = _expand_mask( attention_mask, dtype, tgt_length=input_shape[-1] ) @@ -1631,7 +1590,6 @@ def forward( global_mesh = global_mesh_starts_with_pp() if self.config.sequence_parallel: - # [B, S, H] -> [S, B, H] inputs_embeds = paddle.transpose(inputs_embeds, [1, 0, 2]) if position_ids is not None: @@ -1670,7 +1628,6 @@ def forward( if self.config.tensor_parallel_degree > 1: hidden_states = dist.reshard(hidden_states, get_mesh(0), self.placements) - # decoder layers all_hidden_states = () if output_hidden_states else None all_self_attns = () if output_attentions else None next_decoder_cache = () if use_cache else None @@ -1937,7 +1894,7 @@ def forward_impl(self, prediction_scores, masked_lm_labels): masked_lm_loss = self.loss_impl(prediction_scores, masked_lm_labels) lossmask = masked_lm_labels != self.ignored_index - if (~lossmask).all(): # empty span + if (~lossmask).all(): logger.warning( f"encounter empty span when calculate loss, ignored_index={self.ignored_index}" ) @@ -1945,14 +1902,13 @@ def forward_impl(self, prediction_scores, masked_lm_labels): loss_sum = masked_lm_loss.sum().detach() else: lossmask_ = lossmask.reshape([-1]).cast(paddle.float32) - # 逐位对齐, 全精度聚合 masked_lm_loss_ = paddle.sum( masked_lm_loss.cast(paddle.float32).reshape([-1]) * lossmask_ ) loss = masked_lm_loss_ / lossmask_.sum() loss_sum = masked_lm_loss_.sum().detach() - if not self.return_tuple: # only used in pp + if not self.return_tuple: if self.training: return loss return loss_sum @@ -1982,7 +1938,6 @@ def forward(self, prediction_scores, masked_lm_labels, router_loss=None): loss, loss_sum = res else: loss, loss_sum = res, None - # global_training_logs.update(lm_loss=loss.clone().detach()) if router_loss is not None and not in_auto_parallel_align_mode(): global_mesh = global_mesh_starts_with_pp() if self.config.pipeline_parallel_degree > 1: @@ -2051,7 +2006,6 @@ def __init__(self, config): else: self.bias = None - # Must set distributed attr for Tensor Parallel ! self.weight.is_distributed = ( True if (vocab_size != config.vocab_size) else False ) @@ -2214,7 +2168,6 @@ def prepare_inputs_for_generation( attention_mask = kwargs.get("attention_mask", None) - # if `inputs_embeds` are passed, we only want to use them in the 1st generation step if inputs_embeds is not None and past_key_values is None: model_inputs = {"inputs_embeds": inputs_embeds} else: @@ -2223,7 +2176,7 @@ def prepare_inputs_for_generation( model_inputs.update( { "past_key_values": past_key_values, - "use_cache": True, # use_cache, + "use_cache": True, "attention_mask": attention_mask, "return_dict": True, } @@ -2234,7 +2187,6 @@ def prepare_inputs_for_generation( def update_model_kwargs_for_generation( outputs, model_kwargs, is_encoder_decoder=False ): - # update cache if ( isinstance(outputs, tuple) and len(outputs) > 1 @@ -2248,7 +2200,6 @@ def update_model_kwargs_for_generation( ): model_kwargs["past_key_values"] = outputs.past_key_values - # update token_type_ids with last value if ( "token_type_ids" in model_kwargs and model_kwargs["token_type_ids"] is not None @@ -2259,7 +2210,6 @@ def update_model_kwargs_for_generation( ) if not is_encoder_decoder: - # update attention mask if "attention_mask" in model_kwargs: attention_mask = model_kwargs["attention_mask"] model_kwargs["attention_mask"] = paddle.concat( @@ -2269,7 +2219,6 @@ def update_model_kwargs_for_generation( ], axis=-1, ) - # update role_ids if "role_ids" in model_kwargs and model_kwargs["role_ids"] is not None: role_ids = model_kwargs["role_ids"] model_kwargs["role_ids"] = paddle.concat( @@ -2329,9 +2278,9 @@ def forward( logits = self.lm_head( hidden_states, - ) # tensor_parallel_output=tensor_parallel_output) + ) - if return_dict: # aka Generate Decoding + if return_dict: if labels is not None: loss, _ = self.criterion(logits, labels) else: diff --git a/examples/pre-training/models/ernie/modeling_auto_pp.py b/examples/pre-training/models/ernie/modeling_auto_pp.py index 035f13d7..e77caaa2 100644 --- a/examples/pre-training/models/ernie/modeling_auto_pp.py +++ b/examples/pre-training/models/ernie/modeling_auto_pp.py @@ -46,12 +46,6 @@ from paddle.distributed import in_auto_parallel_align_mode -# Because param_name is generated based on the class name, -# when changes in distributed strategies result in class modifications, -# there may be mismatches during parameter loading. -# You can achieve class name changes by importing the following environment variables. -# Example: `export rowcol_parallel_linear_class_name_convert_map="tpsp->smp"` - logger = logging.getLogger(__name__) try: @@ -142,13 +136,11 @@ def forward(self, *args, **kwargs): attention_mask = kwargs.get("attention_mask") position_ids = kwargs.get("position_ids") outputs = tuple([input_ids, attention_mask, position_ids]) - # decoder layers for idx, (decoder_layer) in enumerate(self.layers): outputs = decoder_layer(outputs) return outputs else: outputs = args - # decoder layers for idx, (decoder_layer) in enumerate(self.layers): outputs = decoder_layer(outputs) return outputs @@ -353,7 +345,6 @@ def forward(self, args): return_dict if return_dict is not None else self.config.use_return_dict ) - # retrieve input_ids and inputs_embeds if input_ids is not None: batch_size, seq_length = input_ids.shape else: @@ -372,14 +363,7 @@ def forward(self, args): ) if self.config.sequence_parallel: - # [B, S, H] -> [S, B, H] inputs_embeds = paddle.transpose(inputs_embeds, [1, 0, 2]) - # if token_type_ids is not None: - # token_type_ids = token_type_ids.reshape([-1, 1]) - # token_type_ids = dist.reshard( - # token_type_ids, global_mesh, [dist.Replicate() for _ in range(len(global_mesh._shape))] - # ) - # token_type_ids = token_type_ids.reshape([-1]) global_mesh = global_mesh_starts_with_pp() if position_ids is not None: @@ -421,7 +405,6 @@ def forward(self, args): args = return_args(hidden_states, attention_mask, position_ids) - # decoder layers hidden_states, attention_mask, position_ids = parse_args(args) all_hidden_states = () if output_hidden_states else None @@ -559,8 +542,6 @@ def __init__(self, config): config.tensor_parallel_degree > 1 ), f"sequence-parallel needs mp>1, got mp={config.tensor_parallel_degree}" - # initialize-trick for big model, see - # https://github.com/bigscience-workshop/bigscience/blob/master/train/tr11-176B-ml/README.md#std-init new_initializer_range = math.sqrt(0.3333 / config.hidden_size) logger.info( f"change initializer-range from {config.initializer_range} to {new_initializer_range}" diff --git a/examples/pre-training/models/ernie_moe/configuration.py b/examples/pre-training/models/ernie_moe/configuration.py index fdebf8c6..fe37204f 100644 --- a/examples/pre-training/models/ernie_moe/configuration.py +++ b/examples/pre-training/models/ernie_moe/configuration.py @@ -115,7 +115,7 @@ def __init__( num_hidden_layers=2, num_attention_heads=2, head_dim=None, - initializer_range=0.02, # no use + initializer_range=0.02, rms_norm_eps=1e-6, use_cache=False, use_flash_attn=True, @@ -260,7 +260,7 @@ def __init__( self.use_recompute_dnd = use_recompute_dnd self.use_mp_gathered_weight = use_mp_gathered_weight - self.selective_no_recompute_num = selective_no_recompute_num # only PP + self.selective_no_recompute_num = selective_no_recompute_num self.refined_recompute = refined_recompute self.attention_probs_dropout_prob = attention_probs_dropout_prob @@ -285,7 +285,6 @@ def __init__( self.loss_subbatch_seqlen = loss_subbatch_seqlen self.gate_force_zero_padding_grad = gate_force_zero_padding_grad - # 默认的 fp8 设置 default_fp8_configs = { "quant_scheme": "DelayedScaling", "recipe": { @@ -320,7 +319,6 @@ def update_nested_dict(default_dict, update_dict): else: default_dict[key] = value - # 更新默认设置 update_nested_dict(default_fp8_configs, fp8_configs) self.fp8_configs = default_fp8_configs self.use_fp8 = use_fp8 @@ -593,10 +591,6 @@ def __init__( self.multi_token_pred_lambda = multi_token_pred_lambda self.enable_mtp_magic_send = enable_mtp_magic_send - # The insert_empty_layer is a list of integer which will be used under pipeline parallel. - # After each layer indicated in the insert_empty_layer, an empty layer will be inserted. - # For example, a model with 4 layers, insert_empty_layer = [1, 3], the model actually passed to - # pp is: transformer, transformer, EMPTY, transformer, transformer, EMPTY self.insert_empty_layer = insert_empty_layer # elastic @@ -664,9 +658,6 @@ def __init__( insert_empty_layer, list ), "pp_no_recompute_layer should be a list" - # Indicating layers not do recompute under pipeline parallel. - # Note that, when insert_empty_layer is not None, the pp_no_recompute_layer should be indicating - # layers number in origin model structure, AKA model before insert empty layers. self.pp_no_recompute_layer = pp_no_recompute_layer self.register_nonsaveable_keys("moe_group") self.register_nonsaveable_keys("pp_no_recompute_layer") diff --git a/examples/pre-training/models/moe/moe_layer.py b/examples/pre-training/models/moe/moe_layer.py index 498c577e..9aa8a2ce 100644 --- a/examples/pre-training/models/moe/moe_layer.py +++ b/examples/pre-training/models/moe/moe_layer.py @@ -200,7 +200,6 @@ def forward(ctx, x, combine_weights, scatter_index): @staticmethod def backward(ctx, grad_y, *_): - # assert moe_combine is not None grad_x, grad_combine_weight_helper = paddle._C_ops.moe_combine_grad( ctx.x, ctx.combine_weights, ctx.scatter_index, grad_y ) diff --git a/examples/pre-training/models/moe/moe_layer_auto.py b/examples/pre-training/models/moe/moe_layer_auto.py index 0b7fc0cf..08d4456d 100644 --- a/examples/pre-training/models/moe/moe_layer_auto.py +++ b/examples/pre-training/models/moe/moe_layer_auto.py @@ -14,13 +14,10 @@ # See the License for the specific language governing permissions and # limitations under the License. -"""_summary_ -Returns: - _type_: _description_ -""" -from typing import Any, Tuple, List, Optional, Callable +from typing import Tuple, List, Optional import logging +import inspect from collections import namedtuple from contextlib import contextmanager from functools import partial @@ -34,21 +31,37 @@ from paddle.autograd import PyLayer from paddle.distributed.communication.group import Group +from paddle.distributed.fleet.utils import recompute from paddle.distributed import fleet import paddle.distributed as dist from paddle import Tensor from paddleformers.trainer.plugins.timer import get_timers +from models.moe.top2_gate_auto import TopKGateFusedAuto +from models.moe.top2_gate_auto import ( + TopKGateFused, +) +from models.sequence_parallel_utils_auto import ScatterOp +from models.utils import ( + global_training_logs_enabled, + manual_backward, +) -from models.moe.top2_gate_auto import TopKGateFusedAuto from models.moe.moe_utils_auto import get_flatten_mesh, get_mesh, _reshard -from models.moe.moe_layer_auto_utils import MOELayer +from paddle.incubate.nn.functional import ( + moe_combine, +) try: from src.utils.misc import global_training_logs except ModuleNotFoundError: - global_training_logs = {} + global_training_logs = {} + +try: + import moe_router_loss_ops +except ImportError: + moe_router_loss_ops = None logger = logging.getLogger(__name__) @@ -80,15 +93,19 @@ "`python3 src/ernie_core/ops/moe/setup_auto.py install` to install" ) +try: + from moe_combine import moe_combine_no_weight +except ImportError: + moe_combine_no_weight = None -GateOutput = namedtuple( - "GateOutput", - [ - "aux", - "z", - "logits", - ], -) +try: + import moe_ops_fp8 +except ImportError: + moe_ops_fp8 = None + logger.warning( + "`moe-ops` not found, run " + "`python3 src/ernie_core/ops/moe/setup_fp8.py install` to install" + ) @contextmanager @@ -101,108 +118,44 @@ def profile(name): get_timers()(name).stop() -class GateCombineForStatic(PyLayer): - """GateCombine""" - - @staticmethod - def forward(ctx, x, combine_weights, scatter_index): - """ - Input: - x: [seqlen * k, hidden_size] - combine_weights: [seqlen, k] - scatter_index: [seqlen, k] - Output: - y: [seqlen, hidden_size] - """ - ctx.save_for_backward(x, combine_weights, scatter_index) - assert moe_combine_auto is not None - return moe_combine_auto.moe_combine_auto(x, combine_weights, scatter_index) - - @staticmethod - def backward(ctx, grad_y, *_): - """ - Input: - grad_y: [seqlen, hidden_size] - combine_weights: [seqlen, k] - scatter_index: [seqlen, k] - Output: - grad_x: [seqlen * k, hidden_size] - grad_combine_weight: [seqlen, k] - - """ - x, combine_weights, scatter_index = ctx.saved_tensor() - assert moe_combine_auto is not None - grad_x, grad_combine_weight_helper = moe_combine_auto.moe_combine_bwd_auto( - x, combine_weights, scatter_index, grad_y - ) - # grad_combine_weight_helper is the same shape with grad x [seqlen * K, dim] - # reduce the hidden shape - # TODO: implement reduce in cuda ops - grad_combine_weight = grad_combine_weight_helper.sum(-1) - # NOTE: PyLayer do not support some inputs with stop_gradient=True in static mode, - # this means that there must be a gradient for each input - scatter_index_grad = paddle.zeros_like(scatter_index) - return grad_x, grad_combine_weight, scatter_index_grad +GateOutput = namedtuple( + "GateOutput", + [ + "aux", + "z", + "logits", + ], +) class GateCombine(PyLayer): - """GateCombine""" @staticmethod def forward(ctx, x, combine_weights, scatter_index): - """ - Input: - x: [seqlen * k, hidden_size] - combine_weights: [seqlen, k] - scatter_index: [seqlen, k] - Output: - y: [seqlen, hidden_size] - """ ctx.x = x ctx.combine_weights = combine_weights ctx.scatter_index = scatter_index - assert moe_combine_auto is not None - return moe_combine_auto.moe_combine_auto(x, combine_weights, scatter_index) + assert moe_combine is not None + ret = moe_combine.moe_combine(x, combine_weights, scatter_index) + return ret @staticmethod def backward(ctx, grad_y, *_): - """ - Input: - grad_y: [seqlen, hidden_size] - combine_weights: [seqlen, k] - scatter_index: [seqlen, k] - Output: - grad_x: [seqlen * k, hidden_size] - grad_combine_weight: [seqlen, k] - - """ - - assert moe_combine_auto is not None - grad_x, grad_combine_weight_helper = moe_combine_auto.moe_combine_bwd_auto( + assert moe_combine is not None + grad_x, grad_combine_weight_helper = moe_combine.moe_combine_bwd( ctx.x, ctx.combine_weights, ctx.scatter_index, grad_y ) - # grad_combine_weight_helper is the same shape with grad x [seqlen * K, dim] - # reduce the hidden shape - # TODO: implement reduce in cuda ops + grad_combine_weight = grad_combine_weight_helper.sum(-1) return grad_x, grad_combine_weight.reshape(ctx.combine_weights.shape), None -def combining_fused_auto(x, combine_weights, scatter_index, hard_gate=False): - """ - Args: - x: Tensor[seq, dim] - combine_weights: [s, k] - scatter_index: ** [k, s] ** +def combining_fused(x, combine_weights, scatter_index, hard_gate=False): - Returns: - y: Tensor[s, dim] - """ if hard_gate: - x_gatherd = F.embedding(scatter_index, x) # [s,k,dim] + x_gatherd = F.embedding(scatter_index, x) return x_gatherd.squeeze(-2) - ret = moe_combine_auto.moe_combine_auto(x, combine_weights, scatter_index) - + ret = GateCombine.apply(x, combine_weights, scatter_index) ret.stop_gradient = False return ret @@ -210,8 +163,6 @@ def combining_fused_auto(x, combine_weights, scatter_index, hard_gate=False): def dispatching(x, dispatch_mask, scatter_index, num_experts, capacity): output = None - # init_output = paddle.zeros([num_experts * capacity, x.shape[-1]], dtype='float32') - # output = init_output + 0. * x.sum() orig_dtype = x.dtype scatter_index = scatter_index.unbind(1) dispatch_mask = dispatch_mask.unbind(1) @@ -244,59 +195,170 @@ def combining(x, combine_weights, scatter_index): dim = x.shape[-1] scatter_index = scatter_index.reshape([-1]) num_k = combine_weights.shape[-1] - x = dist.reshard(x, get_mesh(0), [dist.Replicate(), dist.Shard(0)]) combine_weights = combine_weights.unsqueeze(1) - # num_k = 2 - x = paddle.gather(x, scatter_index).reshape([-1, num_k, dim]) # [seq,2,dim] - return paddle.matmul(combine_weights, x).squeeze( - 1 - ) # [seq,1,2] @ [seq,2,dim] -> [seq,1,dim] + x = paddle.gather(x, scatter_index).reshape([-1, num_k, dim]) + return paddle.matmul(combine_weights, x).squeeze(1) + + +class Fp8MoeGateDispatchAndQuant(paddle.autograd.PyLayer): + + @staticmethod + def forward( + ctx, x, gate_logtis, corr_bias, k, capacity, use_pad, use_pow2_scale=True + ): + ( + out_fp8, + scale, + combine_weights, + scatter_index, + expert_offset, + expert_id, + ) = moe_ops_fp8.moe_gate_dispatch_and_quant( + x, + gate_logtis, + corr_bias=corr_bias, + k=k, + capacity=capacity, + use_pad=use_pad, + use_pow2_scale=use_pow2_scale, + ) + assert out_fp8.shape[0] == scale.shape[0] + + out_fp8.stop_gradient = False + combine_weights.stop_gradient = False + scatter_index.stop_gradient = True + expert_offset.stop_gradient = True + expert_id.stop_gradient = True + scale.stop_gradient = True + + ctx.k = k + ctx.capacity = capacity + ctx.use_pad = use_pad + ctx.combine_weights = combine_weights + ctx.scatter_index = scatter_index + ctx.expert_id = expert_id + ctx.has_corr_bias = corr_bias is not None + + return ( + out_fp8, + combine_weights, + scatter_index, + expert_offset, + expert_id, + { + "scale": scale, + }, + ) + + @staticmethod + def backward(ctx, *grads): + out_grad, combine_weights_grad = grads[0], grads[1] + x_grad, gate_logits_grad = moe_ops.moe_gate_dispatch_bwd( + ctx.combine_weights, + ctx.scatter_index, + ctx.expert_id, + out_grad, + combine_weights_grad, + k=ctx.k, + capacity=ctx.capacity, + use_pad=ctx.use_pad, + ) + if ctx.has_corr_bias: + return x_grad, gate_logits_grad, None + else: + return x_grad, gate_logits_grad class AlltoAll(PyLayer): - """ - AlltoAll w/ backward - """ @staticmethod - def forward(ctx, x, group): - """ - All-to-all communication in the group. - """ + def forward(ctx, x, group, sync_op=True): + ctx.group = group if dist.get_world_size(group) <= 1: return x output = paddle.empty_like(x) output.stop_gradient = False - with profile("moe-all2all"): - stream.alltoall_single(output, x, None, None, group, True, True) - return output + task = stream.alltoall_single( + output, x, None, None, group, sync_op=sync_op, use_calc_stream=sync_op + ) + if not sync_op: + return output, task + else: + return output @staticmethod def backward(ctx, *dx): - """backward""" return AlltoAll.apply(*dx, group=ctx.group) +class AlltoAllExpertOverlap(PyLayer): + + @staticmethod + def forward( + ctx, input, group, num_local_experts, forward_func_dict, is_first_fwd=False + ): + assert ( + dist.get_world_size(group) > 1 + ), "AlltoAllExpertOverlap is not supported for a world size less than or equal to 1." + + ctx.bw_funcs = {} + ctx.group = group + ctx.num_local_experts = num_local_experts + + assert isinstance(forward_func_dict, nn.LayerList) + all2all_tasks = [] + all2all_ins = paddle.unbind(input, axis=0) + for stage_id in range(1): + stage_input = all2all_ins[stage_id] + x_out, task = AlltoAll.apply(stage_input, group=group, sync_op=False) + all2all_tasks.append((task, x_out)) + + expert_outputs = [] + for stage_id in range(num_local_experts): + if stage_id + 1 != num_local_experts: + stage_input = all2all_ins[stage_id + 1] + x_out, task = AlltoAll.apply(stage_input, group=group, sync_op=False) + all2all_tasks.append((task, x_out)) + + task, dispatched_input = all2all_tasks[stage_id] + task.wait() + bwf, (expert_outputs_cur_stage,) = manual_backward( + forward_func_dict[stage_id], is_first_fwd, dispatched_input + ) + ctx.bw_funcs[stage_id] = bwf + expert_outputs.append(expert_outputs_cur_stage) + + expert_output = paddle.stack(expert_outputs, axis=1) + return expert_output + + @staticmethod + def backward(ctx, out_grad): + all2all_tasks = [] + expert_outputs = [] + + out_grad_list = paddle.split( + out_grad, num_or_sections=out_grad.shape[1], axis=1 + ) + for stage_id in range(ctx.num_local_experts): + (grad_cur_stage,) = ctx.bw_funcs[stage_id](out_grad_list[stage_id]) + + x_out, task = AlltoAll.apply(grad_cur_stage, group=ctx.group, sync_op=False) + all2all_tasks.append(task) + expert_outputs.append(x_out) + + for task in all2all_tasks: + task.wait() + + expert_output = paddle.stack(expert_outputs, axis=0) + return expert_output + + class AlltoAllAsync(PyLayer): - """ - AlltoAll async w/ backward - """ @staticmethod def forward(ctx, x, *fn_args, group=None, fn=None, is_first_fwd=False): - """ - All-to-all communication in the group. - Args: - x: Tensor - args: List[Any], argument(s) to `fn` - group: ProcessGroup - fn: callable, called while doing alltoall - is_first_fwd: if using recompute, don't record bacward when first forward - Returns: - x: Tensor - fn_out: List[Tensor] - """ + assert fn is not None, "use AlltoAll no async" ctx.group = group if dist.get_world_size(group) <= 1: @@ -304,42 +366,1038 @@ def forward(ctx, x, *fn_args, group=None, fn=None, is_first_fwd=False): return (x,) + fn_out x_out = paddle.empty_like(x) x_out.stop_gradient = False - with profile("moe-all2all"): - task = stream.alltoall_single( - x_out, - x, - None, - None, - group, - sync_op=False, - ) + task = stream.alltoall_single( + x_out, + x, + None, + None, + group, + sync_op=False, + ) ctx.bwf, fn_out = manual_backward(fn, is_first_fwd, *fn_args) task.wait() return (x_out,) + fn_out @staticmethod def backward(ctx, dx_out, *fn_out_grads): - """backward""" if dist.get_world_size(ctx.group) <= 1: fn_args_grads = ctx.bwf(*fn_out_grads) return (dx_out,) + fn_args_grads dx = paddle.empty_like(dx_out) dx.stop_gradient = False - with profile("moe-all2all"): - task = stream.alltoall_single( - dx, - dx_out, - None, - None, - ctx.group, - sync_op=False, - ) + task = stream.alltoall_single( + dx, + dx_out, + None, + None, + ctx.group, + sync_op=False, + ) fn_args_grads = ctx.bwf(*fn_out_grads) task.wait() return (dx,) + fn_args_grads +class MOELayer(nn.Layer): + + def __init__( + self, + gate: nn.Layer, + experts: List[nn.Layer], + layer_idx, + shared_experts: Optional[List[nn.Layer]] = None, + group: Group = None, + recompute=False, + enable_logging: bool = False, + k=2, + enable_bpr: bool = False, + all_to_all_dropout=0, + group_experts=False, + moe_statics=None, + ): + + super().__init__() + self.gate = gate + self.layer_idx = layer_idx + self.recompute = recompute + logger.info(f"using moe recompute={recompute}") + for p in self.gate.parameters(): + p.is_gate = True + if isinstance(experts, nn.LayerList): + self.experts = experts + else: + logger.info(f"using fused experts, type={type(experts)}") + self.experts = experts + self.shared_experts = shared_experts + + self.group = group + self.k = k + self.all_to_all_dropout = all_to_all_dropout + self.enable_logging = enable_logging + self.use_correction_bias = moe_statics is not None + self.moe_statics = moe_statics + if self.use_correction_bias: + logger.info( + f"using correction bias, aux-coef:{self.gate.config.moe_aux_loss_lambda}" + ) + assert self.gate.config.moe_use_aux_free + + self.is_mp_moe = ( + hasattr(fleet.fleet, "_hcg") + and group is fleet.get_hybrid_communicate_group().get_model_parallel_group() + ) + self.is_ep_moe = ( + hasattr(fleet.fleet, "_hcg") + and hasattr( + fleet.get_hybrid_communicate_group(), + "get_moe_sharding_parallel_world_size", + ) + and fleet.get_hybrid_communicate_group().get_moe_sharding_parallel_world_size() + > 0 + ) + is_dummy_moe = dist.get_world_size(group) == 1 + + for p in experts.parameters(): + p.expert = not (self.is_mp_moe or is_dummy_moe) + p.no_sync = not (self.is_mp_moe or is_dummy_moe) + logger.info(f"expert no-sync={p.no_sync}-{p.name}") + if self.is_mp_moe or self.is_ep_moe: + p.is_distributed = True + + expert_color = None + if self.is_ep_moe: + moe_grad_group = ( + fleet.get_hybrid_communicate_group().get_moe_sharding_parallel_group() + ) + expert_color = {"color": "moe_expert", "group": moe_grad_group} + elif ( + self.config.offline_quant_expert_weight + and self.config.clear_origin_weight_when_offline_quant + ): + expert_color = {"color": "moe_expert"} + + if expert_color is not None: + for p in self.experts.parameters(): + setattr(p, "color", expert_color) + + self.world_size = dist.get_world_size(self.group) + self.rank = dist.get_rank(self.group) + if self.world_size < 1: + self.world_size = 1 + if self.rank < 0: + self.rank = 0 + + self.num_local_experts = len(self.experts) + self.dispatch_by_task = ( + hasattr(self.gate, "dispatch_by_task") and self.gate.dispatch_by_task + ) + + if self.dispatch_by_task: + assert 0, "no supported, checkout earylier code" + assert self.num_local_experts == 1 + + self.input_preprocess = self.output_postprocess = None + self.group_experts = group_experts + self.config = self.gate.config + self.zero = paddle.to_tensor(0, dtype=paddle.float32) + + self._rr_moe_gate_dispatch = None + self._rr_moe_combine = None + self.use_norm_gate_recompute = None + + if self.config.use_recompute and self.config.skip_recompute_ops.get( + "moe_gate_dispatch", False + ): + self._rr_moe_gate_dispatch = None + if self.config.use_recompute and self.config.skip_recompute_ops.get( + "moe_combine", False + ): + self._rr_moe_combine = None + if hasattr(fleet.fleet, "_hcg"): + hcg = fleet.get_hybrid_communicate_group() + if ( + hasattr(hcg, "get_moe_sharding_parallel_world_size") + and hcg.get_moe_sharding_parallel_world_size() > 0 + ): + moe_grad_group = hcg.get_moe_sharding_parallel_group() + for p in self.experts.parameters(): + setattr( + p, "color", {"color": "moe_expert", "group": moe_grad_group} + ) + + def forward_experts(self, dispatched_input): + + with profile("fwd-expert"): + dispatched_input = dispatched_input.reshape( + [ + self.world_size, + self.num_local_experts, + -1, + dispatched_input.shape[-1], + ] + ) + expert_outputs = [] + if isinstance(self.experts, nn.LayerList): + + chunks = dispatched_input.transpose([1, 0, 2, 3]).contiguous().unbind(0) + assert len(chunks) == len(self.experts), ( + len(chunks), + len(self.experts), + ) + for chunk, expert in zip(chunks, self.experts): + expert_outputs += [expert(chunk)] + + expert_output = paddle.stack(expert_outputs, axis=1) + + else: + dispatched_input = dispatched_input.transpose([1, 0, 2, 3]) + dispatched_input.contiguous() + orig_shape = dispatched_input.shape + chunks = dispatched_input.reshape([orig_shape[0], -1, orig_shape[-1]]) + chunks = self.experts(chunks) + chunks = chunks.reshape(orig_shape[:-1] + [chunks.shape[-1]]).unbind(0) + expert_outputs += chunks + expert_output = paddle.stack(expert_outputs, axis=1) + return expert_output + + def fused_gate_logits_process( + self, gate_logits, token_type_ids, offload_helper=None + ): + + k = self.k + experts_type_ids = self.gate.experts_type_ids + use_hard_gate = self.config.moe_use_hard_gate + max_prob = None + + if token_type_ids is not None and use_hard_gate: + if offload_helper is None: + offload_helper = dict() + lm_mask = token_type_ids == 0 + is_lm = lm_mask.any() + mm_mask = token_type_ids == 1 + is_mm = mm_mask.any() + seq_lm = lm_mask.sum() + seq_mm = mm_mask.sum() + lm_mask = lm_mask.unsqueeze(1) & (experts_type_ids == 0).unsqueeze(0) + mm_mask = mm_mask.unsqueeze(1) & (experts_type_ids == 1).unsqueeze(0) + offload_helper["lm_mask"] = [lm_mask, is_lm, seq_lm] + offload_helper["mm_mask"] = [mm_mask, is_mm, seq_mm] + + is_lm = offload_helper["lm_mask"][1] + prob = paddle.zeros_like(gate_logits) + if is_lm: + lm_mask = offload_helper["lm_mask"][0] + seq_lm_cpu = offload_helper["lm_mask"][2] + lm_mask_nonzero = lm_mask.nonzero() + lm_partial_gate_logits = gate_logits.gather_nd(lm_mask_nonzero).reshape( + [seq_lm_cpu, -1] + ) + if self.group_experts: + lm_prob = self.gate.act( + lm_partial_gate_logits.reshape( + [lm_partial_gate_logits.shape[0], k, -1] + ) + ) + max_prob = lm_prob.max(-1, keepdim=True) + lm_prob /= max_prob + else: + lm_prob = self.gate.act(lm_partial_gate_logits) + prob = paddle.scatter_nd_add(prob, lm_mask_nonzero, lm_prob.flatten()) + is_mm = offload_helper["mm_mask"][1] + if is_mm: + mm_mask = offload_helper["mm_mask"][0] + seq_mm_cpu = offload_helper["mm_mask"][2] + mm_mask_nonzero = paddle.nonzero(mm_mask) + mm_partial_gate_logits = gate_logits.gather_nd(mm_mask_nonzero).reshape( + [seq_mm_cpu, -1] + ) + mm_prob = self.gate.act(mm_partial_gate_logits) + prob = paddle.scatter_nd_add(prob, mm_mask_nonzero, mm_prob.flatten()) + else: + if self.group_experts: + prob = self.gate.act(gate_logits.reshape([gate_logits.shape[0], k, -1])) + max_prob = prob.max(-1, keepdim=True) + prob /= max_prob + prob = prob.reshape([prob.shape[0], -1]) + else: + prob = self.gate.act(gate_logits) + return prob, max_prob + + def gate_distpach_and_quant(self, input, token_type_ids): + + assert isinstance(self.gate, (TopKGateFused)), "Only fused gate is supported." + assert not self.config.use_ep_comm_overlap, "ep_comm_overlap is not supported" + assert ( + self._rr_moe_gate_dispatch is None + ), "rr_moe_gate_dispatch is not supported" + assert moe_ops_fp8 is not None + + args = () + if token_type_ids is not None: + token_type_ids = token_type_ids.reshape([-1]) + args = (token_type_ids,) + + ( + gate_logits, + capacity, + router_loss, + ) = self.gate(input, *args) + + if self.config.moe_multimodal_paired_experts: + assert token_type_ids is not None + input = paddle.concat( + [input, token_type_ids.unsqueeze(-1).astype(input.dtype)], axis=-1 + ) + if self.input_preprocess is not None: + input, gate_logits = self.input_preprocess(input, gate_logits, capacity) + + k = self.k + prob, max_prob = self.fused_gate_logits_process(gate_logits, token_type_ids) + + with profile("dispatch_op"): + corr_bias = ( + self.moe_statics.e_score_correction_bias[0].detach() + if self.use_correction_bias + else None + ) + + ( + dispatched_input, + combine_weights_unnorm, + scatter_index, + dispatch_mask, + _, + fp8_dispatched_handle, + ) = Fp8MoeGateDispatchAndQuant.apply( + input, prob, corr_bias, k=k, capacity=capacity, use_pad=True + ) + + dispatch_mask = paddle.diff(F.pad(dispatch_mask, (1, 0))) + if self.use_correction_bias: + if self.gate.config.multimodel_experts: + for i in range(len(self.moe_statics.expert_usage)): + self.moe_statics.expert_usage[i] += dispatch_mask[ + self.gate.experts_type_mask[i] + ].detach() + else: + self.moe_statics.expert_usage[0] += dispatch_mask.detach() + dispatched_input.stop_gradient = False + combine_weights_unnorm.stop_gradient = False + scatter_index.stop_gradient = True + dispatch_mask.stop_gradient = True + + scatter_index = scatter_index.transpose([1, 0]) + if self.group_experts: + if max_prob is not None: + if token_type_ids is not None: + p = paddle.ones_like(combine_weights_unnorm.unsqueeze(-1)) + p = paddle.scatter_nd_add( + p, paddle.nonzero(token_type_ids == 0), -1 + max_prob + ) + else: + p = max_prob + combine_weights_unnorm = ( + combine_weights_unnorm.unsqueeze(-1) * p + ).squeeze(-1) + prob = (prob.reshape([p.shape[0], k, -1]) * p).reshape([p.shape[0], -1]) + if self.gate.norm_gate_logits: + combine_weights = combine_weights_unnorm / paddle.clip( + combine_weights_unnorm.sum(-1, keepdim=True), min=1e-12 + ) + else: + combine_weights = combine_weights_unnorm + combine_weights = combine_weights.cast("bfloat16") + + def reshape_for_a2a(tensor): + return tensor.reshape( + [ + self.world_size * self.num_local_experts, + capacity, + -1, + ] + ) + + dispatched_input = reshape_for_a2a(dispatched_input) + fp8_dispatched_handle["scale"] = reshape_for_a2a(fp8_dispatched_handle["scale"]) + dispatch_mask.stop_gradient = True + scatter_index.stop_gradient = True + return ( + dispatched_input, + combine_weights, + dispatch_mask, + scatter_index, + router_loss, + gate_logits, + prob, + fp8_dispatched_handle, + ) + + def gate_and_distpach(self, input, token_type_ids): + + seqlen, d_model = input.shape + args = () + if token_type_ids is not None: + token_type_ids = token_type_ids.reshape([-1]) + args = (token_type_ids,) + + use_fuse = isinstance(self.gate, (TopKGateFused)) + if use_fuse: + if self.use_norm_gate_recompute: + ( + gate_logits, + capacity, + router_loss, + norm_res, + ) = self.fused_norm_gate(input) + input = norm_res + else: + ( + gate_logits, + capacity, + router_loss, + ) = self.gate(input, *args) + else: + ( + capacity, + dispatch_mask, + combine_weights, + scatter_index, + router_loss, + gate_logits, + ) = self.gate( + input, + *args, + correction_bias=( + self.moe_statics.e_score_correction_bias[0] + if self.use_correction_bias + else None + ), + ) + prob = None + if self.config.moe_multimodal_paired_experts: + assert token_type_ids is not None + input = paddle.concat( + [input, token_type_ids.unsqueeze(-1).astype(input.dtype)], axis=-1 + ) + if self.input_preprocess is not None: + input, gate_logits = self.input_preprocess(input, gate_logits, capacity) + if use_fuse: + k = self.k + prob, max_prob = self.fused_gate_logits_process(gate_logits, token_type_ids) + + assert moe_ops is not None + with profile("dispatch_op"): + if ( + "corr_bias" + in inspect.signature(moe_ops.moe_gate_dispatch).parameters + ): + if self.use_correction_bias: + compat_args = (self.moe_statics.e_score_correction_bias[0],) + else: + compat_args = (None,) + else: + assert ( + not self.use_correction_bias + ), "correction bias not supported, rebuild moe-ops" + compat_args = () + if not self.config.use_ep_comm_overlap: + if self._rr_moe_gate_dispatch is None: + ( + dispatched_input, + combine_weights_unnorm, + scatter_index, + dispatch_mask, + _, + ) = moe_ops.moe_gate_dispatch( + input, + prob, + *compat_args, + k=k, + capacity=capacity, + use_pad=True, + ) + else: + ( + dispatched_input, + combine_weights_unnorm, + scatter_index, + dispatch_mask, + _, + ) = self._rr_moe_gate_dispatch( + input, + prob, + compat_args, + k=k, + capacity=capacity, + use_pad=True, + ) + else: + ( + dispatched_input, + combine_weights_unnorm, + scatter_index, + dispatch_mask, + _, + ) = moe_ops.moe_gate_dispatch_permute( + input, + prob, + *compat_args, + k=k, + capacity=capacity, + world_size=self.group.nranks, + ) + dispatch_mask = paddle.diff(F.pad(dispatch_mask, (1, 0))) + if self.use_correction_bias and framework._dygraph_tracer()._has_grad: + if self.gate.config.multimodel_experts: + for i in range(len(self.moe_statics.expert_usage)): + self.moe_statics.expert_usage[i] += dispatch_mask[ + self.gate.experts_type_mask[i] + ].detach() + else: + self.moe_statics.expert_usage[0] += dispatch_mask.detach() + dispatched_input.stop_gradient = False + combine_weights_unnorm.stop_gradient = False + scatter_index.stop_gradient = True + dispatch_mask.stop_gradient = True + + scatter_index = scatter_index.transpose([1, 0]) + if self.group_experts: + if max_prob is not None: + if token_type_ids is not None: + p = paddle.ones_like(combine_weights_unnorm.unsqueeze(-1)) + p = paddle.scatter_nd_add( + p, paddle.nonzero(token_type_ids == 0), -1 + max_prob + ) + else: + p = max_prob + combine_weights_unnorm = ( + combine_weights_unnorm.unsqueeze(-1) * p + ).squeeze(-1) + prob = (prob.reshape([p.shape[0], k, -1]) * p).reshape( + [p.shape[0], -1] + ) + if self.gate.norm_gate_logits: + combine_weights = combine_weights_unnorm / paddle.clip( + combine_weights_unnorm.sum(-1, keepdim=True), min=1e-12 + ) + else: + combine_weights = combine_weights_unnorm + combine_weights = combine_weights.cast(dispatched_input.dtype) + else: + dispatched_input = dispatching( + input, + dispatch_mask, + scatter_index, + num_experts=self.world_size * self.num_local_experts, + capacity=capacity, + ) + if self.use_correction_bias and framework._dygraph_tracer()._has_grad: + usage = paddle.bincount( + scatter_index.reshape([-1]) // capacity, + minlength=self.world_size * self.num_local_experts, + ) + assert ( + not self.config.multimodel_experts + ), "correction bias not supported, use top2-fused gate" + self.moe_statics.expert_usage[0] += usage.detach() + if not self.config.use_ep_comm_overlap: + dispatched_input = dispatched_input.reshape( + [ + self.world_size * self.num_local_experts, + capacity, + ( + d_model + if not self.config.moe_multimodal_paired_experts + else d_model + 1 + ), + ] + ) + else: + assert ( + len(dispatched_input.shape) == 4 + and dispatched_input.shape[1] == self.world_size + and dispatched_input.shape[0] == self.num_local_experts + ), ( + f"When using ep_comm_overlap, moe_gate_dispatch_permute is needed. " + f"Expected dispatched_input to have shape[1] == {self.world_size} " + f"and shape[0] == {self.num_local_experts}, " + f"but got shape {dispatched_input.shape}" + ) + dispatched_input = dispatched_input + dispatch_mask.stop_gradient = True + scatter_index.stop_gradient = True + return ( + dispatched_input, + combine_weights, + dispatch_mask, + scatter_index, + router_loss, + gate_logits, + prob, + ) + + def _calc_router_loss( + self, + dispatch_mask, + gate_logits, + gate_prob, + num_experts, + use_group, + layer_idx, + token_type=None, + tokens_type_mask=None, + dispatch_tokens_mask=None, + prefix="", + ): + log = {} + router_loss, l_aux, orthogonal_loss, zloss = 0.0, None, None, None + if self.gate.config.moe_aux_loss_lambda: + l_aux = self.gate._cal_aux_loss( + gate_prob, + dispatch_mask, + num_experts, + use_group, + tokens_type_mask, + dispatch_tokens_mask, + ) + router_loss += self.gate.moe_aux_loss_lambda[token_type or 0] * l_aux + else: + router_loss += self.zero * gate_prob[0, 0] + if self.gate.config.moe_orthogonal_loss_lambda: + orthogonal_loss = self.gate._cal_orthogonal_loss(token_type, use_group) + router_loss += ( + self.gate.moe_orthogonal_loss_lambda[token_type or 0] * orthogonal_loss + ) + if self.gate.config.moe_z_loss_lambda and not in_auto_parallel_align_mode(): + zloss = self.gate._cal_z_loss(gate_logits, tokens_type_mask) + router_loss += self.gate.moe_z_loss_lambda[token_type or 0] * zloss + + tracer = framework._dygraph_tracer() + if self.enable_logging and global_training_logs_enabled() and tracer._has_grad: + if l_aux is not None: + log[f"aux_loss_layer_{self.layer_idx}"] = l_aux + + if orthogonal_loss is not None: + log[f"orthogonal_loss_layer_{self.layer_idx}"] = orthogonal_loss + + if zloss is not None: + log[f"zloss_layer_{self.layer_idx}"] = zloss + + global_training_logs.update( + **log, + **{ + k.replace(f"_layer_{self.layer_idx}", ""): v for k, v in log.items() + }, + ) + global_training_logs.update( + **{ + prefix + "_" + k.replace(f"_layer_{self.layer_idx}", ""): v + for k, v in log.items() + } + ) + return router_loss + + def calc_router_loss_and_logging( + self, + router_loss, + combine_weights, + dispatch_mask, + gate_logits, + gate_prob, + token_type_ids, + dispatch_token_type_ids=None, + offload_helper=None, + ): + + use_fuse = isinstance(self.gate, (TopKGateFused)) + if use_fuse: + assert gate_prob is not None + if token_type_ids is not None and self.gate.config.moe_use_hard_gate: + if not self.gate.weight.stop_gradient: + lm_tokens_mask = token_type_ids == 0 + if offload_helper is not None: + is_lm = offload_helper["lm_mask"][1] + else: + is_lm = lm_tokens_mask.any() + if is_lm: + dispatch_tokens_mask = ( + dispatch_token_type_ids == 0 + if dispatch_token_type_ids is not None + else None + ) + router_loss += self._calc_router_loss( + ( + dispatch_mask[self.gate.experts_type_mask[0]] + if hasattr(self.gate, "experts_type_mask") + else dispatch_mask + ), + ( + gate_logits[:, self.gate.experts_type_mask[0]] + if hasattr(self.gate, "experts_type_mask") + else gate_logits + ), + ( + gate_prob[:, self.gate.experts_type_mask[0]] + if hasattr(self.gate, "experts_type_mask") + else gate_prob + ), + ( + self.gate.num_experts_list[0] + if hasattr(self.gate, "num_experts_list") + else self.gate.num_experts_tensor + ), + self.group_experts, + self.layer_idx, + 0, + lm_tokens_mask, + dispatch_tokens_mask, + prefix="lm", + ) + mm_tokens_mask = token_type_ids == 1 + if offload_helper is not None: + is_mm = offload_helper["mm_mask"][1] + else: + is_mm = mm_tokens_mask.any() + if is_mm: + dispatch_tokens_mask = ( + dispatch_token_type_ids == 1 + if dispatch_token_type_ids is not None + else None + ) + router_loss += self._calc_router_loss( + dispatch_mask[self.gate.experts_type_mask[1]], + gate_logits[:, self.gate.experts_type_mask[1]], + gate_prob[:, self.gate.experts_type_mask[1]], + self.gate.num_experts_list[1], + False, + self.layer_idx, + 1, + mm_tokens_mask, + dispatch_tokens_mask, + prefix="mm", + ) + + else: + router_loss += self._calc_router_loss( + dispatch_mask, + gate_logits, + gate_prob, + self.gate.num_experts_tensor, + self.group_experts, + self.layer_idx, + ) + + if self.enable_logging and global_training_logs_enabled(): + seqlen = gate_logits.shape[0] + num_active = paddle.count_nonzero(combine_weights) + gate_experts_per_token = num_active.item() / seqlen + + if token_type_ids is not None: + token_type_ids = token_type_ids.reshape([-1]) + combine_weights_type_0 = combine_weights[token_type_ids == 0] + if combine_weights_type_0.size: + gate_expert_per_token_type_0 = ( + paddle.count_nonzero(combine_weights_type_0).item() + / combine_weights_type_0.shape[0] + ) + global_training_logs.update( + experts_per_token_text=gate_expert_per_token_type_0, + ) + + combine_weights_type_1 = combine_weights[token_type_ids == 1] + if combine_weights_type_1.size: + gate_expert_per_token_type_1 = ( + paddle.count_nonzero(combine_weights_type_1).item() + / combine_weights_type_1.shape[0] + ) + global_training_logs.update( + experts_per_token_image=gate_expert_per_token_type_1, + ) + + ce = ( + (-F.softmax(gate_logits, -1) * F.log_softmax(gate_logits, -1)) + .sum(-1) + .mean(0) + ) + _log = { + f"gate_prob_ce_layer_{self.layer_idx}": ce.item(), + f"experts_per_token_layer_{self.layer_idx}": gate_experts_per_token, + } + global_training_logs.update( + **_log, + **{ + k.replace(f"_layer_{self.layer_idx}", ""): v + for k, v in _log.items() + }, + ) + else: + seqlen = dispatch_mask.shape[0] + dispatch_mask = dispatch_mask.unbind(-1) + top1_gate_experts_per_token = ( + paddle.cast(dispatch_mask[0], dtype="float32").sum() / seqlen + ) + if ( + self.enable_logging + and global_training_logs_enabled() + and len(dispatch_mask) == 2 + ): + top2_gate_experts_per_token = ( + paddle.cast(dispatch_mask[1], dtype="float32").sum() / seqlen + ) + leakage_experts_per_token = ( + paddle.cast( + (~dispatch_mask[0]) & (~dispatch_mask[1]), dtype="float32" + ).sum() + / seqlen + ) + experts_per_token = ( + top1_gate_experts_per_token + top2_gate_experts_per_token + ) + global_training_logs.update( + experts_per_token=experts_per_token.detach(), + top1_experts_per_token=top1_gate_experts_per_token.detach(), + top2_experts_per_token=top2_gate_experts_per_token.detach(), + leakage_experts_per_token=leakage_experts_per_token.detach(), + ) + elif ( + self.enable_logging + and global_training_logs_enabled() + and len(dispatch_mask) == 1 + ): + experts_per_token = top1_gate_experts_per_token + leakage_experts_per_token = ( + paddle.cast(~dispatch_mask[0], dtype="float32").sum() / seqlen + ) + global_training_logs.update( + experts_per_token=experts_per_token.detach(), + top1_experts_per_token=top1_gate_experts_per_token.detach(), + leakage_experts_per_token=leakage_experts_per_token.detach(), + ) + + return router_loss + + def combine_expert_output(self, expert_output, combine_weights, scatter_index): + + expert_output = expert_output.reshape([-1, expert_output.shape[-1]]) + use_fuse = isinstance(self.gate, (TopKGateFused)) + combine_fn = combining_fused if use_fuse else combining + combined_output = combine_fn(expert_output, combine_weights, scatter_index) + + if self.output_postprocess is not None: + combined_output = self.output_postprocess(combined_output) + return combined_output + + def forward_single_stage(self, dispatched_input, stage_id): + assert isinstance(self.experts, nn.LayerList) + return self.experts[stage_id](dispatched_input) + + def forward( + self, + input: Tensor, + token_type_ids=None, + ) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor]: + + if input.ndim == 3: + orig_shape = input.shape + input = input.reshape([-1, input.shape[-1]]) + else: + orig_shape = None + assert ( + len(input.shape) == 2 + ), f"input Tensor must have dimensions: (s)equence, (d)im, got:{input.shape}" + hidden_size = input.shape[1] + if token_type_ids is not None: + token_type_ids = token_type_ids.clone()[:, :-1] + if self.config.sequence_parallel: + token_type_ids = token_type_ids.reshape([-1]) + token_type_ids = ScatterOp.apply(token_type_ids) + token_type_ids.stop_gradient = True + + assert self.gate is not None + if hasattr(self, "rng") and self.rng.random() < self.all_to_all_dropout: + orig_shape_2 = input.shape + if self.config.moe_multimodal_paired_experts: + assert token_type_ids is not None + input = paddle.concat( + [input, token_type_ids.unsqueeze(-1).astype(input.dtype)], axis=-1 + ) + output = self.forward_experts(input) + output += self.gate.weight.sum() * 0.0 + output = output.reshape(orig_shape or orig_shape_2) + return output, None, 0 + + is_first_fwd = not framework._dygraph_tracer()._has_grad + use_async = self.shared_experts is not None + if in_auto_parallel_align_mode(): + gate_input = paddle.assign(input) + else: + gate_input = input + + use_fp8_fuse_node = ( + self.config.use_combine_before_a2a and self.config.use_fp8_fuse_node + ) + use_fp8_dispatch_a2a = self.config.use_fp8_dispatch_a2a and use_fp8_fuse_node + + with profile("fused_gate_and_dispatch"): + fp8_dispatched_handle = None + if use_fp8_dispatch_a2a: + ( + dispatched_input, + combine_weights, + dispatch_mask, + scatter_index, + router_loss, + gate_logits, + gate_prob, + fp8_dispatched_handle, + ) = self.gate_distpach_and_quant(gate_input, token_type_ids) + else: + ( + dispatched_input, + combine_weights, + dispatch_mask, + scatter_index, + router_loss, + gate_logits, + gate_prob, + ) = self.gate_and_distpach(gate_input, token_type_ids) + + if self.config.use_combine_before_a2a: + assert ( + not self.config.use_ep_comm_overlap + ), "Dont support use_ep_comm_overlap" + assert ( + moe_combine_no_weight is not None + ), "use_combine_before_a2a can only use with moe_combine_no_weight op, please install it first." + cw_shape = combine_weights.shape + si_shape = scatter_index.shape + scatter_index = scatter_index.reshape([-1]) + + token_combine_weights = paddle.zeros( + [cw_shape[0] * cw_shape[1]], dtype=combine_weights.dtype + ) + token_combine_weights = paddle.scatter( + token_combine_weights, + scatter_index, + combine_weights.reshape([-1]), + overwrite=False, + ) + + token_combine_weights = token_combine_weights.reshape( + [cw_shape[0], cw_shape[1], 1] + ) + token_combine_weights = AlltoAll.apply(token_combine_weights, self.group) + + if not self.config.use_ep_comm_overlap: + if use_fp8_dispatch_a2a: + shared_out = ( + self.shared_experts(input) + if self.shared_experts is not None + else None + ) + else: + with profile("moe_comm_and_shared_expert"): + if use_async: + dispatched_input, shared_out = AlltoAllAsync.apply( + dispatched_input, + input, + group=self.group, + fn=self.shared_experts, + is_first_fwd=is_first_fwd, + ) + else: + dispatched_input = AlltoAll.apply(dispatched_input, self.group) + + expert_out = ( + recompute(self.forward_experts, dispatched_input) + if self.recompute and self.training + else self.forward_experts(dispatched_input) + ) + + if self.config.use_combine_before_a2a: + token_combine_weights = token_combine_weights.clone().reshape( + expert_out.shape[:-1] + [1] + ) + expert_out = expert_out * token_combine_weights + else: + assert ( + len(dispatched_input.shape) == 4 + and dispatched_input.shape[1] == self.world_size + and dispatched_input.shape[0] == self.num_local_experts + ), ( + f"When using ep_comm_overlap, moe_gate_dispatch_permute is needed. " + f"Expected dispatched_input to have shape[1] == {self.world_size} " + f"and shape[0] == {self.num_local_experts}, " + f"but got shape {dispatched_input.shape}" + ) + with profile("moe_comm_and_forward_expert"): + expert_out = AlltoAllExpertOverlap.apply( + dispatched_input, + self.group, + self.num_local_experts, + self.experts, + is_first_fwd=is_first_fwd, + ) + if self.shared_experts is not None: + shared_out = self.shared_experts(input) + + with profile("moe_comm_and_calc_routerloss"): + expert_out, router_loss2 = AlltoAllAsync.apply( + expert_out, + router_loss, + combine_weights, + dispatch_mask, + gate_logits, + gate_prob, + token_type_ids, + group=self.group, + fn=self.calc_router_loss_and_logging, + is_first_fwd=is_first_fwd, + ) + + with profile("combine"): + if self.config.use_combine_before_a2a: + expert_out = expert_out.reshape([-1, hidden_size]) + + scatter_index = scatter_index.reshape(si_shape) + combined_output = moe_combine_no_weight( + expert_out, combine_weights, scatter_index, epsilon=1e-15 + ) + else: + combined_output = self.combine_expert_output( + expert_out, combine_weights, scatter_index + ) + + if self.shared_experts is not None: + combined_output += shared_out + + if orig_shape: + combined_output = combined_output.clone().reshape( + orig_shape[:-1] + [combined_output.shape[-1]] + ) + return combined_output, combine_weights, router_loss2, gate_logits + + +def combining_fused_auto(x, combine_weights, scatter_index, hard_gate=False): + """ + Args: + x: Tensor[seq, dim] + combine_weights: [s, k] + scatter_index: ** [k, s] ** + + Returns: + y: Tensor[s, dim] + """ + if hard_gate: + x_gatherd = F.embedding(scatter_index, x) + return x_gatherd.squeeze(-2) + ret = moe_combine_auto.moe_combine_auto(x, combine_weights, scatter_index) + + ret.stop_gradient = False + return ret + + def detach_and_requires_grad_(*args): """detach_and_requires_grad_""" ret = [a.detach() if a is not None else None for a in args] @@ -349,54 +1407,6 @@ def detach_and_requires_grad_(*args): return ret -def manual_backward(f: Callable, is_first_fwd: bool, *args: List[Any]): - """ - Args: - f(callable) - args(*Any) - Returns - bw_f(callable): manual backward fn - out(List[Tensor]): output of f(*args) - """ - tracer = framework._dygraph_tracer() - orig = tracer._has_grad - if not is_first_fwd: - tracer._has_grad = True # turn on grad trace so we can manual backward - - detached_args = detach_and_requires_grad_(*args) - detached_args_clone = [a.clone() if a is not None else None for a in detached_args] - out = f(*detached_args_clone) - for a in detached_args: - if a is not None: - a._clear_dataptr() # free mem - if isinstance(out, list): - out = tuple(out) - elif not isinstance(out, tuple): - out = (out,) - - if is_first_fwd: - tracer._has_grad = orig - return None, out - - out_cached = [ - o.clone() for o in out if o is not None and not o.stop_gradient - ] # do not cache stop_gradient output - for o in out_cached: - o._clear_dataptr() # free mem - tracer._has_grad = orig - - def bwd_f(*grad): - nonlocal out_cached, detached_args, f - grad = list(grad) - grad = [g for g in grad if g is not None] - assert len(grad) == len(out_cached), (len(grad), len(out_cached), f) - # out, grad = zip(*[(o, g) for o, g in zip(out, grad) if g is not None]) - paddle.autograd.backward(out_cached, grad) - return tuple([t.grad if t is not None else None for t in detached_args]) - - return bwd_f, out - - def bpr_preprocess(input, logits, capacity, buffer): """impletment bpr sorting""" assert input.ndim == 2, input.shape @@ -460,7 +1470,7 @@ def __init__( is_dummy_moe = config.moe_world_size == 1 for p in experts.parameters(): - p.expert = not (is_mp_moe or is_dummy_moe) # type: ignore + p.expert = not (is_mp_moe or is_dummy_moe) p.no_sync = not (is_mp_moe or is_dummy_moe) logger.info(f"expert no-sync={p.no_sync}-{p.name}") if is_mp_moe or is_mp_moe: @@ -559,7 +1569,7 @@ def forward_experts(self, dispatched_input): assert len(chunks) == len(self.experts), (len(chunks), len(self.experts)) for chunk, expert in zip(chunks, self.experts): expert_outputs += [expert(chunk)] - expert_output = paddle.stack(expert_outputs, axis=1) # [ecm] + expert_output = paddle.stack(expert_outputs, axis=1) return expert_output def gate_and_distpach(self, input, token_type_ids): @@ -600,7 +1610,6 @@ def gate_and_distpach(self, input, token_type_ids): with profile("moe-dispatch"): if use_fuse: - # capacity no use k = self.k prob, max_prob = self.fused_gate_logits_process( gate_logits, token_type_ids @@ -616,12 +1625,9 @@ def gate_and_distpach(self, input, token_type_ids): ) dispatched_input.stop_gradient = False combine_weights_unnorm.stop_gradient = False - # NOTE: PyLayer do not support some inputs with stop_gradient=True in static mode - # it's a bug that will be fixed in the future - # scatter_index.stop_gradient = True dispatch_mask.stop_gradient = True - scatter_index = scatter_index.transpose([1, 0]) # [k,s] ->[s,k] + scatter_index = scatter_index.transpose([1, 0]) if self.group_experts: if max_prob is not None: @@ -680,9 +1686,7 @@ def combine_expert_output(self, expert_output, combine_weights, scatter_index): [dist.Shard(0)], ) else: - expert_output = expert_output.reshape( - [-1, expert_output.shape[-1]] - ) # [e*c,m] + expert_output = expert_output.reshape([-1, expert_output.shape[-1]]) if not self.config.moe_use_all2all: if self.config.moe_group == "mp": @@ -752,15 +1756,14 @@ def forward( token_type_ids = token_type_ids.clone()[:, :-1] if self.config.sequence_parallel: token_type_ids = token_type_ids.reshape([-1]) - # token_type_ids = ScatterOp.apply(token_type_ids) token_type_ids.stop_gradient = True assert self.gate is not None if hasattr(self, "rng") and self.rng.random() < self.all_to_all_dropout: orig_shape_2 = input.shape output = self.forward_experts(input) - output += self.gate.weight.sum() * 0.0 # hack for grad - output = output.reshape(orig_shape or orig_shape_2) # [e*1,c,m] + output += self.gate.weight.sum() * 0.0 + output = output.reshape(orig_shape or orig_shape_2) return output, None, 0 ( @@ -777,7 +1780,6 @@ def forward( dispatched_input, get_mesh(self.ipp), [dist.Shard(1), dist.Shard(1)] ) if self.config.moe_group == "mp": - # TODO(zhangyichen): 统一 moe_group 是 mp 和其他情况下的代码 dispatched_input = dist.reshard( dispatched_input, get_mesh(self.ipp), [dist.Shard(1), dist.Shard(0)] ) diff --git a/examples/pre-training/models/moe/moe_layer_auto_utils.py b/examples/pre-training/models/moe/moe_layer_auto_utils.py deleted file mode 100644 index f9ad5995..00000000 --- a/examples/pre-training/models/moe/moe_layer_auto_utils.py +++ /dev/null @@ -1,1454 +0,0 @@ -# !/usr/bin/env python3 - -# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from typing import Tuple, List, Optional -import logging -from collections import namedtuple -import inspect - -import paddle -from paddle import framework -from paddle import nn -from paddle.distributed.communication import stream -import paddle.nn.functional as F - -from paddle.autograd import PyLayer -from paddle.distributed.communication.group import Group -from paddle.distributed.fleet.utils import recompute -from paddle.distributed import fleet -from paddle.distributed import in_auto_parallel_align_mode - -import paddle.distributed as dist -from paddle import Tensor - -from models.moe.top2_gate_auto_auto import ( - TopKGateFused, - cast_if_needed, -) -from models.sequence_parallel_utils_auto import ScatterOp -from models.utils import ( - global_training_logs_enabled, - manual_backward, -) - -from models.comm_utils import profile - - -from paddle.incubate.nn.functional import ( - moe_combine, -) - - -try: - from src.utils.misc import global_training_logs -except ModuleNotFoundError: - global_training_logs = {} -try: - import moe_router_loss_ops -except ImportError: - moe_router_loss_ops = None - - -logger = logging.getLogger(__name__) - - -try: - import moe_ops -except ImportError: - moe_ops = None - logger.warning( - "`moe-ops` not found, run " - "`python3 src/ernie_core/ops/moe/setup.py install` to install" - ) - -try: - import moe_ops_fp8 -except ImportError: - moe_ops_fp8 = None - logger.warning( - "`moe-ops` not found, run " - "`python3 src/ernie_core/ops/moe/setup_fp8.py install` to install" - ) - -try: - from moe_combine import moe_combine_no_weight -except ImportError: - moe_combine_no_weight = None - - -try: - import fused_ln as fused -except ImportError: - logger.warning( - "fused-ln not found, run `python src/ops/fused_ln_setup.py install` to build fused ln" - ) - fused = None - -try: - from custom_setup_ops import matmul_bwd -except ImportError: - matmul_bwd = None - - -GateOutput = namedtuple( - "GateOutput", - [ - "aux", - "z", - "logits", - ], -) - - -class GateCombine_ori(PyLayer): - - @staticmethod - def forward(ctx, x, combine_weights, scatter_index): - ctx.x = x - ctx.combine_weights = combine_weights - ctx.scatter_index = scatter_index - assert moe_combine is not None - ret = moe_combine.moe_combine(x, combine_weights, scatter_index) - return ret - - @staticmethod - def backward(ctx, grad_y, *_): - assert moe_combine is not None - grad_x, grad_combine_weight_helper = moe_combine.moe_combine_bwd( - ctx.x, ctx.combine_weights, ctx.scatter_index, grad_y - ) - - grad_combine_weight = grad_combine_weight_helper.sum(-1) - return grad_x, grad_combine_weight.reshape(ctx.combine_weights.shape), None - - -def combining_fused(x, combine_weights, scatter_index, hard_gate=False): - - if hard_gate: - x_gatherd = F.embedding(scatter_index, x) # [s,k,dim] - return x_gatherd.squeeze(-2) - ret = GateCombine_ori.apply(x, combine_weights, scatter_index) - ret.stop_gradient = False - return ret - - -def recompute_fwd_gate_up_func(config, layer_idx): - - if "recompute_fwd_gate_up" in config.fp8_mem_configs: - if isinstance(config.fp8_mem_configs["recompute_fwd_gate_up"], bool): - return config.fp8_mem_configs["recompute_fwd_gate_up"] - if isinstance(config.fp8_mem_configs["recompute_fwd_gate_up"], list): - return layer_idx in config.fp8_mem_configs["recompute_fwd_gate_up"] - - return False - - -def dispatching(x, dispatch_mask, scatter_index, num_experts, capacity): - - output = None - # init_output = paddle.zeros([num_experts * capacity, x.shape[-1]], dtype='float32') - # output = init_output + 0. * x.sum() - orig_dtype = x.dtype - scatter_index = scatter_index.unbind(1) - dispatch_mask = dispatch_mask.unbind(1) - for i_scatter_index, i_dispatch_mask in zip(scatter_index, dispatch_mask): - init_output = paddle.zeros( - [num_experts * capacity, x.shape[-1]], dtype="float32" - ) - updates = x * i_dispatch_mask.unsqueeze(-1).cast(x.dtype) - if output is None: - output = paddle.scatter( - init_output, - i_scatter_index, - updates, - overwrite=False, - ) - else: - output = output + paddle.scatter( - init_output, - i_scatter_index, - updates, - overwrite=False, - ) - if output.dtype != orig_dtype: - output = output.cast(orig_dtype) - return output - - -def combining(x, combine_weights, scatter_index): - - dim = x.shape[-1] - scatter_index = scatter_index.reshape([-1]) - num_k = combine_weights.shape[-1] - combine_weights = combine_weights.unsqueeze(1) - # num_k = 2 - x = paddle.gather(x, scatter_index).reshape([-1, num_k, dim]) # [seq,2,dim] - return paddle.matmul(combine_weights, x).squeeze( - 1 - ) # [seq,1,2] @ [seq,2,dim] -> [seq,1,dim] - - -def fuse_logging(gate_logits, combine_weights, token_type_ids): - with paddle.no_grad(): - gate_expert_per_token_type_0, gate_expert_per_token_type_1 = None, None - gate_experts_per_token = None - ce = moe_router_loss_ops.cal_cross_entropy_info(gate_logits).mean(0) - if token_type_ids is not None: - ( - gate_expert_per_token_type_0, - gate_expert_per_token_type_1, - gate_experts_per_token, - ) = moe_router_loss_ops.cal_gate_experts_per_token_info( - combine_weights, token_type_ids - ) - else: - gate_experts_per_token = paddle.count_nonzero(combine_weights) / ( - gate_logits.shape[0] - ) - - return ( - gate_expert_per_token_type_0, - gate_expert_per_token_type_1, - gate_experts_per_token, - ce, - ) - - -class Fp8MoeGateDispatchAndQuant(paddle.autograd.PyLayer): - - @staticmethod - def forward( - ctx, x, gate_logtis, corr_bias, k, capacity, use_pad, use_pow2_scale=True - ): - ( - out_fp8, - scale, - combine_weights, - scatter_index, - expert_offset, - expert_id, - ) = moe_ops_fp8.moe_gate_dispatch_and_quant( - x, - gate_logtis, - corr_bias=corr_bias, - k=k, - capacity=capacity, - use_pad=use_pad, - use_pow2_scale=use_pow2_scale, - ) - assert out_fp8.shape[0] == scale.shape[0] - - out_fp8.stop_gradient = False - combine_weights.stop_gradient = False - scatter_index.stop_gradient = True - expert_offset.stop_gradient = True - expert_id.stop_gradient = True - scale.stop_gradient = True - - ctx.k = k - ctx.capacity = capacity - ctx.use_pad = use_pad - ctx.combine_weights = combine_weights - ctx.scatter_index = scatter_index - ctx.expert_id = expert_id - ctx.has_corr_bias = corr_bias is not None - - return ( - out_fp8, - combine_weights, - scatter_index, - expert_offset, - expert_id, - { - "scale": scale, - }, - ) - - @staticmethod - def backward(ctx, *grads): - out_grad, combine_weights_grad = grads[0], grads[1] - x_grad, gate_logits_grad = moe_ops.moe_gate_dispatch_bwd( - ctx.combine_weights, - ctx.scatter_index, - ctx.expert_id, - out_grad, - combine_weights_grad, - k=ctx.k, - capacity=ctx.capacity, - use_pad=ctx.use_pad, - ) - if ctx.has_corr_bias: - return x_grad, gate_logits_grad, None - else: - return x_grad, gate_logits_grad - - -class AlltoAll(PyLayer): - - @staticmethod - def forward(ctx, x, group, sync_op=True): - - ctx.group = group - if dist.get_world_size(group) <= 1: - return x - output = paddle.empty_like(x) - output.stop_gradient = False - task = stream.alltoall_single( - output, x, None, None, group, sync_op=sync_op, use_calc_stream=sync_op - ) - if not sync_op: - return output, task - else: - return output - - @staticmethod - def backward(ctx, *dx): - return AlltoAll.apply(*dx, group=ctx.group) - - -class AlltoAllExpertOverlap(PyLayer): - - @staticmethod - def forward( - ctx, input, group, num_local_experts, forward_func_dict, is_first_fwd=False - ): - assert ( - dist.get_world_size(group) > 1 - ), "AlltoAllExpertOverlap is not supported for a world size less than or equal to 1." - - ctx.bw_funcs = {} - ctx.group = group - ctx.num_local_experts = num_local_experts - - assert isinstance(forward_func_dict, nn.LayerList) - all2all_tasks = [] - all2all_ins = paddle.unbind(input, axis=0) - for stage_id in range(1): - stage_input = all2all_ins[stage_id] - x_out, task = AlltoAll.apply(stage_input, group=group, sync_op=False) - all2all_tasks.append((task, x_out)) - - expert_outputs = [] - for stage_id in range(num_local_experts): - if stage_id + 1 != num_local_experts: - stage_input = all2all_ins[stage_id + 1] - x_out, task = AlltoAll.apply(stage_input, group=group, sync_op=False) - all2all_tasks.append((task, x_out)) - - task, dispatched_input = all2all_tasks[stage_id] - task.wait() - bwf, (expert_outputs_cur_stage,) = manual_backward( - forward_func_dict[stage_id], is_first_fwd, dispatched_input - ) - ctx.bw_funcs[stage_id] = bwf - expert_outputs.append(expert_outputs_cur_stage) - - expert_output = paddle.stack(expert_outputs, axis=1) - return expert_output - - @staticmethod - def backward(ctx, out_grad): - all2all_tasks = [] - expert_outputs = [] - - out_grad_list = paddle.split( - out_grad, num_or_sections=out_grad.shape[1], axis=1 - ) - for stage_id in range(ctx.num_local_experts): - (grad_cur_stage,) = ctx.bw_funcs[stage_id](out_grad_list[stage_id]) - - x_out, task = AlltoAll.apply(grad_cur_stage, group=ctx.group, sync_op=False) - all2all_tasks.append(task) - expert_outputs.append(x_out) - - for task in all2all_tasks: - task.wait() - - expert_output = paddle.stack(expert_outputs, axis=0) - return expert_output - - -class AlltoAllAsync(PyLayer): - - @staticmethod - def forward(ctx, x, *fn_args, group=None, fn=None, is_first_fwd=False): - - assert fn is not None, "use AlltoAll no async" - ctx.group = group - if dist.get_world_size(group) <= 1: - ctx.bwf, fn_out = manual_backward(fn, is_first_fwd, *fn_args) - return (x,) + fn_out - x_out = paddle.empty_like(x) - x_out.stop_gradient = False - task = stream.alltoall_single( - x_out, - x, - None, - None, - group, - sync_op=False, - ) - ctx.bwf, fn_out = manual_backward(fn, is_first_fwd, *fn_args) - task.wait() - return (x_out,) + fn_out - - @staticmethod - def backward(ctx, dx_out, *fn_out_grads): - if dist.get_world_size(ctx.group) <= 1: - fn_args_grads = ctx.bwf(*fn_out_grads) - return (dx_out,) + fn_args_grads - - dx = paddle.empty_like(dx_out) - dx.stop_gradient = False - task = stream.alltoall_single( - dx, - dx_out, - None, - None, - ctx.group, - sync_op=False, - ) - fn_args_grads = ctx.bwf(*fn_out_grads) - task.wait() - return (dx,) + fn_args_grads - - -class FusedNormGateFunc(paddle.autograd.PyLayer): - - @staticmethod - def forward(ctx, x, rms_norm_weight, moe_gate_weight, eps): - ctx.dtype = paddle.float32 - norm_output, invar = fused.fused_rms_norm(x, rms_norm_weight, eps) - with paddle.amp.auto_cast(False): - gate_logits = F.linear( - cast_if_needed(norm_output, ctx.dtype), - cast_if_needed(moe_gate_weight, ctx.dtype), - ) - - ctx.save_for_backward(x, rms_norm_weight, moe_gate_weight, eps) - return gate_logits, norm_output - - @staticmethod - def backward(ctx, d_gate_logits, d_norm_output): - x, rms_norm_weight, moe_gate_weight, eps = ctx.saved_tensor() - norm_output, invar = fused.fused_rms_norm(x, rms_norm_weight, eps) - d_norm_output_linear, d_moe_gate_weight = matmul_bwd( - cast_if_needed(norm_output, ctx.dtype), - cast_if_needed(moe_gate_weight, ctx.dtype), - d_gate_logits, - False, - False, - ) - d_norm_output_linear, d_moe_gate_weight = cast_if_needed( - d_norm_output_linear, norm_output.dtype - ), cast_if_needed(d_moe_gate_weight, moe_gate_weight.dtype) - d_norm_output = d_norm_output + d_norm_output_linear - dx, d_rms_norm_weight = fused.fused_rms_norm_grad_func( - x, rms_norm_weight, invar, d_norm_output, eps - ) - - return dx, d_rms_norm_weight, d_moe_gate_weight - - -class MOELayer(nn.Layer): - - def __init__( - self, - gate: nn.Layer, - experts: List[nn.Layer], - layer_idx, - shared_experts: Optional[List[nn.Layer]] = None, - group: Group = None, - recompute=False, - enable_logging: bool = False, - k=2, - enable_bpr: bool = False, - all_to_all_dropout=0, - group_experts=False, - moe_statics=None, - ): - - super().__init__() - self.gate = gate - self.layer_idx = layer_idx - self.recompute = recompute - logger.info(f"using moe recompute={recompute}") - for p in self.gate.parameters(): - p.is_gate = True - if isinstance(experts, nn.LayerList): - self.experts = experts - else: - logger.info(f"using fused experts, type={type(experts)}") - self.experts = experts - self.shared_experts = shared_experts - - self.group = group - self.k = k - self.all_to_all_dropout = all_to_all_dropout - self.enable_logging = enable_logging - self.use_correction_bias = moe_statics is not None - self.moe_statics = moe_statics - if self.use_correction_bias: - logger.info( - f"using correction bias, aux-coef:{self.gate.config.moe_aux_loss_lambda}" - ) - assert self.gate.config.moe_use_aux_free - - self.is_mp_moe = ( - hasattr(fleet.fleet, "_hcg") - and group is fleet.get_hybrid_communicate_group().get_model_parallel_group() - ) - self.is_ep_moe = ( - hasattr(fleet.fleet, "_hcg") - and hasattr( - fleet.get_hybrid_communicate_group(), - "get_moe_sharding_parallel_world_size", - ) - and fleet.get_hybrid_communicate_group().get_moe_sharding_parallel_world_size() - > 0 - ) - is_dummy_moe = dist.get_world_size(group) == 1 - - for p in experts.parameters(): - p.expert = not (self.is_mp_moe or is_dummy_moe) # type: ignore - p.no_sync = not (self.is_mp_moe or is_dummy_moe) - logger.info(f"expert no-sync={p.no_sync}-{p.name}") - if self.is_mp_moe or self.is_ep_moe: - p.is_distributed = True - - expert_color = None - if self.is_ep_moe: - moe_grad_group = ( - fleet.get_hybrid_communicate_group().get_moe_sharding_parallel_group() - ) - expert_color = {"color": "moe_expert", "group": moe_grad_group} - elif ( - self.config.offline_quant_expert_weight - and self.config.clear_origin_weight_when_offline_quant - ): - expert_color = {"color": "moe_expert"} - - if expert_color is not None: - for p in self.experts.parameters(): - setattr(p, "color", expert_color) - - self.world_size = dist.get_world_size(self.group) - # assert self.world_size > 1, f'moe-group not found, world_size {self.world_size}' - self.rank = dist.get_rank(self.group) - if self.world_size < 1: - self.world_size = 1 - if self.rank < 0: - self.rank = 0 - - self.num_local_experts = len(self.experts) - self.dispatch_by_task = ( - hasattr(self.gate, "dispatch_by_task") and self.gate.dispatch_by_task - ) - - if self.dispatch_by_task: - assert 0, "no supported, checkout earylier code" - assert self.num_local_experts == 1 - - self.input_preprocess = self.output_postprocess = None - self.group_experts = group_experts - self.config = self.gate.config - self.zero = paddle.to_tensor(0, dtype=paddle.float32) - - self._rr_moe_gate_dispatch = None - self._rr_moe_combine = None - self.use_norm_gate_recompute = None - - if self.config.use_recompute and self.config.skip_recompute_ops.get( - "moe_gate_dispatch", False - ): - self._rr_moe_gate_dispatch = None - if self.config.use_recompute and self.config.skip_recompute_ops.get( - "moe_combine", False - ): - self._rr_moe_combine = None - if hasattr(fleet.fleet, "_hcg"): - hcg = fleet.get_hybrid_communicate_group() - if ( - hasattr(hcg, "get_moe_sharding_parallel_world_size") - and hcg.get_moe_sharding_parallel_world_size() > 0 - ): - moe_grad_group = hcg.get_moe_sharding_parallel_group() - for p in self.experts.parameters(): - setattr( - p, "color", {"color": "moe_expert", "group": moe_grad_group} - ) - - def forward_experts(self, dispatched_input): - - with profile("fwd-expert"): - dispatched_input = dispatched_input.reshape( - [ - self.world_size, - self.num_local_experts, - -1, - dispatched_input.shape[-1], - ] - ) # [e,1,c,m] - expert_outputs = [] - if isinstance(self.experts, nn.LayerList): - - chunks = dispatched_input.transpose([1, 0, 2, 3]).contiguous().unbind(0) - assert len(chunks) == len(self.experts), ( - len(chunks), - len(self.experts), - ) - for chunk, expert in zip(chunks, self.experts): - expert_outputs += [expert(chunk)] - # logger.info( - # f"moe-fwd-expert: {chunk.shape}" - # f'-> {expert_outputs[-1].shape}: {chunk.astype("float32").norm(axis=-1)}' - # ) - expert_output = paddle.stack(expert_outputs, axis=1) # [ecm] - - else: - dispatched_input = dispatched_input.transpose([1, 0, 2, 3]) - dispatched_input.contiguous() - orig_shape = dispatched_input.shape - chunks = dispatched_input.reshape([orig_shape[0], -1, orig_shape[-1]]) - chunks = self.experts(chunks) - chunks = chunks.reshape(orig_shape[:-1] + [chunks.shape[-1]]).unbind(0) - expert_outputs += chunks - expert_output = paddle.stack(expert_outputs, axis=1) # [ecm] - return expert_output - - def fused_gate_logits_process( - self, gate_logits, token_type_ids, offload_helper=None - ): - - k = self.k - experts_type_ids = self.gate.experts_type_ids - use_hard_gate = self.config.moe_use_hard_gate - max_prob = None - - if token_type_ids is not None and use_hard_gate: - if offload_helper is None: - offload_helper = dict() - lm_mask = token_type_ids == 0 - is_lm = lm_mask.any() - mm_mask = token_type_ids == 1 - is_mm = mm_mask.any() - seq_lm = lm_mask.sum() - seq_mm = mm_mask.sum() - lm_mask = lm_mask.unsqueeze(1) & (experts_type_ids == 0).unsqueeze(0) - mm_mask = mm_mask.unsqueeze(1) & (experts_type_ids == 1).unsqueeze(0) - offload_helper["lm_mask"] = [lm_mask, is_lm, seq_lm] - offload_helper["mm_mask"] = [mm_mask, is_mm, seq_mm] - - is_lm = offload_helper["lm_mask"][1] - prob = paddle.zeros_like(gate_logits) - # 处理 lm_prob - if is_lm: - lm_mask = offload_helper["lm_mask"][0] - seq_lm_cpu = offload_helper["lm_mask"][2] - lm_mask_nonzero = lm_mask.nonzero() - lm_partial_gate_logits = gate_logits.gather_nd(lm_mask_nonzero).reshape( - [seq_lm_cpu, -1] - ) - if self.group_experts: - lm_prob = self.gate.act( - lm_partial_gate_logits.reshape( - [lm_partial_gate_logits.shape[0], k, -1] - ) - ) - max_prob = lm_prob.max(-1, keepdim=True) # [s_l, k, 1] - lm_prob /= max_prob - else: - lm_prob = self.gate.act(lm_partial_gate_logits) - prob = paddle.scatter_nd_add(prob, lm_mask_nonzero, lm_prob.flatten()) - is_mm = offload_helper["mm_mask"][1] - if is_mm: - mm_mask = offload_helper["mm_mask"][0] - seq_mm_cpu = offload_helper["mm_mask"][2] - mm_mask_nonzero = paddle.nonzero(mm_mask) - mm_partial_gate_logits = gate_logits.gather_nd(mm_mask_nonzero).reshape( - [seq_mm_cpu, -1] - ) - mm_prob = self.gate.act(mm_partial_gate_logits) - prob = paddle.scatter_nd_add(prob, mm_mask_nonzero, mm_prob.flatten()) - else: - if self.group_experts: - prob = self.gate.act(gate_logits.reshape([gate_logits.shape[0], k, -1])) - max_prob = prob.max(-1, keepdim=True) - prob /= max_prob - prob = prob.reshape([prob.shape[0], -1]) - else: - prob = self.gate.act(gate_logits) - return prob, max_prob - - def gate_distpach_and_quant(self, input, token_type_ids): - - assert isinstance(self.gate, (TopKGateFused)), "Only fused gate is supported." - assert not self.config.use_ep_comm_overlap, "ep_comm_overlap is not supported" - assert ( - self._rr_moe_gate_dispatch is None - ), "rr_moe_gate_dispatch is not supported" - assert moe_ops_fp8 is not None - - args = () - if token_type_ids is not None: - token_type_ids = token_type_ids.reshape([-1]) - args = (token_type_ids,) - - ( - gate_logits, - capacity, - router_loss, - ) = self.gate(input, *args) - - if self.config.moe_multimodal_paired_experts: - assert token_type_ids is not None - input = paddle.concat( - [input, token_type_ids.unsqueeze(-1).astype(input.dtype)], axis=-1 - ) - if self.input_preprocess is not None: - input, gate_logits = self.input_preprocess(input, gate_logits, capacity) - - k = self.k - prob, max_prob = self.fused_gate_logits_process(gate_logits, token_type_ids) - - with profile("dispatch_op"): - corr_bias = ( - self.moe_statics.e_score_correction_bias[0].detach() - if self.use_correction_bias - else None - ) - - ( - dispatched_input, - combine_weights_unnorm, - scatter_index, - dispatch_mask, - _, - fp8_dispatched_handle, - ) = Fp8MoeGateDispatchAndQuant.apply( - input, prob, corr_bias, k=k, capacity=capacity, use_pad=True - ) - - dispatch_mask = paddle.diff(F.pad(dispatch_mask, (1, 0))) - if self.use_correction_bias: - if self.gate.config.multimodel_experts: - for i in range(len(self.moe_statics.expert_usage)): - self.moe_statics.expert_usage[i] += dispatch_mask[ - self.gate.experts_type_mask[i] - ].detach() - else: - self.moe_statics.expert_usage[0] += dispatch_mask.detach() - dispatched_input.stop_gradient = False - combine_weights_unnorm.stop_gradient = False - scatter_index.stop_gradient = True - dispatch_mask.stop_gradient = True - - scatter_index = scatter_index.transpose([1, 0]) - if self.group_experts: - if max_prob is not None: - if token_type_ids is not None: - p = paddle.ones_like(combine_weights_unnorm.unsqueeze(-1)) - p = paddle.scatter_nd_add( - p, paddle.nonzero(token_type_ids == 0), -1 + max_prob - ) - else: - p = max_prob - combine_weights_unnorm = ( - combine_weights_unnorm.unsqueeze(-1) * p - ).squeeze(-1) - # gate_prob 进行还原 - prob = (prob.reshape([p.shape[0], k, -1]) * p).reshape([p.shape[0], -1]) - if self.gate.norm_gate_logits: - combine_weights = combine_weights_unnorm / paddle.clip( - combine_weights_unnorm.sum(-1, keepdim=True), min=1e-12 - ) - else: - combine_weights = combine_weights_unnorm - combine_weights = combine_weights.cast("bfloat16") - - def reshape_for_a2a(tensor): - return tensor.reshape( - [ - self.world_size * self.num_local_experts, - capacity, - -1, - ] - ) - - dispatched_input = reshape_for_a2a(dispatched_input) - fp8_dispatched_handle["scale"] = reshape_for_a2a(fp8_dispatched_handle["scale"]) - dispatch_mask.stop_gradient = True - scatter_index.stop_gradient = True - return ( - dispatched_input, - combine_weights, - dispatch_mask, - scatter_index, - router_loss, - gate_logits, - prob, - fp8_dispatched_handle, - ) - - def gate_and_distpach(self, input, token_type_ids): - - seqlen, d_model = input.shape - args = () - if token_type_ids is not None: - token_type_ids = token_type_ids.reshape([-1]) - args = (token_type_ids,) - - use_fuse = isinstance(self.gate, (TopKGateFused)) - if use_fuse: - if self.use_norm_gate_recompute: - ( - gate_logits, - capacity, - router_loss, - norm_res, - ) = self.fused_norm_gate(input) - input = norm_res - else: - ( - gate_logits, - capacity, - router_loss, - ) = self.gate(input, *args) - else: - ( - capacity, - dispatch_mask, - combine_weights, - scatter_index, - router_loss, - gate_logits, - ) = self.gate( - input, - *args, - correction_bias=( - self.moe_statics.e_score_correction_bias[0] - if self.use_correction_bias - else None - ), - ) - prob = None - if self.config.moe_multimodal_paired_experts: - assert token_type_ids is not None - input = paddle.concat( - [input, token_type_ids.unsqueeze(-1).astype(input.dtype)], axis=-1 - ) - if self.input_preprocess is not None: - input, gate_logits = self.input_preprocess(input, gate_logits, capacity) - if use_fuse: - # capacity no use - k = self.k - prob, max_prob = self.fused_gate_logits_process(gate_logits, token_type_ids) - - assert moe_ops is not None - with profile("dispatch_op"): - if ( - "corr_bias" - in inspect.signature(moe_ops.moe_gate_dispatch).parameters - ): - if self.use_correction_bias: - compat_args = (self.moe_statics.e_score_correction_bias[0],) - else: - compat_args = (None,) - else: - assert ( - not self.use_correction_bias - ), "correction bias not supported, rebuild moe-ops" - compat_args = () - if not self.config.use_ep_comm_overlap: - if self._rr_moe_gate_dispatch is None: - ( - dispatched_input, - combine_weights_unnorm, - scatter_index, - dispatch_mask, - _, - ) = moe_ops.moe_gate_dispatch( - input, - prob, - *compat_args, - k=k, - capacity=capacity, - use_pad=True, - ) - else: - ( - dispatched_input, - combine_weights_unnorm, - scatter_index, - dispatch_mask, - _, - ) = self._rr_moe_gate_dispatch( - input, - prob, - compat_args, - k=k, - capacity=capacity, - use_pad=True, - ) - else: - ( - dispatched_input, - combine_weights_unnorm, - scatter_index, - dispatch_mask, - _, - ) = moe_ops.moe_gate_dispatch_permute( - input, - prob, - *compat_args, - k=k, - capacity=capacity, - world_size=self.group.nranks, - ) - dispatch_mask = paddle.diff(F.pad(dispatch_mask, (1, 0))) - if self.use_correction_bias and framework._dygraph_tracer()._has_grad: - if self.gate.config.multimodel_experts: - for i in range(len(self.moe_statics.expert_usage)): - self.moe_statics.expert_usage[i] += dispatch_mask[ - self.gate.experts_type_mask[i] - ].detach() - else: - self.moe_statics.expert_usage[0] += dispatch_mask.detach() - dispatched_input.stop_gradient = False - combine_weights_unnorm.stop_gradient = False - scatter_index.stop_gradient = True - dispatch_mask.stop_gradient = True - - scatter_index = scatter_index.transpose([1, 0]) # [k,s] ->[s,k] - if self.group_experts: - if max_prob is not None: - if token_type_ids is not None: - p = paddle.ones_like(combine_weights_unnorm.unsqueeze(-1)) - p = paddle.scatter_nd_add( - p, paddle.nonzero(token_type_ids == 0), -1 + max_prob - ) - else: - p = max_prob - combine_weights_unnorm = ( - combine_weights_unnorm.unsqueeze(-1) * p - ).squeeze(-1) - # gate_prob 进行还原 - prob = (prob.reshape([p.shape[0], k, -1]) * p).reshape( - [p.shape[0], -1] - ) - if self.gate.norm_gate_logits: - combine_weights = combine_weights_unnorm / paddle.clip( - combine_weights_unnorm.sum(-1, keepdim=True), min=1e-12 - ) - else: - combine_weights = combine_weights_unnorm - combine_weights = combine_weights.cast(dispatched_input.dtype) - else: - dispatched_input = dispatching( - input, - dispatch_mask, - scatter_index, - num_experts=self.world_size * self.num_local_experts, - capacity=capacity, - ) - if self.use_correction_bias and framework._dygraph_tracer()._has_grad: - usage = paddle.bincount( - scatter_index.reshape([-1]) // capacity, - minlength=self.world_size * self.num_local_experts, - ) - assert ( - not self.config.multimodel_experts - ), "correction bias not supported, use top2-fused gate" - self.moe_statics.expert_usage[0] += usage.detach() - if not self.config.use_ep_comm_overlap: - dispatched_input = dispatched_input.reshape( - [ - self.world_size * self.num_local_experts, - capacity, - ( - d_model - if not self.config.moe_multimodal_paired_experts - else d_model + 1 - ), - ] - ) # .clone() - else: - assert ( - len(dispatched_input.shape) == 4 - and dispatched_input.shape[1] == self.world_size - and dispatched_input.shape[0] == self.num_local_experts - ), ( - f"When using ep_comm_overlap, moe_gate_dispatch_permute is needed. " - f"Expected dispatched_input to have shape[1] == {self.world_size} " - f"and shape[0] == {self.num_local_experts}, " - f"but got shape {dispatched_input.shape}" - ) - dispatched_input = dispatched_input # .clone() - dispatch_mask.stop_gradient = True - scatter_index.stop_gradient = True - return ( - dispatched_input, - combine_weights, - dispatch_mask, - scatter_index, - router_loss, - gate_logits, - prob, - ) - - def _calc_router_loss( - self, - dispatch_mask, - gate_logits, - gate_prob, - num_experts, - use_group, - layer_idx, - token_type=None, - tokens_type_mask=None, - dispatch_tokens_mask=None, - prefix="", - ): - log = {} - router_loss, l_aux, orthogonal_loss, zloss = 0.0, None, None, None - if self.gate.config.moe_aux_loss_lambda: - l_aux = self.gate._cal_aux_loss( - gate_prob, - dispatch_mask, - num_experts, - use_group, - tokens_type_mask, - dispatch_tokens_mask, - ) - router_loss += self.gate.moe_aux_loss_lambda[token_type or 0] * l_aux - else: - router_loss += ( - self.zero * gate_prob[0, 0] - ) # must use gate prob to avoid zero pointer - if self.gate.config.moe_orthogonal_loss_lambda: - orthogonal_loss = self.gate._cal_orthogonal_loss(token_type, use_group) - router_loss += ( - self.gate.moe_orthogonal_loss_lambda[token_type or 0] * orthogonal_loss - ) - if self.gate.config.moe_z_loss_lambda and not in_auto_parallel_align_mode(): - zloss = self.gate._cal_z_loss(gate_logits, tokens_type_mask) - router_loss += self.gate.moe_z_loss_lambda[token_type or 0] * zloss - - tracer = framework._dygraph_tracer() - if self.enable_logging and global_training_logs_enabled() and tracer._has_grad: - if l_aux is not None: - log[f"aux_loss_layer_{self.layer_idx}"] = l_aux - - if orthogonal_loss is not None: - log[f"orthogonal_loss_layer_{self.layer_idx}"] = orthogonal_loss - - if zloss is not None: - log[f"zloss_layer_{self.layer_idx}"] = zloss - - global_training_logs.update( - **log, - **{ - k.replace(f"_layer_{self.layer_idx}", ""): v for k, v in log.items() - }, - ) - global_training_logs.update( - **{ - prefix + "_" + k.replace(f"_layer_{self.layer_idx}", ""): v - for k, v in log.items() - } - ) - return router_loss - - def calc_router_loss_and_logging( - self, - router_loss, - combine_weights, - dispatch_mask, - gate_logits, - gate_prob, - token_type_ids, - dispatch_token_type_ids=None, - offload_helper=None, - ): - - use_fuse = isinstance(self.gate, (TopKGateFused)) - if use_fuse: - assert gate_prob is not None - if token_type_ids is not None and self.gate.config.moe_use_hard_gate: - if not self.gate.weight.stop_gradient: - lm_tokens_mask = token_type_ids == 0 - if offload_helper is not None: - is_lm = offload_helper["lm_mask"][1] - else: - is_lm = lm_tokens_mask.any() - if is_lm: - dispatch_tokens_mask = ( - dispatch_token_type_ids == 0 - if dispatch_token_type_ids is not None - else None - ) - router_loss += self._calc_router_loss( - ( - dispatch_mask[self.gate.experts_type_mask[0]] - if hasattr(self.gate, "experts_type_mask") - else dispatch_mask - ), - ( - gate_logits[:, self.gate.experts_type_mask[0]] - if hasattr(self.gate, "experts_type_mask") - else gate_logits - ), - ( - gate_prob[:, self.gate.experts_type_mask[0]] - if hasattr(self.gate, "experts_type_mask") - else gate_prob - ), - ( - self.gate.num_experts_list[0] - if hasattr(self.gate, "num_experts_list") - else self.gate.num_experts_tensor - ), - self.group_experts, - self.layer_idx, - 0, - lm_tokens_mask, - dispatch_tokens_mask, - prefix="lm", - ) - mm_tokens_mask = token_type_ids == 1 - if offload_helper is not None: - is_mm = offload_helper["mm_mask"][1] - else: - is_mm = mm_tokens_mask.any() - if is_mm: - dispatch_tokens_mask = ( - dispatch_token_type_ids == 1 - if dispatch_token_type_ids is not None - else None - ) - router_loss += self._calc_router_loss( - dispatch_mask[self.gate.experts_type_mask[1]], - gate_logits[:, self.gate.experts_type_mask[1]], - gate_prob[:, self.gate.experts_type_mask[1]], - self.gate.num_experts_list[1], - False, - self.layer_idx, - 1, - mm_tokens_mask, - dispatch_tokens_mask, - prefix="mm", - ) - - else: - router_loss += self._calc_router_loss( - dispatch_mask, - gate_logits, - gate_prob, - self.gate.num_experts_tensor, - self.group_experts, - self.layer_idx, - ) - - if self.enable_logging and global_training_logs_enabled(): - seqlen = gate_logits.shape[0] - num_active = paddle.count_nonzero(combine_weights) - gate_experts_per_token = num_active.item() / seqlen - - if token_type_ids is not None: - token_type_ids = token_type_ids.reshape([-1]) - combine_weights_type_0 = combine_weights[token_type_ids == 0] - if combine_weights_type_0.size: - gate_expert_per_token_type_0 = ( - paddle.count_nonzero(combine_weights_type_0).item() - / combine_weights_type_0.shape[0] - ) - global_training_logs.update( - experts_per_token_text=gate_expert_per_token_type_0, - ) - - combine_weights_type_1 = combine_weights[token_type_ids == 1] - if combine_weights_type_1.size: - gate_expert_per_token_type_1 = ( - paddle.count_nonzero(combine_weights_type_1).item() - / combine_weights_type_1.shape[0] - ) - global_training_logs.update( - experts_per_token_image=gate_expert_per_token_type_1, - ) - - ce = ( - (-F.softmax(gate_logits, -1) * F.log_softmax(gate_logits, -1)) - .sum(-1) - .mean(0) - ) - _log = { - f"gate_prob_ce_layer_{self.layer_idx}": ce.item(), - f"experts_per_token_layer_{self.layer_idx}": gate_experts_per_token, - } - global_training_logs.update( - **_log, - **{ - k.replace(f"_layer_{self.layer_idx}", ""): v - for k, v in _log.items() - }, - ) - else: - seqlen = dispatch_mask.shape[0] - dispatch_mask = dispatch_mask.unbind(-1) - top1_gate_experts_per_token = ( - paddle.cast(dispatch_mask[0], dtype="float32").sum() / seqlen - ) - if ( - self.enable_logging - and global_training_logs_enabled() - and len(dispatch_mask) == 2 - ): - top2_gate_experts_per_token = ( - paddle.cast(dispatch_mask[1], dtype="float32").sum() / seqlen - ) - leakage_experts_per_token = ( - paddle.cast( - (~dispatch_mask[0]) & (~dispatch_mask[1]), dtype="float32" - ).sum() - / seqlen - ) - experts_per_token = ( - top1_gate_experts_per_token + top2_gate_experts_per_token - ) - global_training_logs.update( - experts_per_token=experts_per_token.detach(), - top1_experts_per_token=top1_gate_experts_per_token.detach(), - top2_experts_per_token=top2_gate_experts_per_token.detach(), - leakage_experts_per_token=leakage_experts_per_token.detach(), - ) - elif ( - self.enable_logging - and global_training_logs_enabled() - and len(dispatch_mask) == 1 - ): - experts_per_token = top1_gate_experts_per_token - leakage_experts_per_token = ( - paddle.cast(~dispatch_mask[0], dtype="float32").sum() / seqlen - ) - global_training_logs.update( - experts_per_token=experts_per_token.detach(), - top1_experts_per_token=top1_gate_experts_per_token.detach(), - leakage_experts_per_token=leakage_experts_per_token.detach(), - ) - - return router_loss - - def combine_expert_output(self, expert_output, combine_weights, scatter_index): - - expert_output = expert_output.reshape([-1, expert_output.shape[-1]]) - use_fuse = isinstance(self.gate, (TopKGateFused)) - combine_fn = combining_fused if use_fuse else combining - combined_output = combine_fn(expert_output, combine_weights, scatter_index) - - if self.output_postprocess is not None: - combined_output = self.output_postprocess(combined_output) - return combined_output - - def forward_single_stage(self, dispatched_input, stage_id): - assert isinstance(self.experts, nn.LayerList) - return self.experts[stage_id](dispatched_input) - - def forward( - self, - input: Tensor, - token_type_ids=None, - ) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor]: - - if input.ndim == 3: - orig_shape = input.shape - input = input.reshape([-1, input.shape[-1]]) - else: - orig_shape = None - assert ( - len(input.shape) == 2 - ), f"input Tensor must have dimensions: (s)equence, (d)im, got:{input.shape}" - hidden_size = input.shape[1] - if token_type_ids is not None: - token_type_ids = token_type_ids.clone()[:, :-1] - if self.config.sequence_parallel: - token_type_ids = token_type_ids.reshape([-1]) - token_type_ids = ScatterOp.apply(token_type_ids) - token_type_ids.stop_gradient = True - - assert self.gate is not None - if hasattr(self, "rng") and self.rng.random() < self.all_to_all_dropout: - orig_shape_2 = input.shape - if self.config.moe_multimodal_paired_experts: - assert token_type_ids is not None - input = paddle.concat( - [input, token_type_ids.unsqueeze(-1).astype(input.dtype)], axis=-1 - ) - output = self.forward_experts(input) - output += self.gate.weight.sum() * 0.0 # hack for grad - output = output.reshape(orig_shape or orig_shape_2) # [e*1,c,m] - return output, None, 0 - - is_first_fwd = not framework._dygraph_tracer()._has_grad - use_async = self.shared_experts is not None - if in_auto_parallel_align_mode(): - gate_input = paddle.assign(input) - else: - gate_input = input - - use_fp8_fuse_node = ( - self.config.use_combine_before_a2a and self.config.use_fp8_fuse_node - ) - use_fp8_dispatch_a2a = self.config.use_fp8_dispatch_a2a and use_fp8_fuse_node - - with profile("fused_gate_and_dispatch"): - fp8_dispatched_handle = None - if use_fp8_dispatch_a2a: - ( - dispatched_input, - combine_weights, - dispatch_mask, - scatter_index, - router_loss, - gate_logits, - gate_prob, - fp8_dispatched_handle, - ) = self.gate_distpach_and_quant(gate_input, token_type_ids) - else: - ( - dispatched_input, - combine_weights, - dispatch_mask, - scatter_index, - router_loss, - gate_logits, - gate_prob, - ) = self.gate_and_distpach(gate_input, token_type_ids) - - # TODO(shenliang03): to fuse one kernel to optimize - if self.config.use_combine_before_a2a: - assert ( - not self.config.use_ep_comm_overlap - ), "Dont support use_ep_comm_overlap" - assert ( - moe_combine_no_weight is not None - ), "use_combine_before_a2a can only use with moe_combine_no_weight op, please install it first." - cw_shape = combine_weights.shape - si_shape = scatter_index.shape - scatter_index = scatter_index.reshape([-1]) - - token_combine_weights = paddle.zeros( - [cw_shape[0] * cw_shape[1]], dtype=combine_weights.dtype - ) - token_combine_weights = paddle.scatter( - token_combine_weights, - scatter_index, - combine_weights.reshape([-1]), - overwrite=False, - ) - - token_combine_weights = token_combine_weights.reshape( - [cw_shape[0], cw_shape[1], 1] - ) - token_combine_weights = AlltoAll.apply(token_combine_weights, self.group) - - if not self.config.use_ep_comm_overlap: - if use_fp8_dispatch_a2a: - shared_out = ( - self.shared_experts(input) - if self.shared_experts is not None - else None - ) - else: - with profile("moe_comm_and_shared_expert"): - if use_async: - dispatched_input, shared_out = AlltoAllAsync.apply( - dispatched_input, - input, - group=self.group, - fn=self.shared_experts, - is_first_fwd=is_first_fwd, - ) - else: - dispatched_input = AlltoAll.apply(dispatched_input, self.group) - - expert_out = ( - recompute(self.forward_experts, dispatched_input) - if self.recompute and self.training - else self.forward_experts(dispatched_input) - ) - - if self.config.use_combine_before_a2a: - token_combine_weights = token_combine_weights.clone().reshape( - expert_out.shape[:-1] + [1] - ) - expert_out = expert_out * token_combine_weights - else: - assert ( - len(dispatched_input.shape) == 4 - and dispatched_input.shape[1] == self.world_size - and dispatched_input.shape[0] == self.num_local_experts - ), ( - f"When using ep_comm_overlap, moe_gate_dispatch_permute is needed. " - f"Expected dispatched_input to have shape[1] == {self.world_size} " - f"and shape[0] == {self.num_local_experts}, " - f"but got shape {dispatched_input.shape}" - ) - with profile("moe_comm_and_forward_expert"): - expert_out = AlltoAllExpertOverlap.apply( - dispatched_input, - self.group, - self.num_local_experts, - self.experts, - is_first_fwd=is_first_fwd, - ) - if self.shared_experts is not None: - shared_out = self.shared_experts(input) - - with profile("moe_comm_and_calc_routerloss"): - expert_out, router_loss2 = AlltoAllAsync.apply( - expert_out, - router_loss, - combine_weights, - dispatch_mask, - gate_logits, - gate_prob, - token_type_ids, - group=self.group, - fn=self.calc_router_loss_and_logging, - is_first_fwd=is_first_fwd, - ) - - with profile("combine"): - if self.config.use_combine_before_a2a: - expert_out = expert_out.reshape([-1, hidden_size]) - - scatter_index = scatter_index.reshape(si_shape) - combined_output = moe_combine_no_weight( - expert_out, combine_weights, scatter_index, epsilon=1e-15 - ) - else: - combined_output = self.combine_expert_output( - expert_out, combine_weights, scatter_index - ) - - if self.shared_experts is not None: - combined_output += shared_out - - if orig_shape: - combined_output = combined_output.clone().reshape( - orig_shape[:-1] + [combined_output.shape[-1]] - ) - return combined_output, combine_weights, router_loss2, gate_logits diff --git a/examples/pre-training/models/moe/moe_utils.py b/examples/pre-training/models/moe/moe_utils.py deleted file mode 100644 index cd797ab4..00000000 --- a/examples/pre-training/models/moe/moe_utils.py +++ /dev/null @@ -1,229 +0,0 @@ -# !/usr/bin/env python3 - -# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" moe utils for allgather dispatcher """ -import paddle -import paddle.distributed as dist -from paddle.distributed import fleet -import paddle.nn.functional as F -from paddle import nn -from paddle.autograd import PyLayer - -from models.sequence_parallel_utils import ( - AllGatherOp, - ReduceScatterOp, -) - - -class MOEGather(PyLayer): - """ - MOE Gather - """ - - @staticmethod - def forward(ctx, input_, map_): - """ - MOE Gather forward - """ - ctx.input_shape = input_.shape - ctx.map = map_ - return paddle.take_along_axis(input_, map_, 0) - - @staticmethod - def backward(ctx, grad_output): - """ - MOE Gather backward - """ - input_shape = ctx.input_shape - map_ = ctx.map - - output = paddle.zeros(input_shape, dtype=grad_output.dtype) - return paddle.put_along_axis(output, map_, grad_output, 0), None - - -class MOEScatter(PyLayer): - """ - MOE Scatter - """ - - @staticmethod - def forward(ctx, input_, map_, output_size=None): - """ - MOE Scatter forward - """ - ctx.map = map_ - - if output_size is not None: - output = paddle.zeros(output_size, dtype=input_.dtype) - else: - output = paddle.zeros_like(input_) - - return paddle.put_along_axis(output, map_, input_, 0) - - @staticmethod - def backward(ctx, grad_output): - """ - MOE Scatter backward - """ - map_ = ctx.map - return paddle.take_along_axis(grad_output, map_, 0), None - - -class AllgatherDispatcherReturn(object): - """ - MOE allgather dispatcher return value - """ - - def __init__( - self, - global_hidden_states, - dispatched_input, - combine_weights, - scatter_index, - gather_scatter_mask, - dispatch_mask, - tokens_per_expert, - ): - self.global_hidden_states = global_hidden_states - self.dispatched_input = dispatched_input - self.combine_weights = combine_weights - self.scatter_index = scatter_index - self.gather_scatter_mask = gather_scatter_mask - self.dispatch_mask = dispatch_mask - self.tokens_per_expert = tokens_per_expert - - -class MOEAllGatherDispatcher(nn.Layer): - """ - MOE with allgather dispatcher. - Contains two static methos. - MOEAllGatherDispatcher.token_dispatcher - MOEAllGatherDispatcher.token_combine - """ - - @staticmethod - def token_dispatcher( - hidden_states, - local_gate_logits, - top_k, - local_expert_indices, - num_moe_experts, - num_local_experts, - ): - """ - MOE token dispatcher with allgather - """ - seq_len = local_gate_logits.shape[0] - num_experts = local_gate_logits.shape[-1] - prob = F.softmax(local_gate_logits.reshape([seq_len, top_k, -1]), axis=-1) - max_prob = prob.max(-1, keepdim=True) - prob /= max_prob - prob = prob.reshape([-1, num_experts]) - - probs, scatter_index = paddle.topk(prob, top_k, axis=-1) - dispatch_mask = paddle.cumsum( - paddle.histogram(scatter_index.flatten(), bins=num_experts) - ) - - # dispatch - with paddle.no_grad(): - global_indices = AllGatherOp.apply(scatter_index) - global_local_mask = (global_indices >= local_expert_indices[0]) & ( - global_indices <= local_expert_indices[-1] - ) - local_indices = global_indices.masked_select(global_local_mask) - - global_hidden_states = AllGatherOp.apply(hidden_states) - global_probs = AllGatherOp.apply(probs) - - # get local hidden states - combine_weights = global_probs.masked_select(global_local_mask).cast( - dtype=hidden_states.dtype - ) - gather_scatter_mask = global_local_mask.nonzero()[:, 0] - gather_scatter_mask = paddle.reshape(gather_scatter_mask, shape=[-1, 1]) - gather_scatter_mask = paddle.expand( - gather_scatter_mask, shape=[-1, hidden_states.shape[-1]] - ) - local_hidden_states = MOEGather.apply(global_hidden_states, gather_scatter_mask) - - with paddle.no_grad(): - # The indices of local_indices that give its sorted order along dim 0. - scatter_index = paddle.argsort(local_indices, axis=0) - tokens_per_expert = paddle.bincount( - paddle.reshape(local_indices, [-1]), minlength=num_moe_experts - ) - if num_local_experts < num_moe_experts: - start = local_expert_indices[0] - end = local_expert_indices[-1] + 1 - tokens_per_expert = tokens_per_expert[start:end] - - scatter_index = paddle.reshape(scatter_index, shape=[-1, 1]) - scatter_index = paddle.expand( - scatter_index, shape=[-1, hidden_states.shape[-1]] - ) - - dispatched_input = MOEGather.apply(local_hidden_states, scatter_index) - - return AllgatherDispatcherReturn( - global_hidden_states, - dispatched_input, - combine_weights, - scatter_index, - gather_scatter_mask, - dispatch_mask, - tokens_per_expert, - ) - - @staticmethod - def token_combine( - expert_out, - shared_out, - combine_weights, - scatter_index, - gather_scatter_mask, - global_shape, - ): - """ - MOE token combine with reduce scatter - """ - expert_out = MOEScatter.apply(expert_out, scatter_index) - expert_out = expert_out * paddle.reshape(combine_weights, shape=[-1, 1]) - expert_out = MOEScatter.apply(expert_out, gather_scatter_mask, global_shape) - combine_out = expert_out + shared_out - combine_out = ReduceScatterOp.apply(combine_out) - return combine_out - - -def get_flatten_mesh(mesh): - - return dist.ProcessMesh(mesh.process_ids) - - -def get_mesh(pp_idx=0): - - mesh = fleet.auto.get_mesh() - if "pp" in mesh.dim_names: - mesh = mesh.get_mesh_with_dim("pp", pp_idx) - return mesh - - -def _reshard(tensor, mesh, placements): - - dst_tensor = dist.auto_parallel.moe_utils._dist_reshape( - tensor, tensor.shape, mesh, placements - ) - return dst_tensor diff --git a/examples/pre-training/models/moe/top2_gate_auto.py b/examples/pre-training/models/moe/top2_gate_auto.py index a8aee34d..7c50aa5a 100644 --- a/examples/pre-training/models/moe/top2_gate_auto.py +++ b/examples/pre-training/models/moe/top2_gate_auto.py @@ -14,21 +14,968 @@ # See the License for the specific language governing permissions and # limitations under the License. -""" -top2gate -""" - - from typing import Tuple +from functools import partial import logging +import numpy as np +import paddle from paddle import Tensor import paddle.distributed as dist +import paddle.nn.functional as F +from paddle import nn +from paddle.utils import unique_name +from paddle.nn.clip import _squared_l2_norm +from paddle.distributed import fleet +from models.utils import global_training_logs_enabled +from models.moe.moe_utils_auto import get_mesh, get_flatten_mesh +try: + from src.utils.misc import global_training_logs +except ModuleNotFoundError: + global_training_logs = {} +try: + import moe_router_loss_ops +except ImportError: + moe_router_loss_ops = None + +try: + from custom_setup_ops import matmul_bwd +except ImportError: + matmul_bwd = None + +try: + from bincount_ops import int_bincount +except ImportError: + int_bincount = None logger = logging.getLogger(__name__) -from models.moe.top2_gate_auto_auto import TopKGateFused -from models.moe.moe_utils_auto import get_mesh, get_flatten_mesh + +class CalOrthogonalLossOptEachWeightFunctor(paddle.autograd.PyLayer): + + @staticmethod + def forward(ctx, gate_weight, moe_k, use_group, eps=1e-12): + if gate_weight.dtype != paddle.float32: + gate_weight = gate_weight.astype(paddle.float32) + ( + orthogonal_loss, + wnorm, + weight_scale, + normed_weight, + weight_matmul, + ) = moe_router_loss_ops.cal_orthogonal_loss_opt_each_weight( + gate_weight, moe_k, use_group, eps + ) + ctx.save_for_backward( + gate_weight, wnorm, weight_scale, normed_weight, weight_matmul + ) + ctx.moe_k = moe_k + ctx.use_group = use_group + ctx.eps = eps + return orthogonal_loss + + @staticmethod + def backward(ctx, out_grad): + gate_weight, wnorm, weight_scale, normed_weight, weight_matmul = ( + ctx.saved_tensor() + ) + if gate_weight.stop_gradient: + return None + moe_k = ctx.moe_k + use_group = ctx.use_group + eps = ctx.eps + return moe_router_loss_ops.cal_orthogonal_loss_opt_each_weight_grad( + out_grad, + wnorm, + weight_scale, + normed_weight, + weight_matmul, + moe_k, + use_group, + eps, + ) + + +class CalZLossFunctor(paddle.autograd.PyLayer): + + @staticmethod + def forward(ctx, logits, loss_mask=None, clip_min=1e-6): + if loss_mask is not None: + assert loss_mask.stop_gradient + loss, max_logits, safe_sumexp, logsumexp_per_token = ( + moe_router_loss_ops.cal_z_loss(logits, loss_mask, clip_min) + ) + ctx.save_for_backward( + logits, loss_mask, max_logits, safe_sumexp, logsumexp_per_token + ) + ctx.clip_min = clip_min + return loss + + @staticmethod + def backward(ctx, out_grad): + logits, loss_mask, max_logits, safe_sumexp, logsumexp_per_token = ( + ctx.saved_tensor() + ) + if logits.stop_gradient: + return None + clip_min = ctx.clip_min + return moe_router_loss_ops.cal_z_loss_grad( + out_grad, + logits, + loss_mask, + max_logits, + safe_sumexp, + logsumexp_per_token, + clip_min, + ) + + +class CalAuxLossFunctor(paddle.autograd.PyLayer): + + @staticmethod + def forward( + ctx, + gate_prob, + dispatch_mask, + tokens_mask, + dispatch_tokens_mask, + num_experts, + use_group, + moe_k, + clip_min=1e-6, + ): + if tokens_mask is not None and tokens_mask.dtype != gate_prob.dtype: + tokens_mask = tokens_mask.astype(gate_prob.dtype) + loss, seqlen_float, ce = paddle.incubate.nn.functional.cal_aux_loss( + gate_prob, + dispatch_mask, + tokens_mask, + dispatch_tokens_mask, + num_experts, + use_group, + moe_k, + clip_min, + ) + ctx.save_for_backward(gate_prob, seqlen_float, ce) + ctx.num_experts = num_experts + ctx.use_group = use_group + ctx.moe_k = moe_k + return loss + + @staticmethod + def backward(ctx, out_grad): + gate_prob, seqlen_float, ce = ctx.saved_tensor() + num_experts = ctx.num_experts + use_group = ctx.use_group + moe_k = ctx.moe_k + return paddle.incubate.nn.functional.cal_aux_loss_grad( + out_grad, gate_prob, seqlen_float, ce, num_experts, use_group, moe_k + ) + + +def cal_orthogonal_loss_opt_each_weight_func( + weight, moe_k, use_group, eps, training=True +): + weight = weight.transpose([1, 0]).contiguous() + wnorm = weight.norm(axis=1) + weight = weight / paddle.maximum(wnorm, eps).unsqueeze(1) + + if use_group: + weight = weight.reshape([moe_k, -1, weight.shape[1]]) + eye_matrix = paddle.eye(weight.shape[1], dtype=weight.dtype).unsqueeze(0) + else: + eye_matrix = paddle.eye(weight.shape[0], dtype=weight.dtype) + + weight_matmul = paddle.matmul(weight, weight, transpose_y=True) + + orthogonal_loss = weight_matmul - eye_matrix + orthogonal_loss = _squared_l2_norm(orthogonal_loss) / orthogonal_loss.size + return orthogonal_loss + + +def cal_z_loss_func(logits, loss_mask): + if loss_mask is not None: + loss_mask = loss_mask.astype(logits.dtype) + l_zloss = (logits.logsumexp(1).square() * loss_mask).sum() / paddle.clip( + loss_mask.sum(), min=1e-6 + ) + else: + l_zloss = logits.logsumexp(1).square().mean() + return l_zloss + + +def cal_aux_loss_func( + gate_prob, + dispatch_mask, + tokens_mask, + dispatch_tokens_mask, + num_experts, + use_group, + moe_k, + global_aux_loss=False, + rank=None, + group=None, +): + if tokens_mask is not None and tokens_mask.dtype != gate_prob.dtype: + tokens_mask = tokens_mask.astype(gate_prob.dtype) + + scale = None + if dispatch_tokens_mask is not None: + seqlen_float = dispatch_tokens_mask.astype(gate_prob.dtype).sum() + if ( + tokens_mask is not None + and gate_prob.shape[0] != dispatch_tokens_mask.shape[0] + ): + scale = seqlen_float / paddle.clip(tokens_mask.sum(), min=1e-6) + elif tokens_mask is not None: + seqlen_float = tokens_mask.sum() + else: + seqlen_float = gate_prob.numel().astype(gate_prob.dtype) / num_experts + seqlen_float = paddle.clip(seqlen_float, min=1e-6) + + if len(dispatch_mask.shape) == 2: + dispatch_mask = dispatch_mask.sum(0) + ce = dispatch_mask.astype(gate_prob.dtype).detach() / seqlen_float + me = paddle.sum(gate_prob, axis=0) / seqlen_float + if global_aux_loss: + me_list, ce_list = [], [] + dist.all_gather(me_list, me, group=group) + dist.all_gather(ce_list, ce, group=group) + + me_list[rank] = me + ce_list[rank] = ce + me = paddle.stack(me_list).mean(0) + ce = paddle.stack(ce_list).mean(0) + + l_aux = paddle.sum(me * ce) * num_experts + if use_group: + l_aux = l_aux / moe_k + + if scale is not None: + l_aux = l_aux + (scale - 1) * l_aux.detach() + + return l_aux + + +def masked_fill(x, mask, value): + + y = paddle.full(x.shape, value, x.dtype) + return paddle.where(mask, y, x) + + +@paddle.no_grad() +def compute_optimal_transport(M, r, c, lam=1.0, epsilon=1e-8, max_iters: int = 10): + + n, _ = M.shape + P = F.softmax(-M / lam) + u = paddle.zeros(n, "float32") + for _ in range(max_iters): + if (u - P.sum(1)).abs().max() < epsilon: + break + u = P.sum(1) + P *= (r / (u + 1e-8)).reshape((-1, 1)) + P *= (c / (P.sum(0) + 1e-8)).reshape((1, -1)) + P = paddle.where(~P.isnan(), P, paddle.zeros_like(P)) + return P, _ + + +def cast_if_needed(x, dtype): + + return x.cast(dtype) if x.dtype != dtype else x + + +class FusedGateDetachMatmul(paddle.autograd.PyLayer): + + @staticmethod + def forward(ctx, x, w): + + ctx.dtype = paddle.float32 + ctx.save_for_backward(x, w) + return F.linear(cast_if_needed(x, ctx.dtype), cast_if_needed(w, ctx.dtype)) + + @staticmethod + def backward(ctx, y_grad): + + x, w = ctx.saved_tensor() + assert ctx.dtype == y_grad.dtype, "dtype not match" + x_g, w_g = matmul_bwd( + cast_if_needed(x, ctx.dtype), + cast_if_needed(w, ctx.dtype), + y_grad, + False, + False, + ) + return cast_if_needed(x_g, x.dtype), cast_if_needed(w_g, w.dtype) + + +def gate_detach_matmul(x, weight, use_fuse, use_fake_gate=False): + + if use_fuse: + score = FusedGateDetachMatmul.apply(x, weight) + else: + x = cast_if_needed(x, paddle.float32) + score = F.linear(x, weight) + + if use_fake_gate: + score = paddle.randn(score.shape).astype(score.dtype) + score - score + return score + + +class Top2Gate(nn.Layer): + + def __init__(self, config, layer_idx: int, group, gate_weight=None) -> None: + + super().__init__() + self.config = config + self.fuse_gate_detach_matmul = config.fuse_gate_detach_matmul + if self.fuse_gate_detach_matmul: + assert matmul_bwd is not None, "matmul_bwd is not supported" + + self.use_fake_gate = config.use_fake_gate + if self.use_fake_gate: + logging.warning( + "You are use fake_gate, which is just for test, not for real training." + ) + + self.model_dim = config.hidden_size + self.num_experts = config.moe_num_experts + self.num_experts_tensor = ( + sum(config.moe_num_experts) + if config.multimodel_experts + else config.moe_num_experts + ) + + self.cap = config.moe_capacity + self.group = group + + self.layer_idx = layer_idx + self.global_aux_loss = config.global_aux_loss + if self.global_aux_loss: + self.rank = dist.get_rank(self.group) + + self.sinkhorn_2gate = config.sinkhorn_2gate + self.sinkhorn_temp = config.sinkhorn_temp + self.use_token_type_bias = config.moe_use_token_type_bias + self.use_correction_bias = config.moe_use_aux_free + + if config.moe_gate_act == "softmax": + self.act = partial(F.softmax, axis=-1) + elif config.moe_gate_act == "sigmoid": + self.act = F.sigmoid + else: + raise ValueError(f"{config.moe_gate_act} is not supported.") + self.no_jitter = True + self.expert_drop = False + self.eye_matrix = None + self.eye_matrix_size = None + self.enable_logging = config.moe_logging + self.norm_gate_logits = config.moe_norm_gate_logits + self.one = paddle.ones([], dtype="float32") + + self.moe_aux_loss_lambda = paddle.to_tensor( + config.moe_aux_loss_lambda, dtype="float32" + ) + self.moe_z_loss_lambda = paddle.to_tensor( + config.moe_z_loss_lambda, dtype="float32" + ) + self.moe_orthogonal_loss_lambda = paddle.to_tensor( + config.moe_orthogonal_loss_lambda, dtype="float32" + ) + if self.moe_aux_loss_lambda.ndim == 0: + self.moe_aux_loss_lambda = self.moe_aux_loss_lambda.unsqueeze(0) + if self.moe_z_loss_lambda.ndim == 0: + self.moe_z_loss_lambda = self.moe_z_loss_lambda.unsqueeze(0) + if self.moe_orthogonal_loss_lambda.ndim == 0: + self.moe_orthogonal_loss_lambda = self.moe_orthogonal_loss_lambda.unsqueeze( + 0 + ) + + self.experts_type_ids = None + if config.moe_orthogonal_loss_lambda: + if hasattr(fleet.fleet, "_user_defined_strategy"): + strategy = fleet.fleet._user_defined_strategy + sharding_configs = strategy.hybrid_configs["sharding_configs"] + pp_config = strategy.hybrid_configs["pp_configs"] + assert ( + not sharding_configs.comm_overlap + and not pp_config.sharding_comm_overlap + ), "orthogonal loss will cause twice gradient accumulate, will break pp/sharding overlap" + + self.eps = paddle.to_tensor([1e-12], dtype="float32") + if config.multimodel_experts: + if config.moe_use_hard_gate: + self.num_experts_list = [] + self.experts_type_mask = [] + experts_ids = paddle.zeros( + [sum(self.num_experts)], dtype="int64" + ).reshape([config.moe_world_size, -1]) + offset = 0 + for i, expert_num in enumerate(self.num_experts): + experts_ids[ + :, offset : offset + expert_num // config.moe_world_size + ] = i + offset += expert_num // config.moe_world_size + self.experts_type_ids = experts_ids.reshape([-1]) + logger.info( + f"use moe_use_hard_gate, experts_ids: {self.experts_type_ids}" + ) + for i, expert_num in enumerate(self.num_experts): + self.experts_type_mask.append( + self.experts_type_ids == i, + ) + self.num_experts_list.append(expert_num) + else: + assert ( + not config.moe_group_experts + ), "group_experts must use hard_gate when multimodel_experts is True" + else: + self.num_experts_list = [self.num_experts] + if gate_weight is not None: + self.weight = gate_weight + assert ( + not self.config.moe_use_token_type_bias + ), "gate_weights is from outside, token_type_bias can't be used" + logger.info("moe use gate_weight from outside") + self._cast_to_low_precision = False + self._cast_to_low_precison = False + else: + self._create_gate_parameter() + logger.info( + f"{config.moe_gate}: w/ capacity: {self.cap} experts:{self.num_experts} " + f"use_token_type_bias:{self.use_token_type_bias} gate_act:{config.moe_gate_act} " + f"norm_gate_logits={self.norm_gate_logits} use_correction_bias={self.use_correction_bias}" + ) + + def _create_gate_parameter(self): + + if self.config.multimodel_experts: + self.moe_z_loss_lambda = self.moe_z_loss_lambda.expand( + len(self.num_experts) + ) + self.moe_aux_loss_lambda = self.moe_aux_loss_lambda.expand( + len(self.num_experts) + ) + self.moe_orthogonal_loss_lambda = self.moe_orthogonal_loss_lambda.expand( + len(self.num_experts) + ) + + for i, num_experts in enumerate(self.num_experts): + if i == 1: + with paddle.utils.unique_name.guard(f"mm_gate_{self.layer_idx}_"): + p = self.create_parameter( + shape=[self.model_dim, num_experts], + dtype="float32", + attr=paddle.ParamAttr( + name=unique_name.generate("moe_gate") + ), + ) + else: + p = self.create_parameter( + shape=[self.model_dim, num_experts], + dtype="float32", + attr=paddle.ParamAttr(name=unique_name.generate("moe_gate")), + ) + p.expert_type = f"expert_type_{i}" + self.add_parameter( + ("weight" if i == 0 else f"weight_{i}"), + p, + ) + else: + self.weight = self.create_parameter( + shape=[self.model_dim, self.num_experts], + dtype="float32", + attr=paddle.ParamAttr(name=unique_name.generate("moe_gate")), + ) + logger.info(f"moe-Gate, {self.weight}") + + if self.use_token_type_bias: + if self.config.multimodel_experts: + assert ( + not self.config.moe_use_hard_gate + ), "multimodel_experts with hard_gate is not support token_type_bias." + num_experts = ( + sum(self.num_experts) + if self.config.multimodel_experts + else self.num_experts + ) + bias_type_num = ( + len(self.num_experts) if self.config.multimodel_experts else 1 + ) + self.bias = self.create_parameter( + shape=[bias_type_num, num_experts], + dtype="float32", + attr=paddle.ParamAttr( + name=unique_name.generate("moe_gate_bias"), + initializer=paddle.nn.initializer.Assign( + np.zeros([bias_type_num, num_experts]) + ), + ), + ) + logger.info(f"using token type bias, bias: {self.bias},") + self._cast_to_low_precision = False + self._cast_to_low_precison = False + + def get_gate_weight(self, transform_weight): + if not self.config.multimodel_experts: + return self.weight + if not transform_weight: + return paddle.concat( + [ + getattr(self, "weight" if i == 0 else f"weight_{i}") + for i in range(len(self.num_experts)) + ], + -1, + ) + weight = paddle.zeros( + [ + self.model_dim, + self.config.moe_world_size, + sum(self.num_experts) // self.config.moe_world_size, + ], + dtype="float32", + ) + offset = 0 + for i, num_experts in enumerate(self.num_experts): + weight[ + :, :, offset : offset + num_experts // self.config.moe_world_size + ] = getattr(self, "weight" if i == 0 else f"weight_{i}").reshape( + [self.model_dim, self.config.moe_world_size, -1] + ) + offset += num_experts // self.config.moe_world_size + weight = weight.reshape([self.model_dim, -1]) + + return weight + + def forward( + self, + input: Tensor, + token_type_ids: Tensor = None, + transform_weight: bool = True, + correction_bias: Tensor = None, + ) -> Tuple[Tensor, Tensor, Tensor]: + + orig_dtype = input.dtype + weight = self.get_gate_weight(transform_weight) + with paddle.amp.auto_cast(False): + + logits = gate_detach_matmul( + input, weight, self.fuse_gate_detach_matmul, self.use_fake_gate + ) + + if self.use_token_type_bias: + assert token_type_ids is not None + bias = self.bias[token_type_ids] + logits = logits + bias + ( + capacity, + dispatch_mask, + combine_weights, + scatter_index, + l_aux, + l_zloss, + ) = self.top2_gating(logits, correction_bias=correction_bias) + orthogonal_loss = self._cal_orthogonal_loss() + router_loss = ( + l_aux * self.moe_aux_loss_lambda + + l_zloss * self.moe_z_loss_lambda + + orthogonal_loss * self.moe_orthogonal_loss_lambda + ) + router_loss.stop_gradient = False + if self.enable_logging and global_training_logs_enabled(): + _log = { + f"aux_loss_layer_{self.layer_idx}": l_aux.item(), + f"orthogonal_loss_layer_{self.layer_idx}": orthogonal_loss.item(), + f"zloss_layer_{self.layer_idx}": l_zloss.item(), + } + global_training_logs.update( + **_log, + **{ + k.replace(f"_layer_{self.layer_idx}", ""): v + for k, v in _log.items() + }, + ) + if self.use_token_type_bias: + _bias_log = { + f"token_type_bias_layer_{self.layer_idx}_expert{i}_gap": v + for i, v in enumerate((self.bias[0] - self.bias[1]).numpy()) + } + global_training_logs.update(**_bias_log) + + combine_weights = combine_weights.cast(orig_dtype) + return ( + capacity, + dispatch_mask, + combine_weights, + scatter_index, + router_loss, + logits, + ) + + def get_capacity(self, num_tokens, cap_factor=None): + + num_experts = ( + sum(self.num_experts) + if self.config.multimodel_experts + else self.num_experts + ) + if cap_factor is not None: + cap = cap_factor + else: + if self.training: + cap = self.cap[0] + elif num_tokens < num_experts: + cap = self.cap[2] + else: + cap = self.cap[1] + capacity = int(cap * num_tokens // num_experts) + assert ( + capacity > 0 + ), f"requires capacity to >= 0. cap={cap}, num_tokens={num_tokens}" + return capacity + + def top2_gating(self, logits, cap=None, correction_bias=None): + + l_zloss = self._cal_z_loss(logits) + gates = self.act(logits) + + assert logits.ndim == 2, logits.shape + num_tokens = gates.shape[0] + num_experts = gates.shape[1] + capacity = self.get_capacity(logits.shape[0], cap) + + score_for_argmax = ( + gates + correction_bias.unsqueeze(0) + if correction_bias is not None + else gates + ) + indices1_s = paddle.argmax(score_for_argmax, axis=1) + mask1 = F.one_hot(indices1_s, num_classes=num_experts).cast(paddle.int64) + + l_aux = self._cal_aux_loss(gates, mask1.sum(axis=0), self.num_experts_tensor) + + if self.training and not self.no_jitter: + gumbels = ( + -paddle.empty_like( + logits, + ) + .exponential_() + .log() + ) + logits_w_noise = logits + gumbels + else: + logits_w_noise = logits + + logits_except1 = masked_fill( + logits_w_noise, mask1.cast(paddle.bool), float("-inf") + ) + score_for_argmax = ( + self.act(logits_except1) + correction_bias.unsqueeze(0) + if correction_bias is not None + else logits_except1 + ) + indices2_s_original = paddle.argmax(score_for_argmax, axis=1) + + if self.training and self.sinkhorn_2gate: + r = paddle.ones(num_tokens, "float32") / num_tokens + + c = capacity - mask1.cast("float32").sum(0) + c = paddle.maximum(c, paddle.zeros_like(c)) + c /= c.sum() + + pi, _ = compute_optimal_transport( + -logits_except1.cast("float32").detach(), r, c, lam=self.sinkhorn_temp + ) + pi = masked_fill(pi, mask1.cast(paddle.bool), float("-inf")) + indices2_s = paddle.argmax(pi, axis=1) + else: + indices2_s = indices2_s_original + + if self.enable_logging and global_training_logs_enabled(): + global_training_logs.update( + **{ + "redispatch_acc": (indices2_s_original == indices2_s) + .cast(paddle.float32) + .mean() + .item(), + f"redispatch_acc_layer_{self.layer_idx}": ( + indices2_s_original == indices2_s + ) + .cast(paddle.float32) + .mean() + .item(), + } + ) + + mask2 = F.one_hot(indices2_s, num_classes=self.num_experts).cast(paddle.int64) + + locations1 = paddle.cumsum(mask1, axis=0) - 1 + locations2 = paddle.cumsum(mask2, axis=0) - 1 + locations2 += paddle.sum(mask1, axis=0, keepdim=True) + + mask1 *= (locations1 < capacity).cast(paddle.int64) + mask2 *= (locations2 < capacity).cast(paddle.int64) + + locations1_s = paddle.sum(locations1 * mask1, axis=1) + locations2_s = paddle.sum(locations2 * mask2, axis=1) + + mask1_float = mask1.cast(paddle.float32) + mask2_float = mask2.cast(paddle.float32) + gates1_s = (gates * mask1_float).sum(axis=-1) + gates2_s = (gates * mask2_float).sum(axis=-1) + + if self.norm_gate_logits: + denom_s = gates1_s + gates2_s + denom_s = paddle.clip(denom_s, min=1e-6) + gates1_s /= denom_s + gates2_s /= denom_s + if self.training and self.expert_drop: + gates2_s = paddle.where( + 2 * gates2_s < paddle.rand_like(gates2_s), + paddle.zeros_like(gates2_s), + gates2_s, + ) + + gates1 = gates1_s.unsqueeze(1) * mask1_float + gates2 = gates2_s.unsqueeze(1) * mask2_float + + expert1_index = paddle.argmax(gates1, -1) + combine1_weight = paddle.max(gates1, -1, keepdim=True) + scatter1_index = expert1_index * capacity + locations1_s + scatter1_index = scatter1_index.cast("int64") + dispatch1_mask = combine1_weight.cast(paddle.bool).detach() + + expert2_index = paddle.argmax(gates2, -1) + combine2_weight = paddle.max(gates2, -1, keepdim=True) + scatter2_index = expert2_index * capacity + locations2_s + scatter2_index = scatter2_index.cast("int64") + dispatch2_mask = combine2_weight.cast(paddle.bool).detach() + if self.enable_logging and global_training_logs_enabled(): + global_training_logs.update( + **{ + "top1_gate": ( + combine1_weight.sum() + / (dispatch1_mask.cast("float32").sum() + 1e-9) + ).item(), + "top2_gate": ( + combine2_weight.sum() + / (dispatch2_mask.cast("float32").sum() + 1e-9) + ).item(), + f"top1_gate_layer_{self.layer_idx}": ( + combine1_weight.sum() + / (dispatch1_mask.cast("float32").sum() + 1e-9) + ).item(), + f"top2_gate_layer_{self.layer_idx}": ( + combine2_weight.sum() + / (dispatch2_mask.cast("float32").sum() + 1e-9) + ).item(), + } + ) + + seqlen = logits.shape[0] + top1_gate_experts_per_token = ( + paddle.cast(dispatch1_mask, dtype="float32").sum() / seqlen + ) + top2_gate_experts_per_token = ( + paddle.cast(dispatch2_mask, dtype="float32").sum() / seqlen + ) + leakage_experts_per_token = ( + paddle.cast( + (~dispatch1_mask) & (~dispatch2_mask), dtype="float32" + ).sum() + / seqlen + ) + + experts_per_token = ( + top1_gate_experts_per_token + top2_gate_experts_per_token + ) + _log = { + f"experts_per_token_layer_{self.layer_idx}": experts_per_token.item(), + f"top1_experts_per_token_layer_{self.layer_idx}": top1_gate_experts_per_token.item(), + f"top2_experts_per_token_layer_{self.layer_idx}": top2_gate_experts_per_token.item(), + f"leakage_experts_per_token_layer_{self.layer_idx}": leakage_experts_per_token.item(), + } + global_training_logs.update( + **_log, + **{ + k.replace(f"_layer_{self.layer_idx}", ""): v + for k, v in _log.items() + }, + ) + + return ( + capacity, + paddle.concat((dispatch1_mask, dispatch2_mask), 1), + paddle.concat((combine1_weight, combine2_weight), 1), + paddle.stack((scatter1_index, scatter2_index), 1), + l_aux, + l_zloss, + ) + + def _cal_aux_loss( + self, + gate_prob, + dispatch_mask, + num_experts=None, + use_group=None, + tokens_mask=None, + dispatch_tokens_mask=None, + ): + + if self.act is F.sigmoid: + gate_prob = gate_prob / gate_prob.sum(-1, keepdim=True) + + if self.use_correction_bias: + if tokens_mask is not None: + gate_prob_this_modality = gate_prob[tokens_mask.astype("bool")] + if gate_prob_this_modality.shape[0]: + _, top_idx = gate_prob_this_modality.topk( + k=self.config.moe_k, axis=-1 + ) + if int_bincount is not None: + dispatch_mask = int_bincount( + top_idx, 0, gate_prob.shape[-1], paddle.int64 + ) + else: + mask = paddle.zeros_like( + gate_prob_this_modality + ).put_along_axis(top_idx, paddle.to_tensor(1.0), axis=1) + dispatch_mask = paddle.sum(mask.cast(paddle.int64), axis=0) + else: + dispatch_mask = paddle.zeros(gate_prob.shape[-1], dtype="int64") + dist.stream.all_reduce( + dispatch_mask, + group=self.group, + use_calc_stream=True, + ) + else: + _, top_idx = gate_prob.topk(k=self.config.moe_k, axis=-1) + if int_bincount is not None: + dispatch_mask = int_bincount( + top_idx, 0, gate_prob.shape[-1], paddle.int64 + ) + else: + mask = paddle.zeros_like(gate_prob).put_along_axis( + top_idx, paddle.to_tensor(1.0), axis=1 + ) + dispatch_mask = paddle.sum(mask.cast(paddle.int64), axis=0) + + if num_experts is None: + num_experts = self.num_experts_tensor + if use_group is None: + use_group = self.config.moe_group_experts + + return cal_aux_loss_func( + gate_prob, + dispatch_mask, + tokens_mask, + dispatch_tokens_mask, + num_experts, + use_group, + self.config.moe_k, + self.global_aux_loss, + self.rank if self.global_aux_loss else None, + self.group if self.global_aux_loss else None, + ) + + def _cal_z_loss(self, logits, loss_mask=None): + + if ( + (moe_router_loss_ops is not None) + and (loss_mask is None or len(loss_mask.shape) == 1) + and (logits.dtype == paddle.float32) + ): + return CalZLossFunctor.apply(logits, loss_mask) + else: + return cal_z_loss_func(logits, loss_mask) + + def _cal_orthogonal_loss_opt_each_weight(self, weight, use_group): + + if weight.dtype != paddle.float32: + weight = weight.astype(paddle.float32) + + if (moe_router_loss_ops is not None) and (weight.dtype == paddle.float32): + return CalOrthogonalLossOptEachWeightFunctor.apply( + weight, self.config.moe_k, use_group + ) + else: + return cal_orthogonal_loss_opt_each_weight_func( + weight, + self.config.moe_k, + use_group, + self.eps, + self.training, + ) + + def _cal_orthogonal_loss(self, weight_id=None, use_group=None): + + if use_group is None: + use_group = ( + self.config.moe_group_experts and self.config.moe_group_orthogonal_loss + ) + + if weight_id is not None: + if weight_id == 0: + w_ = self.weight + else: + assert self.config.multimodel_experts + w_ = getattr(self, f"weight_{weight_id}") + return self._cal_orthogonal_loss_opt_each_weight(w_, use_group) + + orthogonal_loss = self._cal_orthogonal_loss_opt_each_weight( + self.weight, use_group + ) + if self.config.multimodel_experts: + for i in range(1, len(self.config.moe_num_experts)): + w_ = getattr(self, f"weight_{i}") + orthogonal_loss += self._cal_orthogonal_loss_opt_each_weight( + w_, use_group=False + ) + return orthogonal_loss + + +class TopKGateFused(Top2Gate): + + def forward( + self, + input: Tensor, + token_type_ids=None, + transform_weight=True, + ) -> Tuple[Tensor, Tensor, Tensor]: + + capacity = self.get_capacity(input.shape[0]) + weight = self.get_gate_weight(transform_weight) + with paddle.amp.auto_cast(False): + + logits = gate_detach_matmul( + input, weight, self.fuse_gate_detach_matmul, self.use_fake_gate + ) + if self.use_token_type_bias: + assert token_type_ids is not None + assert ( + token_type_ids.max() < self.bias.shape[0] + ), f"token_type_ids {token_type_ids.max()} >= bias shape {self.bias.shape[0]}" + bias = self.bias[token_type_ids] + logits = logits + bias + orthogonal_loss = None + router_loss = paddle.zeros([1], dtype="float32") + router_loss.stop_gradient = False + if ( + self.enable_logging + and global_training_logs_enabled() + and orthogonal_loss is not None + ): + _log = { + f"orthogonal_loss_layer_{self.layer_idx}": orthogonal_loss.item(), + } + global_training_logs.update( + **_log, + **{ + k.replace(f"_layer_{self.layer_idx}", ""): v + for k, v in _log.items() + }, + ) + + return logits, capacity, router_loss class TopKGateFusedAuto(TopKGateFused): @@ -45,7 +992,7 @@ def forward( self, input: Tensor, token_type_ids=None, - ) -> Tuple[Tensor, Tensor, Tensor, Tensor]: # type: ignore + ) -> Tuple[Tensor, Tensor, Tensor, Tensor]: """ Args: input: paddle.Tensor, hidden-states of layer @@ -61,12 +1008,11 @@ def forward( ) if self.training: cap = self.cap[0] - elif input.shape[0] < num_experts: # seqlen < num_expert + elif input.shape[0] < num_experts: cap = self.cap[2] else: cap = self.cap[1] num_tokens = input.shape[0] - # capacity = 2S/E global_capacity = int(cap * num_tokens // num_experts) local_num_tokens = input._local_shape[0] local_capacity = int(cap * local_num_tokens // num_experts) diff --git a/examples/pre-training/models/moe/top2_gate_auto_auto.py b/examples/pre-training/models/moe/top2_gate_auto_auto.py deleted file mode 100644 index 6ce094d2..00000000 --- a/examples/pre-training/models/moe/top2_gate_auto_auto.py +++ /dev/null @@ -1,1036 +0,0 @@ -# !/usr/bin/env python3 - -# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -from typing import Tuple -from functools import partial -import logging -import numpy as np -import paddle -from paddle import Tensor -import paddle.distributed as dist -import paddle.nn.functional as F -from paddle import nn -from paddle.utils import unique_name -from paddle.nn.clip import _squared_l2_norm -from paddle.distributed import fleet -from paddleformers.utils.tools import get_env_device -from models.utils import global_training_logs_enabled - -try: - from src.utils.misc import global_training_logs -except ModuleNotFoundError: - global_training_logs = {} -try: - import moe_router_loss_ops -except ImportError: - moe_router_loss_ops = None - -try: - from custom_setup_ops import matmul_bwd -except ImportError: - matmul_bwd = None - -try: - from bincount_ops import int_bincount -except ImportError: - int_bincount = None - -logger = logging.getLogger(__name__) - - -class CalOrthogonalLossOptEachWeightFunctor(paddle.autograd.PyLayer): - - @staticmethod - def forward(ctx, gate_weight, moe_k, use_group, eps=1e-12): - if gate_weight.dtype != paddle.float32: - gate_weight = gate_weight.astype(paddle.float32) - ( - orthogonal_loss, - wnorm, - weight_scale, - normed_weight, - weight_matmul, - ) = moe_router_loss_ops.cal_orthogonal_loss_opt_each_weight( - gate_weight, moe_k, use_group, eps - ) - ctx.save_for_backward( - gate_weight, wnorm, weight_scale, normed_weight, weight_matmul - ) - ctx.moe_k = moe_k - ctx.use_group = use_group - ctx.eps = eps - return orthogonal_loss - - @staticmethod - def backward(ctx, out_grad): - gate_weight, wnorm, weight_scale, normed_weight, weight_matmul = ( - ctx.saved_tensor() - ) - if gate_weight.stop_gradient: - return None - moe_k = ctx.moe_k - use_group = ctx.use_group - eps = ctx.eps - return moe_router_loss_ops.cal_orthogonal_loss_opt_each_weight_grad( - out_grad, - wnorm, - weight_scale, - normed_weight, - weight_matmul, - moe_k, - use_group, - eps, - ) - - -class CalZLossFunctor(paddle.autograd.PyLayer): - - @staticmethod - def forward(ctx, logits, loss_mask=None, clip_min=1e-6): - if loss_mask is not None: - assert loss_mask.stop_gradient - loss, max_logits, safe_sumexp, logsumexp_per_token = ( - moe_router_loss_ops.cal_z_loss(logits, loss_mask, clip_min) - ) - ctx.save_for_backward( - logits, loss_mask, max_logits, safe_sumexp, logsumexp_per_token - ) - ctx.clip_min = clip_min - return loss - - @staticmethod - def backward(ctx, out_grad): - logits, loss_mask, max_logits, safe_sumexp, logsumexp_per_token = ( - ctx.saved_tensor() - ) - if logits.stop_gradient: - return None - clip_min = ctx.clip_min - return moe_router_loss_ops.cal_z_loss_grad( - out_grad, - logits, - loss_mask, - max_logits, - safe_sumexp, - logsumexp_per_token, - clip_min, - ) - - -class CalAuxLossFunctor(paddle.autograd.PyLayer): - - @staticmethod - def forward( - ctx, - gate_prob, - dispatch_mask, - tokens_mask, - dispatch_tokens_mask, - num_experts, - use_group, - moe_k, - clip_min=1e-6, - ): - if tokens_mask is not None and tokens_mask.dtype != gate_prob.dtype: - tokens_mask = tokens_mask.astype(gate_prob.dtype) - loss, seqlen_float, ce = paddle.incubate.nn.functional.cal_aux_loss( - gate_prob, - dispatch_mask, - tokens_mask, - dispatch_tokens_mask, - num_experts, - use_group, - moe_k, - clip_min, - ) - ctx.save_for_backward(gate_prob, seqlen_float, ce) - ctx.num_experts = num_experts - ctx.use_group = use_group - ctx.moe_k = moe_k - return loss - - @staticmethod - def backward(ctx, out_grad): - gate_prob, seqlen_float, ce = ctx.saved_tensor() - num_experts = ctx.num_experts - use_group = ctx.use_group - moe_k = ctx.moe_k - return paddle.incubate.nn.functional.cal_aux_loss_grad( - out_grad, gate_prob, seqlen_float, ce, num_experts, use_group, moe_k - ) - - -def cal_orthogonal_loss_opt_each_weight_func( - weight, moe_k, use_group, eps, xpu_matmul=None, training=True -): - weight = weight.transpose([1, 0]).contiguous() # transpose weight here - wnorm = weight.norm(axis=1) - weight = weight / paddle.maximum(wnorm, eps).unsqueeze(1) - - if use_group: - weight = weight.reshape([moe_k, -1, weight.shape[1]]) # [K, E/K, H] - eye_matrix = paddle.eye(weight.shape[1], dtype=weight.dtype).unsqueeze(0) - else: - eye_matrix = paddle.eye(weight.shape[0], dtype=weight.dtype) - - if get_env_device() == "xpu" and xpu_matmul is not None: - weight_matmul = xpu_matmul(weight, weight, transpose_y=True, training=training) - else: - weight_matmul = paddle.matmul(weight, weight, transpose_y=True) - - orthogonal_loss = weight_matmul - eye_matrix - orthogonal_loss = _squared_l2_norm(orthogonal_loss) / orthogonal_loss.size - return orthogonal_loss - - -def cal_z_loss_func(logits, loss_mask): - if loss_mask is not None: - loss_mask = loss_mask.astype(logits.dtype) - l_zloss = (logits.logsumexp(1).square() * loss_mask).sum() / paddle.clip( - loss_mask.sum(), min=1e-6 - ) - else: - l_zloss = logits.logsumexp(1).square().mean() - return l_zloss - - -def cal_aux_loss_func( - gate_prob, - dispatch_mask, - tokens_mask, - dispatch_tokens_mask, - num_experts, - use_group, - moe_k, - global_aux_loss=False, - rank=None, - group=None, -): - if tokens_mask is not None and tokens_mask.dtype != gate_prob.dtype: - tokens_mask = tokens_mask.astype(gate_prob.dtype) - - scale = None - if dispatch_tokens_mask is not None: - seqlen_float = dispatch_tokens_mask.astype(gate_prob.dtype).sum() - if ( - tokens_mask is not None - and gate_prob.shape[0] != dispatch_tokens_mask.shape[0] - ): - scale = seqlen_float / paddle.clip(tokens_mask.sum(), min=1e-6) - elif tokens_mask is not None: - seqlen_float = tokens_mask.sum() - else: - seqlen_float = gate_prob.numel().astype(gate_prob.dtype) / num_experts - seqlen_float = paddle.clip(seqlen_float, min=1e-6) - - if len(dispatch_mask.shape) == 2: - dispatch_mask = dispatch_mask.sum(0) - ce = dispatch_mask.astype(gate_prob.dtype).detach() / seqlen_float - me = paddle.sum(gate_prob, axis=0) / seqlen_float - # me = paddle.mean(gate_prob, axis=0) - # ce = paddle.mean(dispatch_mask.cast("float32"), axis=0) - if global_aux_loss: - me_list, ce_list = [], [] - dist.all_gather(me_list, me, group=group) - dist.all_gather(ce_list, ce, group=group) - - me_list[rank] = me - ce_list[rank] = ce - me = paddle.stack(me_list).mean(0) - ce = paddle.stack(ce_list).mean(0) - - l_aux = paddle.sum(me * ce) * num_experts - if use_group: - l_aux = l_aux / moe_k - - if scale is not None: - l_aux = l_aux + (scale - 1) * l_aux.detach() - - return l_aux - - -def masked_fill(x, mask, value): - - y = paddle.full(x.shape, value, x.dtype) - return paddle.where(mask, y, x) - - -@paddle.no_grad() -def compute_optimal_transport(M, r, c, lam=1.0, epsilon=1e-8, max_iters: int = 10): - - n, _ = M.shape - P = F.softmax(-M / lam) - u = paddle.zeros(n, "float32") - for _ in range(max_iters): - if (u - P.sum(1)).abs().max() < epsilon: - break - u = P.sum(1) - P *= (r / (u + 1e-8)).reshape((-1, 1)) - P *= (c / (P.sum(0) + 1e-8)).reshape((1, -1)) - P = paddle.where(~P.isnan(), P, paddle.zeros_like(P)) - return P, _ - - -def cast_if_needed(x, dtype): - - return x.cast(dtype) if x.dtype != dtype else x - - -class FusedGateDetachMatmul(paddle.autograd.PyLayer): - - @staticmethod - def forward(ctx, x, w): - - ctx.dtype = paddle.float32 - ctx.save_for_backward(x, w) - return F.linear(cast_if_needed(x, ctx.dtype), cast_if_needed(w, ctx.dtype)) - - @staticmethod - def backward(ctx, y_grad): - - x, w = ctx.saved_tensor() - assert ctx.dtype == y_grad.dtype, "dtype not match" - x_g, w_g = matmul_bwd( - cast_if_needed(x, ctx.dtype), - cast_if_needed(w, ctx.dtype), - y_grad, - False, - False, - ) - return cast_if_needed(x_g, x.dtype), cast_if_needed(w_g, w.dtype) - - -def gate_detach_matmul(x, weight, use_fuse, use_fake_gate=False): - - if use_fuse: - score = FusedGateDetachMatmul.apply(x, weight) - else: - x = cast_if_needed(x, paddle.float32) - score = F.linear(x, weight) - - if use_fake_gate: - score = paddle.randn(score.shape).astype(score.dtype) + score - score - return score - - -class Top2Gate(nn.Layer): - - def __init__(self, config, layer_idx: int, group, gate_weight=None) -> None: - - super().__init__() - if get_env_device() == "xpu": - try: - from paddle_xpu.layers.nn import xpu_matmul - - self.xpu_matmul = xpu_matmul() - except ImportError: - self.xpu_matmul = None - else: - self.xpu_matmul = None - self.config = config - self.fuse_gate_detach_matmul = config.fuse_gate_detach_matmul - if self.fuse_gate_detach_matmul: - assert matmul_bwd is not None, "matmul_bwd is not supported" - - self.use_fake_gate = config.use_fake_gate - if self.use_fake_gate: - logging.warning( - "You are use fake_gate, which is just for test, not for real training." - ) - - self.model_dim = config.hidden_size - self.num_experts = config.moe_num_experts - self.num_experts_tensor = ( - sum(config.moe_num_experts) - if config.multimodel_experts - else config.moe_num_experts - ) # paddle.to_tensor(config.moe_num_experts, dtype="float32").sum() - - self.cap = config.moe_capacity - self.group = group - - self.layer_idx = layer_idx - self.global_aux_loss = config.global_aux_loss - if self.global_aux_loss: - self.rank = dist.get_rank(self.group) - - self.sinkhorn_2gate = config.sinkhorn_2gate - self.sinkhorn_temp = config.sinkhorn_temp - self.use_token_type_bias = config.moe_use_token_type_bias - self.use_correction_bias = config.moe_use_aux_free - - if config.moe_gate_act == "softmax": - self.act = partial(F.softmax, axis=-1) # [S,E] - elif config.moe_gate_act == "sigmoid": - self.act = F.sigmoid - else: - raise ValueError(f"{config.moe_gate_act} is not supported.") - self.no_jitter = True - self.expert_drop = False - self.eye_matrix = None - self.eye_matrix_size = None - self.enable_logging = config.moe_logging - self.norm_gate_logits = config.moe_norm_gate_logits - self.one = paddle.ones([], dtype="float32") - - self.moe_aux_loss_lambda = paddle.to_tensor( - config.moe_aux_loss_lambda, dtype="float32" - ) - self.moe_z_loss_lambda = paddle.to_tensor( - config.moe_z_loss_lambda, dtype="float32" - ) - self.moe_orthogonal_loss_lambda = paddle.to_tensor( - config.moe_orthogonal_loss_lambda, dtype="float32" - ) - if self.moe_aux_loss_lambda.ndim == 0: - self.moe_aux_loss_lambda = self.moe_aux_loss_lambda.unsqueeze(0) - if self.moe_z_loss_lambda.ndim == 0: - self.moe_z_loss_lambda = self.moe_z_loss_lambda.unsqueeze(0) - if self.moe_orthogonal_loss_lambda.ndim == 0: - self.moe_orthogonal_loss_lambda = self.moe_orthogonal_loss_lambda.unsqueeze( - 0 - ) - - self.experts_type_ids = None - if config.moe_orthogonal_loss_lambda: - if hasattr(fleet.fleet, "_user_defined_strategy"): - strategy = fleet.fleet._user_defined_strategy - sharding_configs = strategy.hybrid_configs["sharding_configs"] - pp_config = strategy.hybrid_configs["pp_configs"] - assert ( - not sharding_configs.comm_overlap - and not pp_config.sharding_comm_overlap - ), "orthogonal loss will cause twice gradient accumulate, will break pp/sharding overlap" - - self.eps = paddle.to_tensor([1e-12], dtype="float32") - if config.multimodel_experts: - if config.moe_use_hard_gate: - self.num_experts_list = [] - self.experts_type_mask = [] - experts_ids = paddle.zeros( - [sum(self.num_experts)], dtype="int64" - ).reshape([config.moe_world_size, -1]) - offset = 0 - for i, expert_num in enumerate(self.num_experts): - experts_ids[ - :, offset : offset + expert_num // config.moe_world_size - ] = i - offset += expert_num // config.moe_world_size - self.experts_type_ids = experts_ids.reshape([-1]) - logger.info( - f"use moe_use_hard_gate, experts_ids: {self.experts_type_ids}" - ) - for i, expert_num in enumerate(self.num_experts): - self.experts_type_mask.append( - self.experts_type_ids == i, - ) - self.num_experts_list.append(expert_num) - else: - assert ( - not config.moe_group_experts - ), "group_experts must use hard_gate when multimodel_experts is True" - else: - self.num_experts_list = [self.num_experts] - if gate_weight is not None: - self.weight = gate_weight - assert ( - not self.config.moe_use_token_type_bias - ), "gate_weights is from outside, token_type_bias can't be used" - logger.info("moe use gate_weight from outside") - self._cast_to_low_precision = False - self._cast_to_low_precison = False - else: - self._create_gate_parameter() - logger.info( - f"{config.moe_gate}: w/ capacity: {self.cap} experts:{self.num_experts} " - f"use_token_type_bias:{self.use_token_type_bias} gate_act:{config.moe_gate_act} " - f"norm_gate_logits={self.norm_gate_logits} use_correction_bias={self.use_correction_bias}" - ) - - def _create_gate_parameter(self): - - if self.config.multimodel_experts: - # support setting lambda for each expert group - self.moe_z_loss_lambda = self.moe_z_loss_lambda.expand( - len(self.num_experts) - ) - self.moe_aux_loss_lambda = self.moe_aux_loss_lambda.expand( - len(self.num_experts) - ) - self.moe_orthogonal_loss_lambda = self.moe_orthogonal_loss_lambda.expand( - len(self.num_experts) - ) - - for i, num_experts in enumerate(self.num_experts): - if i == 1: - with paddle.utils.unique_name.guard(f"mm_gate_{self.layer_idx}_"): - p = self.create_parameter( - shape=[self.model_dim, num_experts], - dtype="float32", - attr=paddle.ParamAttr( - name=unique_name.generate("moe_gate") - ), - ) - else: - p = self.create_parameter( - shape=[self.model_dim, num_experts], - dtype="float32", - attr=paddle.ParamAttr(name=unique_name.generate("moe_gate")), - ) - p.expert_type = f"expert_type_{i}" - self.add_parameter( - ("weight" if i == 0 else f"weight_{i}"), - p, - ) - else: - self.weight = self.create_parameter( - shape=[self.model_dim, self.num_experts], - dtype="float32", - attr=paddle.ParamAttr(name=unique_name.generate("moe_gate")), - ) - logger.info(f"moe-Gate, {self.weight}") - - if self.use_token_type_bias: - if self.config.multimodel_experts: - assert ( - not self.config.moe_use_hard_gate - ), "multimodel_experts with hard_gate is not support token_type_bias." - num_experts = ( - sum(self.num_experts) - if self.config.multimodel_experts - else self.num_experts - ) - bias_type_num = ( - len(self.num_experts) if self.config.multimodel_experts else 1 - ) - self.bias = self.create_parameter( - shape=[bias_type_num, num_experts], - dtype="float32", - attr=paddle.ParamAttr( - name=unique_name.generate("moe_gate_bias"), - initializer=paddle.nn.initializer.Assign( - np.zeros([bias_type_num, num_experts]) - ), - ), - ) - logger.info(f"using token type bias, bias: {self.bias},") - self._cast_to_low_precision = False - self._cast_to_low_precison = False - - def get_gate_weight(self, transform_weight): - if not self.config.multimodel_experts: - return self.weight - if not transform_weight: - return paddle.concat( - [ - getattr(self, "weight" if i == 0 else f"weight_{i}") - for i in range(len(self.num_experts)) - ], - -1, - ) - weight = paddle.zeros( - [ - self.model_dim, - self.config.moe_world_size, - sum(self.num_experts) // self.config.moe_world_size, - ], - dtype="float32", - ) - offset = 0 - for i, num_experts in enumerate(self.num_experts): - weight[ - :, :, offset : offset + num_experts // self.config.moe_world_size - ] = getattr(self, "weight" if i == 0 else f"weight_{i}").reshape( - [self.model_dim, self.config.moe_world_size, -1] - ) - offset += num_experts // self.config.moe_world_size - weight = weight.reshape([self.model_dim, -1]) - - return weight - - def forward( - self, - input: Tensor, - token_type_ids: Tensor = None, - transform_weight: bool = True, # [seq] - correction_bias: Tensor = None, # [seq] - ) -> Tuple[Tensor, Tensor, Tensor]: # type: ignore - - orig_dtype = input.dtype - weight = self.get_gate_weight(transform_weight) - with paddle.amp.auto_cast(False): - if get_env_device() == "xpu" and self.xpu_matmul is not None: - assert not self.fuse_gate_detach_matmul, "not supported on XPU" - input_32 = input.cast("float32") - logits = self.xpu_matmul( - input_32, - weight, - training=self.training, - ) - else: - logits = gate_detach_matmul( - input, weight, self.fuse_gate_detach_matmul, self.use_fake_gate - ) - - if self.use_token_type_bias: - assert token_type_ids is not None - bias = self.bias[token_type_ids] # [seq] - # logger.info(f"adding bias: {bias}") - logits = logits + bias - ( - capacity, - dispatch_mask, - combine_weights, - scatter_index, - l_aux, - l_zloss, - ) = self.top2_gating(logits, correction_bias=correction_bias) - orthogonal_loss = self._cal_orthogonal_loss() - router_loss = ( - l_aux * self.moe_aux_loss_lambda - + l_zloss * self.moe_z_loss_lambda - + orthogonal_loss * self.moe_orthogonal_loss_lambda - ) - router_loss.stop_gradient = False - if self.enable_logging and global_training_logs_enabled(): - _log = { - f"aux_loss_layer_{self.layer_idx}": l_aux.item(), - f"orthogonal_loss_layer_{self.layer_idx}": orthogonal_loss.item(), - f"zloss_layer_{self.layer_idx}": l_zloss.item(), - } - global_training_logs.update( - **_log, - **{ - k.replace(f"_layer_{self.layer_idx}", ""): v - for k, v in _log.items() - }, - ) - if self.use_token_type_bias: - _bias_log = { - f"token_type_bias_layer_{self.layer_idx}_expert{i}_gap": v - for i, v in enumerate((self.bias[0] - self.bias[1]).numpy()) - } - global_training_logs.update(**_bias_log) - - combine_weights = combine_weights.cast(orig_dtype) - return ( - capacity, - dispatch_mask, - combine_weights, - scatter_index, - router_loss, - logits, - ) - - def get_capacity(self, num_tokens, cap_factor=None): - - num_experts = ( - sum(self.num_experts) - if self.config.multimodel_experts - else self.num_experts - ) - if cap_factor is not None: - cap = cap_factor - else: - if self.training: - cap = self.cap[0] - elif num_tokens < num_experts: # seqlen < num_expert - cap = self.cap[2] - else: - cap = self.cap[1] - # capacity = 2S/E - capacity = int(cap * num_tokens // num_experts) - assert ( - capacity > 0 - ), f"requires capacity to >= 0. cap={cap}, num_tokens={num_tokens}" - return capacity - - def top2_gating(self, logits, cap=None, correction_bias=None): - - # logger.info(f'gate-input: {logits}') - l_zloss = self._cal_z_loss(logits) - gates = self.act(logits) - - # gates has shape of SE - assert logits.ndim == 2, logits.shape - num_tokens = gates.shape[0] - num_experts = gates.shape[1] - # capacity = 2S/E - capacity = self.get_capacity(logits.shape[0], cap) - - # Create a mask for 1st's expert per token - score_for_argmax = ( - gates + correction_bias.unsqueeze(0) - if correction_bias is not None - else gates - ) - indices1_s = paddle.argmax(score_for_argmax, axis=1) - mask1 = F.one_hot(indices1_s, num_classes=num_experts).cast( - paddle.int64 - ) # [0,1] - - l_aux = self._cal_aux_loss(gates, mask1.sum(axis=0), self.num_experts_tensor) - - if self.training and not self.no_jitter: - gumbels = ( - -paddle.empty_like( - logits, - ) - .exponential_() - .log() - ) # ~Gumbel(0,1) - logits_w_noise = logits + gumbels - else: - logits_w_noise = logits - - logits_except1 = masked_fill( - logits_w_noise, mask1.cast(paddle.bool), float("-inf") - ) - score_for_argmax = ( - self.act(logits_except1) + correction_bias.unsqueeze(0) - if correction_bias is not None - else logits_except1 - ) - indices2_s_original = paddle.argmax(score_for_argmax, axis=1) - - if self.training and self.sinkhorn_2gate: - r = paddle.ones(num_tokens, "float32") / num_tokens - - c = capacity - mask1.cast("float32").sum(0) - c = paddle.maximum(c, paddle.zeros_like(c)) - c /= c.sum() - - pi, _ = compute_optimal_transport( - -logits_except1.cast("float32").detach(), r, c, lam=self.sinkhorn_temp - ) - pi = masked_fill(pi, mask1.cast(paddle.bool), float("-inf")) - indices2_s = paddle.argmax(pi, axis=1) - else: - indices2_s = indices2_s_original - - if self.enable_logging and global_training_logs_enabled(): - global_training_logs.update( - **{ - "redispatch_acc": (indices2_s_original == indices2_s) - .cast(paddle.float32) - .mean() - .item(), - f"redispatch_acc_layer_{self.layer_idx}": ( - indices2_s_original == indices2_s - ) - .cast(paddle.float32) - .mean() - .item(), - } - ) - - mask2 = F.one_hot(indices2_s, num_classes=self.num_experts).cast(paddle.int64) - - # Compute locations in capacity buffer - locations1 = ( - paddle.cumsum(mask1, axis=0) - 1 - ) # [0,1,1,0,1,0,0] -> [0,0,0,0,1,1,1,] - locations2 = paddle.cumsum(mask2, axis=0) - 1 - # Update 2nd's location by accounting for locations of 1st - locations2 += paddle.sum(mask1, axis=0, keepdim=True) - - # Remove locations outside capacity from mask - mask1 *= (locations1 < capacity).cast(paddle.int64) # [0,1,1,0,0,0,0] - mask2 *= (locations2 < capacity).cast(paddle.int64) - - # Store the capacity location for each token - locations1_s = paddle.sum(locations1 * mask1, axis=1) - locations2_s = paddle.sum(locations2 * mask2, axis=1) - - # Normalize gate probabilities - mask1_float = mask1.cast(paddle.float32) - mask2_float = mask2.cast(paddle.float32) - gates1_s = (gates * mask1_float).sum(axis=-1) - gates2_s = (gates * mask2_float).sum(axis=-1) - # logger.info(f'gates1_s:{gates1_s} gates2_s:{gates2_s} logits:{logits}') - - if self.norm_gate_logits: - denom_s = gates1_s + gates2_s # [0.2, 0.3] - # Avoid divide-by-zero - denom_s = paddle.clip(denom_s, min=1e-6) - gates1_s /= denom_s - gates2_s /= denom_s - if self.training and self.expert_drop: - # log.debug(gates2_s) - gates2_s = paddle.where( - 2 * gates2_s < paddle.rand_like(gates2_s), - paddle.zeros_like(gates2_s), - gates2_s, - ) - - # Calculate combine_weights and dispatch_mask - gates1 = gates1_s.unsqueeze(1) * mask1_float - gates2 = gates2_s.unsqueeze(1) * mask2_float - - expert1_index = paddle.argmax(gates1, -1) - combine1_weight = paddle.max(gates1, -1, keepdim=True) - scatter1_index = expert1_index * capacity + locations1_s - scatter1_index = scatter1_index.cast("int64") - dispatch1_mask = combine1_weight.cast(paddle.bool).detach() - - expert2_index = paddle.argmax(gates2, -1) - combine2_weight = paddle.max(gates2, -1, keepdim=True) - scatter2_index = expert2_index * capacity + locations2_s - scatter2_index = scatter2_index.cast("int64") - dispatch2_mask = combine2_weight.cast(paddle.bool).detach() - # logger.info(f'expert-id: {expert1_index} vs {expert2_index}, mask:{mask1_float} vs {mask2_float}') - if self.enable_logging and global_training_logs_enabled(): - global_training_logs.update( - **{ - "top1_gate": ( - combine1_weight.sum() - / (dispatch1_mask.cast("float32").sum() + 1e-9) - ).item(), - "top2_gate": ( - combine2_weight.sum() - / (dispatch2_mask.cast("float32").sum() + 1e-9) - ).item(), - f"top1_gate_layer_{self.layer_idx}": ( - combine1_weight.sum() - / (dispatch1_mask.cast("float32").sum() + 1e-9) - ).item(), - f"top2_gate_layer_{self.layer_idx}": ( - combine2_weight.sum() - / (dispatch2_mask.cast("float32").sum() + 1e-9) - ).item(), - } - ) - - seqlen = logits.shape[0] - top1_gate_experts_per_token = ( - paddle.cast(dispatch1_mask, dtype="float32").sum() / seqlen - ) - top2_gate_experts_per_token = ( - paddle.cast(dispatch2_mask, dtype="float32").sum() / seqlen - ) - leakage_experts_per_token = ( - paddle.cast( - (~dispatch1_mask) & (~dispatch2_mask), dtype="float32" - ).sum() - / seqlen - ) - - experts_per_token = ( - top1_gate_experts_per_token + top2_gate_experts_per_token - ) - _log = { - f"experts_per_token_layer_{self.layer_idx}": experts_per_token.item(), - f"top1_experts_per_token_layer_{self.layer_idx}": top1_gate_experts_per_token.item(), - f"top2_experts_per_token_layer_{self.layer_idx}": top2_gate_experts_per_token.item(), - f"leakage_experts_per_token_layer_{self.layer_idx}": leakage_experts_per_token.item(), - } - global_training_logs.update( - **_log, - **{ - k.replace(f"_layer_{self.layer_idx}", ""): v - for k, v in _log.items() - }, - ) - - return ( - capacity, - paddle.concat((dispatch1_mask, dispatch2_mask), 1), - paddle.concat((combine1_weight, combine2_weight), 1), - paddle.stack((scatter1_index, scatter2_index), 1), - l_aux, - l_zloss, - ) - - def _cal_aux_loss( - self, - gate_prob, - dispatch_mask, - num_experts=None, - use_group=None, - tokens_mask=None, - dispatch_tokens_mask=None, - ): - - if self.act is F.sigmoid: - gate_prob = gate_prob / gate_prob.sum(-1, keepdim=True) - - if self.use_correction_bias: - if tokens_mask is not None: - gate_prob_this_modality = gate_prob[tokens_mask.astype("bool")] - if gate_prob_this_modality.shape[0]: - _, top_idx = gate_prob_this_modality.topk( - k=self.config.moe_k, axis=-1 - ) - if int_bincount is not None: - dispatch_mask = int_bincount( - top_idx, 0, gate_prob.shape[-1], paddle.int64 - ) - else: - mask = paddle.zeros_like( - gate_prob_this_modality - ).put_along_axis(top_idx, paddle.to_tensor(1.0), axis=1) - dispatch_mask = paddle.sum(mask.cast(paddle.int64), axis=0) - else: - dispatch_mask = paddle.zeros(gate_prob.shape[-1], dtype="int64") - dist.stream.all_reduce( - dispatch_mask, - group=self.group, - use_calc_stream=True, - ) - else: - _, top_idx = gate_prob.topk(k=self.config.moe_k, axis=-1) - if int_bincount is not None: - dispatch_mask = int_bincount( - top_idx, 0, gate_prob.shape[-1], paddle.int64 - ) - else: - mask = paddle.zeros_like(gate_prob).put_along_axis( - top_idx, paddle.to_tensor(1.0), axis=1 - ) - dispatch_mask = paddle.sum(mask.cast(paddle.int64), axis=0) - - if num_experts is None: - num_experts = self.num_experts_tensor - if use_group is None: - use_group = self.config.moe_group_experts - - return cal_aux_loss_func( - gate_prob, - dispatch_mask, - tokens_mask, - dispatch_tokens_mask, - num_experts, - use_group, - self.config.moe_k, - self.global_aux_loss, - self.rank if self.global_aux_loss else None, - self.group if self.global_aux_loss else None, - ) - - def _cal_z_loss(self, logits, loss_mask=None): - - if ( - (moe_router_loss_ops is not None) - and (loss_mask is None or len(loss_mask.shape) == 1) - and (get_env_device() != "xpu") - and (logits.dtype == paddle.float32) - ): - return CalZLossFunctor.apply(logits, loss_mask) - else: - return cal_z_loss_func(logits, loss_mask) - - def _cal_orthogonal_loss_opt_each_weight(self, weight, use_group): - - if weight.dtype != paddle.float32: - weight = weight.astype(paddle.float32) - - if ( - (moe_router_loss_ops is not None) - and (get_env_device() != "xpu") - and (weight.dtype == paddle.float32) - ): - return CalOrthogonalLossOptEachWeightFunctor.apply( - weight, self.config.moe_k, use_group - ) - else: - return cal_orthogonal_loss_opt_each_weight_func( - weight, - self.config.moe_k, - use_group, - self.eps, - self.xpu_matmul, - self.training, - ) - - def _cal_orthogonal_loss(self, weight_id=None, use_group=None): - - if use_group is None: - use_group = ( - self.config.moe_group_experts and self.config.moe_group_orthogonal_loss - ) - - if weight_id is not None: - if weight_id == 0: - w_ = self.weight - else: - assert self.config.multimodel_experts - w_ = getattr(self, f"weight_{weight_id}") - return self._cal_orthogonal_loss_opt_each_weight(w_, use_group) - - orthogonal_loss = self._cal_orthogonal_loss_opt_each_weight( - self.weight, use_group - ) - if self.config.multimodel_experts: - for i in range(1, len(self.config.moe_num_experts)): - w_ = getattr(self, f"weight_{i}") - orthogonal_loss += self._cal_orthogonal_loss_opt_each_weight( - w_, use_group=False - ) - return orthogonal_loss - - -class TopKGateFused(Top2Gate): - - def forward( - self, - input: Tensor, - token_type_ids=None, - transform_weight=True, - ) -> Tuple[Tensor, Tensor, Tensor]: # type: ignore - - capacity = self.get_capacity(input.shape[0]) - weight = self.get_gate_weight(transform_weight) - with paddle.amp.auto_cast(False): - if get_env_device() == "xpu" and self.xpu_matmul is not None: - assert not self.fuse_gate_detach_matmul, "not supported on XPU" - input_32 = input.cast("float32") - logits = self.xpu_matmul( - input_32, - weight, - training=self.training, - ) - else: - logits = gate_detach_matmul( - input, weight, self.fuse_gate_detach_matmul, self.use_fake_gate - ) - if self.use_token_type_bias: - assert token_type_ids is not None - assert ( - token_type_ids.max() < self.bias.shape[0] - ), f"token_type_ids {token_type_ids.max()} >= bias shape {self.bias.shape[0]}" - bias = self.bias[token_type_ids] # [seq] - logits = logits + bias - orthogonal_loss = None - router_loss = paddle.zeros([1], dtype="float32") - router_loss.stop_gradient = False - if ( - self.enable_logging - and global_training_logs_enabled() - and orthogonal_loss is not None - ): - _log = { - f"orthogonal_loss_layer_{self.layer_idx}": orthogonal_loss.item(), - } - global_training_logs.update( - **_log, - **{ - k.replace(f"_layer_{self.layer_idx}", ""): v - for k, v in _log.items() - }, - ) - - return logits, capacity, router_loss diff --git a/examples/pre-training/models/sequence_parallel_utils_auto.py b/examples/pre-training/models/sequence_parallel_utils_auto.py index 408a7227..ea0e52b2 100644 --- a/examples/pre-training/models/sequence_parallel_utils_auto.py +++ b/examples/pre-training/models/sequence_parallel_utils_auto.py @@ -14,9 +14,7 @@ # !/usr/bin/env python3 -import hashlib import numpy as np -import logging import paddle from paddle import distributed as dist @@ -27,45 +25,9 @@ from models.comm_utils import ( scatter, all_gather, - reduce_scatter, ) -from paddle.distributed import in_auto_parallel_align_mode - - -try: - from paddle.nn.functional import gemm_reduce_scatter, all_gather_gemm -except ImportError: - gemm_reduce_scatter = None - all_gather_gemm = None - flux = None - -logger = logging.getLogger(__name__) - -if not hasattr(paddle.Tensor, "contiguous"): - - def contiguous(self): - - return self - - setattr(paddle.Tensor, "contiguous", contiguous) - - -if not hasattr(paddle.Tensor, "_md5sum"): - - def _md5sum(self): - numpy_array = np.array(self) - array_bytes = numpy_array.tobytes() - return hashlib.md5(array_bytes).hexdigest() - - setattr(paddle.Tensor, "_md5sum", _md5sum) - - -def get_hcg(): - return fleet.get_hybrid_communicate_group() - - class ScatterOp(PyLayer): @staticmethod @@ -79,50 +41,6 @@ def backward(ctx, grad): return all_gather(grad, axis=ctx.axis, group=ctx.group) -class GatherOp(PyLayer): - - @staticmethod - def forward(ctx, input, axis=0, group=None): - ctx.axis = axis - ctx.group = group - return all_gather(input, axis=axis, group=group) - - @staticmethod - def backward(ctx, grad): - return scatter(grad, axis=ctx.axis, group=ctx.group) - - -class AllGatherOp(PyLayer): - - @staticmethod - def forward(ctx, input, group=None): - ctx.group = group - return all_gather(input, group=group) - - @staticmethod - def backward(ctx, grad): - if in_auto_parallel_align_mode(): - group = ctx.group - if group is None: - group = get_hcg().get_model_parallel_group() - pg = group.process_group - pg.allreduce(grad).wait() - return paddle.split(grad, group.nranks, axis=0)[group.rank] - else: - return reduce_scatter(grad, group=ctx.group) - - -class ReduceScatterOp(PyLayer): - @staticmethod - def forward(ctx, input, group=None): - ctx.group = group - return reduce_scatter(input, group=group) - - @staticmethod - def backward(ctx, grad): - return all_gather(grad, group=ctx.group) - - class AllGatherVarlenOp(PyLayer): @staticmethod @@ -179,40 +97,6 @@ def backward(ctx, grad): return grad -class GemmReduceScatterOp(PyLayer): - - @staticmethod - def forward(ctx, input, weight, group): - - ctx.save_for_backward(input, weight) - ctx.group = group - output = gemm_reduce_scatter(input, weight, group) - return output - - @staticmethod - def backward(ctx, grad): - input, weight = ctx.saved_tensor() - group = ctx.group - if input.stop_gradient and weight.stop_gradient: - return None, None - - if input.stop_gradient: - input_grad = None - grad_parallel = None - else: - input_grad, grad_parallel = all_gather_gemm( - grad, weight, group, deepcopy_input_parallel=False - ) - - if weight.stop_gradient: - weight_grad = None - else: - if grad_parallel is None: - grad_parallel = all_gather(grad) - weight_grad = paddle.matmul(input, grad_parallel, transpose_x=True) - return input_grad, weight_grad - - def sequence_parallel_sparse_mask_labels(labels, ignore_label=-100): hcg = fleet.get_hybrid_communicate_group() group = hcg.get_model_parallel_group() From 8e5ada0191ae007273912e214522f5e55d6556a0 Mon Sep 17 00:00:00 2001 From: xuexixi Date: Mon, 18 Aug 2025 04:47:29 +0800 Subject: [PATCH 07/15] revert refactor modeling_auto --- .../models/ernie/modeling_auto.py | 720 +++++++++++++++--- .../models/ernie/modeling_auto_pp.py | 28 +- 2 files changed, 639 insertions(+), 109 deletions(-) diff --git a/examples/pre-training/models/ernie/modeling_auto.py b/examples/pre-training/models/ernie/modeling_auto.py index 475b6a32..6246434a 100644 --- a/examples/pre-training/models/ernie/modeling_auto.py +++ b/examples/pre-training/models/ernie/modeling_auto.py @@ -20,6 +20,7 @@ import contextlib import inspect + from copy import deepcopy from dataclasses import dataclass import numpy as np @@ -30,12 +31,15 @@ from paddle.distributed import fleet from paddle.distributed.fleet.utils import recompute from paddle.distributed.fleet.layers.mpu.random import get_rng_state_tracker +from paddle.incubate.nn.memory_efficient_attention import ( + memory_efficient_attention, + BlockDiagonalCausalMask, +) from paddle.distributed import in_auto_parallel_align_mode from models.comm_utils import subbatch -from models.moe.top2_gate_auto import Top2Gate -from models.moe.top2_gate_auto import TopKGateFusedAuto +from models.moe.top2_gate_auto import Top2Gate, TopKGateFusedAuto from paddleformers.transformers.conversion_utils import ( @@ -50,19 +54,7 @@ from paddleformers.transformers.model_utils import PretrainedModel, register_base_model -from models.ernie.modeling import ( - FusedDropoutImpl, - RotaryEmbedding, - RMSNorm, - get_triangle_upper_mask, - mem_eff_attn, - inbatch_pack_offset_to_attn_mask_start_row_indices, - _make_causal_mask, - _expand_mask, -) -from models.ernie.modeling_moe import ( - ErnieMoeMLPFused, -) +from models.ernie.modeling import FusedDropoutImpl from models.sequence_parallel_utils_auto import ( sequence_parallel_sparse_mask_labels, ) @@ -125,11 +117,16 @@ class CausalLMOutputWithCrossAttentionsAuto(CausalLMOutputWithCrossAttentions): try: from fast_ln import fast_ln +except ImportError: + fast_ln = None + +try: + import fused_ln as fused except ImportError: logger.warning( - "fast-ln not found, run `python src/ops/fast_ln_setup.py install` to build fast ln" + "fused-ln not found, run `python src/ops/fused_ln_setup.py install` to build fused ln" ) - fast_ln = None + fused = None try: from paddle.incubate.nn.functional import ( @@ -160,10 +157,16 @@ class CausalLMOutputWithCrossAttentionsAuto(CausalLMOutputWithCrossAttentions): ) +def is_pp_enable(): + + mesh = fleet.auto.get_mesh() + return "pp" in mesh.dim_names + + def global_mesh_starts_with_pp(): mesh = fleet.auto.get_mesh() - if "pp" in mesh.dim_names: + if is_pp_enable(): return mesh.get_mesh_with_dim("pp") else: return mesh @@ -183,6 +186,57 @@ def is_fleety_func(): IS_FLEETY = is_fleety_func() +def get_triangle_upper_mask(x, mask=None): + + if mask is not None: + return mask + shape = x.shape + shape[1] = 1 + mask = paddle.full(shape, -np.inf, dtype=x.dtype) + mask.stop_gradient = True + mask = paddle.triu(mask, diagonal=1) + mask.stop_gradient = True + return mask + + +def naive_fuse_split_tp( + weight, + tensor_parallel_degree, + tensor_parallel_rank=None, + is_column=True, + fuse_tensor_parts=2, +): + + logging.info(f"spliting fused-ffn: {weight.shape}") + axis = -1 if is_column else 0 + splited = np.split(weight, fuse_tensor_parts * tensor_parallel_degree, axis=axis) + return np.concatenate( + splited[tensor_parallel_rank::tensor_parallel_degree], axis=axis + ) + + +def parallel_matmul( + x, + y, + bias=None, + transpose_y=False, + tensor_parallel_degree=1, + tensor_parallel_output=True, +): + + if transpose_y: + logits = paddle.matmul(x, y, transpose_y=True) + if bias is not None: + logits += bias + else: + logits = F.linear(x, y, bias) + + if tensor_parallel_degree > 1 and not tensor_parallel_output: + logits = dist.reshard(logits, get_mesh(-1), [dist.Shard(0), dist.Replicate()]) + + return logits + + def calc_lm_head_logits( config, hidden_states, @@ -213,6 +267,7 @@ def calc_lm_head_logits( get_mesh(-1), [dist.Shard(1), dist.Replicate()], ) + # [S, B, H] to [B, S, H] hidden_states = paddle.transpose(hidden_states, [1, 0, 2]) if not config.using_dynamic_sequence_length: hidden_states = hidden_states.reshape( @@ -227,18 +282,102 @@ def calc_lm_head_logits( ) if tensor_parallel_output is None: tensor_parallel_output = config.tensor_parallel_output - logits = paddle.matmul( - hidden_states, weight, transpose_y=config.tie_word_embeddings + logits = parallel_matmul( + hidden_states, + weight, + bias=bias, + transpose_y=config.tie_word_embeddings, + tensor_parallel_degree=config.tensor_parallel_degree, + tensor_parallel_output=tensor_parallel_output, ) - if bias is not None: - logits += bias - - if config.tensor_parallel_degree > 1 and not tensor_parallel_output: - logits = dist.reshard(logits, get_mesh(-1), [dist.Shard(0), dist.Replicate()]) return logits +def finfo(dtype: paddle.dtype = None): + + if dtype is None: + dtype = paddle.get_default_dtype() + + if dtype == paddle.bfloat16: + + class BFloatFInfo: + min = -3.3895313892515355e38 + + return BFloatFInfo + if dtype == paddle.float32: + return np.finfo(np.float32) + if dtype == paddle.float16: + return np.finfo(np.float16) + if dtype == paddle.float64: + return np.finfo(np.float64) + + +def masked_fill(x, mask, value): + + y = paddle.full(x.shape, value, x.dtype) + return paddle.where(mask, y, x) + + +def mem_eff_attn( + query, key, value, pack_offset, drop_prob=0.0, dtype=paddle.bfloat16, training=True +): + + pack_offset = pack_offset.numpy() + shape = pack_offset.shape + assert len(shape) == 2, len(shape) + assert shape[0] == 1, shape[0] + n = pack_offset.size + pack_offset = pack_offset.flatten() + seqlens = [] + assert pack_offset[0] == 0, pack_offset[0] + for i in range(1, n): + if pack_offset[i] < 0: + break + cur = pack_offset[i] - pack_offset[i - 1] + assert cur > 0 + seqlens.append(cur) + + assert drop_prob == 0.0, drop_prob + assert dtype == paddle.bfloat16, dtype + + def cast(x): + return x.astype(dtype) if x.dtype != dtype else x + + if len(seqlens) == 1: + out, _ = flash_attention( + query, key, value, drop_prob, causal=True, training=training + ) + else: + mask = BlockDiagonalCausalMask.from_seqlens(seqlens) + out = memory_efficient_attention( + cast(query), + cast(key), + cast(value), + attn_bias=mask, + p=drop_prob, + training=training, + ) + return out + + +def inbatch_pack_offset_to_attn_mask_start_row_indices(inbatch_pack_offset): + inbatch_pack_offset = inbatch_pack_offset.numpy() + attn_mask_row_start_indices = [] + min_start_row = np.inf + for bidx in range(inbatch_pack_offset.shape[0]): + item = inbatch_pack_offset[bidx] + cumsum_item = item[item != -1] + record_lens = cumsum_item[1:] - cumsum_item[0:-1] + min_start_row = min(cumsum_item[1], min_start_row) + row_start_indices = np.repeat(cumsum_item[1:], record_lens) + attn_mask_row_start_indices.append(row_start_indices[None, None, ...]) + attn_mask_row_start_indices = np.concatenate(attn_mask_row_start_indices, axis=0) + return paddle.to_tensor(attn_mask_row_start_indices, dtype=paddle.int32), int( + min_start_row + ) + + def scaled_dot_product_attention( query_states, key_states, @@ -373,8 +512,7 @@ def scaled_dot_product_attention( attn_weights = paddle.maximum( attn_weights, paddle.to_tensor( - float(paddle.finfo(query_states.dtype).min), - dtype=query_states.dtype, + float(finfo(query_states.dtype).min), dtype=query_states.dtype ), ) @@ -418,6 +556,59 @@ def scaled_dot_product_attention( return attn_output, None +def _make_causal_mask(input_ids_shape, past_key_values_length, dtype): + batch_size, target_length = input_ids_shape + + mask = paddle.full((target_length, target_length), float(finfo(dtype).min)) + + mask_cond = paddle.arange(mask.shape[-1]) + mask = masked_fill( + mask, mask_cond < (mask_cond + 1).reshape([mask.shape[-1], 1]), 0 + ) + + if past_key_values_length > 0: + mask = paddle.concat( + [paddle.zeros([target_length, past_key_values_length]), mask], axis=-1 + ) + + return mask[None, None, :, :].expand( + [batch_size, 1, target_length, target_length + past_key_values_length] + ) + + +def _expand_mask(mask, dtype, tgt_length): + if mask.ndim == 4: + expanded_mask = mask + elif mask.ndim == 3: + expanded_mask = mask[:, None, :, :] + else: + batch_size, src_length = mask.shape[0], mask.shape[-1] + tgt_length = tgt_length if tgt_length is not None else src_length + + expanded_mask = mask[:, None, None, :].expand( + [batch_size, 1, tgt_length, src_length] + ) + + inverted_mask = 1.0 - expanded_mask + return masked_fill( + inverted_mask, inverted_mask.cast("bool"), float(finfo(dtype).min) + ) + + +def slice_experts(experts, moe_world_size): + moe_num_experts_per_device = len(experts) // moe_world_size + experts_per_device = [[] for _ in range(moe_world_size)] + + for i, expert in enumerate(experts): + ep_group_id = i // moe_num_experts_per_device + experts_per_device[ep_group_id].append(expert) + + lm_experts = nn.LayerList([]) + for experts_list in experts_per_device: + lm_experts.extend(experts_list[: moe_num_experts_per_device // 2]) + return lm_experts + + def get_gate( config: ErnieMoEConfig, expert: Tuple[Tuple[int, nn.Layer]], @@ -466,6 +657,9 @@ def get_gate( config, layer_idx=layer_idx, group=config.moe_group, ipp=ipp ) + lm_gate, lm_experts = None, None + logger.info(f"LM-experts-{lm_experts} -- experts-{experts}") + index = 0 if config.moe_group == "dp" else 1 ep_sub_meshes = dist.auto_parallel.api.split_mesh(get_mesh(ipp), index) @@ -477,16 +671,293 @@ def get_gate( ) experts[i].ep_group_id = ep_group_id - return gate, experts + return gate, experts, lm_gate, lm_experts -class FastLayerNorm(nn.LayerNorm): - def __init__(self, config): - assert fast_ln is not None +def _parse_moe_group(moe_group: str): + moe_group = moe_group.lower() + assert moe_group in { + "dp", + "mp", + "none", + }, f"moe-group not supported, got: {moe_group}" + logger.info(f"using moe-group: {moe_group}") + + return moe_group + + +class RMSNorm(nn.Layer): + + def __init__(self, config, ipp=0): + super().__init__() + self.hidden_size = config.hidden_size + self.weight = paddle.create_parameter( + shape=[self.hidden_size], + dtype=paddle.get_default_dtype(), + default_initializer=nn.initializer.Constant(1.0), + ) + self.variance_epsilon = config.rms_norm_eps + self.config = config + + def forward(self, hidden_states): + + if self.config.fuse_rms_norm: + return fused.fused_rms_norm( + hidden_states, self.weight, self.variance_epsilon + )[0] + if paddle.in_dynamic_mode(): + with paddle.amp.auto_cast(False): + variance = hidden_states.astype("float32").pow(2).mean(-1, keepdim=True) + hidden_states = ( + paddle.rsqrt(variance + self.variance_epsilon) * hidden_states + ) + else: + variance = hidden_states.astype("float32").pow(2).mean(-1, keepdim=True) + hidden_states = ( + paddle.rsqrt(variance + self.variance_epsilon) * hidden_states + ) + + if self.weight.dtype in [paddle.float16, paddle.bfloat16]: + hidden_states = paddle.cast(hidden_states, self.weight.dtype) + return hidden_states * self.weight + + +class LayerNorm(nn.LayerNorm): + + def __init__(self, config, ipp=0): super().__init__(config.hidden_size, epsilon=config.rms_norm_eps) + self.use_fast_ln = config.use_fast_ln + if self.use_fast_ln: + assert fast_ln is not None + self.ipp = ipp + if config.pipeline_parallel_degree > 1: + self.weight = dist.shard_tensor( + self.weight, get_mesh(self.ipp), [dist.Replicate(), dist.Replicate()] + ) + self.bias = dist.shard_tensor( + self.bias, get_mesh(self.ipp), [dist.Replicate(), dist.Replicate()] + ) + + def forward(self, hidden_states): + if self.use_fast_ln: + return fast_ln(hidden_states, self.weight, self.bias, self._epsilon)[0] + else: + return super().forward(hidden_states) + + +class FusedLayerNorm(nn.Layer): + + def __init__(self, config, ipp=0): + super().__init__() + self.config = config + self.hidden_size = config.hidden_size + self.weight = paddle.create_parameter( + shape=[self.hidden_size], + dtype=paddle.get_default_dtype(), + default_initializer=nn.initializer.Constant(1.0), + ) + self.bias = paddle.create_parameter( + shape=[self.hidden_size], dtype=paddle.get_default_dtype(), is_bias=True + ) + self.variance_epsilon = config.rms_norm_eps + self.ipp = ipp + if config.pipeline_parallel_degree > 1: + self.weight = dist.shard_tensor( + self.weight, get_mesh(self.ipp), [dist.Replicate(), dist.Replicate()] + ) + self.bias = dist.shard_tensor( + self.bias, get_mesh(self.ipp), [dist.Replicate(), dist.Replicate()] + ) + def forward(self, hidden_states): - return fast_ln(hidden_states, self.weight, self.bias, self._epsilon)[0] + + return fused.fused_ln( + hidden_states, self.weight, self.bias, self.variance_epsilon + )[0] + + +class RotaryEmbedding(nn.Layer): + + def __init__(self, dim, max_position_embeddings=4096, base=10000): + + super().__init__() + self.base = base + self.max_position_embeddings = max_position_embeddings + inv_freq = 1.0 / ( + base ** (paddle.cast(paddle.arange(0, dim, 2), dtype="float32") / dim) + ) + t = paddle.arange(max_position_embeddings, dtype="float32") + freqs = paddle.einsum("i,j->ij", t, inv_freq.cast("float32")) + emb = paddle.concat([freqs, freqs], axis=-1) + + self.cos_cached = emb.cos() + self.sin_cached = emb.sin() + + self._cast_to_low_precision = False + self._cast_to_low_precison = False + + def forward(self, x, seq_len=None): + + return ( + self.cos_cached[:seq_len, :], + self.sin_cached[:seq_len, :], + ) + + @classmethod + def rotate_half(cls, x): + + x1 = x[..., : x.shape[-1] // 2] + x2 = x[..., x.shape[-1] // 2 :] + return paddle.concat([-x2, x1], axis=-1) + + @classmethod + def apply_rotary_pos_emb(cls, q, k, cos, sin, offset: int = 0, position_ids=None): + if position_ids is not None: + assert offset == 0, offset + cos = F.embedding(position_ids, cos) + sin = F.embedding(position_ids, sin) + else: + cos = cos.unsqueeze(0) + sin = sin.unsqueeze(0) + cos = cos[:, offset : q.shape[1] + offset, None, :] + sin = sin[:, offset : q.shape[1] + offset, None, :] + + q_embed = paddle.add( + paddle.multiply(q, cos), paddle.multiply(cls.rotate_half(q), sin) + ) + k_embed = paddle.add( + paddle.multiply(k, cos), paddle.multiply(cls.rotate_half(k), sin) + ) + q_embed = q_embed.astype(q.dtype) + k_embed = k_embed.astype(k.dtype) + return q_embed, k_embed + + +class RopeEmbeddingLegacy(nn.Layer): + + def __init__(self, head_dim, compression_ratio=1.0, base=10000): + super().__init__() + self.head_dim = head_dim + self.compression_ratio = compression_ratio + self.base = base + + def forward(self, seq_length, position_ids=None): + + indices = paddle.arange(0, self.head_dim, 2, dtype="float32") + indices = 1 / self.base ** (indices / self.head_dim) + if position_ids is None: + position_ids = paddle.arange(0, seq_length, 1, dtype="float32").unsqueeze(1) + position_ids = position_ids / self.compression_ratio + sinusoid_inp = position_ids * indices.unsqueeze(0) + else: + position_ids = position_ids / self.compression_ratio + seq_length = position_ids.shape[-1] + sinusoid_inp = position_ids.unsqueeze(-1).astype( + "float32" + ) * indices.unsqueeze(0) + pos_emb = paddle.concat( + [paddle.sin(sinusoid_inp), paddle.cos(sinusoid_inp)], axis=-1 + ) + pos_emb = paddle.reshape(pos_emb, (-1, 1, seq_length, self.head_dim)) + pos_emb.stop_gradient = True + return pos_emb + + def apply_rotary(self, rp, q, k): + + sin, cos = paddle.chunk(rp, 2, axis=-1) + sin_pos = paddle.reshape(paddle.stack([sin, sin], axis=-1), rp.shape) + cos_pos = paddle.reshape(paddle.stack([cos, cos], axis=-1), rp.shape) + rotate_half_q = paddle.reshape( + paddle.stack([-q[:, :, :, 1::2], q[:, :, :, 0::2]], axis=-1), + paddle.shape(q), + ) + query = paddle.add( + paddle.multiply(q.astype("float32"), cos_pos), + paddle.multiply(rotate_half_q.astype("float32"), sin_pos), + ) + rotate_half_k = paddle.reshape( + paddle.stack([-k[:, :, :, 1::2], k[:, :, :, 0::2]], axis=-1), + paddle.shape(k), + ) + key = paddle.add( + paddle.multiply(k.astype("float32"), cos_pos), + paddle.multiply(rotate_half_k.astype("float32"), sin_pos), + ) + return query, key + + def forward_single(self, position_ids): + + batch_size, seq_length = position_ids.shape[:2] + rope_emb = paddle.zeros( + (2, batch_size, seq_length, 1, self.head_dim), dtype="float32" + ) + inv_freq = self.base ** ( + -paddle.arange(0, self.head_dim, 2, dtype="float32") / self.head_dim + ) + position_ids = position_ids.cast("float32") + position_ids = position_ids / self.compression_ratio + freqs = paddle.einsum("ij,k->ijk", position_ids.cast("float32"), inv_freq) + emb = paddle.stack([freqs, freqs], axis=-1).reshape( + (batch_size, seq_length, self.head_dim) + ) + emb = paddle.unsqueeze(emb, 2) + + rope_emb[0] = paddle.cos(emb) + rope_emb[1] = paddle.sin(emb) + return rope_emb + + @staticmethod + def apply_rotary_single(x, rope_emb): + + rotate_half_x = paddle.reshape( + paddle.stack([-x[:, :, :, 1::2], x[:, :, :, 0::2]], axis=-1), + paddle.shape(x), + ) + return x * rope_emb[0] + rotate_half_x * rope_emb[1] + + +class ErnieLinear(nn.Layer): + + def __init__( + self, + in_features, + out_features, + weight_attr=None, + bias_attr=None, + name=None, + ipp=0, + ): + super(ErnieLinear, self).__init__() + self._dtype = self._helper.get_default_dtype() + self._weight_attr = weight_attr + self._bias_attr = bias_attr + self.weight = self.create_parameter( + shape=[in_features, out_features], + attr=self._weight_attr, + dtype=self._dtype, + is_bias=False, + ) + self.bias = self.create_parameter( + shape=[out_features], + attr=self._bias_attr, + dtype=self._dtype, + is_bias=True, + ) + self.name = name + self.ipp = ipp + + def forward(self, input): + + out = F.linear(x=input, weight=self.weight, bias=None, name=self.name) + out = dist.reshard( + out, + get_mesh(self.ipp), + [dist.Shard(1), dist.Shard(0)], + ) + if self.bias: + out += self.bias + return out class ErnieMLP(nn.Layer): @@ -498,16 +969,25 @@ def __init__(self, config, ipp=None, do_shard_tensor=True): self.hidden_size = config.hidden_size self.intermediate_size = config.intermediate_size - self.gate_proj = nn.Linear( + LinearFN = nn.Linear + self.gate_proj = LinearFN( self.hidden_size, self.intermediate_size, bias_attr=config.use_bias ) - self.up_proj = nn.Linear( + self.up_proj = LinearFN( self.hidden_size, self.intermediate_size, bias_attr=config.use_bias ) - self.down_proj = nn.Linear( - self.intermediate_size, self.hidden_size, bias_attr=config.use_bias - ) + if config.sequence_parallel: + self.down_proj = ErnieLinear( + self.intermediate_size, + self.hidden_size, + bias_attr=config.use_bias, + ipp=self.ipp, + ) + else: + self.down_proj = LinearFN( + self.intermediate_size, self.hidden_size, bias_attr=config.use_bias + ) if do_shard_tensor and ( self.config.tensor_parallel_degree > 1 @@ -556,10 +1036,7 @@ def forward(self, x): x = fused_swiglu(self.gate_proj(x), self.up_proj(x)) else: x = F.silu(self.gate_proj(x)) * self.up_proj(x) - out = self.down_proj(x) - if self.config.sequence_parallel: - out = dist.reshard(out, get_mesh(self.ipp), [dist.Shard(1), dist.Shard(0)]) - return out + return self.down_proj(x) class ErnieAttentionAuto(nn.Layer): @@ -591,27 +1068,36 @@ def __init__(self, config, ipp: Optional[int] = None): self.hidden_size // self.num_heads * self.num_key_value_heads ) - self.q_proj = nn.Linear( + LinearFN = nn.Linear + self.q_proj = LinearFN( self.hidden_size, self.hidden_size, bias_attr=config.use_bias, ) - self.k_proj = nn.Linear( + self.k_proj = LinearFN( self.hidden_size, self.hidden_size if not self.is_gqa else kv_hidden_size, bias_attr=config.use_bias, ) - self.v_proj = nn.Linear( + self.v_proj = LinearFN( self.hidden_size, self.hidden_size if not self.is_gqa else kv_hidden_size, bias_attr=config.use_bias, ) - self.o_proj = nn.Linear( - self.hidden_size, - self.hidden_size, - bias_attr=config.use_bias, - ) + if config.sequence_parallel: + self.o_proj = ErnieLinear( + self.hidden_size, + self.hidden_size, + bias_attr=config.use_bias, + ipp=self.ipp, + ) + else: + self.o_proj = LinearFN( + self.hidden_size, + self.hidden_size, + bias_attr=config.use_bias, + ) self.config = config @@ -731,10 +1217,6 @@ def forward( attn_output = paddle.transpose(attn_output, [1, 0, 2]) attn_output = self.o_proj(attn_output) - if self.config.sequence_parallel: - attn_output = dist.reshard( - attn_output, get_mesh(self.ipp), [dist.Shard(1), dist.Shard(0)] - ) if not output_attentions: attn_weights = None @@ -897,6 +1379,48 @@ def forward(self, x): return paddle.bmm(x, self.weight) +class ErnieMoeMLPFused(nn.Layer): + + def __init__(self, config): + + assert ( + hasattr(config, "disable_ffn_model_parallel") + or config.tensor_parallel_degree == 1 + ), f"fused mlp only suport mp-moe, mp={config.tensor_parallel_degree}" + assert config.fuse_attn_ffn, "fused mlp only support fuse_attn_ffn" + super().__init__() + self.moe_dropout_prob = config.moe_dropout_prob + self.num_local_experts = config.moe_num_experts // config.moe_world_size + logger.info( + f"fused-expert-weight-shape: {[self.num_local_experts, config.hidden_size, config.intermediate_size]}" + ) + + self.up_gate_proj = BMMLinear( + self.num_local_experts, config.hidden_size, config.intermediate_size * 2 + ) + self.down_proj = BMMLinear( + self.num_local_experts, config.intermediate_size, config.hidden_size + ) + self.fuse_swiglu = config.fuse_swiglu + if self.fuse_swiglu: + assert fused_swiglu is not None, "fused_swiglu operator is not found." + + def __len__(self): + return self.num_local_experts + + def __iter__(self): + return (self for _ in range(1)) + + def forward(self, x): + if self.fuse_swiglu: + x = fused_swiglu(self.up_gate_proj(x)) + else: + gate, x = self.up_gate_proj(x).chunk(2, axis=-1) + x = F.silu(gate) * x + x = self.down_proj(x) + return x + + class ErnieDecoderLayerAuto(nn.Layer): """ ErnieDecoderLayerAuto is a decoder layer in Ernie model. @@ -940,23 +1464,11 @@ def __init__(self, config, layer_idx=0, ipp=0): self.create_moe_mlp_layer(layer_idx, ipp) else: self.mlp = ErnieMLP(config, ipp) - if config.use_rmsnorm: - Norm = RMSNorm(config) - elif config.use_fast_ln: - Norm = FastLayerNorm(config) - else: - Norm = nn.LayerNorm(config.hidden_size, epsilon=config.rms_norm_eps) - if config.pipeline_parallel_degree > 1: - Norm.weight = dist.shard_tensor( - Norm.weight, get_mesh(ipp), [dist.Replicate(), dist.Replicate()] - ) - if hasattr(Norm, "bias"): - Norm.bias = dist.shard_tensor( - Norm.bias, get_mesh(ipp), [dist.Replicate(), dist.Replicate()] - ) - - self.input_layernorm = Norm - self.post_attention_layernorm = Norm + Norm = RMSNorm if config.use_rmsnorm else LayerNorm + if not config.use_rmsnorm and config.fuse_ln: + Norm = FusedLayerNorm + self.input_layernorm = Norm(config, ipp) + self.post_attention_layernorm = Norm(config, ipp) self.residual_add1 = FusedDropoutImpl( config.hidden_dropout_prob, mode="upscale_in_train" ) @@ -1009,7 +1521,9 @@ def create_moe_mlp_layer(self, layer_idx, ipp): fc = [(_ex_cfg.moe_num_experts, fc_cls(_ex_cfg))] else: fc = [(_ex_cfg.moe_num_experts, fc_cls(_ex_cfg))] - gate, experts = get_gate(self.config, fc, layer_idx, self.ipp) + gate, experts, lm_gate, lm_experts = get_gate( + self.config, fc, layer_idx, self.ipp + ) _sh_cfg = deepcopy(self.config) if _sh_cfg.moe_num_shared_experts > 0: @@ -1393,9 +1907,13 @@ def __init__(self, config: ErnieMoEConfig): ) config.disable_ffn_model_parallel = True - mesh = fleet.auto.get_mesh() - if config.moe_group in mesh.dim_names: - config.moe_world_size = max(1, mesh.get_dim_size(config.moe_group)) + config.moe_group = _parse_moe_group(config.moe_group) + if config.moe_group in fleet.auto.get_mesh().dim_names: + config.moe_world_size = fleet.auto.get_mesh().get_dim_size( + config.moe_group + ) + if config.moe_world_size < 0: + config.moe_world_size = 1 else: config.moe_world_size = 1 @@ -1414,17 +1932,18 @@ def __init__(self, config: ErnieMoEConfig): self.config.tensor_parallel_degree > 1 or self.config.pipeline_parallel_degree > 1 ): - self.embed_tokens.weight = dist.shard_tensor( - self.embed_tokens.weight, - get_mesh(), - [dist.Replicate(), dist.Shard(1)], - ) + if not in_auto_parallel_align_mode(): + self.embed_tokens.weight = dist.shard_tensor( + self.embed_tokens.weight, + get_mesh(), + [dist.Replicate(), dist.Shard(1)], + ) layers_list = [] def get_layer_pp_info(ipp): mesh = fleet.auto.get_mesh() - if "pp" in mesh.dim_names: + if is_pp_enable() is False: return None, False else: pp_degree = mesh.get_dim_size("pp") @@ -1448,14 +1967,10 @@ def get_layer_pp_info(ipp): if input_need_reshard: self.next_pp_stage_indexes.append(layer_idx) self.layers = nn.LayerList(layers_list) - if config.use_rmsnorm: - Norm = RMSNorm(config) - elif config.use_fast_ln: - Norm = FastLayerNorm(config) - else: - Norm = nn.LayerNorm(config.hidden_size, epsilon=config.rms_norm_eps) - - self.norm = Norm + Norm = RMSNorm if config.use_rmsnorm else LayerNorm + if not config.use_rmsnorm and config.fuse_ln: + Norm = FusedLayerNorm + self.norm = Norm(config, -1) self.gradient_checkpointing = False @@ -1492,7 +2007,7 @@ def _prepare_decoder_attention_mask( ) combined_attention_mask = paddle.maximum( combined_attention_mask.astype(dtype), - paddle.to_tensor(float(paddle.finfo(dtype).min), dtype=dtype), + paddle.to_tensor(float(finfo(dtype).min), dtype=dtype), ) return combined_attention_mask @@ -1649,7 +2164,11 @@ def forward( has_gradient = not hidden_states.stop_gradient ipp = decoder_layer.ipp - if "pp" in fleet.auto.get_mesh().dim_names: + if not is_pp_enable(): + position_ids_input = position_ids + attention_mask_input = attention_mask + token_type_ids_input = token_type_ids + else: if position_ids is not None: position_ids_input = dist.reshard( position_ids, @@ -1676,10 +2195,6 @@ def forward( if token_type_ids is not None else None ) - else: - position_ids_input = position_ids - attention_mask_input = attention_mask - token_type_ids_input = token_type_ids if idx in self.next_pp_stage_indexes: hidden_states = dist.reshard( @@ -2073,7 +2588,11 @@ def __init__(self, config): config.tensor_parallel_degree > 1 ), f"sequence-parallel needs mp>1, got mp={config.tensor_parallel_degree}" - config.initializer_range = math.sqrt(0.3333 / config.hidden_size) + new_initializer_range = math.sqrt(0.3333 / config.hidden_size) + logger.info( + f"change initializer-range from {config.initializer_range} to {new_initializer_range}" + ) + config.initializer_range = new_initializer_range self.config = config self.ernie = ErnieModelAuto(config) self.lm_head = ErnieLMHead(config) @@ -2081,6 +2600,17 @@ def __init__(self, config): self.tie_weights() + if self.config.use_rmsnorm: + if self.config.fuse_rms_norm: + logger.info("Use fusedRMSNorm") + else: + logger.info("Use normal RMSNorm") + else: + if self.config.fuse_ln: + logger.info("Use fusedLN") + else: + logger.info("Use normal LayerNorm") + def _post_init(self, original_init, *args, **kwargs): """ Initialize weights and apply final processing diff --git a/examples/pre-training/models/ernie/modeling_auto_pp.py b/examples/pre-training/models/ernie/modeling_auto_pp.py index e77caaa2..5f74b76f 100644 --- a/examples/pre-training/models/ernie/modeling_auto_pp.py +++ b/examples/pre-training/models/ernie/modeling_auto_pp.py @@ -33,12 +33,13 @@ from models.moe.moe_utils_auto import get_mesh -from models.ernie.modeling import RMSNorm - from .modeling_auto import ( + _parse_moe_group, ErnieDecoderLayerAuto, ErniePretrainedModelAuto, - FastLayerNorm, + LayerNorm, + RMSNorm, + FusedLayerNorm, ErniePretrainingCriterion, ErnieLMHead, ) @@ -206,13 +207,14 @@ def __init__(self, config, layer_idx=0, ipp=0): None. """ if hasattr(config, "use_moe") and config.use_moe: - if config.moe_group.lower() in {"mp", "model", "tp", "mpdp"}: + if config.moe_group in {"mp", "model", "tp", "mpdp"}: assert config.sequence_parallel logger.info( f"disable FFN tensor model parallel, moe-group={config.moe_group}" ) config.disable_ffn_model_parallel = True + config.moe_group = _parse_moe_group(config.moe_group) if config.moe_group in fleet.auto.get_mesh().dim_names: config.moe_world_size = fleet.auto.get_mesh().get_dim_size( config.moe_group @@ -226,13 +228,14 @@ def __init__(self, config, layer_idx=0, ipp=0): self.config = config if hasattr(config, "use_moe") and config.use_moe: - if config.moe_group.lower() in {"mp", "model", "tp", "mpdp"}: + if config.moe_group in {"mp", "model", "tp", "mpdp"}: assert config.sequence_parallel logger.info( f"disable FFN tensor model parallel, moe-group={config.moe_group}" ) config.disable_ffn_model_parallel = True + config.moe_group = _parse_moe_group(config.moe_group) if config.moe_group in fleet.auto.get_mesh().dim_names: config.moe_world_size = fleet.auto.get_mesh().get_dim_size( config.moe_group @@ -271,14 +274,11 @@ def __init__(self, config, layer_idx=0, ipp=0): ) self.layer = ErnieDecoderLayerAuto(config, layer_idx, ipp) - if config.use_rmsnorm: - Norm = RMSNorm(config) - elif config.use_fast_ln: - Norm = FastLayerNorm(config) - else: - Norm = nn.LayerNorm(config.hidden_size, epsilon=config.rms_norm_eps) + Norm = RMSNorm if config.use_rmsnorm else LayerNorm + if not config.use_rmsnorm and config.fuse_ln: + Norm = FusedLayerNorm if self.layer_idx == self.config.num_hidden_layers - 1: - self.norm = Norm + self.norm = Norm(config, -1) self.lm_head = ErnieLMHead(config) def recompute_training( @@ -556,8 +556,8 @@ def __init__(self, config): else: logger.info("Use normal RMSNorm") else: - if self.config.use_fast_ln: - logger.info("Use FastLN") + if self.config.fuse_ln: + logger.info("Use fusedLN") else: logger.info("Use normal LayerNorm") From a74c379bfc721d38116ab0472da12b742d1b7a8c Mon Sep 17 00:00:00 2001 From: xuexixi Date: Mon, 18 Aug 2025 20:59:32 +0800 Subject: [PATCH 08/15] remove useless config --- examples/pre-training/ernie/pretrain_auto.py | 14 +- .../ernie/src/callbacks/__init__.py | 2 - .../callbacks/adaptivegradclip_callback.py | 120 ---- .../pre-training/ernie/src/clip/__init__.py | 6 +- examples/pre-training/ernie/src/clip/clip.py | 316 --------- .../src/trainers/pretraining_trainer_auto.py | 606 ++---------------- examples/pre-training/models/moe/moe_layer.py | 1 + .../pre-training/scripts/train_96_auto.sh | 71 +- .../pre-training/yamls/pretrain_96_auto.yaml | 52 +- 9 files changed, 51 insertions(+), 1137 deletions(-) delete mode 100644 examples/pre-training/ernie/src/callbacks/adaptivegradclip_callback.py delete mode 100644 examples/pre-training/ernie/src/clip/clip.py diff --git a/examples/pre-training/ernie/pretrain_auto.py b/examples/pre-training/ernie/pretrain_auto.py index ab4299bc..04ad80b8 100644 --- a/examples/pre-training/ernie/pretrain_auto.py +++ b/examples/pre-training/ernie/pretrain_auto.py @@ -259,7 +259,6 @@ def main(): (args,) = parser.parse_dict(dict(**model_args, **trainer_args)) # 2. check and update - # setup_pipeline_config(config.trainer_args) if "enable_dp_comm_overlap" in config.trainer_args.pipeline_parallel_config: logger.warning( "Pipeline dp_comm_overlap and FusedLinearWithGradAdd cannot be used together." @@ -290,7 +289,7 @@ def main(): setup_logger_output_file(config.model_args.output_dir, args.local_rank) setup_device_and_seed(args) check_memory_preallocation(args) - run_fleet_tests() # liyamei not need? + run_fleet_tests() set_dtype(args) # 4. init model @@ -304,20 +303,11 @@ def main(): tokenizer = setup_tokenizer(args, cfg) with paddle.LazyGuard(): - if args.from_scratch: - model = model_class(cfg) - else: - model = model_class.from_pretrained(args.model_name_or_path, config=cfg) + model = model_class(cfg) logger.info(f"Using model: {type(model)}, config: {model.config}") paddle.set_default_dtype("float32") - # freeze # liyamei not need? - freeze_config = set(args.freeze_config.split()) - if "freeze_vision" in freeze_config and hasattr(model, "freeze_vision"): - logger.info("Freezing model vision module") - model.freeze_vision() - # 5. dataset logger.info("Loading datasets...") train_dataset, eval_dataset, test_dataset, data_collator = ( diff --git a/examples/pre-training/ernie/src/callbacks/__init__.py b/examples/pre-training/ernie/src/callbacks/__init__.py index 3b1384e1..684af4a9 100644 --- a/examples/pre-training/ernie/src/callbacks/__init__.py +++ b/examples/pre-training/ernie/src/callbacks/__init__.py @@ -15,7 +15,6 @@ from .gc_callback import GCCallback from .logging_callback import LoggingCallback from .stopper_callback import StopperCallback -from .adaptivegradclip_callback import ClipGradByAdaptiveNormCallback from .moe_correction_bias_adjust_callback import MoECorrectionBiasAdjustCallback from .moe_logging_callback import GlobalRNGCallback, MoeLoggingCallback from .sp_grad_sync_callback import SPGradSyncCallback @@ -33,6 +32,5 @@ "MoECorrectionBiasAdjustCallback", "FP8QuantWeightCallback", "OrthogonalCallback", - "ClipGradByAdaptiveNormCallback", "StopperCallback", ] diff --git a/examples/pre-training/ernie/src/callbacks/adaptivegradclip_callback.py b/examples/pre-training/ernie/src/callbacks/adaptivegradclip_callback.py deleted file mode 100644 index 00188856..00000000 --- a/examples/pre-training/ernie/src/callbacks/adaptivegradclip_callback.py +++ /dev/null @@ -1,120 +0,0 @@ -# !/usr/bin/env python3 - -# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" ClipGradByAdaptiveNormCallback """ - -import os -import paddle -from paddleformers.trainer.trainer_callback import TrainerCallback -from paddleformers.trainer.trainer_utils import ( - PREFIX_CHECKPOINT_DIR, - get_last_checkpoint, -) -from src.utils import logger - - -class ClipGradByAdaptiveNormCallback(TrainerCallback): - """ - Load and save adaptive norm state hook, hack version - """ - - def on_train_begin(self, args, state, control, **kwargs): - """ - load adaptive norm state at the beginning of training. - """ - optimizer = kwargs.get("optimizer", None) - assert optimizer is not None - if optimizer._grad_clip is None: - logger.info("grad_clip is None.") - return - elif not hasattr(optimizer._grad_clip, "state_dict"): - logger.info("grad_clip {optimizer._grad_clip} has not state_dict method.") - return - - if args.adaptive_norm_force_clear_state: - logger.info("force clear ClipGradByAdaptiveNorm state dict.") - return - - resume_from_checkpoint = ( - None if not args.resume_from_checkpoint else args.resume_from_checkpoint - ) - if isinstance(resume_from_checkpoint, bool) and resume_from_checkpoint: - resume_from_checkpoint = get_last_checkpoint(args.output_dir) - if resume_from_checkpoint is None: - raise ValueError( - f"No valid checkpoint found in output directory ({args.output_dir})" - ) - - if resume_from_checkpoint is None: - return - - if args.world_size > 1: - process_index = args.process_index - path = os.path.join( - resume_from_checkpoint, f"adaptivenorm_clip_state_{process_index}.pth" - ) - if not os.path.isfile(path): - logger.info( - f"Didn't find an adaptivenorm clip state file for process {process_index}, if you are resuming " - "a training that wasn't launched in a distributed fashion, reproducibility is not guaranteed." - ) - return - else: - path = os.path.join(resume_from_checkpoint, "adaptivenorm_clip_state.pth") - if not os.path.isfile(path): - logger.info( - "Didn't find an adaptivenorm clip state file, if you are resuming a training that was " - "launched in a distributed fashion, reproducibility is not guaranteed." - ) - return - - logger.info(f"Loading adaptivenorm clip state state to {path}") - state_dict = paddle.load(path) - - optimizer._grad_clip.set_state_dict(state_dict) - logger.info("load ClipGradByAdaptiveNorm state dict success.") - - def on_save(self, args, state, control, **kwargs): - """ - Event called after a checkpoint save. - """ - optimizer = kwargs.get("optimizer", None) - assert optimizer is not None - - if optimizer._grad_clip is None or not hasattr( - optimizer._grad_clip, "state_dict" - ): - return - - # Save model checkpoint - checkpoint_folder = f"{PREFIX_CHECKPOINT_DIR}-{state.global_step}" - - run_dir = args.output_dir - - output_dir = os.path.join(run_dir, checkpoint_folder) - - os.makedirs(output_dir, exist_ok=True) - - if args.world_size > 1: - # use global process_index to save - process_index = args.process_index - path = os.path.join( - output_dir, f"adaptivenorm_clip_state_{process_index}.pth" - ) - else: - path = os.path.join(output_dir, "adaptivenorm_clip_state.pth") - logger.info(f"Saving randompos rng state to {path}") - paddle.save(optimizer._grad_clip.state_dict(), path) diff --git a/examples/pre-training/ernie/src/clip/__init__.py b/examples/pre-training/ernie/src/clip/__init__.py index 215b5156..6484ef44 100644 --- a/examples/pre-training/ernie/src/clip/__init__.py +++ b/examples/pre-training/ernie/src/clip/__init__.py @@ -12,10 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -from .clip import ClipGradByAdaptiveNorm from .moe_clip import ClipGradForMOEByGlobalNorm -__all__ = [ - "ClipGradForMOEByGlobalNorm", - "ClipGradByAdaptiveNorm", -] +__all__ = ['ClipGradForMOEByGlobalNorm'] diff --git a/examples/pre-training/ernie/src/clip/clip.py b/examples/pre-training/ernie/src/clip/clip.py deleted file mode 100644 index d795061f..00000000 --- a/examples/pre-training/ernie/src/clip/clip.py +++ /dev/null @@ -1,316 +0,0 @@ -# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import copy -from collections import defaultdict -import paddle -import paddle.distributed as dist -from paddle.distributed import fleet - -try: - from paddle.base import framework -except ImportError: - from paddle.fluid import framework -from paddle.nn.clip import ClipGradBase, _squared_l2_norm -from src.utils import logger - - -class ClipGradByAdaptiveNorm(ClipGradBase): - - def __init__( - self, - clip_ratio=1.03, - start_clip_steps=100, - beta=0.98, - epsilon=1e-8, - shard_clip=False, - enable_record=False, - enable_record_clip_history=False, - verbose=False, - ): - super().__init__() - self.clip_ratio = clip_ratio - self.beta = beta - self.epsilon = epsilon - self.state = defaultdict(dict) - self.start_clip_steps = start_clip_steps - self.shard_clip = shard_clip - self.enable_record = enable_record - self.steps = 0 - self.enable_record_clip_history = enable_record_clip_history - self.verbose = verbose - self.keys = [ - "clip_ratio", - "beta", - "epsilon", - "start_clip_steps", - "shard_clip", - "enable_record", - "steps", - "enable_record_clip_history", - ] - - if start_clip_steps < 0: - raise ValueError( - "start_clip_steps {}, please start_clip_steps >= 0.".format( - start_clip_steps - ) - ) - - def __str__(self): - return "ClipGradByAdaptiveNorm, clip_ratio={}, beta={}, start_clip_steps={}, \ - shard_clip={}, enable_record={}".format( - self.clip_ratio, - self.beta, - self.start_clip_steps, - self.shard_clip, - self.enable_record, - ) - - def clip_by_norm(self, param, grad, norm_value, global_norm): - - state = self.state[param.name] - - if "norm_value" not in state: - state["norm_value"] = norm_value - - if "clip_times" not in state: - state["clip_times"] = 0 - - if self.enable_record_clip_history: - if "clip_history" not in state: - state["clip_history"] = {} - - avg_norm_value = state["norm_value"] - - if self.enable_record: - if "norm_history" not in state: - state["norm_history"] = {} - state["norm_history"][self.steps] = [ - float(norm_value), - float(avg_norm_value), - ] - - if self.steps <= self.start_clip_steps: - clip_coeff = 1.0 / (global_norm + self.epsilon) - if clip_coeff < 1.0: - grad.multiply_(clip_coeff) - param._reset_grad_inplace_version(True) - - if norm_value < state["norm_value"]: - state["norm_value"] = norm_value - else: - if norm_value > self.clip_ratio * avg_norm_value: - # clip grad - coef = (self.clip_ratio * avg_norm_value) / (norm_value + self.epsilon) - grad.multiply_(coef) - param._reset_grad_inplace_version(True) - norm_value_old = norm_value - norm_value = self.clip_ratio * avg_norm_value - state["clip_times"] = state["clip_times"] + 1 - if self.enable_record_clip_history: - state["clip_history"][self.steps] = [ - float(norm_value_old), - float(norm_value), - ] - if self.verbose: - logger.info( - "{} gradclip {} times, clip from {} to {}".format( - param.name, - state["clip_times"], - float(norm_value_old), - float(norm_value), - ) - ) - - logger.info( - "{} steps {}, gradclip {} times, clip_ratio {}, clip from {} to {}".format( - param.name, - self.steps, - state["clip_times"], - self.clip_ratio, - float(norm_value_old), - float(norm_value), - ) - ) - state["norm_value"] = avg_norm_value * self.beta + norm_value * ( - 1.0 - self.beta - ) - - return grad - - @paddle.no_grad() - def _dygraph_clip(self, params_grads): - global_norm_tensor = None - if self.steps <= self.start_clip_steps: - hcg = fleet.get_hybrid_communicate_group() - mp_size = hcg.get_model_parallel_world_size() - mp_group = hcg.get_model_parallel_group() - pp_size = hcg.get_pipe_parallel_world_size() - pp_group = hcg.get_pipe_parallel_group() - sharding_size = hcg.get_sharding_parallel_world_size() - sharding_group = hcg.get_sharding_parallel_group() - - norm_squared_values = [] - for p, g in params_grads: - if g is None: - continue - if getattr(p, "need_clip", True) is False: - continue - norm_squared_value = _squared_l2_norm(g) - if not p.is_distributed and mp_size > 1: - norm_squared_value = norm_squared_value / mp_size - norm_squared_values.append(norm_squared_value) - - global_norm_squared_tensor = paddle.stack(norm_squared_values).sum() - - if mp_size > 1: - dist.all_reduce(global_norm_squared_tensor, group=mp_group) - if pp_size > 1: - dist.all_reduce(global_norm_squared_tensor, group=pp_group) - if sharding_size > 1: - dist.all_reduce(global_norm_squared_tensor, group=sharding_group) - global_norm_tensor = paddle.sqrt(global_norm_squared_tensor) - - if self.verbose and global_norm_tensor is not None: - logger.info( - "step: {}, global norm: {}".format( - self.steps, float(global_norm_tensor) - ) - ) - - if hasattr(self, "sharding_stage1_v2") and self.sharding_stage1_v2: - need_sync = False - if not self.shard_clip: - hcg = fleet.get_hybrid_communicate_group() - mp_size = hcg.get_model_parallel_world_size() - mp_group = hcg.get_model_parallel_group() - sharding_size = hcg.get_sharding_parallel_world_size() - sharding_group = hcg.get_sharding_parallel_group() - if mp_size > 1 or sharding_size > 1: - need_sync = True - - norm_squared_values = [ - paddle.zeros([1], dtype=params_grads[0][1].dtype) - for _ in range(self.num_params) - ] - - for p, g in params_grads: - if g is None: - continue - if getattr(p, "need_clip", True) is False: - continue - norm_squared_value = _squared_l2_norm(g) - if need_sync and not p.is_distributed: - norm_squared_values[self.pname_to_paramindex[p.name]] = ( - 1 / mp_size - ) * norm_squared_value - else: - norm_squared_values[self.pname_to_paramindex[p.name]] = ( - norm_squared_value - ) - - num_has_grad = len(norm_squared_values) - norm_squared_tensor = paddle.concat(norm_squared_values, axis=0) - if need_sync: - if mp_size > 1: - dist.all_reduce(norm_squared_tensor, group=mp_group) - if sharding_size > 1: - dist.all_reduce(norm_squared_tensor, group=sharding_group) - - norm_tensor = paddle.sqrt(norm_squared_tensor) - norm_values = paddle.split(norm_tensor, num_has_grad, axis=0) - - params_and_grads = [] - for p, g in params_grads: - if g is None: - continue - if getattr(p, "need_clip", True) is False: - params_and_grads.append((p, g)) - continue - new_grad = self.clip_by_norm( - p, - g, - norm_values[self.pname_to_paramindex[p.name]], - global_norm_tensor, - ) - params_and_grads.append((p, new_grad)) - else: - need_sync = False - if not self.shard_clip: - hcg = fleet.get_hybrid_communicate_group() - mp_size = hcg.get_model_parallel_world_size() - mp_group = hcg.get_model_parallel_group() - if mp_size > 1: - need_sync = True - - norm_squared_values = [] - for p, g in params_grads: - if g is None: - continue - if getattr(p, "need_clip", True) is False: - continue - norm_squared_value = _squared_l2_norm(g) - if need_sync and not p.is_distributed: - norm_squared_values.append((1 / mp_size) * norm_squared_value) - else: - norm_squared_values.append(norm_squared_value) - - num_has_grad = len(norm_squared_values) - norm_squared_tensor = paddle.concat(norm_squared_values, axis=0) - if need_sync: - dist.all_reduce(norm_squared_tensor, group=mp_group) - - norm_tensor = paddle.sqrt(norm_squared_tensor) - norm_values = paddle.split(norm_tensor, num_has_grad, axis=0) - - params_and_grads = [] - idx = 0 - for p, g in params_grads: - if g is None: - continue - if getattr(p, "need_clip", True) is False: - params_and_grads.append((p, g)) - continue - new_grad = self.clip_by_norm(p, g, norm_values[idx], global_norm_tensor) - params_and_grads.append((p, new_grad)) - idx += 1 - - self.steps += 1 - return params_and_grads - - @framework.dygraph_only - def state_dict(self): - - state_dict = {k: v for k, v in self.state.items()} - for key in self.keys: - state_dict[key] = self.__dict__[key] - return state_dict - - @framework.dygraph_only - def set_state_dict(self, state_dict): - - if len(state_dict) == 0 or state_dict is None: - logger.info("state_dict is empty, please check if it is right.") - - for key in self.keys: - if key in state_dict: - self.__dict__[key] = state_dict[key] - else: - logger.info("Can't find [ {} ] in state_dict".format(key)) - - for k in state_dict: - if k in self.keys: - continue - self.state[k] = copy.deepcopy(state_dict[k]) diff --git a/examples/pre-training/ernie/src/trainers/pretraining_trainer_auto.py b/examples/pre-training/ernie/src/trainers/pretraining_trainer_auto.py index b583fa14..ab95e149 100644 --- a/examples/pre-training/ernie/src/trainers/pretraining_trainer_auto.py +++ b/examples/pre-training/ernie/src/trainers/pretraining_trainer_auto.py @@ -25,7 +25,6 @@ import json import contextlib from typing import Optional -from collections import OrderedDict from dataclasses import dataclass, field import time import math @@ -59,18 +58,12 @@ DistributedBatchSampler as PaddleNLPDistributedBatchSampler, ) -try: - from paddleformers.trainer.trainer import ( - PADDLE_WEIGHT_FILE_NAME as PADDLE_WEIGHTS_NAME, - ) -except ImportError: - from paddleformers.utils.env import PADDLE_WEIGHTS_NAME + from paddleformers.trainer.utils import add_start_docstrings from paddleformers.trainer.trainer_callback import PrinterCallback from paddle.distributed import fleet import paddle.distributed as dist -from paddleformers.transformers.model_utils import _add_variant from src.lr_schedulers import get_cosine_schedule_with_warmup from src.utils.training_utils import ( @@ -80,25 +73,11 @@ TensorBoardCallback, LoggingCallback, StopperCallback, - ClipGradByAdaptiveNormCallback, ) from src.datasets.dist_data_loader import ( DistDataLoaderAuto, ) -from paddle.distributed import in_auto_parallel_align_mode -from src.clip import ClipGradByAdaptiveNorm, ClipGradForMOEByGlobalNorm - -try: - from paddleformers.trainer.trainer import ( - is_dp_group_support_in_group_sharded_parallel, - ) -except Exception: - - def is_dp_group_support_in_group_sharded_parallel(): - """ - hack for paddlenlp develop branch. - """ - return True +from src.clip import ClipGradForMOEByGlobalNorm logger = logging.getLogger(__name__) @@ -119,10 +98,6 @@ def is_dp_group_support_in_group_sharded_parallel(): @add_start_docstrings(AutoTrainingArguments.__doc__) class AutoPreTrainingArguments(AutoTrainingArguments): - vocab_path: str = field( - default=None, metadata={"help": "eb35 streaming data vocab"} - ) - task_need_convert: str = field(default=None, metadata={"help": "glm task id"}) multimodal: bool = field( default=False, metadata={"help": "whether training with multimodal"} ) @@ -133,20 +108,7 @@ class AutoPreTrainingArguments(AutoTrainingArguments): "https://paddlenlp.readthedocs.io/zh/latest/model_zoo/transformers.html" }, ) - vision_model_name_or_path: str = field( - default=None, - metadata={ - "help": "Path to pretrained model or model identifier from " - "https://paddlenlp.readthedocs.io/zh/latest/model_zoo/transformers.html" - }, - ) - inception_model_name_or_path: str = field( - default=None, - metadata={ - "help": "Path to pretrained model or model identifier from " - "https://paddlenlp.readthedocs.io/zh/latest/model_zoo/transformers.html" - }, - ) + prefetch_factor: int = field( default=2, metadata={"help": "global random seed factor."}, @@ -155,65 +117,17 @@ class AutoPreTrainingArguments(AutoTrainingArguments): default=-1, metadata={"help": "eval iteration for every evaluation."}, ) - num_consecutive: int = field( - default=1, - metadata={ - "help": "H5文件连续采样。为了保证AFS性能,在读取AFS H5文件的时候需要尽量读取一片ID" - ",这个参数指定了一次连续读取的`样本`大小" - }, - ) - train_emb_only: int = field( - default=0, - metadata={"help": "是否只训练embedding,通常用于热启换词表"}, - ) - use_train_part_sharding: Optional[int] = field( - default=1, - metadata={"help": "根据file进行数据切片,只在预训练时候使用。否则会很慢"}, - ) + min_lr: float = field( default=0.0, metadata={"help": "minus learning rate"}, ) - use_map_style_data: int = field( - default=0, - metadata={ - "help": "以为HF dataset为中心的 MapStyle SFT数据流(支持ShareGPT/DistillGPT)等数据", - }, - ) - use_streaming_data: int = field( - default=0, - metadata={ - "help": "标准线上明文数据流", - }, - ) - dataset: str = field( - default=None, - metadata={"help": "The name of the dataset to use (via the datasets library)."}, - ) - data_load_process_num: int = field( - default=10, - metadata={ - "help": "是否使用多进程加速原始数据读取,与DataLoader的num_workers意义不同" - }, - ) input_dir: str = field(default=None, metadata={"help": "data path"}) split: str = field( default="949,50,1", metadata={"help": "Train/valid/test data split ratio"} ) - data_dir: str = field(default=None, metadata={"help": "数据路径(指向一个目录)"}) - - data_filelist: str = field( - default=None, metadata={"help": "数据文件列表,与`args.data_dir`互斥"} - ) - data_weights: str = field(default=None, metadata={"help": "数据配比权重"}) - - dev_data: str = field( - default=None, - metadata={"help": "The name of the dataset to use (via the datasets library)."}, - ) - max_seq_length: int = field( default=512, metadata={ @@ -228,126 +142,29 @@ class AutoPreTrainingArguments(AutoTrainingArguments): "`gradient_accumulation_steps` will be ignored" }, ) - init_global_batch_size: int = field( - default=-1, - metadata={ - "help": "开启动态Batching。必须提供`global_batch_size`, " - "global_batch_size 会在 `batch_size_warumup_steps` 步内从 " - "`init_global_batch_size` 提升到 `global_batch_size`, " - "每次 `batchsize` 的提升量为`batch_size_warmup_increment`" - }, - ) - batch_size_warmup_steps: int = field( - default=-1, - metadata={ - "help": "开启动态Batching。必须提供`global_batch_size`, " - "global_batch_size 会在 `batch_size_warumup_steps` 步内从 " - "`init_global_batch_size` 提升到 `global_batch_size`, " - "每次 `batchsize` 的提升量为`batch_size_warmup_increment`" - }, - ) - batch_size_warmup_increment: int = field( - default=1, - metadata={ - "help": "开启动态Batching。必须提供`global_batch_size`, " - "global_batch_size 会在 `batch_size_warumup_steps` 步内从 " - "`init_global_batch_size` 提升到 `global_batch_size`, " - "每次 `batchsize` 的提升量为`batch_size_warmup_increment`" - }, - ) - preprocessing_num_workers: Optional[int] = field( - default=None, - metadata={"help": "The number of processes to use for the preprocessing."}, - ) - config_name: Optional[str] = field( - default=None, - metadata={ - "help": "Pretrained config name or path if not the same as model_name" - }, - ) + tokenizer_name: Optional[str] = field( default=None, metadata={ "help": "Pretrained tokenizer name or path if not the same as model_name" }, ) - init_ckpt: Optional[str] = field( - default=None, - metadata={}, - ) + sequence_parallel: Optional[int] = field( default=0, metadata={}, ) - config_file: Optional[str] = field( - default=None, - metadata={"help": "config file (YAML) to update hyper-parameters"}, - ) virtual_pp_degree: Optional[int] = field( default=1, metadata={ "help": "vpp", }, ) - from_scratch: Optional[int] = field(default=1, metadata={"help": "是否重头训练"}) - no_shuffle: Optional[int] = field(default=0, metadata={"help": "不要shuffle数据"}) - no_part_shuffle: Optional[int] = field( - default=0, metadata={"help": "不进行part内数据shuffle"} - ) - record_optimizer_stat: Optional[bool] = field( - default=False, metadata={"help": "是否记录优化器momentum信息"} - ) - skip_optimizer_badcases: Optional[bool] = field( - default=False, metadata={"help": "是否跳过optimizer badcase很多的step"} - ) - same_data: Optional[bool] = field( - default=False, - metadata={"help": "热启时,数据、配比、DP数是否完全一致, 支持续线"}, - ) - base_seq_length: Optional[int] = field( - default=4096, metadata={"help": "reeao最小seq_length"} - ) - shuffle_consecutive: Optional[bool] = field( - default=False, - metadata={ - "help": "是否对num_consecutive片段进行shuffle, same_data=True热启时,该值需与上一次保持一致" - }, - ) - global_shuffle_num_examples: Optional[int] = field( - default=0, - metadata={ - "help": "part间shuffle的num_example总数限制,默认不做限制, " - "这个值与最小配比的积 必须大于1, 改变该值时,需要设置same_data=False" - }, - ) - adaptive_norm_clip: Optional[bool] = field( - default=False, metadata={"help": "是否启用 AdaptiveNormClip 梯度裁剪策略"} - ) - adaptive_norm_clip_ratio: Optional[float] = field( - default=1.03, - metadata={"help": "AdaptiveNormClip 裁剪阈值, 大于设定的阈值才会启动裁剪"}, - ) - adaptive_norm_force_clear_state: Optional[bool] = field( - default=False, metadata={"help": "AdaptiveNormClip 强制清空 state dict"} - ) - adaptive_norm_shard_clip: Optional[bool] = field( - default=False, metadata={"help": "AdaptiveNormClip 在切分参数上是否在局部clip"} - ) - adaptive_norm_enable_record: Optional[bool] = field( - default=False, metadata={"help": "AdaptiveNormClip 是否启用统计历史norm值"} - ) - adaptive_norm_start_clip_steps: Optional[int] = field( - default=100, metadata={"help": "AdaptiveNormClip 开始裁剪的step"} - ) - adaptive_norm_enable_record_clip_history: Optional[bool] = field( - default=False, metadata={"help": "AdaptiveNormClip 是否启用统计历史裁剪的记录"} - ) - adaptive_norm_verbose: Optional[bool] = field( - default=False, metadata={"help": "AdaptiveNormClip 是否开启裁剪日志打印"} - ) + use_async_save: Optional[bool] = field( - default=False, metadata={"help": "是否开启异步保存功能"} + default=False, + metadata={"help": "Whether to use async_save instead of paddle.save."}, ) pre_alloc_memory: float = field( default=0.0, @@ -356,28 +173,24 @@ class AutoPreTrainingArguments(AutoTrainingArguments): "and release it for avoiding memory fragmentation" }, ) - enable_global_training_logs: bool = field( - default=False, metadata={"help": "是否启用global_training_logs"} - ) - use_dummy_dataset: Optional[bool] = field( - default=False, metadata={"help": "是否使用DummyDataSet, 仅用于Debug"} - ) - reshard_save_then_exit: Optional[bool] = field( - default=False, metadata={"help": "是否在reshard后直接退出程序"} - ) + moe_group: Optional[str] = field( - default="dp", metadata={"help": "moe 的通信组,目前支持“dp|sharding|mp|dummy”"} + default="dp", + metadata={ + "help": "The communication group of moe currently supports `dp|sharding|mp|dummy`" + }, ) use_moe: Optional[bool] = field( - default=False, metadata={"help": "expert parallel 临时替代"} + default=False, metadata={"help": "Temporary alternative to expert parallelism."} ) moe_use_all2all: Optional[bool] = field( - default=False, metadata={"help": "是否使用all2all通信方式"} + default=False, + metadata={"help": "Whether to use the all2all communication method."}, ) log_global_grad_norm: Optional[bool] = field( default=False, metadata={ - "help": "打印全局grad-norm, 只有在开启`enable_global_training_logs`时生效" + "help": "Print the global gradient norm, which only takes effect when `enable_global_training_logs` is enabled.." }, ) @@ -392,37 +205,22 @@ class AutoPreTrainingArguments(AutoTrainingArguments): "help": "The scheduler type to use. suppor linear, cosine, constant, constant_with_warmup" }, ) - image_token_len: int = field( - default=64, - metadata={"help": "number of images tokens from resampler per image"}, - ) - freeze_config: str = field( - default="", + + moe_gate_lr_ratio: float = field( + default=None, metadata={ "help": ( - "Some additional config for freeze params, we provide some option to config it." - "following config is support: freeze_vision,freeze_lm" + "When enabling MoE, apply special handling to the learning rate (LR) of the gate/router." ) }, ) - moe_gate_lr_ratio: float = field( - default=None, - metadata={"help": ("启用 moe 时,对 gate/router 的 LR 做特殊处理")}, - ) vit_lr_ratio: float = field( default=None, - metadata={"help": ("启用vit训练时,对 vit 的 LR 做特殊处理")}, - ) - modality_interleave: str = field(default="acc", metadata={"help": "acc"}) - modality_ratio: tuple = field( - default=None, - metadata={"help": "ratio of modality tokens to be masked out"}, - ) - bos_retry_max_time: int = field( - default=0, metadata={"help": "when bos download failed, #retry times"} - ) - bos_retry_interval: float = field( - default=1, metadata={"help": "when bos download failed, interval between retry"} + metadata={ + "help": ( + "When enabling ViT training, apply special handling to the learning rate (LR) of ViT." + ) + }, ) pipeline_schedule_mode: str = field( @@ -433,17 +231,7 @@ class AutoPreTrainingArguments(AutoTrainingArguments): default="ErnieDecoderLayerAuto", metadata={"help": "The seg method of spliting pp layer for virtual pipeline."}, ) - pp_need_data_degree: int = field( - default=0, - metadata={ - "help": "pipline 并行中的机器也需要 fetch 数据,提升吞吐,搭配 `ErniemmMoEForCausalPipe` 使用" - }, - ) - pp_need_data: bool = field(default=False, metadata={"help": "向前兼容"}) - custom_data_status: str = field( - default=None, - metadata={"help": "load data status from custom trainer_state.json"}, - ) + model_type: Optional[str] = field( default="ernie", metadata={"help": "Only support for ernie pre-training for now."}, @@ -455,67 +243,14 @@ class AutoPreTrainingArguments(AutoTrainingArguments): @property def need_data(self): - - if self.pp_need_data_degree: - assert self.pipeline_parallel_degree > 1 - assert ( - self.pp_need_data_degree >= 2 - and self.pp_need_data_degree <= self.pipeline_parallel_degree - ), ( - self.pp_need_data_degree, - self.pipeline_parallel_degree, - ) - no_need_data_range = list( - range(self.pp_need_data_degree - 1, self.pipeline_parallel_degree - 1) - ) - return self.tensor_parallel_rank == 0 and ( - self.pipeline_parallel_rank not in no_need_data_range - ) return self.pipeline_parallel_rank == 0 and self.tensor_parallel_rank == 0 - @property - def combine_batch(self): - return self.max_seq_length // self.base_seq_length - - @property - def reeao_dataset_rank(self): - if not self.pp_need_data_degree: - return super().dataset_rank - no_need_data_range = list( - range(self.pp_need_data_degree - 1, self.pipeline_parallel_degree - 1) - ) - ranks = [ - i - for i in range(self.pipeline_parallel_degree) - if i not in no_need_data_range - ] - if self.pipeline_parallel_rank not in ranks: - return None - reeao_pp_rank = ranks.index(self.pipeline_parallel_rank) - - assert not (self.sharding_parallel_degree > 1 and self.data_parallel_rank > 1) - return ( - max(self.pp_need_data_degree, 1) * self.sharding_parallel_rank - + reeao_pp_rank - ) - @property def reeao_dataset_world_size(self): - if not self.pp_need_data: - return super().dataset_world_size - return ( - max(self.sharding_parallel_degree, 1) - * max(self.data_parallel_degree, 1) - * max(self.pipeline_parallel_degree, 1) - ) + return super().dataset_world_size def __post_init__(self): super().__post_init__() - if in_auto_parallel_align_mode(): - self.adaptive_norm_clip = False - self.adaptive_norm_clip_ratio = 0.0 - self.no_shuffle = 1 - self.no_part_shuffle = 1 assert ( self.global_batch_size @@ -555,31 +290,12 @@ def __post_init__(self): acc_steps, ) - if self.batch_size_warmup_steps > 0: - assert self.global_batch_size > 0, self.global_batch_size - assert self.init_global_batch_size > 0, self.init_global_batch_size - self.max_gradient_accumulation_steps = self.gradient_accumulation_steps - ( - self.per_device_train_batch_size, - self.gradient_accumulation_steps, - ) = reset_per_device_batch_size( - self.init_global_batch_size, - self.per_device_train_batch_size, - self.dataset_world_size, - ) - logger.info( - f"using progressive batching, accumulate step will increese from {self.gradient_accumulation_steps}" - f"to {self.max_gradient_accumulation_steps} in {self.batch_size_warmup_steps} steps" - ) - else: - self.max_gradient_accumulation_steps = ( - self.gradient_accumulation_steps - ) # hack add new + self.max_gradient_accumulation_steps = self.gradient_accumulation_steps if self.pipeline_parallel_degree > 1: self.per_device_eval_batch_size = ( self.per_device_train_batch_size * self.gradient_accumulation_steps - ) # hack Eval for PP! + ) logger.warn( f"eval_batch_size set to {self.per_device_eval_batch_size} in Pipeline Parallel!" ) @@ -587,24 +303,8 @@ def __post_init__(self): user_defined_strategy.strategy.pipeline_configs.accumulate_steps = ( self.gradient_accumulation_steps ) - if self.pp_need_data and not self.pp_need_data_degree: - self.pp_need_data_degree = self.pipeline_parallel_degree - if self.pp_need_data_degree: - assert ( - self.gradient_accumulation_steps % self.pp_need_data_degree == 0 - ), ( - f"gradient_accumulation_steps[{self.gradient_accumulation_steps}] should be divisible by " - f"pp_need_data_degree[{self.pp_need_data_degree}]" - ) - self.gradient_accumulation_steps = ( - self.gradient_accumulation_steps // self.pp_need_data_degree - ) - logger.info( - f"pp-need-data hack args.gradient_accumulation_steps to - {self.gradient_accumulation_steps}" - ) - self.max_gradient_accumulation_steps = ( - self.gradient_accumulation_steps - ) # hack add new + + self.max_gradient_accumulation_steps = self.gradient_accumulation_steps logger.info(f"fixing pp configs: {user_defined_strategy.pipeline_configs}") else: self.per_device_eval_batch_size = self.per_device_train_batch_size @@ -654,8 +354,6 @@ def __post_init__(self): f"accumulate_steps[{sd_configs.accumulate_steps}] * " f"per_device_train_batch_size[{self.per_device_train_batch_size}]" ) - if self.vision_model_name_or_path is not None: - self.multimodal = True class AutoPretrainingTrainer(AutoTrainer): @@ -670,10 +368,6 @@ def __init__(self, _shit=None, args=None, model=None, callbacks=[], **kwargs): ), ] + callbacks - if args.adaptive_norm_clip: - callbacks.append( - ClipGradByAdaptiveNormCallback(), - ) args.use_async_save = ( args.use_async_save and args.save_sharded_model and args.load_sharded_model ) @@ -693,27 +387,6 @@ def get_numel_item(p): self.model_numel = numel_tensor.item() // self.args.dataset_world_size self.pop_callback(PrinterCallback) - self.pp_data_buffer = [] # pp - self._tokens_per_sec_per_card_buffer = [] - self._start_save_time = time.time() - self._end_save_time = time.time() - self._first_end_save_time = time.time() - self.resume_global_step = -1 - self.first_skip_step = ( - 5 if self.args.save_steps > 5 else self.args.save_steps / 2 - ) - if args.same_data: - logger.warning( - "You have set same_data=True. \ - Carefully check whether the data, population proportion, " - "and DP count are completely consistent with those before." - ) - else: - logger.warning( - "You have set same_data=False. \ - which will regenerate the global shuffle domain." - ) - # self.return_value = paddle.zeros([]) #fake return value def autocast_smart_context_manager(self): @@ -753,126 +426,6 @@ def autocast_smart_context_manager(self): return ctx_manager - def _load_optimizer_state(self, checkpoint): - # def _load_moe_optimizer_state(checkpoint): - # opt_moe_suffix = re.sub(r"moe\d\d", "moe00", self.args.optimizer_name_suffix) - # return self._load_optimizer_state_of_one_shard(checkpoint, opt_moe_suffix) - - def _broadcast_moe_optimizer_state(state_dict): - # boardcast_keys - base_state_dict = {"master_weights": {}} - buf = [ - { - i: j.shape - for i, j in state_dict.items() - if i not in ["master_weights", "LR_Scheduler"] - }, - {i: j.shape for i, j in state_dict["master_weights"].items()}, - {"LR_Scheduler": state_dict.get("LR_Scheduler", {})}, - ] - - if self.args.use_hybrid_parallel: - hcg = fleet.get_hybrid_communicate_group() - src_rank = hcg.get_data_parallel_group_src_rank() - group = hcg.get_data_parallel_group() - else: - src_rank = 0 - group = None - - dist.broadcast_object_list(buf, src=src_rank, group=group) - for k, s in buf[0].items(): - v = state_dict.get(k, paddle.zeros(s, "float32")).cuda() - v.name = k - dist.broadcast(v, src=src_rank, group=group) - logger.info(f"broadcast moe optimizer {k} from {src_rank}") - base_state_dict[k] = v.cpu() - for k, s in buf[1].items(): - v = ( - state_dict["master_weights"] - .get(k, paddle.zeros(s, "float32")) - .cuda() - ) - v.name = k - dist.broadcast(v, src=src_rank, group=group) - logger.info( - f"broadcast moe optimizer-master_weights {k} from {src_rank}" - ) - base_state_dict["master_weights"][k] = v.cpu() - base_state_dict.update(buf[2]) - return base_state_dict - - state_dict = super()._load_optimizer_state(checkpoint) - - if self.args.use_moe: - base_state_dict = _broadcast_moe_optimizer_state(state_dict) - if self.args.data_parallel_rank > 0: - master_weight = state_dict.pop("master_weights", {}) - base_state_dict.update(state_dict) - if master_weight: - if "master_weights" in base_state_dict: - base_state_dict["master_weights"].update(master_weight) - else: - base_state_dict["master_weights"] = master_weight - state_dict = base_state_dict - del base_state_dict - return state_dict - - def _save_moe_weights(self, output_dir): - - optimizer_name = _add_variant( - PADDLE_OPTIMIZER_NAME, self.args.optimizer_name_suffix - ) - saved_signal_path = os.path.join(output_dir, f"saved_signal_{dist.get_rank()}") - - os.makedirs(output_dir, exist_ok=True) - state_dict = self.model.state_dict() - optimzier_state_dict = self.optimizer.state_dict() - - filtered_state_dict = OrderedDict() - filter_optimzier_state_dict = OrderedDict() - - param_names_in_master_weights = ( - list(optimzier_state_dict["master_weights"].keys()) - if self.args.bf16 - else [] - ) - filter_optimzier_state_dict["master_weights"] = OrderedDict() - - for k, v in state_dict.items(): - if getattr(v, "no_sync", False): - - if v.name in param_names_in_master_weights: - filter_optimzier_state_dict["master_weights"][v.name] = ( - optimzier_state_dict["master_weights"][v.name] - ) - if not ( - getattr(self.args, "should_save_sharding_stage1_model", False) - or getattr(self.args, "save_sharding_stage1_model", False) - ): - filtered_state_dict[k] = v - for op_k, op_v in optimzier_state_dict.items(): - if op_k.startswith(v.name): - filter_optimzier_state_dict[op_k] = op_v - - if getattr(self.args, "should_save_sharding_stage1_model", False) or getattr( - self.args, "save_sharding_stage1_model", False - ): - self._save(output_dir=output_dir) - else: - if self.args.sharding_parallel_rank == 0: - paddle.save( - filtered_state_dict, - os.path.join( - output_dir, - _add_variant(PADDLE_WEIGHTS_NAME, self.args.weight_name_suffix), - ), - ) - paddle.save( - filter_optimzier_state_dict, os.path.join(output_dir, optimizer_name) - ) - with open(saved_signal_path, mode="w+") as f: - f.write("1") - def evaluate( self, eval_dataset=None, ignore_keys=None, metric_key_prefix: str = "eval" ): @@ -937,14 +490,14 @@ def get_train_dataloader(self): _DataLoader = partial( DistDataLoaderAuto, need_data=self.args.need_data, - pp_broadcast=not self.args.pp_need_data, + pp_broadcast=True, ) train_dataset = self.train_dataset if self._is_iterable_dataset(train_dataset): return DataLoader( train_dataset, - batch_size=None, # we do data collation in Stream + batch_size=None, collate_fn=self.data_collator, num_workers=self.args.dataloader_num_workers, use_shared_memory=True, @@ -962,21 +515,6 @@ def get_train_dataloader(self): prefetch_factor=self.args.prefetch_factor, ) - def _broadcast_final_loss(self, tr_loss): - tr_loss = tr_loss._local_value() if tr_loss.is_dist() else tr_loss - - if self.args.pipeline_parallel_degree > 1: - hcg = fleet.get_hybrid_communicate_group() - num_stages = hcg.get_pipe_parallel_world_size() - - paddle.distributed.broadcast( - tr_loss, - src=hcg.get_rank_from_stage(num_stages - 1), - sync_op=True, - group=hcg.get_pipe_parallel_group(), - ) - return tr_loss - def _maybe_log_save_evaluate( self, tr_loss, model, epoch, ignore_keys_for_eval, **kwargs ): @@ -1007,15 +545,7 @@ def create_optimizer(self, lr_scheduler=None): We provide a reasonable default that works well. If you want to use something else, you can pass a tuple in the Trainer's init through `optimizers`, or subclass and override this method in a subclass. """ - optimizer_params = ( - [p for n, p in self.model.named_parameters() if "embeddings" in n] - if self.args.train_emb_only - else self.model.parameters() - ) - if self.args.train_emb_only: - logger.info( - f"using `train-emb-only`, #embedding params={len(optimizer_params)}" - ) + optimizer_params = self.model.parameters() if self.optimizer is None: def need_decay(name): @@ -1037,45 +567,7 @@ def apply_decay_param_fun(x): self.args ) - if self.args.adaptive_norm_clip: - if "split_param" in self.args.sharding_parallel_config: - from paddle.distributed.fleet.meta_optimizers.dygraph_optimizer.dygraph_sharding_optimizer import ( - DygraphShardingOptimizerV2, - ) - - v2_assign_slice_grad = DygraphShardingOptimizerV2._assign_slice_grad - - def _assign_slice_grad(self): - v2_assign_slice_grad(self) - assert isinstance( - self._grad_clip, ClipGradByAdaptiveNorm - ), "self._grad_clip must be ClipGradByAdaptiveNorm" - if not hasattr(self._grad_clip, "pname_to_paramindex"): - pname_to_paramindex = {} - assert not isinstance(self._parameter_list[0], dict) - for idx, param in enumerate(self._parameter_list): - param = self._slice_params[param.name] - if param._is_initialized(): - pname_to_paramindex[param.name] = idx - self._grad_clip.pname_to_paramindex = pname_to_paramindex - self._grad_clip.num_params = len(self._parameter_list) - self._grad_clip.sharding_stage1_v2 = True - - DygraphShardingOptimizerV2._assign_slice_grad = _assign_slice_grad - logger.info( - "Hack DygraphShardingOptimizerV2._assign_slice_grad for ClipGradByAdaptiveNorm" - ) - - grad_clip = ClipGradByAdaptiveNorm( - clip_ratio=self.args.adaptive_norm_clip_ratio, - start_clip_steps=self.args.adaptive_norm_start_clip_steps, - shard_clip=self.args.adaptive_norm_shard_clip, - enable_record=self.args.adaptive_norm_enable_record, - enable_record_clip_history=self.args.adaptive_norm_enable_record_clip_history, - verbose=self.args.adaptive_norm_verbose, - ) - logger.info("using ClipGradByAdaptiveNorm") - elif ( + if ( self.args.use_moe and not self.args.use_hybrid_parallel and not self.args.enable_auto_parallel @@ -1102,10 +594,6 @@ def expert_fn(p): p.name: n for n, p in self.model.state_dict().items() } gate_pattern = re.compile(r"ernie\.layers\.0\.mlp\.gate\.weight") - vit_pattern = re.compile( - r"vision_model\.(cls_token|pos_embed|patch_embed|blocks)" - ) - vit_blocks_pattern = re.compile(r"vision_model\.blocks\.(\d+)\.") def lr_ratio_fn(param): if param.name in self.static_name_to_dyg_name.keys(): @@ -1117,15 +605,7 @@ def lr_ratio_fn(param): f"apply moe_gate_lr_ratio to {name}, ratio={self.args.moe_gate_lr_ratio}" ) return float(self.args.moe_gate_lr_ratio) - elif self.args.vit_lr_ratio is not None and vit_pattern.match(name): - n_layers = self.model.config.vision_config.layers - if vit_blocks_pattern.match(name): - layer_id = int(vit_blocks_pattern.match(name).group(1)) - else: - layer_id = 0 - lr_ratio = self.args.vit_lr_ratio ** (n_layers - 1 - layer_id) - logger.info(f"apply vit lr_ratio to {name}, ratio={lr_ratio}") - return float(lr_ratio) + return 1.0 self.optimizer = optimizer_cls( @@ -1138,12 +618,7 @@ def lr_ratio_fn(param): grad_clip=grad_clip, multi_precision=True, lr_ratio=( - lr_ratio_fn - if ( - self.args.moe_gate_lr_ratio is not None - or self.args.vit_lr_ratio is not None - ) - else None + lr_ratio_fn if self.args.moe_gate_lr_ratio is not None else None ), **optimizer_kwargs, ) @@ -1163,9 +638,6 @@ def save_model(self, output_dir=None): ) as of: of.write(json.dumps(self.static_name_to_dyg_name)) - def _load_rng_state(self, checkpoint): - pass - def _get_meshes_for_loader(self): def _get_mesh(pp_idx=0): return self.global_mesh.get_mesh_with_dim("pp")[pp_idx] diff --git a/examples/pre-training/models/moe/moe_layer.py b/examples/pre-training/models/moe/moe_layer.py index 9aa8a2ce..498c577e 100644 --- a/examples/pre-training/models/moe/moe_layer.py +++ b/examples/pre-training/models/moe/moe_layer.py @@ -200,6 +200,7 @@ def forward(ctx, x, combine_weights, scatter_index): @staticmethod def backward(ctx, grad_y, *_): + # assert moe_combine is not None grad_x, grad_combine_weight_helper = paddle._C_ops.moe_combine_grad( ctx.x, ctx.combine_weights, ctx.scatter_index, grad_y ) diff --git a/examples/pre-training/scripts/train_96_auto.sh b/examples/pre-training/scripts/train_96_auto.sh index 4deaf98d..25f139c8 100644 --- a/examples/pre-training/scripts/train_96_auto.sh +++ b/examples/pre-training/scripts/train_96_auto.sh @@ -14,22 +14,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -export NNODES=1 -export PADDLE_TRAINERS_NUM=1 - -mpi_rank=${OMPI_COMM_WORLD_RANK:-0} -node_rank=$((mpi_rank+offset)) -mpi_node=${OMPI_COMM_WORLD_SIZE:-1} -echo "MPI status:${mpi_rank}/${mpi_node}" -nnode_train=${nnode_set:-${mpi_node}} -master_train=${master:-localhost} -# -echo "Distributed Training ${node_rank}/${nnode_train} master=${master_train}" -set -x - export CUDA_MODULE_LOADING=LAZY export CUDA_DEVICE_MAX_CONNECTIONS=1 -export NCCL_DEBUG=INFO export PYTHONUNBUFFERED=1 unset GLOG_vmodule GLOG_v export PADDLE_DISABLE_CUDNN_FA=1 @@ -41,24 +27,6 @@ export FLAGS_tcp_max_syn_backlog=16384 export FLAGS_call_stack_level=2 -# 屏蔽平台预设的环境变量,因为框架采用兼容升级,检测到这些配置会使用原方式启动 -unset PADDLE_ELASTIC_JOB_ID -unset PADDLE_TRAINER_ENDPOINTS -unset DISTRIBUTED_TRAINER_ENDPOINTS -unset FLAGS_START_PORT -unset PADDLE_ELASTIC_TIMEOUT -nnodes=$PADDLE_TRAINERS_NUM -rank=$PADDLE_TRAINER_ID - - -export FLAGS_shard_use_reduce=1 -export FLAGS_shard_norm_align_dp=0 - -#加速pin memory save ckpt时间 -export FLAGS_use_auto_growth_pinned_allocator=True - -# export FLAGS_flash_attn_version=v1 -# 开启FA3 SM=`nvidia-smi --query-gpu=compute_cap --format=csv | tail -n 1 | sed 's/\.//g'` if [ $SM -eq 90 ] then @@ -67,52 +35,17 @@ else export FLAGS_flash_attn_version=2 fi -# 保证集群稳定性的配置,跟性能无关 -export NCCL_IB_QPS_PER_CONNECTION=8 -export NCCL_IB_TIMEOUT=22 -export NCCL_IB_GID_INDEX=3 -# 开启AR功能 -export NCCL_IB_ADAPTIVE_ROUTING=1 -# 集群hang检测 -export PADDLE_PG_TIMEOUT=150000 # 通信组超时时间,单位是ms,默认2分钟 -export FLAGS_enable_async_trace=False # True开启通信debug功能,False或不设置关闭,默认开启 -# export CUDA_MODULE_LOADING=LAZY - -export FLAGS_pipeline_nccl_comm_init_option=1 - -# 启动方式 -cuda_version=`nvidia-smi |grep "CUDA Version" |awk '{print $9}' |awk -F'.' '{print $1}'` -if [ ${cuda_version} != "12" ];then - export LD_LIBRARY_PATH=/usr/local/cuda/compat:$LD_LIBRARY_PATH -fi - -master=`cat /root/paddlejob/workspace/hostfile | head -n 1 | awk '{print $1}'` -port=36677 - - -#自动并行相关 export FLAGS_enable_fused_ffn_qkv_pass=1 export FLAGS_enable_pir_api=1 -#export FLAGS_enable_sharding_stage1_tensor_fusion=1 export FLAGS_enable_moe_utils=true - -#调试相关 export FLAGS_call_stack_level=2 -#export GLOG_v=6 -#export FLAGS_print_ir=1 -#export FLAGS_benchmark=1 -#export CUDA_VISIBLE_DEVICES=0,1 - -export PYTHONPATH=$PYTHONPATH:./ernie -LOG_DIR=output/paddle_distributed_logs -rm -rf output -rm -rf core.* +export PYTHONPATH=$PYTHONPATH:./ernie python -m paddle.distributed.launch \ - --log_dir $LOG_DIR \ + --log_dir output/paddle_distributed_logs \ --run_mode=collective \ ${script:-ernie/pretrain_auto.py} \ --config yamls/pretrain_96_auto.yaml diff --git a/examples/pre-training/yamls/pretrain_96_auto.yaml b/examples/pre-training/yamls/pretrain_96_auto.yaml index 730f0abf..ecc993e9 100644 --- a/examples/pre-training/yamls/pretrain_96_auto.yaml +++ b/examples/pre-training/yamls/pretrain_96_auto.yaml @@ -1,4 +1,3 @@ -# -----------环境变量----------------------# env: HOME: null @@ -7,7 +6,6 @@ model_args: model_name_or_path: model_configs_auto/ tokenizer_name: ./ernie/src/tokenizers/tokenizer_model output_dir: ./output/ - data_load_process_num: 40 max_seq_length: 4096 base_seq_length: 4096 num_consecutive: 32 @@ -27,20 +25,6 @@ model_args: trainer_args: input_dir: "0.4 ./demo_data/data-1-part0 0.6 ./demo_data/data-1-part0" split: "998,1,1" - loss_spike_settings: - enable_loss_spike_watcher: 1 - longjob_id: long-78f0ae68688b4659 - supervised_filename: output/paddle_distributed_logs/metrics_rank0.json - delimiter: "Loading configuration file" - watch_loss_spike_interval: 20 - loss_spike_restart_interval: 300 - params: - - data_type: null - data_type_human_read: "纯文" - max_loss_thr: 2.0 - max_tolerance_steps: 1 - allow_loss_fallback: 0 - start_check_step: 219700 use_sp_callback: true moe_gate_lr_ratio: 0.01 @@ -60,8 +44,8 @@ trainer_args: learning_rate: 2.2e-4 min_lr: 2.2e-5 - global_batch_size: 2 # 16660 - gradient_accumulation_steps: 1 # 8008: 14; + global_batch_size: 2 + gradient_accumulation_steps: 1 per_device_train_batch_size: 2 per_device_eval_batch_size: 1 @@ -69,22 +53,16 @@ trainer_args: decay_function: 1-sqrt max_grad_norm: 1.0 - adaptive_norm_clip: 0 # 4350 step后,关闭 adaptive-norm-clip - adaptive_norm_clip_ratio: 1.2 - adaptive_norm_force_clear_state: 0 # 在切换分布式策略时, 开启强制刷新统计状态 - adaptive_norm_enable_record: 1 # 开启更详细的裁剪日志 - use_async_save: True # enable asynchronize save to gain efficiency + use_async_save: True weight_decay: 0.1 warmup_steps: 200 save_total_limit: 5 bf16: True fp16_opt_level: "O2" - use_fp8: False scale_loss: 4096 seed: 666 - use_train_part_sharding: 1 pre_alloc_memory: 60 # # N7 @@ -104,24 +82,20 @@ trainer_args: # N1 dynamic auto tensor_parallel_degree: 4 # N7:8, N4:8, N1:4 - # pipeline_parallel_degree: 1 # N7:7, N4:4, N1:2 pipeline_parallel_degree: 2 # N7:7, N4:4, N1:2 n_microbatches: 2 pipeline_schedule_mode: "VPP" model_type: "ernie_pp" - virtual_pp_degree: 1 # N7:8, N4:8, N1:1 + virtual_pp_degree: 1 data_parallel_degree: 1 sharding: "stage1" - sharding_degree: 1 # 170 - # sharding_degree: 170 # + sharding_degree: 1 amp_master_grad: 1 - pipeline_parallel_config: enable_delay_scale_loss #enable_dp_comm_overlap - # pipeline_parallel_config: enable_delay_scale_loss enable_overlap_p2p_comm best_unbalanced_scheduler #enable_dp_comm_overlap + pipeline_parallel_config: enable_delay_scale_loss sharding_parallel_config: split_param enable_fuse_optimizer_states sharding_comm_buffer_size_MB: 2048 tensor_parallel_config: replace_with_parallel_cross_entropy - # tensor_parallel_config: sync_param sync_grad sync_moment skip_profile_timer: False @@ -131,27 +105,13 @@ trainer_args: load_sharded_model: True save_sharded_model: True - save_sharding_stage1_model_include_freeze_params: True ignore_load_lr_and_optim: False metrics_output_path: ./output/paddle_distributed_logs/ - #TODO(@gexiao): move to longjob_args - pdc_download_ckpt: true - pdc_download_timeout: 300 - # # Flash checkpoint settings - # enable_zero_cost_checkpoint: true - # save_tokenizer: false - # save_rng_states: false - # zcc_workers_num: 1 - # zcc_pipeline_hooks_capacity_usage: 0.8 - # flash_device_save_steps: 4 - # zcc_save_ema_coef: 0.9993 #exp((4/10000)*ln(1-0.9999)) - # zcc_ema_interval: 4 use_moe: true - moe_with_send_router_loss: False moe_group: mp log_global_grad_norm: True enable_optimizer_timer: False From b0e5a53dcb6c1655a42968b6ce869427db530e23 Mon Sep 17 00:00:00 2001 From: xuexixi Date: Mon, 18 Aug 2025 21:14:29 +0800 Subject: [PATCH 09/15] revert pretrain_auto --- examples/pre-training/ernie/pretrain_auto.py | 434 +++++++++---------- 1 file changed, 199 insertions(+), 235 deletions(-) diff --git a/examples/pre-training/ernie/pretrain_auto.py b/examples/pre-training/ernie/pretrain_auto.py index 04ad80b8..4ef4b6cb 100644 --- a/examples/pre-training/ernie/pretrain_auto.py +++ b/examples/pre-training/ernie/pretrain_auto.py @@ -12,20 +12,24 @@ # See the License for the specific language governing permissions and # limitations under the License. -import json import os -import random import time -from typing import Dict, Any - +import json import numpy as np +import random import paddle -from omegaconf import ListConfig, DictConfig -from paddle.distributed.fleet import fleet, collective_perf - -from paddleformers.trainer import PdArgumentParser, get_last_checkpoint - -from config import get_config +import paddle.distributed.fleet as fleet +from src.utils import logger +from paddleformers.trainer import ( + PdArgumentParser, + get_last_checkpoint, +) +from src.tokenizers.tokenization_eb_v2 import ErnieBotTokenizer +from omegaconf.listconfig import ListConfig +from omegaconf.dictconfig import DictConfig +from src.callbacks import ( + GlobalRNGCallback, +) from models.ernie import ( ErnieForCausalLMAuto, ErnieForCausalLMAutoPP, @@ -34,129 +38,204 @@ ErnieConfig, ErnieMoEConfig, ) -from pretrain import create_pretrained_dataset -from src.callbacks import GlobalRNGCallback -from src.tokenizers.tokenization_eb_v2 import ErnieBotTokenizer from src.trainers import AutoPretrainingTrainer, AutoPreTrainingArguments -from src.utils import logger, setup_logger_output_file +from src.utils import ( + setup_logger_output_file, +) from src.utils.misc import global_training_logs +from pretrain import create_pretrained_dataset -def log_trainer_start(): - if "MAIN_PROCESS_STARTED" not in os.environ: - start_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) - logger.info( - f"Training Main Process Started. time: {start_time}, pid: {os.getpid()}" - ) - os.environ["MAIN_PROCESS_STARTED"] = "1" +from config import get_config + +try: + from paddleformers.trainer.trainer_utils import log_trainer_start +except ImportError: + + def log_trainer_start(): + """Print main process messgae""" + if "MAIN_PROCESS_STARTED" not in os.environ: + start_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) + logger.info( + f"The Training Main Process Started Successfully. time: {start_time}, pid: {os.getpid()}" + ) + os.environ["MAIN_PROCESS_STARTED"] = "1" -def format_config_value(v): - if isinstance(v, (ListConfig, DictConfig)): - return list(v) if isinstance(v, ListConfig) else dict(v) - return v +from paddle.distributed.fleet import collective_perf +log_trainer_start() -def update_model_config_from_args( - config: ErnieConfig, model_args: Dict[str, Any] -) -> ErnieConfig: + +def update_model_config_from_args(config: ErnieConfig, model_args: dict): for k, v in model_args.items(): if hasattr(config, k): - logger.info(f"Updating model config: {k} = {v}") + logger.info(f"update model config: {k} = {v}") setattr(config, k, v) else: - logger.warning(f"Model config key '{k}' does not exist") + logger.warning(f"model config key: {k} does not exist") return config -def init_parameters(model): +def init_parameter(model): + for param in model.parameters(): param.initialize() - model.apply(model.init_weights) -def setup_device_and_seed(args): +def main(): + """Main function""" + config = get_config(verbose=True) + os.makedirs(config.model_args.output_dir, exist_ok=True) + parser = PdArgumentParser(AutoPreTrainingArguments) + if not hasattr(config.trainer_args, "pipeline_parallel_config"): + config.trainer_args.pipeline_parallel_config = "" + + if "enable_dp_comm_overlap" in config.trainer_args.pipeline_parallel_config: + logger.warning( + "Pipeline dp_comm_overlap and FusedLinearWithGradAdd can not be used at " + "the same time." + ) + + if "enable_timer" in config.trainer_args.pipeline_parallel_config: + from paddle.distributed.fleet.meta_parallel.pipeline_parallel import ( + PipelineParallel, + ) + + PipelineParallel.timer_printer = lambda _: None + + def formatv(v): + if isinstance(v, ListConfig): + return list(v) + elif isinstance(v, DictConfig): + return dict(v) + return v + + model_args = {k: formatv(v) for k, v in dict(config.model_args).items()} + trainer_args = {k: formatv(v) for k, v in dict(config.trainer_args).items()} + (args,) = parser.parse_dict(dict(**model_args, **trainer_args)) + + if args.strategy.pipeline.enable and args.virtual_pp_degree > 1: + pipeline = args.strategy.pipeline + pipeline.vpp_degree = args.virtual_pp_degree + pipeline.vpp_seg_method = args.virtual_pipeline_seg_method + + args.eval_iters = 10 + args.test_iters = args.eval_iters * 10 + + args.use_moe = dict(**dict(config.model_args), **dict(config.trainer_args)).get( + "use_moe", False + ) + model_config = dict(getattr(config.model_args, "model_config", {})) + model_config = {k: formatv(v) for k, v in model_config.items()} + logger.info(f"model_config_from_yaml: {json.dumps(model_config, indent=4)}") + setup_logger_output_file(config.model_args.output_dir, args.local_rank) paddle.set_device(args.device) np.random.seed(args.seed) random.seed(args.seed) paddle.seed(args.seed) - -def check_memory_preallocation(args): prop = paddle.device.cuda.get_device_properties() - if prop.total_memory < args.pre_alloc_memory * (1024**3): - logger.warning("Invalid value for `pre_alloc_memory`, pre-allocation failed.") + if prop.total_memory < args.pre_alloc_memory * 1024 * 1024 * 1024: + logger.warning( + "Invalid value for `pre_alloc_memory`, so pre-allocating just failed." + ) elif args.pre_alloc_memory > 0: logger.warning( - f"Pre-allocating a tensor {args.pre_alloc_memory}GB memory and then release it" + f"pre-allocating a tensor whose memory capacity is {args.pre_alloc_memory} GB " + "and then release it." ) - memory_size = int(args.pre_alloc_memory * 1024**3) + memory_size = int(args.pre_alloc_memory * 1024 * 1024 * 1024) x = paddle.empty([memory_size], dtype=paddle.uint8) del x - -def run_fleet_tests(): try: - tests = [ - ("allgather", {67108864: 0.00625, 234881024: 0.02, 637534208: 0.057}), - ("allreduce", {67108864: 0.02, 134217728: 0.038, 268435456: 0.075}), - ] - for test_name, size_time_map in tests: - collective_perf(test_name, round=50, size_and_time=size_time_map) - logger.info(f"======monitor {test_name} done!=======\n") + collective_perf( + "allgather", + round=50, + size_and_time={67108864: 0.00625, 234881024: 0.02, 637534208: 0.057}, + ) + logger.info("======monitor allgather done!=======\n") + collective_perf( + "allreduce", + round=50, + size_and_time={67108864: 0.02, 134217728: 0.038, 268435456: 0.075}, + ) + logger.info("======monitor allreduce done!=======\n") except Exception as e: - logger.warning(f"Fleet test error: {e}, skipping...") - - -def compute_metrics(p, tokenizer): - preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions - output = paddle.to_tensor(preds) - labels = paddle.to_tensor(p.label_ids) - - output = [t.astype("float32").cuda() for t in output] - labels = [t[t != tokenizer.ignored_index].cuda() for t in labels] - - all_numel = ( - (paddle.concat(labels, 0) != tokenizer.ignored_index).astype("int64").sum() - ) - ignored = (paddle.concat(labels, 0) == -100).astype("int64").sum() - valid_tokens = all_numel - ignored - - total_output = sum(output) - nll_loss = total_output / (valid_tokens + 1e-6) - ppl = paddle.exp(nll_loss) - - logger.info(f"Output: {output[0].item()}, Valid tokens: {valid_tokens.item()}") - - return { - "nll_loss": nll_loss.item(), - "ppl": ppl.item(), - "num_token": valid_tokens.item(), - } + logger.warning(f"fleet test unexcepted error! skip exception[{e}]...") + + # Detecting last checkpoint. + last_checkpoint = None + if ( + os.path.isdir(args.output_dir) + and args.do_train + and not args.overwrite_output_dir + ): + last_checkpoint = get_last_checkpoint(args.output_dir) + if last_checkpoint is None and len(os.listdir(args.output_dir)) > 0: + raise ValueError( + f"Output directory ({args.output_dir}) already exists and is not empty. " + "Use --overwrite_output_dir to overcome." + ) + elif last_checkpoint is not None and args.resume_from_checkpoint is None: + logger.info( + f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " + "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." + ) + + # Define the metrics of tasks. + def compute_metrics(p): + preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions + + output = paddle.to_tensor(preds) + labels = paddle.to_tensor(p.label_ids) + output = [t.astype("float32").cuda() for t in output] + labels = [t[t != tokenizer.ignored_index] for t in labels] + labels = [t.cuda() for t in labels] + all_numel = ( + (paddle.concat(labels, 0) != tokenizer.ignored_index).astype("int64").sum() + ) + ignored = (paddle.concat(labels, 0) == -100).astype("int64").sum() + labels = all_numel - ignored + output = sum(output) + logger.info(f"output : {output.item()}, labels : {labels.item()}") + nll_loss = output / (labels + 1.0e-6) + ppl = paddle.exp(nll_loss) + + return { + "nll_loss": nll_loss.item(), + "ppl": ppl.item(), + "num_token": labels.item(), + } + + # model + dtype = "float32" + if args.fp16 and args.fp16_opt_level == "O2": + paddle.set_default_dtype("float16") + dtype = "float16" + elif args.bf16: + paddle.set_default_dtype("bfloat16") + dtype = "bfloat16" + if args.use_moe: + global ErnieConfig, ErnieForCausalLMAuto + ErnieConfig = ErnieMoEConfig -def setup_model_config(args, model_config): - config_cls = ErnieMoEConfig if args.use_moe else ErnieConfig if args.moe_group.lower() in {"mp", "tp", "model", "dummy"}: logger.info(f"disable moe flag when using moe-group={args.moe_group}") args.use_moe = False + args.multi_token_pred_depth = model_config.get("multi_token_pred_depth", 0) - cfg = config_cls.from_pretrained(args.model_name_or_path) - - update_params = { - "seqlen": args.max_seq_length, - "token_balance_seqlen": args.max_seq_length * args.per_device_train_batch_size, - "fp16_opt_level": args.fp16_opt_level, - "moe_group": args.moe_group, - "dtype": get_dtype(args), - "pipeline_parallel_degree": args.pipeline_parallel_degree, - "virtual_pp_degree": args.virtual_pp_degree, - "micro_batch_size": args.per_device_train_batch_size, - } - - for key, value in update_params.items(): - setattr(cfg, key, value) + cfg = ErnieConfig.from_pretrained(args.model_name_or_path) + cfg.seqlen = args.max_seq_length + cfg.token_balance_seqlen = args.max_seq_length * args.per_device_train_batch_size + cfg.fp16_opt_level = args.fp16_opt_level + cfg.moe_group = args.moe_group + cfg.dtype = dtype + cfg.pipeline_parallel_degree = args.pipeline_parallel_degree + cfg.virtual_pp_degree = args.virtual_pp_degree if args.tensor_parallel_degree > 1: cfg.sequence_parallel = args.sequence_parallel cfg.tensor_parallel_degree = max( @@ -170,154 +249,42 @@ def setup_model_config(args, model_config): cfg.tensor_parallel_degree = 1 cfg.tensor_parallel_rank = 0 - return update_model_config_from_args(cfg, model_config) - - -def get_dtype(args): - if args.fp16 and args.fp16_opt_level == "O2": - return "float16" - if args.bf16: - return "bfloat16" - return "float32" - - -def set_dtype(args): - if args.fp16 and args.fp16_opt_level == "O2": - paddle.set_default_dtype("float16") - if args.bf16: - paddle.set_default_dtype("bfloat16") - return - - -def get_model_class(args): - if args.model_type == "ernie": - return ErnieForCausalLMAuto - if args.model_type == "ernie_pp": - return ErnieForCausalLMAutoPP - raise ValueError(f"Unsupported model_type: {args.model_type}") - - -def setup_tokenizer(args, config): + cfg.micro_batch_size = args.per_device_train_batch_size tokenizer = ErnieBotTokenizer.from_pretrained(args.tokenizer_name) - tokenizer.ignored_index = config.ignored_index + tokenizer.ignored_index = cfg.ignored_index logger.info( - f"Using tokenizer={type(tokenizer)}, bos:{tokenizer.bos_token_id} " - f"eos:{tokenizer.eos_token_id} pad:{tokenizer.pad_token_id}" + f"using tokenizer={type(tokenizer)}, bos:{tokenizer.bos_token_id} " + f"eos:{tokenizer.eos_token_id} pad:{tokenizer.pad_token_id} " ) - return tokenizer - - -def get_checkpoint(args, output_dir): - if not os.path.isdir(output_dir) or not args.do_train or args.overwrite_output_dir: - return None - - last_checkpoint = get_last_checkpoint(output_dir) - if last_checkpoint is None and len(os.listdir(output_dir)) > 0: - raise ValueError( - f"Output directory ({output_dir}) exists and is not empty. " - "Use --overwrite_output_dir to train from scratch." - ) - if last_checkpoint is not None and args.resume_from_checkpoint is None: - logger.info( - f"Checkpoint detected, resuming training at {last_checkpoint}. " - "To avoid this, change --output_dir or add --overwrite_output_dir." - ) - return args.resume_from_checkpoint or last_checkpoint + cfg = update_model_config_from_args(cfg, model_config) - -def setup_pipeline_config(args): - if "enable_dp_comm_overlap" in args.pipeline_parallel_config: - logger.warning( - "Pipeline dp_comm_overlap and FusedLinearWithGradAdd cannot be used together." - ) - if "enable_timer" in args.pipeline_parallel_config: - from paddle.distributed.fleet.meta_parallel.pipeline_parallel import ( - PipelineParallel, - ) - - PipelineParallel.timer_printer = lambda _: None - if args.strategy.pipeline.enable and args.virtual_pp_degree > 1: - pipeline = args.strategy.pipeline - pipeline.vpp_degree = args.virtual_pp_degree - pipeline.vpp_seg_method = args.virtual_pipeline_seg_method - return args - - -def main(): - # 1. init config and parse arg - config = get_config(verbose=True) - if not hasattr(config.trainer_args, "pipeline_parallel_config"): - config.trainer_args.pipeline_parallel_config = "" - os.makedirs(config.model_args.output_dir, exist_ok=True) - - model_args = {k: format_config_value(v) for k, v in dict(config.model_args).items()} - trainer_args = { - k: format_config_value(v) for k, v in dict(config.trainer_args).items() - } - parser = PdArgumentParser(AutoPreTrainingArguments) - (args,) = parser.parse_dict(dict(**model_args, **trainer_args)) - - # 2. check and update - if "enable_dp_comm_overlap" in config.trainer_args.pipeline_parallel_config: - logger.warning( - "Pipeline dp_comm_overlap and FusedLinearWithGradAdd cannot be used together." - ) - - if "enable_timer" in config.trainer_args.pipeline_parallel_config: - from paddle.distributed.fleet.meta_parallel.pipeline_parallel import ( - PipelineParallel, - ) - - PipelineParallel.timer_printer = lambda _: None - - if args.strategy.pipeline.enable and args.virtual_pp_degree > 1: - pipeline = args.strategy.pipeline - pipeline.vpp_degree = args.virtual_pp_degree - pipeline.vpp_seg_method = args.virtual_pipeline_seg_method - - args.use_moe = dict(**dict(config.model_args), **dict(config.trainer_args)).get( - "use_moe", False - ) - args.eval_iters = 10 - args.test_iters = args.eval_iters * 10 - args.enable_delay_scale_loss = ( - "enable_delay_scale_loss" in config.trainer_args.pipeline_parallel_config - ) - - # 3. set log and device - setup_logger_output_file(config.model_args.output_dir, args.local_rank) - setup_device_and_seed(args) - check_memory_preallocation(args) - run_fleet_tests() - set_dtype(args) - - # 4. init model - model_config = { - k: format_config_value(v) - for k, v in dict(getattr(config.model_args, "model_config", {})).items() - } - logger.info(f"Model config from YAML: {json.dumps(model_config, indent=4)}") - cfg = setup_model_config(args, model_config) - model_class = get_model_class(args) - tokenizer = setup_tokenizer(args, cfg) + if args.model_type == "ernie": + model_class = ErnieForCausalLMAuto + elif args.model_type == "ernie_pp": + model_class = ErnieForCausalLMAutoPP + else: + raise ValueError(f"not support model_type: {args.model_type}") with paddle.LazyGuard(): model = model_class(cfg) - logger.info(f"Using model: {type(model)}, config: {model.config}") + cfg = model.config + logger.info(f"using model type:{type(model)}") paddle.set_default_dtype("float32") - # 5. dataset - logger.info("Loading datasets...") + logger.info(f"using model={type(model)}, cfg={cfg}") + + # data + logger.info("loading data...") train_dataset, eval_dataset, test_dataset, data_collator = ( create_pretrained_dataset(args) ) - # 6. prepare for train/eval callbacks = [GlobalRNGCallback()] - init_parameters(model) + init_parameter(model) + model.apply(model.init_weights) trainer = AutoPretrainingTrainer( model=model, args=args, @@ -325,14 +292,17 @@ def main(): train_dataset=train_dataset, eval_dataset=eval_dataset, tokenizer=tokenizer, - compute_metrics=lambda p: compute_metrics(p, tokenizer), + compute_metrics=compute_metrics, callbacks=callbacks, ) - global_training_logs.accumulate = args.gradient_accumulation_steps - checkpoint = get_checkpoint(args, args.output_dir) + checkpoint = None + if args.resume_from_checkpoint is not None: + checkpoint = args.resume_from_checkpoint + elif last_checkpoint is not None: + checkpoint = last_checkpoint - # 7.1 train + # Training if args.do_train: train_result = trainer.train(resume_from_checkpoint=checkpoint) metrics = train_result.metrics @@ -341,17 +311,11 @@ def main(): trainer.save_metrics("train", metrics) trainer.save_state() - # 7.2 eval + # Evaluate and tests model if args.do_eval: eval_metrics = trainer.evaluate() trainer.log_metrics("eval", eval_metrics) if __name__ == "__main__": - log_trainer_start() - assert paddle.version.mkl() == "OFF", ( - "MKL is not supported in this version. " - "Please set -DWITH_MKL=OFF when compiling PaddlePaddle." - ) - main() From 54f8d90a3bbbba2a6eabadbc39133bc715cdcb2a Mon Sep 17 00:00:00 2001 From: xuexixi Date: Mon, 18 Aug 2025 21:27:41 +0800 Subject: [PATCH 10/15] remove --- .../ernie/src/datasets/dist_data_loader.py | 20 ------------------- 1 file changed, 20 deletions(-) diff --git a/examples/pre-training/ernie/src/datasets/dist_data_loader.py b/examples/pre-training/ernie/src/datasets/dist_data_loader.py index 0d79250c..5745330f 100644 --- a/examples/pre-training/ernie/src/datasets/dist_data_loader.py +++ b/examples/pre-training/ernie/src/datasets/dist_data_loader.py @@ -13,7 +13,6 @@ # limitations under the License. import logging -import hashlib from collections import deque from collections import OrderedDict from itertools import groupby @@ -21,7 +20,6 @@ from dataclasses import dataclass import numpy as np -import os import paddle from paddle.distributed import fleet import paddle.distributed as dist @@ -42,15 +40,6 @@ _MAX_DATA_DIM = 64 -VOCAB_SIZE = os.getenv("VOCAB_SIZE") -G_DEBUG_DATA_MD5 = os.getenv("G_DEBUG_DATA_MD5") - - -def md5(tensor): - numpy_array = tensor.numpy() - array_bytes = numpy_array.tobytes() - return hashlib.md5(array_bytes).hexdigest() - class DummyDataset(paddle.io.Dataset): def __len__(self): @@ -310,12 +299,6 @@ def __next__(self): global input_ids_for_mtp input_ids_for_mtp.append(input_ids) - if VOCAB_SIZE is not None: - if input_ids is not None: - input_ids %= int(VOCAB_SIZE) - if labels is not None: - labels %= int(VOCAB_SIZE) - to_return = OrderedDict( [ ("input_ids", input_ids), @@ -348,9 +331,6 @@ def __next__(self): ] for k in none_keys: to_return.pop(k) - if G_DEBUG_DATA_MD5 and int(G_DEBUG_DATA_MD5): - printable = map_structure(lambda i: md5(i), to_return) - logger.info(f"data-md5: {printable}") return to_return From 62c7da66419de9e8bd20c1011c9e62f2c15cc206 Mon Sep 17 00:00:00 2001 From: xuexixi Date: Mon, 18 Aug 2025 22:43:52 +0800 Subject: [PATCH 11/15] organization --- examples/pre-training/ernie/pretrain_auto.py | 59 ++++- .../ernie/src/callbacks/__init__.py | 2 - .../ernie/src/callbacks_auto/__init__.py | 25 +++ .../src/callbacks_auto/logging_callback.py | 63 ++++++ .../callbacks_auto/moe_logging_callback.py | 26 +++ .../stopper_callback.py | 0 .../callbacks_auto/tensorboard_callback.py | 202 ++++++++++++++++++ .../src/trainers/pretraining_trainer_auto.py | 16 +- .../ernie/src/utils/training_utils_auto.py | 43 ++++ .../configuration_auto.py} | 1 - .../models/ernie/modeling_auto.py | 69 +++++- .../models/sequence_parallel_utils_auto.py | 44 +++- .../pre-training/yamls/pretrain_96_auto.yaml | 34 +-- 13 files changed, 526 insertions(+), 58 deletions(-) create mode 100644 examples/pre-training/ernie/src/callbacks_auto/__init__.py create mode 100644 examples/pre-training/ernie/src/callbacks_auto/logging_callback.py create mode 100644 examples/pre-training/ernie/src/callbacks_auto/moe_logging_callback.py rename examples/pre-training/ernie/src/{callbacks => callbacks_auto}/stopper_callback.py (100%) create mode 100644 examples/pre-training/ernie/src/callbacks_auto/tensorboard_callback.py create mode 100644 examples/pre-training/ernie/src/utils/training_utils_auto.py rename examples/pre-training/models/{ernie_moe/configuration.py => ernie/configuration_auto.py} (99%) diff --git a/examples/pre-training/ernie/pretrain_auto.py b/examples/pre-training/ernie/pretrain_auto.py index 4ef4b6cb..43565f12 100644 --- a/examples/pre-training/ernie/pretrain_auto.py +++ b/examples/pre-training/ernie/pretrain_auto.py @@ -27,14 +27,14 @@ from src.tokenizers.tokenization_eb_v2 import ErnieBotTokenizer from omegaconf.listconfig import ListConfig from omegaconf.dictconfig import DictConfig -from src.callbacks import ( +from src.callbacks_auto import ( GlobalRNGCallback, ) from models.ernie import ( ErnieForCausalLMAuto, ErnieForCausalLMAutoPP, ) -from models.ernie_moe.configuration import ( +from models.ernie.configuration_auto import ( ErnieConfig, ErnieMoEConfig, ) @@ -43,7 +43,11 @@ setup_logger_output_file, ) from src.utils.misc import global_training_logs -from pretrain import create_pretrained_dataset + +from paddleformers.data.causal_dataset import ( + build_train_valid_test_datasets, + check_data_split, +) from config import get_config @@ -67,6 +71,55 @@ def log_trainer_start(): log_trainer_start() +def create_pretrained_dataset(args): + assert args.input_dir is not None and len(args.input_dir.split()) > 1 + + check_data_split( + args.split, + args.do_train, + args.do_eval, + args.do_predict, + ) + + train_val_test_num_samples = [ + args.per_device_train_batch_size + * args.dataset_world_size + * args.max_steps + * args.gradient_accumulation_steps, + args.per_device_eval_batch_size + * args.dataset_world_size + * args.eval_iters + * (args.max_steps // args.eval_steps + 1), + args.per_device_eval_batch_size * args.dataset_world_size * args.test_iters, + ] + + train_dataset, valid_dataset, test_dataset = build_train_valid_test_datasets( + data_prefix=args.input_dir.split(), + data_impl="mmap", + splits_string=args.split, + train_val_test_num_samples=train_val_test_num_samples, + seq_length=args.max_seq_length + args.multi_token_pred_depth, + seed=args.seed, + skip_warmup=True, + data_cache_path=None, + ) + + from paddleformers.data import Stack + + def _collate_data(data, stack_fn=Stack()): + tokens_ = stack_fn([x["text"] for x in data]) + + labels = tokens_[:, 1:] + tokens = tokens_[:, :-1] + + return { + "input_ids": tokens, + "labels": labels, + } + + return train_dataset, valid_dataset, test_dataset, _collate_data + + def update_model_config_from_args(config: ErnieConfig, model_args: dict): for k, v in model_args.items(): if hasattr(config, k): diff --git a/examples/pre-training/ernie/src/callbacks/__init__.py b/examples/pre-training/ernie/src/callbacks/__init__.py index 684af4a9..15b0cb9f 100644 --- a/examples/pre-training/ernie/src/callbacks/__init__.py +++ b/examples/pre-training/ernie/src/callbacks/__init__.py @@ -14,7 +14,6 @@ from .gc_callback import GCCallback from .logging_callback import LoggingCallback -from .stopper_callback import StopperCallback from .moe_correction_bias_adjust_callback import MoECorrectionBiasAdjustCallback from .moe_logging_callback import GlobalRNGCallback, MoeLoggingCallback from .sp_grad_sync_callback import SPGradSyncCallback @@ -32,5 +31,4 @@ "MoECorrectionBiasAdjustCallback", "FP8QuantWeightCallback", "OrthogonalCallback", - "StopperCallback", ] diff --git a/examples/pre-training/ernie/src/callbacks_auto/__init__.py b/examples/pre-training/ernie/src/callbacks_auto/__init__.py new file mode 100644 index 00000000..26eac2da --- /dev/null +++ b/examples/pre-training/ernie/src/callbacks_auto/__init__.py @@ -0,0 +1,25 @@ +# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .logging_callback import LoggingCallback +from .stopper_callback import StopperCallback +from .moe_logging_callback import GlobalRNGCallback +from .tensorboard_callback import TensorBoardCallback + +__all__ = [ + "TensorBoardCallback", + "LoggingCallback", + "GlobalRNGCallback", + "StopperCallback", +] diff --git a/examples/pre-training/ernie/src/callbacks_auto/logging_callback.py b/examples/pre-training/ernie/src/callbacks_auto/logging_callback.py new file mode 100644 index 00000000..c2435a4b --- /dev/null +++ b/examples/pre-training/ernie/src/callbacks_auto/logging_callback.py @@ -0,0 +1,63 @@ +# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging + +from paddleformers.trainer.trainer_callback import TrainerCallback + +logger = logging.getLogger(__name__) + + +class LoggingCallback(TrainerCallback): + def __init__( + self, + ) -> None: + super().__init__() + + def on_log(self, args, state, control, logs=None, **kwargs): + _ = logs.pop("total_flos", None) + if "inputs" in kwargs: + data_id = kwargs["inputs"].get("data_id", None) + src_id = kwargs["inputs"].get("src_id", None) + data_type = kwargs["inputs"].get("data_type", None) + + if data_id is not None: + logs = dict( + logs, data_id="-".join(map(str, (data_id.numpy().tolist()))) + ) + if src_id is not None: + logs = dict(logs, src_id="-".join(map(str, (src_id.numpy().tolist())))) + if data_type is not None: + logs.update(data_type="-".join(map(str, (data_type.numpy().tolist())))) + + if type(logs) is dict: + logger.info( + ", ".join( + ( + ( + f"{k}: {v}" + if k == "loss" or "cur_dp" in k + else f"{k}: {v:e}" if v < 1e-3 else f"{k}: {v:f}" + ) + if isinstance(v, float) + else f"{k}: {v}" + ) + for k, v in logs.items() + ) + ) + metrics_dumper = kwargs.get("metrics_dumper", None) + if metrics_dumper is not None: + metrics_dumper.append(logs) + else: + logger.info(logs) diff --git a/examples/pre-training/ernie/src/callbacks_auto/moe_logging_callback.py b/examples/pre-training/ernie/src/callbacks_auto/moe_logging_callback.py new file mode 100644 index 00000000..e5bd2e86 --- /dev/null +++ b/examples/pre-training/ernie/src/callbacks_auto/moe_logging_callback.py @@ -0,0 +1,26 @@ +# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import random + +from paddleformers.trainer.trainer_callback import TrainerCallback +from models.ernie.modeling_moe import ErnieMoEForCausalLM + +__all__ = ["GlobalRNGCallback"] + + +class GlobalRNGCallback(TrainerCallback): + def on_step_end(self, args, state, control, model, **kwargs): + isinstance(model, ErnieMoEForCausalLM), type(model) + random.Random(state.global_step) diff --git a/examples/pre-training/ernie/src/callbacks/stopper_callback.py b/examples/pre-training/ernie/src/callbacks_auto/stopper_callback.py similarity index 100% rename from examples/pre-training/ernie/src/callbacks/stopper_callback.py rename to examples/pre-training/ernie/src/callbacks_auto/stopper_callback.py diff --git a/examples/pre-training/ernie/src/callbacks_auto/tensorboard_callback.py b/examples/pre-training/ernie/src/callbacks_auto/tensorboard_callback.py new file mode 100644 index 00000000..f420d02c --- /dev/null +++ b/examples/pre-training/ernie/src/callbacks_auto/tensorboard_callback.py @@ -0,0 +1,202 @@ +# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import importlib.util +import json + +from paddleformers.peft.lora import LoRAModel +from paddleformers.trainer.trainer_callback import TrainerCallback +from paddleformers.transformers import PretrainedModel +from paddleformers.utils.log import logger + +try: + from paddleformers.trainer.trainer import clear_async_save_task_queue +except Exception: + clear_async_save_task_queue = None + + +def is_tensorboard_available(): + return ( + importlib.util.find_spec("tensorboard") is not None + or importlib.util.find_spec("tensorboardX") is not None + ) + + +def rewrite_logs(d): + new_d = {} + eval_prefix = "eval_" + eval_prefix_len = len(eval_prefix) + test_prefix = "test_" + test_prefix_len = len(test_prefix) + for k, v in d.items(): + if k.startswith(eval_prefix): + new_d["eval/" + k[eval_prefix_len:]] = v + elif k.startswith(test_prefix): + new_d["test/" + k[test_prefix_len:]] = v + else: + new_d["train/" + k] = v + return new_d + + +class TensorBoardCallback(TrainerCallback): + def __init__( + self, + args, + model, + tb_writer=None, + log_flops_per_step=False, + log_tokens_per_step=False, + ): + has_tensorboard = is_tensorboard_available() + if not has_tensorboard: + raise RuntimeError( + "TensorBoardCallback requires tensorboard to be installed. Either update or install tensorboardX." + ) + if has_tensorboard: + try: + from torch.utils.tensorboard import SummaryWriter + + self._SummaryWriter = SummaryWriter + except ImportError: + try: + from tensorboardX import SummaryWriter + + self._SummaryWriter = SummaryWriter + except ImportError: + self._SummaryWriter = None + else: + self._SummaryWriter = None + self.tb_writer = tb_writer + + def get_numel_item(p): + item = p.numel().item() + return item if item else 0 + + self.model_numel = sum( + get_numel_item(p) + for n, p in model.named_parameters() + if not p.stop_gradient and "embeddings" not in n and "embed_tokens" not in n + ) + self.log_flops_per_step = log_flops_per_step + self.log_tokens_per_step = log_tokens_per_step + + def _init_summary_writer(self, args, log_dir=None): + log_dir = log_dir or args.logging_dir + if self._SummaryWriter is not None: + self.tb_writer = self._SummaryWriter(log_dir=log_dir) + + def on_train_begin(self, args, state, control, **kwargs): + if not state.is_world_process_zero: + return + + log_dir = None + + if self.tb_writer is None: + self._init_summary_writer(args, log_dir) + + if self.tb_writer is not None: + self.tb_writer.add_text("args", args.to_json_string()) + if "model" in kwargs: + model = kwargs["model"] + + if ( + isinstance(model, PretrainedModel) + and model.constructed_from_pretrained_config() + ) or isinstance(model, LoRAModel): + model.config.architectures = [model.__class__.__name__] + self.tb_writer.add_text("model_config", str(model.config)) + + elif hasattr(model, "init_config") and model.init_config is not None: + model_config_json = json.dumps( + model.get_model_config(), ensure_ascii=False, indent=2 + ) + self.tb_writer.add_text("model_config", model_config_json) + + def on_log(self, args, state, control, logs=None, **kwargs): + if not state.is_world_process_zero: + return + + timers = kwargs.get("timers") + paddle_pipeline_timers = kwargs.get("paddle_pipeline_timers") + + if self.tb_writer is None: + self._init_summary_writer(args) + + if self.tb_writer is not None: + logs = rewrite_logs(logs) + + total_tokens_per_step = ( + args.train_batch_size + * args.gradient_accumulation_steps + * args.reeao_dataset_world_size + * args.max_seq_length + ) + + if self.log_flops_per_step: + logger.warning("The FLOPs might be not accurate") + flops_per_step = self.model_numel * total_tokens_per_step * 6 + else: + flops_per_step = None + + if self.log_tokens_per_step: + tokens_per_step = total_tokens_per_step + else: + tokens_per_step = None + inputs = kwargs.get("inputs") + data_type = inputs and inputs.get("data_type") + if data_type is not None: + data_type = data_type.tolist()[-1] + logs.update(data_type=data_type) + + for k, v in logs.items(): + if isinstance(v, (int, float)): + self.tb_writer.add_scalar(k, v, state.global_step) + + if tokens_per_step is not None and k in ["train/loss"]: + self.tb_writer.add_scalar( + k + "_xaxis_tokens", v, state.global_step * tokens_per_step + ) + + if flops_per_step is not None and k in ["train/loss"]: + self.tb_writer.add_scalar( + k + "_xaxis_flops", v, state.global_step * flops_per_step + ) + + else: + logger.warning( + "Trainer is attempting to log a value of " + f'"{v}" of type {type(v)} for key "{k}" as a scalar. ' + "This invocation of Tensorboard's writer.add_scalar() " + "is incorrect so we dropped this attribute." + ) + if timers is not None: + timers.write( + timers.timers.keys(), self.tb_writer, state.global_step, reset=False + ) + + if paddle_pipeline_timers: + for name, timer in paddle_pipeline_timers.timers.items(): + elapsed_time = timer.elapsed(reset=False) + self.tb_writer.add_scalar( + f"timers/{name}", elapsed_time, state.global_step + ) + + self.tb_writer.flush() + + def on_train_end(self, args, state, control, **kwargs): + if clear_async_save_task_queue: + clear_async_save_task_queue() + if self.tb_writer: + self.tb_writer.close() + self.tb_writer = None diff --git a/examples/pre-training/ernie/src/trainers/pretraining_trainer_auto.py b/examples/pre-training/ernie/src/trainers/pretraining_trainer_auto.py index ab95e149..6d47bc1b 100644 --- a/examples/pre-training/ernie/src/trainers/pretraining_trainer_auto.py +++ b/examples/pre-training/ernie/src/trainers/pretraining_trainer_auto.py @@ -44,16 +44,7 @@ from paddleformers.trainer.auto_trainer import AutoTrainer -try: - from paddleformers.utils.env import ( - PADDLE_OPTIMIZER_NAME, - ) -except ImportError: - from paddleformers.trainer.trainer import ( - OPTIMIZER_NAME, - ) - PADDLE_OPTIMIZER_NAME = OPTIMIZER_NAME from paddleformers.utils.batch_sampler import ( DistributedBatchSampler as PaddleNLPDistributedBatchSampler, ) @@ -66,10 +57,10 @@ from src.lr_schedulers import get_cosine_schedule_with_warmup -from src.utils.training_utils import ( +from src.utils.training_utils_auto import ( reset_per_device_batch_size, ) -from src.callbacks import ( +from src.callbacks_auto import ( TensorBoardCallback, LoggingCallback, StopperCallback, @@ -358,8 +349,7 @@ def __post_init__(self): class AutoPretrainingTrainer(AutoTrainer): - def __init__(self, _shit=None, args=None, model=None, callbacks=[], **kwargs): - assert _shit is None, "use key-ward argument" + def __init__(self, args=None, model=None, callbacks=[], **kwargs): callbacks = [ LoggingCallback(), StopperCallback(), diff --git a/examples/pre-training/ernie/src/utils/training_utils_auto.py b/examples/pre-training/ernie/src/utils/training_utils_auto.py new file mode 100644 index 00000000..027606fc --- /dev/null +++ b/examples/pre-training/ernie/src/utils/training_utils_auto.py @@ -0,0 +1,43 @@ +# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging + +logger = logging.getLogger(__name__) + + +def reset_per_device_batch_size( + global_batch_size, per_device_train_batch_size, dataset_world_size +): + assert ( + global_batch_size % dataset_world_size == 0 + ), f"global_bsz={global_batch_size} not evenly divided by world_size={dataset_world_size}" + batch_per_device = global_batch_size // dataset_world_size + if batch_per_device < per_device_train_batch_size: + gradient_accumulation_steps = 1 + per_device_train_batch_size = batch_per_device + logger.info( + f"reset `per_device_train_batch_size` to {per_device_train_batch_size}, global_batch_size={global_batch_size }, " + f"dp_worldsize={ dataset_world_size}, accumulate_steps={gradient_accumulation_steps} " + ) + else: + assert ( + batch_per_device % per_device_train_batch_size == 0 + ), f"global_bsz={global_batch_size} not evenly divided by world_size={dataset_world_size}, batch_per_device={batch_per_device}" + gradient_accumulation_steps = batch_per_device // per_device_train_batch_size + logger.info( + f"per_device_train_batch_size={per_device_train_batch_size}, global_batch_size={global_batch_size }, " + f"dp_worldsize={dataset_world_size}, accumulate_steps={gradient_accumulation_steps} " + ) + return per_device_train_batch_size, gradient_accumulation_steps diff --git a/examples/pre-training/models/ernie_moe/configuration.py b/examples/pre-training/models/ernie/configuration_auto.py similarity index 99% rename from examples/pre-training/models/ernie_moe/configuration.py rename to examples/pre-training/models/ernie/configuration_auto.py index fe37204f..58070f60 100644 --- a/examples/pre-training/models/ernie_moe/configuration.py +++ b/examples/pre-training/models/ernie/configuration_auto.py @@ -537,7 +537,6 @@ def __init__( self.moe_group = moe_group self.moe_gate = moe_gate self.moe_num_attn_experts = moe_num_attn_experts - # implemtent size-all2all as https://arxiv.org/pdf/2303.06182.pdf self.moe_use_size_all2all = moe_use_size_all2all self.moe_logging = moe_logging self.num_experts_per_tok = num_experts_per_tok diff --git a/examples/pre-training/models/ernie/modeling_auto.py b/examples/pre-training/models/ernie/modeling_auto.py index 6246434a..b5a54de3 100644 --- a/examples/pre-training/models/ernie/modeling_auto.py +++ b/examples/pre-training/models/ernie/modeling_auto.py @@ -14,6 +14,7 @@ # limitations under the License. """Paddle Ernie model""" import math +import functools from functools import partial import logging from typing import Optional, Tuple @@ -37,7 +38,6 @@ ) from paddle.distributed import in_auto_parallel_align_mode -from models.comm_utils import subbatch from models.moe.top2_gate_auto import Top2Gate, TopKGateFusedAuto @@ -54,14 +54,13 @@ from paddleformers.transformers.model_utils import PretrainedModel, register_base_model -from models.ernie.modeling import FusedDropoutImpl from models.sequence_parallel_utils_auto import ( sequence_parallel_sparse_mask_labels, ) from models.moe.moe_layer_auto import ( MOELayerAuto, ) -from .configuration import ErnieMoEConfig +from models.ernie.configuration_auto import ErnieMoEConfig from models.moe.moe_utils_auto import get_mesh @@ -157,6 +156,70 @@ class CausalLMOutputWithCrossAttentionsAuto(CausalLMOutputWithCrossAttentions): ) +def subbatch(f, arg_idx, axis, bs, out_idx, use_recompute=False, same_arg_idx={}): + @functools.wraps(f) + def wrapper(*args, **kwargs): + + assert len(arg_idx) == len( + axis + ), "Number of batching args and number of batching dims should match." + + inps = [args[i] for i in arg_idx] + axis_width = [inp.shape[d] for inp, d in zip(inps, axis)] + assert len(set(axis_width)) == 1, "Batch sizes should be kept equal." + + inp_axis = {inp: d for inp, d in zip(inps, axis)} + + axis_width = axis_width[0] + if axis_width < bs: + return f(*args, **kwargs) + + outs = [] + for slice_at in np.arange(0, axis_width, bs): + _args = [] + for i, inp in enumerate(args): + if i in same_arg_idx: + assert ( + i > same_arg_idx[i] + ), f"expect i > same_arg_idx[i], but got i: {i} and same_arg_idx[i]: {same_arg_idx[i]}" + _args.append(_args[same_arg_idx[i]]) + elif i in arg_idx: + inp = inp.slice( + [inp_axis[inp]], + [slice_at], + [min(inp.shape[inp_axis[inp]], slice_at + bs)], + ) + _args.append(inp) + else: + _args.append(inp) + if use_recompute: + out = paddle.distributed.fleet.utils.recompute(f, *_args, **kwargs) + else: + out = f(*_args, **kwargs) + outs.append(out) + + return paddle.concat(outs, out_idx) + + return wrapper + + +class FusedDropoutImpl(nn.Layer): + + def __init__(self, prob, mode): + super().__init__() + self.prob = prob + self.mode = mode + + self.dropout = nn.Dropout(p=prob, mode=mode) + + def forward(self, x, y): + if self.prob > 0: + x = self.dropout(x) + output = x + y + + return output + + def is_pp_enable(): mesh = fleet.auto.get_mesh() diff --git a/examples/pre-training/models/sequence_parallel_utils_auto.py b/examples/pre-training/models/sequence_parallel_utils_auto.py index ea0e52b2..35f11077 100644 --- a/examples/pre-training/models/sequence_parallel_utils_auto.py +++ b/examples/pre-training/models/sequence_parallel_utils_auto.py @@ -22,10 +22,46 @@ from paddle.distributed import fleet -from models.comm_utils import ( - scatter, - all_gather, -) +def scatter(input, group=None, axis=0): + if group is None: + hcg = fleet.get_hybrid_communicate_group() + group = hcg.get_model_parallel_group() + parallelism = group.nranks + if parallelism == 1: + return input.clone() + rank = group.rank + seq_len = input.shape[axis] + assert seq_len % parallelism == 0, ( + f"Input sequence length {seq_len} can't be divided exactly" + f" by sequence parallelism {parallelism}" + ) + interval = seq_len // parallelism + input = paddle.slice( + input, axes=[axis], starts=[interval * rank], ends=[interval * (rank + 1)] + ) + input = paddle.assign(input) + return input + + +def all_gather(input, group=None, axis=0): + if group is None: + hcg = fleet.get_hybrid_communicate_group() + group = hcg.get_model_parallel_group() + parallelism = group.nranks + if parallelism == 1: + return input.clone() + output_shape = input.shape + if axis == 0: + output_shape[axis] = output_shape[axis] * parallelism + output = paddle.empty(shape=output_shape, dtype=input.dtype) + dist.stream.all_gather(output, input, group=group, use_calc_stream=True) + return output + outputs = [ + paddle.empty(output_shape, dtype=input.dtype) for _ in range(parallelism) + ] + dist.stream.all_gather(outputs, input, group=group, use_calc_stream=True) + output = paddle.concat(outputs, axis=axis) + return output class ScatterOp(PyLayer): diff --git a/examples/pre-training/yamls/pretrain_96_auto.yaml b/examples/pre-training/yamls/pretrain_96_auto.yaml index ecc993e9..39fbef49 100644 --- a/examples/pre-training/yamls/pretrain_96_auto.yaml +++ b/examples/pre-training/yamls/pretrain_96_auto.yaml @@ -10,7 +10,6 @@ model_args: base_seq_length: 4096 num_consecutive: 32 sequence_parallel: 1 - enable_global_training_logs: False moe_use_aux_free_update_coef: 0.001 global_logging_interval: 10 @@ -19,13 +18,10 @@ model_args: moe_use_aux_free: true multi_token_pred_depth: 0 - - # ---------------------------trainer args-------------------------------------------------# trainer_args: input_dir: "0.4 ./demo_data/data-1-part0 0.6 ./demo_data/data-1-part0" split: "998,1,1" - use_sp_callback: true moe_gate_lr_ratio: 0.01 do_train: True @@ -43,19 +39,14 @@ trainer_args: adam_epsilon: 1e-8 learning_rate: 2.2e-4 min_lr: 2.2e-5 - global_batch_size: 2 gradient_accumulation_steps: 1 per_device_train_batch_size: 2 per_device_eval_batch_size: 1 - lr_scheduler: wsd:231084 decay_function: 1-sqrt max_grad_norm: 1.0 - - use_async_save: True - weight_decay: 0.1 warmup_steps: 200 save_total_limit: 5 @@ -65,28 +56,13 @@ trainer_args: seed: 666 pre_alloc_memory: 60 - # # N7 - # tensor_parallel_degree: 8 # N7:8, N4:8, N1:4 - # pipeline_parallel_degree: 7 # N7:7, N4:4, N1:2 - # virtual_pp_degree: 8 # N7:8, N4:8, N1:1 - - # # N4 - # tensor_parallel_degree: 8 # N7:8, N4:8, N1:4 - # pipeline_parallel_degree: 4 # N7:7, N4:4, N1:2 - # virtual_pp_degree: 8 # N7:8, N4:8, N1:1 - - # # N1 - # tensor_parallel_degree: 4 # N7:8, N4:8, N1:4 - # pipeline_parallel_degree: 2 # N7:7, N4:4, N1:2 - # virtual_pp_degree: 1 # N7:8, N4:8, N1:1 - - # N1 dynamic auto tensor_parallel_degree: 4 # N7:8, N4:8, N1:4 pipeline_parallel_degree: 2 # N7:7, N4:4, N1:2 + virtual_pp_degree: 1 # N7:8, N4:8, N1:1 + n_microbatches: 2 pipeline_schedule_mode: "VPP" model_type: "ernie_pp" - virtual_pp_degree: 1 data_parallel_degree: 1 sharding: "stage1" @@ -97,20 +73,14 @@ trainer_args: sharding_comm_buffer_size_MB: 2048 tensor_parallel_config: replace_with_parallel_cross_entropy - skip_profile_timer: False - ignore_data_skip: 0 shuffle_consecutive: True - load_sharded_model: True save_sharded_model: True ignore_load_lr_and_optim: False metrics_output_path: ./output/paddle_distributed_logs/ - - - use_moe: true moe_group: mp log_global_grad_norm: True From 033cb15dd39b74f65bfa4edf3c34ced6a529af46 Mon Sep 17 00:00:00 2001 From: xuexixi Date: Tue, 19 Aug 2025 12:29:43 +0800 Subject: [PATCH 12/15] update --- examples/pre-training/ernie/pretrain_auto.py | 6 +- .../pre-training/ernie/src/clip/__init__.py | 3 +- .../ernie/src/clip/moe_clip_auto.py | 175 ++++++++++++++ .../ernie/src/datasets/dist_data_loader.py | 2 - .../ernie/src/trainers/pretraining_trainer.py | 4 +- .../src/trainers/pretraining_trainer_auto.py | 6 +- .../ernie/src/utils_auto/__init__.py | 17 ++ .../ernie/src/utils_auto/logging.py | 54 +++++ .../pre-training/ernie/src/utils_auto/misc.py | 214 ++++++++++++++++++ .../training_utils.py} | 0 .../models/ernie/modeling_auto.py | 1 - .../pre-training/models/moe/moe_layer_auto.py | 127 +---------- .../pre-training/models/moe/top2_gate_auto.py | 110 +-------- examples/pre-training/models/utils_auto.py | 93 ++++++++ 14 files changed, 566 insertions(+), 246 deletions(-) create mode 100644 examples/pre-training/ernie/src/clip/moe_clip_auto.py create mode 100644 examples/pre-training/ernie/src/utils_auto/__init__.py create mode 100644 examples/pre-training/ernie/src/utils_auto/logging.py create mode 100644 examples/pre-training/ernie/src/utils_auto/misc.py rename examples/pre-training/ernie/src/{utils/training_utils_auto.py => utils_auto/training_utils.py} (100%) create mode 100644 examples/pre-training/models/utils_auto.py diff --git a/examples/pre-training/ernie/pretrain_auto.py b/examples/pre-training/ernie/pretrain_auto.py index 43565f12..3e8078c6 100644 --- a/examples/pre-training/ernie/pretrain_auto.py +++ b/examples/pre-training/ernie/pretrain_auto.py @@ -19,7 +19,7 @@ import random import paddle import paddle.distributed.fleet as fleet -from src.utils import logger +from src.utils_auto import logger from paddleformers.trainer import ( PdArgumentParser, get_last_checkpoint, @@ -39,10 +39,10 @@ ErnieMoEConfig, ) from src.trainers import AutoPretrainingTrainer, AutoPreTrainingArguments -from src.utils import ( +from src.utils_auto import ( setup_logger_output_file, ) -from src.utils.misc import global_training_logs +from src.utils_auto.misc import global_training_logs from paddleformers.data.causal_dataset import ( build_train_valid_test_datasets, diff --git a/examples/pre-training/ernie/src/clip/__init__.py b/examples/pre-training/ernie/src/clip/__init__.py index 6484ef44..f4c56fec 100644 --- a/examples/pre-training/ernie/src/clip/__init__.py +++ b/examples/pre-training/ernie/src/clip/__init__.py @@ -13,5 +13,6 @@ # limitations under the License. from .moe_clip import ClipGradForMOEByGlobalNorm +from .moe_clip_auto import ClipGradForMOEByGlobalNormAuto -__all__ = ['ClipGradForMOEByGlobalNorm'] +__all__ = ["ClipGradForMOEByGlobalNorm", "ClipGradForMOEByGlobalNormAuto"] diff --git a/examples/pre-training/ernie/src/clip/moe_clip_auto.py b/examples/pre-training/ernie/src/clip/moe_clip_auto.py new file mode 100644 index 00000000..c82130ee --- /dev/null +++ b/examples/pre-training/ernie/src/clip/moe_clip_auto.py @@ -0,0 +1,175 @@ +# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +import math + +import paddle +import paddle.distributed as dist +from paddle.autograd import no_grad +from paddle.framework import core +from paddle.nn import clip +from paddle.nn.clip import ClipGradBase, _squared_l2_norm + +logger = logging.getLogger(__name__) + + +class ClipGradForMOEByGlobalNormAuto(ClipGradBase): + def __init__( + self, + clip_norm, + is_expert_param_func=None, + moe_group=None, + group_name="default_moe_group", + local_clip=False, + ): + super().__init__() + self.clip_norm = float(clip_norm) + self.group_name = group_name + self.moe_group = moe_group + if moe_group is not None and moe_group.nranks > 1: + assert ( + is_expert_param_func is not None + ), "When moe group size > 1, a function for selecting expert params must be specified." + self.is_expert_param_func = is_expert_param_func + self.stat = {} + self.local_clip = local_clip + + def __str__(self): + return "Gradient Clip By GlobalNorm, global_norm=%f" % (self.clip_norm) + + @staticmethod + def get_l2_norm_pow(params_grads, sum_dtype=None): + sum_square_list = [] + sum_square_list_fp16 = [] + sum_square_list_fp32 = [] + for p, g in params_grads: + if g is None: + continue + if getattr(p, "need_clip", True) is False: + continue + merge_grad = g + if g.type == core.VarDesc.VarType.SELECTED_ROWS: + merge_grad = clip.merge_selected_rows(g) + merge_grad = clip.get_tensor_from_selected_rows(merge_grad) + sum_square = _squared_l2_norm(merge_grad) + if sum_square.dtype == core.VarDesc.VarType.FP16: + sum_square_list_fp16.append(sum_square) + elif sum_square.dtype == core.VarDesc.VarType.FP32: + sum_square_list_fp32.append(sum_square) + else: + sum_square_list.append(sum_square.cast("float64")) + + if ( + len(sum_square_list) + len(sum_square_list_fp16) + len(sum_square_list_fp32) + == 0 + ): + return None, None + assert sum_dtype in [ + "float64", + "float32", + None, + ], "sum's type must be float64/ float32 / None" + if sum_dtype != "float64": + sum_dtype = "float64" if len(sum_square_list) > 0 else "float32" + + global_norm_var = [] + if len(sum_square_list_fp16) > 0: + global_norm_var_fp16 = paddle.add_n(sum_square_list_fp16) + global_norm_var.append(global_norm_var_fp16.astype(sum_dtype)) + if len(sum_square_list_fp32) > 0: + global_norm_var_fp32 = paddle.add_n(sum_square_list_fp32) + if sum_dtype == "float32": + global_norm_var.append(global_norm_var_fp32) + else: + global_norm_var.append(global_norm_var_fp32.astype(sum_dtype)) + if len(sum_square_list) > 0: + global_norm_var_fp64 = paddle.add_n(sum_square_list) + global_norm_var.append(global_norm_var_fp64) + global_norm_var = paddle.add_n(global_norm_var) + return global_norm_var, sum_dtype + + @no_grad() + def _dygraph_clip(self, params_grads): + normal_params_grads = [] + moe_params_grads = [] + + if self.moe_group is not None and self.moe_group.nranks > 1: + for p, g in params_grads: + if self.is_expert_param_func(p): + moe_params_grads.append((p, g)) + else: + normal_params_grads.append((p, g)) + else: + normal_params_grads = params_grads + + global_norm_var_normal, sum_dtype = self.get_l2_norm_pow(normal_params_grads) + global_norm_var_moe = None + if len(moe_params_grads) > 0: + global_norm_var_moe, _ = self.get_l2_norm_pow(moe_params_grads, sum_dtype) + if global_norm_var_moe is not None: + dist.all_reduce( + global_norm_var_moe, + op=dist.ReduceOp.SUM, + group=self.moe_group, + ) + + if global_norm_var_normal is None and global_norm_var_moe is None: + return params_grads + elif global_norm_var_normal is None: + global_norm_var = global_norm_var_moe + elif global_norm_var_moe is None: + global_norm_var = global_norm_var_normal + else: + if global_norm_var_normal.dtype != global_norm_var_moe.dtype: + global_norm_var_normal = global_norm_var_normal.astype( + global_norm_var_moe.dtype + ) + if self.local_clip: + global_norm_var = global_norm_var_normal + else: + global_norm_var = global_norm_var_normal + global_norm_var_moe + self.stat["local_grad_norm"] = math.sqrt( + global_norm_var_normal.astype("float32").item() + ) + self.stat["moe_grad_norm"] = math.sqrt( + global_norm_var_moe.astype("float32").item() + ) + self.stat["global_grad_norm"] = math.sqrt( + global_norm_var.astype("float32").item() + ) + + params_and_grads = [] + global_norm_var = paddle.sqrt(global_norm_var) + max_global_norm = paddle.full( + shape=[1], dtype=global_norm_var.dtype, fill_value=self.clip_norm + ) + clip_var = paddle.divide( + x=max_global_norm, + y=paddle.maximum(x=global_norm_var, y=max_global_norm), + ) + for p, g in params_grads: + if g is None: + continue + if getattr(p, "need_clip", True) is False: + params_and_grads.append((p, g)) + continue + clip_input = ( + clip_var.astype("float16") + if g.dtype == core.VarDesc.VarType.FP16 + else clip_var + ) + new_grad = paddle.multiply(x=g, y=clip_input.astype(g.dtype)) + params_and_grads.append((p, new_grad)) + return params_and_grads diff --git a/examples/pre-training/ernie/src/datasets/dist_data_loader.py b/examples/pre-training/ernie/src/datasets/dist_data_loader.py index 5745330f..54f030f9 100644 --- a/examples/pre-training/ernie/src/datasets/dist_data_loader.py +++ b/examples/pre-training/ernie/src/datasets/dist_data_loader.py @@ -31,8 +31,6 @@ from src.utils.misc import global_training_logs -logger = logging.getLogger(__name__) - input_ids_for_mtp = deque() diff --git a/examples/pre-training/ernie/src/trainers/pretraining_trainer.py b/examples/pre-training/ernie/src/trainers/pretraining_trainer.py index 9308c69a..65477b03 100644 --- a/examples/pre-training/ernie/src/trainers/pretraining_trainer.py +++ b/examples/pre-training/ernie/src/trainers/pretraining_trainer.py @@ -93,7 +93,7 @@ FP8QuantWeightCallback, ) from src.callbacks.moe_logging_callback import MoeLoggingCallback -from src.clip import ClipGradForMOEByGlobalNorm +from src.clip import ClipGradForMOEByGlobalNormAuto from src.lr_schedulers import get_wsd_schedule_with_warmup from src.trainers.data_parallel import sync_dp_moe_params_across_sharding from src.utils.misc import global_training_logs @@ -1540,7 +1540,7 @@ def apply_decay_param_fun(x): def expert_fn(p): return getattr(p, "no_sync", False) - grad_clip = ClipGradForMOEByGlobalNorm( + grad_clip = ClipGradForMOEByGlobalNormAuto( self.args.max_grad_norm, is_expert_param_func=expert_fn, moe_group=_get_global_group(), diff --git a/examples/pre-training/ernie/src/trainers/pretraining_trainer_auto.py b/examples/pre-training/ernie/src/trainers/pretraining_trainer_auto.py index 6d47bc1b..68624cf6 100644 --- a/examples/pre-training/ernie/src/trainers/pretraining_trainer_auto.py +++ b/examples/pre-training/ernie/src/trainers/pretraining_trainer_auto.py @@ -57,7 +57,7 @@ from src.lr_schedulers import get_cosine_schedule_with_warmup -from src.utils.training_utils_auto import ( +from src.utils_auto.training_utils import ( reset_per_device_batch_size, ) from src.callbacks_auto import ( @@ -68,7 +68,7 @@ from src.datasets.dist_data_loader import ( DistDataLoaderAuto, ) -from src.clip import ClipGradForMOEByGlobalNorm +from src.clip import ClipGradForMOEByGlobalNormAuto logger = logging.getLogger(__name__) @@ -567,7 +567,7 @@ def apply_decay_param_fun(x): def expert_fn(p): return getattr(p, "no_sync", False) - grad_clip = ClipGradForMOEByGlobalNorm( + grad_clip = ClipGradForMOEByGlobalNormAuto( self.args.max_grad_norm, is_expert_param_func=expert_fn, moe_group=_get_global_group(), diff --git a/examples/pre-training/ernie/src/utils_auto/__init__.py b/examples/pre-training/ernie/src/utils_auto/__init__.py new file mode 100644 index 00000000..0eb015e8 --- /dev/null +++ b/examples/pre-training/ernie/src/utils_auto/__init__.py @@ -0,0 +1,17 @@ +# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .logging import logger, setup_logger_output_file + +__all__ = ["logger", "setup_logger_output_file"] diff --git a/examples/pre-training/ernie/src/utils_auto/logging.py b/examples/pre-training/ernie/src/utils_auto/logging.py new file mode 100644 index 00000000..e43daf69 --- /dev/null +++ b/examples/pre-training/ernie/src/utils_auto/logging.py @@ -0,0 +1,54 @@ +# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +import sys +from pathlib import Path + +from paddleformers.utils.log import logger as paddlenlp_logger + +hdl = logging.StreamHandler(sys.stderr) +formatter = logging.Formatter( + fmt="[%(levelname)s] %(asctime)s [%(filename)12s:%(lineno)5d]: %(message)s" +) +hdl.setFormatter(formatter) +logger = logging.getLogger() +logger.handlers = [hdl] + +bce_log = logging.getLogger("baidubce") +bce_log.handlers = [] +bce_log.propagate = False +logger.setLevel(10) + +bce_bns_proxy_log = logging.getLogger("bce_bns_proxy.wrapper") +bce_bns_proxy_log.disabled = True +filelock_log = logging.getLogger("filelock") +filelock_log.disabled = True + +paddlenlp_logger.logger.handlers = [] +paddlenlp_logger.logger.propagate = True + + +def setup_logger_output_file(outputpath, local_rank): + logdir = Path(outputpath) / "log" + logdir.mkdir(exist_ok=True) + file_hdl = logging.FileHandler( + logdir / f"workerlog.{local_rank}", mode="a", encoding="utf-8" + ) + formatter = logging.Formatter( + fmt=f"[%(levelname)s] %(asctime)s [%(filename)12s:%(lineno)5d][rank-{local_rank}]: %(message)s" + ) + file_hdl.setFormatter(formatter) + hdl.setFormatter(formatter) + logger.handlers = [hdl, file_hdl] diff --git a/examples/pre-training/ernie/src/utils_auto/misc.py b/examples/pre-training/ernie/src/utils_auto/misc.py new file mode 100644 index 00000000..418bc180 --- /dev/null +++ b/examples/pre-training/ernie/src/utils_auto/misc.py @@ -0,0 +1,214 @@ +# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import copy +import logging +import re + +import numpy as np +import paddle +import paddle.distributed as dist + +logger = logging.getLogger(__name__) + +try: + from models.sequence_parallel_utils import get_async_loader + from paddle.incubate.tensor.manipulation import async_offload +except ImportError: + get_async_loader = async_offload = None + +__all__ = ("global_training_logs",) + +ZERO = paddle.zeros([], dtype="float32") + + +class SmoothedValue: + def __init__( + self, + skip_zero, + ): + self.total = 0.0 + self.count = 0 + self._skip_zero = skip_zero + + @paddle.no_grad() + def update(self, value): + if isinstance(value, paddle.Tensor): + value = value.astype("float32").detach() + if value.shape == [1]: + value = value.squeeze() + self.count += (value != ZERO).astype("int64") if self._skip_zero else 1 + else: + self.count += 1 + self.total += value + + @property + def global_avg(self): + return self.total / max(self.count, 1e-6) + + def reset(self): + self.total = 0.0 + self.count = 0 + + +class TrainingLogs: + _instance = None + + def __new__(cls, *args, **kw): + if cls._instance is None: + cls._instance = object.__new__(cls, *args, **kw) + return cls._instance + + def __init__(self): + self.meters = {} + self.snapshot = None + self._global_meters_keys = [] + self.trainer = None + self.logging_interval = None + self._skip_zero_keys = [] + + def set_trainer_interval(self, trainer, logging_interval): + self.trainer = trainer + self.logging_interval = logging_interval + + @property + def global_meters_keys(self): + return self._global_meters_keys + + @global_meters_keys.setter + def global_meters_keys(self, lst): + self._global_meters_keys = lst + + def enable_skip_zero(self, keys=[]): + logger.info("global_training_logs: use skip zero") + self._skip_zero_keys = keys + for m in self.meters.keys(): + for k in keys: + if re.match(k, m): + m._skip_zero = True + + def update(self, **kwargs): + for k, v in kwargs.items(): + self[k] = v + + def is_enabled(self): + return ( + self.trainer is None + or (self.trainer.state.global_step + 1) % self.logging_interval == 0 + ) + + def __setitem__(self, k, v): + skip_zero = False + for skip_k in self._skip_zero_keys: + if re.match(skip_k, k): + skip_zero = True + metric = self.meters.setdefault(k, SmoothedValue(skip_zero=skip_zero)) + metric.update(v) + + def __getitem__(self, v): + return self.meters[v] + + def __getattr__(self, attr): + if attr in self.meters: + return self.meters[attr] + if attr in self.__dict__: + return self.__dict__[attr] + raise AttributeError( + f"'{type(self).__name__}' object has no attribute '{attr}'" + ) + + def dict(self, use_async=False): + avg_metric = { + k: v.global_avg + for k, v in self.meters.items() + if k not in self.global_meters_keys + } + + if self.global_meters_keys: + tensor_lst = [] + for k in self.global_meters_keys: + v = self.meters[k].global_avg if k in self.meters else -100 + tensor_lst.append(paddle.to_tensor(v, "float32")) + gathered_v = [] + dist.gather(paddle.stack(tensor_lst), gathered_v, 0) + if gathered_v: + for i, k in enumerate(self.global_meters_keys): + avg_metric[k] = np.mean( + [t[i] for t in gathered_v if t[i] != -100] + ).item() + + if not use_async: + ret = { + k: v.item() if isinstance(v, paddle.Tensor) else v + for k, v in avg_metric.items() + } + global_info = {k: v for k, v in ret.items() if k in self.global_meters_keys} + ret = { + k: v + for k, v in ret.items() + if (k not in self.global_meters_keys) + and ((not self.meters[k]._skip_zero) or v != 0.0) + } + return ret, global_info + assert get_async_loader is not None, "async logging requires latest paddle" + if not avg_metric: + return lambda: ({}, {}) + keys, values = zip(*avg_metric.items()) + tensor_list = [ + (i, t) for i, t in enumerate(values) if isinstance(t, paddle.Tensor) + ] + if tensor_list: + async_loader = get_async_loader() + tensor_id, tensor_list = zip(*tensor_list) + tensor_list = paddle.stack(tensor_list) + tensor_list_cpu, task = async_offload(tensor_list, async_loader) + else: + task = None + + def _ret(): + nonlocal task, tensor_list_cpu, values + values = list(values) + if task: + task.cpu_wait() + for i, val in zip(tensor_id, tensor_list_cpu.tolist()): + values[i] = val + ret = dict(zip(keys, values)) + global_info = {k: v for k, v in ret.items() if k in self.global_meters_keys} + ret = { + k: v + for k, v in ret.items() + if (k not in self.global_meters_keys) + and ((not self.meters[k]._skip_zero) or v != 0.0) + } + return ret, global_info + + return _ret + + def reset(self): + for k in list(self.meters.keys()): + self.meters[k].reset() + self.meters.pop(k) + + def take_snapshot(self): + self.snapshot = copy.deepcopy(self.meters) + + def restore_snapshot(self): + assert ( + self.snapshot is not None + ), "you should use take_snapshot before restore_snapshot" + self.meters = copy.deepcopy(self.snapshot) + self.snapshot = None + + +global_training_logs = TrainingLogs() diff --git a/examples/pre-training/ernie/src/utils/training_utils_auto.py b/examples/pre-training/ernie/src/utils_auto/training_utils.py similarity index 100% rename from examples/pre-training/ernie/src/utils/training_utils_auto.py rename to examples/pre-training/ernie/src/utils_auto/training_utils.py diff --git a/examples/pre-training/models/ernie/modeling_auto.py b/examples/pre-training/models/ernie/modeling_auto.py index b5a54de3..4387c02e 100644 --- a/examples/pre-training/models/ernie/modeling_auto.py +++ b/examples/pre-training/models/ernie/modeling_auto.py @@ -1623,7 +1623,6 @@ def create_moe_mlp_layer(self, layer_idx, ipp): shared_experts=shared_experts, group=self.config.moe_group, recompute=self.config.use_recompute_moe, - enable_logging=self.config.moe_logging, k=self.config.moe_k, enable_pbr=self.config.moe_use_bpr, all_to_all_dropout=self.config.moe_all_to_all_dropout, diff --git a/examples/pre-training/models/moe/moe_layer_auto.py b/examples/pre-training/models/moe/moe_layer_auto.py index 08d4456d..76d1294b 100644 --- a/examples/pre-training/models/moe/moe_layer_auto.py +++ b/examples/pre-training/models/moe/moe_layer_auto.py @@ -37,13 +37,9 @@ import paddle.distributed as dist from paddle import Tensor from paddleformers.trainer.plugins.timer import get_timers -from models.moe.top2_gate_auto import TopKGateFusedAuto -from models.moe.top2_gate_auto import ( - TopKGateFused, -) +from models.moe.top2_gate_auto import TopKGateFused, TopKGateFusedAuto from models.sequence_parallel_utils_auto import ScatterOp -from models.utils import ( - global_training_logs_enabled, +from models.utils_auto import ( manual_backward, ) @@ -53,10 +49,6 @@ moe_combine, ) -try: - from src.utils.misc import global_training_logs -except ModuleNotFoundError: - global_training_logs = {} try: import moe_router_loss_ops @@ -409,7 +401,6 @@ def __init__( shared_experts: Optional[List[nn.Layer]] = None, group: Group = None, recompute=False, - enable_logging: bool = False, k=2, enable_bpr: bool = False, all_to_all_dropout=0, @@ -434,7 +425,6 @@ def __init__( self.group = group self.k = k self.all_to_all_dropout = all_to_all_dropout - self.enable_logging = enable_logging self.use_correction_bias = moe_statics is not None self.moe_statics = moe_statics if self.use_correction_bias: @@ -950,7 +940,6 @@ def _calc_router_loss( dispatch_tokens_mask=None, prefix="", ): - log = {} router_loss, l_aux, orthogonal_loss, zloss = 0.0, None, None, None if self.gate.config.moe_aux_loss_lambda: l_aux = self.gate._cal_aux_loss( @@ -973,29 +962,6 @@ def _calc_router_loss( zloss = self.gate._cal_z_loss(gate_logits, tokens_type_mask) router_loss += self.gate.moe_z_loss_lambda[token_type or 0] * zloss - tracer = framework._dygraph_tracer() - if self.enable_logging and global_training_logs_enabled() and tracer._has_grad: - if l_aux is not None: - log[f"aux_loss_layer_{self.layer_idx}"] = l_aux - - if orthogonal_loss is not None: - log[f"orthogonal_loss_layer_{self.layer_idx}"] = orthogonal_loss - - if zloss is not None: - log[f"zloss_layer_{self.layer_idx}"] = zloss - - global_training_logs.update( - **log, - **{ - k.replace(f"_layer_{self.layer_idx}", ""): v for k, v in log.items() - }, - ) - global_training_logs.update( - **{ - prefix + "_" + k.replace(f"_layer_{self.layer_idx}", ""): v - for k, v in log.items() - } - ) return router_loss def calc_router_loss_and_logging( @@ -1088,93 +1054,6 @@ def calc_router_loss_and_logging( self.layer_idx, ) - if self.enable_logging and global_training_logs_enabled(): - seqlen = gate_logits.shape[0] - num_active = paddle.count_nonzero(combine_weights) - gate_experts_per_token = num_active.item() / seqlen - - if token_type_ids is not None: - token_type_ids = token_type_ids.reshape([-1]) - combine_weights_type_0 = combine_weights[token_type_ids == 0] - if combine_weights_type_0.size: - gate_expert_per_token_type_0 = ( - paddle.count_nonzero(combine_weights_type_0).item() - / combine_weights_type_0.shape[0] - ) - global_training_logs.update( - experts_per_token_text=gate_expert_per_token_type_0, - ) - - combine_weights_type_1 = combine_weights[token_type_ids == 1] - if combine_weights_type_1.size: - gate_expert_per_token_type_1 = ( - paddle.count_nonzero(combine_weights_type_1).item() - / combine_weights_type_1.shape[0] - ) - global_training_logs.update( - experts_per_token_image=gate_expert_per_token_type_1, - ) - - ce = ( - (-F.softmax(gate_logits, -1) * F.log_softmax(gate_logits, -1)) - .sum(-1) - .mean(0) - ) - _log = { - f"gate_prob_ce_layer_{self.layer_idx}": ce.item(), - f"experts_per_token_layer_{self.layer_idx}": gate_experts_per_token, - } - global_training_logs.update( - **_log, - **{ - k.replace(f"_layer_{self.layer_idx}", ""): v - for k, v in _log.items() - }, - ) - else: - seqlen = dispatch_mask.shape[0] - dispatch_mask = dispatch_mask.unbind(-1) - top1_gate_experts_per_token = ( - paddle.cast(dispatch_mask[0], dtype="float32").sum() / seqlen - ) - if ( - self.enable_logging - and global_training_logs_enabled() - and len(dispatch_mask) == 2 - ): - top2_gate_experts_per_token = ( - paddle.cast(dispatch_mask[1], dtype="float32").sum() / seqlen - ) - leakage_experts_per_token = ( - paddle.cast( - (~dispatch_mask[0]) & (~dispatch_mask[1]), dtype="float32" - ).sum() - / seqlen - ) - experts_per_token = ( - top1_gate_experts_per_token + top2_gate_experts_per_token - ) - global_training_logs.update( - experts_per_token=experts_per_token.detach(), - top1_experts_per_token=top1_gate_experts_per_token.detach(), - top2_experts_per_token=top2_gate_experts_per_token.detach(), - leakage_experts_per_token=leakage_experts_per_token.detach(), - ) - elif ( - self.enable_logging - and global_training_logs_enabled() - and len(dispatch_mask) == 1 - ): - experts_per_token = top1_gate_experts_per_token - leakage_experts_per_token = ( - paddle.cast(~dispatch_mask[0], dtype="float32").sum() / seqlen - ) - global_training_logs.update( - experts_per_token=experts_per_token.detach(), - top1_experts_per_token=top1_gate_experts_per_token.detach(), - leakage_experts_per_token=leakage_experts_per_token.detach(), - ) - return router_loss def combine_expert_output(self, expert_output, combine_weights, scatter_index): @@ -1435,7 +1314,6 @@ def __init__( shared_experts: Optional[List[nn.Layer]] = None, group: Group = None, recompute=False, - enable_logging: bool = False, k=2, enable_pbr: bool = False, all_to_all_dropout=0, @@ -1462,7 +1340,6 @@ def __init__( self.group = group self.k = k self.all_to_all_dropout = all_to_all_dropout - self.enable_logging = enable_logging is_mp_moe = ( hasattr(fleet.fleet, "_hcg") and group is fleet.get_hybrid_communicate_group().get_model_parallel_group() diff --git a/examples/pre-training/models/moe/top2_gate_auto.py b/examples/pre-training/models/moe/top2_gate_auto.py index 7c50aa5a..93414a04 100644 --- a/examples/pre-training/models/moe/top2_gate_auto.py +++ b/examples/pre-training/models/moe/top2_gate_auto.py @@ -26,13 +26,9 @@ from paddle.utils import unique_name from paddle.nn.clip import _squared_l2_norm from paddle.distributed import fleet -from models.utils import global_training_logs_enabled from models.moe.moe_utils_auto import get_mesh, get_flatten_mesh -try: - from src.utils.misc import global_training_logs -except ModuleNotFoundError: - global_training_logs = {} + try: import moe_router_loss_ops except ImportError: @@ -368,7 +364,6 @@ def __init__(self, config, layer_idx: int, group, gate_weight=None) -> None: self.expert_drop = False self.eye_matrix = None self.eye_matrix_size = None - self.enable_logging = config.moe_logging self.norm_gate_logits = config.moe_norm_gate_logits self.one = paddle.ones([], dtype="float32") @@ -581,25 +576,6 @@ def forward( + orthogonal_loss * self.moe_orthogonal_loss_lambda ) router_loss.stop_gradient = False - if self.enable_logging and global_training_logs_enabled(): - _log = { - f"aux_loss_layer_{self.layer_idx}": l_aux.item(), - f"orthogonal_loss_layer_{self.layer_idx}": orthogonal_loss.item(), - f"zloss_layer_{self.layer_idx}": l_zloss.item(), - } - global_training_logs.update( - **_log, - **{ - k.replace(f"_layer_{self.layer_idx}", ""): v - for k, v in _log.items() - }, - ) - if self.use_token_type_bias: - _bias_log = { - f"token_type_bias_layer_{self.layer_idx}_expert{i}_gap": v - for i, v in enumerate((self.bias[0] - self.bias[1]).numpy()) - } - global_training_logs.update(**_bias_log) combine_weights = combine_weights.cast(orig_dtype) return ( @@ -690,22 +666,6 @@ def top2_gating(self, logits, cap=None, correction_bias=None): else: indices2_s = indices2_s_original - if self.enable_logging and global_training_logs_enabled(): - global_training_logs.update( - **{ - "redispatch_acc": (indices2_s_original == indices2_s) - .cast(paddle.float32) - .mean() - .item(), - f"redispatch_acc_layer_{self.layer_idx}": ( - indices2_s_original == indices2_s - ) - .cast(paddle.float32) - .mean() - .item(), - } - ) - mask2 = F.one_hot(indices2_s, num_classes=self.num_experts).cast(paddle.int64) locations1 = paddle.cumsum(mask1, axis=0) - 1 @@ -749,58 +709,6 @@ def top2_gating(self, logits, cap=None, correction_bias=None): scatter2_index = expert2_index * capacity + locations2_s scatter2_index = scatter2_index.cast("int64") dispatch2_mask = combine2_weight.cast(paddle.bool).detach() - if self.enable_logging and global_training_logs_enabled(): - global_training_logs.update( - **{ - "top1_gate": ( - combine1_weight.sum() - / (dispatch1_mask.cast("float32").sum() + 1e-9) - ).item(), - "top2_gate": ( - combine2_weight.sum() - / (dispatch2_mask.cast("float32").sum() + 1e-9) - ).item(), - f"top1_gate_layer_{self.layer_idx}": ( - combine1_weight.sum() - / (dispatch1_mask.cast("float32").sum() + 1e-9) - ).item(), - f"top2_gate_layer_{self.layer_idx}": ( - combine2_weight.sum() - / (dispatch2_mask.cast("float32").sum() + 1e-9) - ).item(), - } - ) - - seqlen = logits.shape[0] - top1_gate_experts_per_token = ( - paddle.cast(dispatch1_mask, dtype="float32").sum() / seqlen - ) - top2_gate_experts_per_token = ( - paddle.cast(dispatch2_mask, dtype="float32").sum() / seqlen - ) - leakage_experts_per_token = ( - paddle.cast( - (~dispatch1_mask) & (~dispatch2_mask), dtype="float32" - ).sum() - / seqlen - ) - - experts_per_token = ( - top1_gate_experts_per_token + top2_gate_experts_per_token - ) - _log = { - f"experts_per_token_layer_{self.layer_idx}": experts_per_token.item(), - f"top1_experts_per_token_layer_{self.layer_idx}": top1_gate_experts_per_token.item(), - f"top2_experts_per_token_layer_{self.layer_idx}": top2_gate_experts_per_token.item(), - f"leakage_experts_per_token_layer_{self.layer_idx}": leakage_experts_per_token.item(), - } - global_training_logs.update( - **_log, - **{ - k.replace(f"_layer_{self.layer_idx}", ""): v - for k, v in _log.items() - }, - ) return ( capacity, @@ -956,24 +864,8 @@ def forward( ), f"token_type_ids {token_type_ids.max()} >= bias shape {self.bias.shape[0]}" bias = self.bias[token_type_ids] logits = logits + bias - orthogonal_loss = None router_loss = paddle.zeros([1], dtype="float32") router_loss.stop_gradient = False - if ( - self.enable_logging - and global_training_logs_enabled() - and orthogonal_loss is not None - ): - _log = { - f"orthogonal_loss_layer_{self.layer_idx}": orthogonal_loss.item(), - } - global_training_logs.update( - **_log, - **{ - k.replace(f"_layer_{self.layer_idx}", ""): v - for k, v in _log.items() - }, - ) return logits, capacity, router_loss diff --git a/examples/pre-training/models/utils_auto.py b/examples/pre-training/models/utils_auto.py new file mode 100644 index 00000000..364323e7 --- /dev/null +++ b/examples/pre-training/models/utils_auto.py @@ -0,0 +1,93 @@ +# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +from typing import Any, Callable, List + +import paddle +from paddle import framework + +logger = logging.getLogger(__name__) + +try: + import moe_permutation + +except ImportError: + moe_permutation = None + logger.warning("moe_permutation is not installed.") + + +def detach_and_requires_grad_(*args): + ret = [a.detach() if a is not None else None for a in args] + for r, a in zip(ret, args): + if a is not None: + r.stop_gradient = a.stop_gradient + return ret + + +class FakeClone(paddle.autograd.PyLayer): + @staticmethod + def forward(ctx, input): + if input.is_contiguous(): + fake_output = paddle.empty_like(input) + input._share_buffer_to(fake_output) + else: + fake_output = input.clone() + return fake_output + + @staticmethod + def backward(ctx, grad_output): + return grad_output + + +def manual_backward(f: Callable, is_first_fwd: bool, *args: List[Any]): + tracer = framework._dygraph_tracer() + orig = tracer._has_grad + if not is_first_fwd: + tracer._has_grad = True + + detached_args = detach_and_requires_grad_(*args) + detached_args_clone = [ + FakeClone.apply(a) if a is not None else None for a in detached_args + ] + out = f(*detached_args_clone) + if isinstance(out, list): + out = tuple(out) + elif not isinstance(out, tuple): + out = (out,) + + if is_first_fwd: + tracer._has_grad = orig + return None, out + + out_cached = [FakeClone.apply(o) for o in out if o is not None] + + for o in out_cached: + o._clear_dataptr() + tracer._has_grad = orig + + def bwd_f(*grad): + nonlocal out_cached, detached_args, f + grad = list(grad) + grad = [g for g in grad if g is not None] + assert grad and out_cached, (len(grad), len(out_cached)) + grad, out_cached = zip( + *[(g, o) for g, o in zip(grad, out_cached) if not o.stop_gradient] + ) + + assert len(grad) == len(out_cached), (len(grad), len(out_cached), f) + paddle.autograd.backward(out_cached, grad) + return tuple([t.grad for t in detached_args if t is not None]) + + return bwd_f, out From 4ee547115a02fc89e03b7353b68e95726ab916b6 Mon Sep 17 00:00:00 2001 From: xuexixi Date: Tue, 19 Aug 2025 22:17:09 +0800 Subject: [PATCH 13/15] remove custom fused_ln --- .../models/ernie/configuration_auto.py | 2 - .../models/ernie/modeling_auto.py | 50 ++----------------- .../models/ernie/modeling_auto_pp.py | 9 +--- 3 files changed, 5 insertions(+), 56 deletions(-) diff --git a/examples/pre-training/models/ernie/configuration_auto.py b/examples/pre-training/models/ernie/configuration_auto.py index 58070f60..fc858468 100644 --- a/examples/pre-training/models/ernie/configuration_auto.py +++ b/examples/pre-training/models/ernie/configuration_auto.py @@ -127,7 +127,6 @@ def __init__( use_rmsnorm=True, z_loss_lambda=None, fuse_rms_norm=False, - fuse_ln=False, pad_token_id=0, bos_token_id=1, eos_token_id=2, @@ -230,7 +229,6 @@ def __init__( self.fuse_attn_ffn = fuse_attn_ffn self.fuse_swiglu = fuse_swiglu self.fuse_rms_norm = fuse_rms_norm - self.fuse_ln = fuse_ln self.use_rmsnorm = use_rmsnorm self.z_loss_lambda = z_loss_lambda self.using_dynamic_sequence_length = using_dynamic_sequence_length diff --git a/examples/pre-training/models/ernie/modeling_auto.py b/examples/pre-training/models/ernie/modeling_auto.py index 4387c02e..ef145aab 100644 --- a/examples/pre-training/models/ernie/modeling_auto.py +++ b/examples/pre-training/models/ernie/modeling_auto.py @@ -119,13 +119,6 @@ class CausalLMOutputWithCrossAttentionsAuto(CausalLMOutputWithCrossAttentions): except ImportError: fast_ln = None -try: - import fused_ln as fused -except ImportError: - logger.warning( - "fused-ln not found, run `python src/ops/fused_ln_setup.py install` to build fused ln" - ) - fused = None try: from paddle.incubate.nn.functional import ( @@ -765,7 +758,7 @@ def __init__(self, config, ipp=0): def forward(self, hidden_states): if self.config.fuse_rms_norm: - return fused.fused_rms_norm( + return paddle.incubate.nn.functional.fused_rms_norm_ext( hidden_states, self.weight, self.variance_epsilon )[0] if paddle.in_dynamic_mode(): @@ -809,37 +802,6 @@ def forward(self, hidden_states): return super().forward(hidden_states) -class FusedLayerNorm(nn.Layer): - - def __init__(self, config, ipp=0): - super().__init__() - self.config = config - self.hidden_size = config.hidden_size - self.weight = paddle.create_parameter( - shape=[self.hidden_size], - dtype=paddle.get_default_dtype(), - default_initializer=nn.initializer.Constant(1.0), - ) - self.bias = paddle.create_parameter( - shape=[self.hidden_size], dtype=paddle.get_default_dtype(), is_bias=True - ) - self.variance_epsilon = config.rms_norm_eps - self.ipp = ipp - if config.pipeline_parallel_degree > 1: - self.weight = dist.shard_tensor( - self.weight, get_mesh(self.ipp), [dist.Replicate(), dist.Replicate()] - ) - self.bias = dist.shard_tensor( - self.bias, get_mesh(self.ipp), [dist.Replicate(), dist.Replicate()] - ) - - def forward(self, hidden_states): - - return fused.fused_ln( - hidden_states, self.weight, self.bias, self.variance_epsilon - )[0] - - class RotaryEmbedding(nn.Layer): def __init__(self, dim, max_position_embeddings=4096, base=10000): @@ -1528,8 +1490,6 @@ def __init__(self, config, layer_idx=0, ipp=0): else: self.mlp = ErnieMLP(config, ipp) Norm = RMSNorm if config.use_rmsnorm else LayerNorm - if not config.use_rmsnorm and config.fuse_ln: - Norm = FusedLayerNorm self.input_layernorm = Norm(config, ipp) self.post_attention_layernorm = Norm(config, ipp) self.residual_add1 = FusedDropoutImpl( @@ -2030,8 +1990,7 @@ def get_layer_pp_info(ipp): self.next_pp_stage_indexes.append(layer_idx) self.layers = nn.LayerList(layers_list) Norm = RMSNorm if config.use_rmsnorm else LayerNorm - if not config.use_rmsnorm and config.fuse_ln: - Norm = FusedLayerNorm + self.norm = Norm(config, -1) self.gradient_checkpointing = False @@ -2668,10 +2627,7 @@ def __init__(self, config): else: logger.info("Use normal RMSNorm") else: - if self.config.fuse_ln: - logger.info("Use fusedLN") - else: - logger.info("Use normal LayerNorm") + logger.info("Use normal LayerNorm") def _post_init(self, original_init, *args, **kwargs): """ diff --git a/examples/pre-training/models/ernie/modeling_auto_pp.py b/examples/pre-training/models/ernie/modeling_auto_pp.py index 5f74b76f..bf192e40 100644 --- a/examples/pre-training/models/ernie/modeling_auto_pp.py +++ b/examples/pre-training/models/ernie/modeling_auto_pp.py @@ -39,7 +39,6 @@ ErniePretrainedModelAuto, LayerNorm, RMSNorm, - FusedLayerNorm, ErniePretrainingCriterion, ErnieLMHead, ) @@ -275,8 +274,7 @@ def __init__(self, config, layer_idx=0, ipp=0): self.layer = ErnieDecoderLayerAuto(config, layer_idx, ipp) Norm = RMSNorm if config.use_rmsnorm else LayerNorm - if not config.use_rmsnorm and config.fuse_ln: - Norm = FusedLayerNorm + if self.layer_idx == self.config.num_hidden_layers - 1: self.norm = Norm(config, -1) self.lm_head = ErnieLMHead(config) @@ -556,10 +554,7 @@ def __init__(self, config): else: logger.info("Use normal RMSNorm") else: - if self.config.fuse_ln: - logger.info("Use fusedLN") - else: - logger.info("Use normal LayerNorm") + logger.info("Use normal LayerNorm") decoder_layers = [] From 96d762d2b36b5b7fd5a0f75b855f64cb63ee2708 Mon Sep 17 00:00:00 2001 From: xuexixi Date: Wed, 20 Aug 2025 09:57:35 +0800 Subject: [PATCH 14/15] base --- .../pre-training/models/moe/moe_layer_auto.py | 520 +----------------- .../pre-training/models/moe/top2_gate_auto.py | 286 +--------- 2 files changed, 15 insertions(+), 791 deletions(-) diff --git a/examples/pre-training/models/moe/moe_layer_auto.py b/examples/pre-training/models/moe/moe_layer_auto.py index 76d1294b..85d0c79c 100644 --- a/examples/pre-training/models/moe/moe_layer_auto.py +++ b/examples/pre-training/models/moe/moe_layer_auto.py @@ -50,22 +50,11 @@ ) -try: - import moe_router_loss_ops -except ImportError: - moe_router_loss_ops = None logger = logging.getLogger(__name__) -try: - import moe_ops -except ImportError: - moe_ops = None - logger.warning( - "`moe-ops` not found, run " - "`python3 src/ernie_core/ops/moe/setup.py install` to install" - ) + try: import moe_ops_auto @@ -85,19 +74,7 @@ "`python3 src/ernie_core/ops/moe/setup_auto.py install` to install" ) -try: - from moe_combine import moe_combine_no_weight -except ImportError: - moe_combine_no_weight = None -try: - import moe_ops_fp8 -except ImportError: - moe_ops_fp8 = None - logger.warning( - "`moe-ops` not found, run " - "`python3 src/ernie_core/ops/moe/setup_fp8.py install` to install" - ) @contextmanager @@ -192,204 +169,6 @@ def combining(x, combine_weights, scatter_index): return paddle.matmul(combine_weights, x).squeeze(1) -class Fp8MoeGateDispatchAndQuant(paddle.autograd.PyLayer): - - @staticmethod - def forward( - ctx, x, gate_logtis, corr_bias, k, capacity, use_pad, use_pow2_scale=True - ): - ( - out_fp8, - scale, - combine_weights, - scatter_index, - expert_offset, - expert_id, - ) = moe_ops_fp8.moe_gate_dispatch_and_quant( - x, - gate_logtis, - corr_bias=corr_bias, - k=k, - capacity=capacity, - use_pad=use_pad, - use_pow2_scale=use_pow2_scale, - ) - assert out_fp8.shape[0] == scale.shape[0] - - out_fp8.stop_gradient = False - combine_weights.stop_gradient = False - scatter_index.stop_gradient = True - expert_offset.stop_gradient = True - expert_id.stop_gradient = True - scale.stop_gradient = True - - ctx.k = k - ctx.capacity = capacity - ctx.use_pad = use_pad - ctx.combine_weights = combine_weights - ctx.scatter_index = scatter_index - ctx.expert_id = expert_id - ctx.has_corr_bias = corr_bias is not None - - return ( - out_fp8, - combine_weights, - scatter_index, - expert_offset, - expert_id, - { - "scale": scale, - }, - ) - - @staticmethod - def backward(ctx, *grads): - out_grad, combine_weights_grad = grads[0], grads[1] - x_grad, gate_logits_grad = moe_ops.moe_gate_dispatch_bwd( - ctx.combine_weights, - ctx.scatter_index, - ctx.expert_id, - out_grad, - combine_weights_grad, - k=ctx.k, - capacity=ctx.capacity, - use_pad=ctx.use_pad, - ) - if ctx.has_corr_bias: - return x_grad, gate_logits_grad, None - else: - return x_grad, gate_logits_grad - - -class AlltoAll(PyLayer): - - @staticmethod - def forward(ctx, x, group, sync_op=True): - - ctx.group = group - if dist.get_world_size(group) <= 1: - return x - output = paddle.empty_like(x) - output.stop_gradient = False - task = stream.alltoall_single( - output, x, None, None, group, sync_op=sync_op, use_calc_stream=sync_op - ) - if not sync_op: - return output, task - else: - return output - - @staticmethod - def backward(ctx, *dx): - return AlltoAll.apply(*dx, group=ctx.group) - - -class AlltoAllExpertOverlap(PyLayer): - - @staticmethod - def forward( - ctx, input, group, num_local_experts, forward_func_dict, is_first_fwd=False - ): - assert ( - dist.get_world_size(group) > 1 - ), "AlltoAllExpertOverlap is not supported for a world size less than or equal to 1." - - ctx.bw_funcs = {} - ctx.group = group - ctx.num_local_experts = num_local_experts - - assert isinstance(forward_func_dict, nn.LayerList) - all2all_tasks = [] - all2all_ins = paddle.unbind(input, axis=0) - for stage_id in range(1): - stage_input = all2all_ins[stage_id] - x_out, task = AlltoAll.apply(stage_input, group=group, sync_op=False) - all2all_tasks.append((task, x_out)) - - expert_outputs = [] - for stage_id in range(num_local_experts): - if stage_id + 1 != num_local_experts: - stage_input = all2all_ins[stage_id + 1] - x_out, task = AlltoAll.apply(stage_input, group=group, sync_op=False) - all2all_tasks.append((task, x_out)) - - task, dispatched_input = all2all_tasks[stage_id] - task.wait() - bwf, (expert_outputs_cur_stage,) = manual_backward( - forward_func_dict[stage_id], is_first_fwd, dispatched_input - ) - ctx.bw_funcs[stage_id] = bwf - expert_outputs.append(expert_outputs_cur_stage) - - expert_output = paddle.stack(expert_outputs, axis=1) - return expert_output - - @staticmethod - def backward(ctx, out_grad): - all2all_tasks = [] - expert_outputs = [] - - out_grad_list = paddle.split( - out_grad, num_or_sections=out_grad.shape[1], axis=1 - ) - for stage_id in range(ctx.num_local_experts): - (grad_cur_stage,) = ctx.bw_funcs[stage_id](out_grad_list[stage_id]) - - x_out, task = AlltoAll.apply(grad_cur_stage, group=ctx.group, sync_op=False) - all2all_tasks.append(task) - expert_outputs.append(x_out) - - for task in all2all_tasks: - task.wait() - - expert_output = paddle.stack(expert_outputs, axis=0) - return expert_output - - -class AlltoAllAsync(PyLayer): - - @staticmethod - def forward(ctx, x, *fn_args, group=None, fn=None, is_first_fwd=False): - - assert fn is not None, "use AlltoAll no async" - ctx.group = group - if dist.get_world_size(group) <= 1: - ctx.bwf, fn_out = manual_backward(fn, is_first_fwd, *fn_args) - return (x,) + fn_out - x_out = paddle.empty_like(x) - x_out.stop_gradient = False - task = stream.alltoall_single( - x_out, - x, - None, - None, - group, - sync_op=False, - ) - ctx.bwf, fn_out = manual_backward(fn, is_first_fwd, *fn_args) - task.wait() - return (x_out,) + fn_out - - @staticmethod - def backward(ctx, dx_out, *fn_out_grads): - if dist.get_world_size(ctx.group) <= 1: - fn_args_grads = ctx.bwf(*fn_out_grads) - return (dx_out,) + fn_args_grads - - dx = paddle.empty_like(dx_out) - dx.stop_gradient = False - task = stream.alltoall_single( - dx, - dx_out, - None, - None, - ctx.group, - sync_op=False, - ) - fn_args_grads = ctx.bwf(*fn_out_grads) - task.wait() - return (dx,) + fn_args_grads - class MOELayer(nn.Layer): @@ -614,115 +393,6 @@ def fused_gate_logits_process( prob = self.gate.act(gate_logits) return prob, max_prob - def gate_distpach_and_quant(self, input, token_type_ids): - - assert isinstance(self.gate, (TopKGateFused)), "Only fused gate is supported." - assert not self.config.use_ep_comm_overlap, "ep_comm_overlap is not supported" - assert ( - self._rr_moe_gate_dispatch is None - ), "rr_moe_gate_dispatch is not supported" - assert moe_ops_fp8 is not None - - args = () - if token_type_ids is not None: - token_type_ids = token_type_ids.reshape([-1]) - args = (token_type_ids,) - - ( - gate_logits, - capacity, - router_loss, - ) = self.gate(input, *args) - - if self.config.moe_multimodal_paired_experts: - assert token_type_ids is not None - input = paddle.concat( - [input, token_type_ids.unsqueeze(-1).astype(input.dtype)], axis=-1 - ) - if self.input_preprocess is not None: - input, gate_logits = self.input_preprocess(input, gate_logits, capacity) - - k = self.k - prob, max_prob = self.fused_gate_logits_process(gate_logits, token_type_ids) - - with profile("dispatch_op"): - corr_bias = ( - self.moe_statics.e_score_correction_bias[0].detach() - if self.use_correction_bias - else None - ) - - ( - dispatched_input, - combine_weights_unnorm, - scatter_index, - dispatch_mask, - _, - fp8_dispatched_handle, - ) = Fp8MoeGateDispatchAndQuant.apply( - input, prob, corr_bias, k=k, capacity=capacity, use_pad=True - ) - - dispatch_mask = paddle.diff(F.pad(dispatch_mask, (1, 0))) - if self.use_correction_bias: - if self.gate.config.multimodel_experts: - for i in range(len(self.moe_statics.expert_usage)): - self.moe_statics.expert_usage[i] += dispatch_mask[ - self.gate.experts_type_mask[i] - ].detach() - else: - self.moe_statics.expert_usage[0] += dispatch_mask.detach() - dispatched_input.stop_gradient = False - combine_weights_unnorm.stop_gradient = False - scatter_index.stop_gradient = True - dispatch_mask.stop_gradient = True - - scatter_index = scatter_index.transpose([1, 0]) - if self.group_experts: - if max_prob is not None: - if token_type_ids is not None: - p = paddle.ones_like(combine_weights_unnorm.unsqueeze(-1)) - p = paddle.scatter_nd_add( - p, paddle.nonzero(token_type_ids == 0), -1 + max_prob - ) - else: - p = max_prob - combine_weights_unnorm = ( - combine_weights_unnorm.unsqueeze(-1) * p - ).squeeze(-1) - prob = (prob.reshape([p.shape[0], k, -1]) * p).reshape([p.shape[0], -1]) - if self.gate.norm_gate_logits: - combine_weights = combine_weights_unnorm / paddle.clip( - combine_weights_unnorm.sum(-1, keepdim=True), min=1e-12 - ) - else: - combine_weights = combine_weights_unnorm - combine_weights = combine_weights.cast("bfloat16") - - def reshape_for_a2a(tensor): - return tensor.reshape( - [ - self.world_size * self.num_local_experts, - capacity, - -1, - ] - ) - - dispatched_input = reshape_for_a2a(dispatched_input) - fp8_dispatched_handle["scale"] = reshape_for_a2a(fp8_dispatched_handle["scale"]) - dispatch_mask.stop_gradient = True - scatter_index.stop_gradient = True - return ( - dispatched_input, - combine_weights, - dispatch_mask, - scatter_index, - router_loss, - gate_logits, - prob, - fp8_dispatched_handle, - ) - def gate_and_distpach(self, input, token_type_ids): seqlen, d_model = input.shape @@ -958,9 +628,7 @@ def _calc_router_loss( router_loss += ( self.gate.moe_orthogonal_loss_lambda[token_type or 0] * orthogonal_loss ) - if self.gate.config.moe_z_loss_lambda and not in_auto_parallel_align_mode(): - zloss = self.gate._cal_z_loss(gate_logits, tokens_type_mask) - router_loss += self.gate.moe_z_loss_lambda[token_type or 0] * zloss + return router_loss @@ -1075,188 +743,8 @@ def forward( self, input: Tensor, token_type_ids=None, - ) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor]: - - if input.ndim == 3: - orig_shape = input.shape - input = input.reshape([-1, input.shape[-1]]) - else: - orig_shape = None - assert ( - len(input.shape) == 2 - ), f"input Tensor must have dimensions: (s)equence, (d)im, got:{input.shape}" - hidden_size = input.shape[1] - if token_type_ids is not None: - token_type_ids = token_type_ids.clone()[:, :-1] - if self.config.sequence_parallel: - token_type_ids = token_type_ids.reshape([-1]) - token_type_ids = ScatterOp.apply(token_type_ids) - token_type_ids.stop_gradient = True - - assert self.gate is not None - if hasattr(self, "rng") and self.rng.random() < self.all_to_all_dropout: - orig_shape_2 = input.shape - if self.config.moe_multimodal_paired_experts: - assert token_type_ids is not None - input = paddle.concat( - [input, token_type_ids.unsqueeze(-1).astype(input.dtype)], axis=-1 - ) - output = self.forward_experts(input) - output += self.gate.weight.sum() * 0.0 - output = output.reshape(orig_shape or orig_shape_2) - return output, None, 0 - - is_first_fwd = not framework._dygraph_tracer()._has_grad - use_async = self.shared_experts is not None - if in_auto_parallel_align_mode(): - gate_input = paddle.assign(input) - else: - gate_input = input - - use_fp8_fuse_node = ( - self.config.use_combine_before_a2a and self.config.use_fp8_fuse_node - ) - use_fp8_dispatch_a2a = self.config.use_fp8_dispatch_a2a and use_fp8_fuse_node - - with profile("fused_gate_and_dispatch"): - fp8_dispatched_handle = None - if use_fp8_dispatch_a2a: - ( - dispatched_input, - combine_weights, - dispatch_mask, - scatter_index, - router_loss, - gate_logits, - gate_prob, - fp8_dispatched_handle, - ) = self.gate_distpach_and_quant(gate_input, token_type_ids) - else: - ( - dispatched_input, - combine_weights, - dispatch_mask, - scatter_index, - router_loss, - gate_logits, - gate_prob, - ) = self.gate_and_distpach(gate_input, token_type_ids) - - if self.config.use_combine_before_a2a: - assert ( - not self.config.use_ep_comm_overlap - ), "Dont support use_ep_comm_overlap" - assert ( - moe_combine_no_weight is not None - ), "use_combine_before_a2a can only use with moe_combine_no_weight op, please install it first." - cw_shape = combine_weights.shape - si_shape = scatter_index.shape - scatter_index = scatter_index.reshape([-1]) - - token_combine_weights = paddle.zeros( - [cw_shape[0] * cw_shape[1]], dtype=combine_weights.dtype - ) - token_combine_weights = paddle.scatter( - token_combine_weights, - scatter_index, - combine_weights.reshape([-1]), - overwrite=False, - ) - - token_combine_weights = token_combine_weights.reshape( - [cw_shape[0], cw_shape[1], 1] - ) - token_combine_weights = AlltoAll.apply(token_combine_weights, self.group) - - if not self.config.use_ep_comm_overlap: - if use_fp8_dispatch_a2a: - shared_out = ( - self.shared_experts(input) - if self.shared_experts is not None - else None - ) - else: - with profile("moe_comm_and_shared_expert"): - if use_async: - dispatched_input, shared_out = AlltoAllAsync.apply( - dispatched_input, - input, - group=self.group, - fn=self.shared_experts, - is_first_fwd=is_first_fwd, - ) - else: - dispatched_input = AlltoAll.apply(dispatched_input, self.group) - - expert_out = ( - recompute(self.forward_experts, dispatched_input) - if self.recompute and self.training - else self.forward_experts(dispatched_input) - ) - - if self.config.use_combine_before_a2a: - token_combine_weights = token_combine_weights.clone().reshape( - expert_out.shape[:-1] + [1] - ) - expert_out = expert_out * token_combine_weights - else: - assert ( - len(dispatched_input.shape) == 4 - and dispatched_input.shape[1] == self.world_size - and dispatched_input.shape[0] == self.num_local_experts - ), ( - f"When using ep_comm_overlap, moe_gate_dispatch_permute is needed. " - f"Expected dispatched_input to have shape[1] == {self.world_size} " - f"and shape[0] == {self.num_local_experts}, " - f"but got shape {dispatched_input.shape}" - ) - with profile("moe_comm_and_forward_expert"): - expert_out = AlltoAllExpertOverlap.apply( - dispatched_input, - self.group, - self.num_local_experts, - self.experts, - is_first_fwd=is_first_fwd, - ) - if self.shared_experts is not None: - shared_out = self.shared_experts(input) - - with profile("moe_comm_and_calc_routerloss"): - expert_out, router_loss2 = AlltoAllAsync.apply( - expert_out, - router_loss, - combine_weights, - dispatch_mask, - gate_logits, - gate_prob, - token_type_ids, - group=self.group, - fn=self.calc_router_loss_and_logging, - is_first_fwd=is_first_fwd, - ) - - with profile("combine"): - if self.config.use_combine_before_a2a: - expert_out = expert_out.reshape([-1, hidden_size]) - - scatter_index = scatter_index.reshape(si_shape) - combined_output = moe_combine_no_weight( - expert_out, combine_weights, scatter_index, epsilon=1e-15 - ) - else: - combined_output = self.combine_expert_output( - expert_out, combine_weights, scatter_index - ) - - if self.shared_experts is not None: - combined_output += shared_out - - if orig_shape: - combined_output = combined_output.clone().reshape( - orig_shape[:-1] + [combined_output.shape[-1]] - ) - return combined_output, combine_weights, router_loss2, gate_logits - + ) : + pass def combining_fused_auto(x, combine_weights, scatter_index, hard_gate=False): """ diff --git a/examples/pre-training/models/moe/top2_gate_auto.py b/examples/pre-training/models/moe/top2_gate_auto.py index 93414a04..989329b7 100644 --- a/examples/pre-training/models/moe/top2_gate_auto.py +++ b/examples/pre-training/models/moe/top2_gate_auto.py @@ -29,177 +29,15 @@ from models.moe.moe_utils_auto import get_mesh, get_flatten_mesh -try: - import moe_router_loss_ops -except ImportError: - moe_router_loss_ops = None - try: from custom_setup_ops import matmul_bwd except ImportError: matmul_bwd = None -try: - from bincount_ops import int_bincount -except ImportError: - int_bincount = None logger = logging.getLogger(__name__) -class CalOrthogonalLossOptEachWeightFunctor(paddle.autograd.PyLayer): - - @staticmethod - def forward(ctx, gate_weight, moe_k, use_group, eps=1e-12): - if gate_weight.dtype != paddle.float32: - gate_weight = gate_weight.astype(paddle.float32) - ( - orthogonal_loss, - wnorm, - weight_scale, - normed_weight, - weight_matmul, - ) = moe_router_loss_ops.cal_orthogonal_loss_opt_each_weight( - gate_weight, moe_k, use_group, eps - ) - ctx.save_for_backward( - gate_weight, wnorm, weight_scale, normed_weight, weight_matmul - ) - ctx.moe_k = moe_k - ctx.use_group = use_group - ctx.eps = eps - return orthogonal_loss - - @staticmethod - def backward(ctx, out_grad): - gate_weight, wnorm, weight_scale, normed_weight, weight_matmul = ( - ctx.saved_tensor() - ) - if gate_weight.stop_gradient: - return None - moe_k = ctx.moe_k - use_group = ctx.use_group - eps = ctx.eps - return moe_router_loss_ops.cal_orthogonal_loss_opt_each_weight_grad( - out_grad, - wnorm, - weight_scale, - normed_weight, - weight_matmul, - moe_k, - use_group, - eps, - ) - - -class CalZLossFunctor(paddle.autograd.PyLayer): - - @staticmethod - def forward(ctx, logits, loss_mask=None, clip_min=1e-6): - if loss_mask is not None: - assert loss_mask.stop_gradient - loss, max_logits, safe_sumexp, logsumexp_per_token = ( - moe_router_loss_ops.cal_z_loss(logits, loss_mask, clip_min) - ) - ctx.save_for_backward( - logits, loss_mask, max_logits, safe_sumexp, logsumexp_per_token - ) - ctx.clip_min = clip_min - return loss - - @staticmethod - def backward(ctx, out_grad): - logits, loss_mask, max_logits, safe_sumexp, logsumexp_per_token = ( - ctx.saved_tensor() - ) - if logits.stop_gradient: - return None - clip_min = ctx.clip_min - return moe_router_loss_ops.cal_z_loss_grad( - out_grad, - logits, - loss_mask, - max_logits, - safe_sumexp, - logsumexp_per_token, - clip_min, - ) - - -class CalAuxLossFunctor(paddle.autograd.PyLayer): - - @staticmethod - def forward( - ctx, - gate_prob, - dispatch_mask, - tokens_mask, - dispatch_tokens_mask, - num_experts, - use_group, - moe_k, - clip_min=1e-6, - ): - if tokens_mask is not None and tokens_mask.dtype != gate_prob.dtype: - tokens_mask = tokens_mask.astype(gate_prob.dtype) - loss, seqlen_float, ce = paddle.incubate.nn.functional.cal_aux_loss( - gate_prob, - dispatch_mask, - tokens_mask, - dispatch_tokens_mask, - num_experts, - use_group, - moe_k, - clip_min, - ) - ctx.save_for_backward(gate_prob, seqlen_float, ce) - ctx.num_experts = num_experts - ctx.use_group = use_group - ctx.moe_k = moe_k - return loss - - @staticmethod - def backward(ctx, out_grad): - gate_prob, seqlen_float, ce = ctx.saved_tensor() - num_experts = ctx.num_experts - use_group = ctx.use_group - moe_k = ctx.moe_k - return paddle.incubate.nn.functional.cal_aux_loss_grad( - out_grad, gate_prob, seqlen_float, ce, num_experts, use_group, moe_k - ) - - -def cal_orthogonal_loss_opt_each_weight_func( - weight, moe_k, use_group, eps, training=True -): - weight = weight.transpose([1, 0]).contiguous() - wnorm = weight.norm(axis=1) - weight = weight / paddle.maximum(wnorm, eps).unsqueeze(1) - - if use_group: - weight = weight.reshape([moe_k, -1, weight.shape[1]]) - eye_matrix = paddle.eye(weight.shape[1], dtype=weight.dtype).unsqueeze(0) - else: - eye_matrix = paddle.eye(weight.shape[0], dtype=weight.dtype) - - weight_matmul = paddle.matmul(weight, weight, transpose_y=True) - - orthogonal_loss = weight_matmul - eye_matrix - orthogonal_loss = _squared_l2_norm(orthogonal_loss) / orthogonal_loss.size - return orthogonal_loss - - -def cal_z_loss_func(logits, loss_mask): - if loss_mask is not None: - loss_mask = loss_mask.astype(logits.dtype) - l_zloss = (logits.logsumexp(1).square() * loss_mask).sum() / paddle.clip( - loss_mask.sum(), min=1e-6 - ) - else: - l_zloss = logits.logsumexp(1).square().mean() - return l_zloss - - def cal_aux_loss_func( gate_prob, dispatch_mask, @@ -252,7 +90,6 @@ def cal_aux_loss_func( return l_aux - def masked_fill(x, mask, value): y = paddle.full(x.shape, value, x.dtype) @@ -547,45 +384,8 @@ def forward( token_type_ids: Tensor = None, transform_weight: bool = True, correction_bias: Tensor = None, - ) -> Tuple[Tensor, Tensor, Tensor]: - - orig_dtype = input.dtype - weight = self.get_gate_weight(transform_weight) - with paddle.amp.auto_cast(False): - - logits = gate_detach_matmul( - input, weight, self.fuse_gate_detach_matmul, self.use_fake_gate - ) - - if self.use_token_type_bias: - assert token_type_ids is not None - bias = self.bias[token_type_ids] - logits = logits + bias - ( - capacity, - dispatch_mask, - combine_weights, - scatter_index, - l_aux, - l_zloss, - ) = self.top2_gating(logits, correction_bias=correction_bias) - orthogonal_loss = self._cal_orthogonal_loss() - router_loss = ( - l_aux * self.moe_aux_loss_lambda - + l_zloss * self.moe_z_loss_lambda - + orthogonal_loss * self.moe_orthogonal_loss_lambda - ) - router_loss.stop_gradient = False - - combine_weights = combine_weights.cast(orig_dtype) - return ( - capacity, - dispatch_mask, - combine_weights, - scatter_index, - router_loss, - logits, - ) + ): + pass def get_capacity(self, num_tokens, cap_factor=None): @@ -739,15 +539,10 @@ def _cal_aux_loss( _, top_idx = gate_prob_this_modality.topk( k=self.config.moe_k, axis=-1 ) - if int_bincount is not None: - dispatch_mask = int_bincount( - top_idx, 0, gate_prob.shape[-1], paddle.int64 - ) - else: - mask = paddle.zeros_like( - gate_prob_this_modality - ).put_along_axis(top_idx, paddle.to_tensor(1.0), axis=1) - dispatch_mask = paddle.sum(mask.cast(paddle.int64), axis=0) + mask = paddle.zeros_like( + gate_prob_this_modality + ).put_along_axis(top_idx, paddle.to_tensor(1.0), axis=1) + dispatch_mask = paddle.sum(mask.cast(paddle.int64), axis=0) else: dispatch_mask = paddle.zeros(gate_prob.shape[-1], dtype="int64") dist.stream.all_reduce( @@ -757,15 +552,11 @@ def _cal_aux_loss( ) else: _, top_idx = gate_prob.topk(k=self.config.moe_k, axis=-1) - if int_bincount is not None: - dispatch_mask = int_bincount( - top_idx, 0, gate_prob.shape[-1], paddle.int64 - ) - else: - mask = paddle.zeros_like(gate_prob).put_along_axis( - top_idx, paddle.to_tensor(1.0), axis=1 - ) - dispatch_mask = paddle.sum(mask.cast(paddle.int64), axis=0) + + mask = paddle.zeros_like(gate_prob).put_along_axis( + top_idx, paddle.to_tensor(1.0), axis=1 + ) + dispatch_mask = paddle.sum(mask.cast(paddle.int64), axis=0) if num_experts is None: num_experts = self.num_experts_tensor @@ -785,61 +576,6 @@ def _cal_aux_loss( self.group if self.global_aux_loss else None, ) - def _cal_z_loss(self, logits, loss_mask=None): - - if ( - (moe_router_loss_ops is not None) - and (loss_mask is None or len(loss_mask.shape) == 1) - and (logits.dtype == paddle.float32) - ): - return CalZLossFunctor.apply(logits, loss_mask) - else: - return cal_z_loss_func(logits, loss_mask) - - def _cal_orthogonal_loss_opt_each_weight(self, weight, use_group): - - if weight.dtype != paddle.float32: - weight = weight.astype(paddle.float32) - - if (moe_router_loss_ops is not None) and (weight.dtype == paddle.float32): - return CalOrthogonalLossOptEachWeightFunctor.apply( - weight, self.config.moe_k, use_group - ) - else: - return cal_orthogonal_loss_opt_each_weight_func( - weight, - self.config.moe_k, - use_group, - self.eps, - self.training, - ) - - def _cal_orthogonal_loss(self, weight_id=None, use_group=None): - - if use_group is None: - use_group = ( - self.config.moe_group_experts and self.config.moe_group_orthogonal_loss - ) - - if weight_id is not None: - if weight_id == 0: - w_ = self.weight - else: - assert self.config.multimodel_experts - w_ = getattr(self, f"weight_{weight_id}") - return self._cal_orthogonal_loss_opt_each_weight(w_, use_group) - - orthogonal_loss = self._cal_orthogonal_loss_opt_each_weight( - self.weight, use_group - ) - if self.config.multimodel_experts: - for i in range(1, len(self.config.moe_num_experts)): - w_ = getattr(self, f"weight_{i}") - orthogonal_loss += self._cal_orthogonal_loss_opt_each_weight( - w_, use_group=False - ) - return orthogonal_loss - class TopKGateFused(Top2Gate): From 1358ad59cf5de621a4272a1e3b366e293eabb8e5 Mon Sep 17 00:00:00 2001 From: xuexixi Date: Wed, 20 Aug 2025 09:57:53 +0800 Subject: [PATCH 15/15] switch custom op --- .../pre-training/models/moe/moe_layer_auto.py | 32 ++++--------------- 1 file changed, 6 insertions(+), 26 deletions(-) diff --git a/examples/pre-training/models/moe/moe_layer_auto.py b/examples/pre-training/models/moe/moe_layer_auto.py index 85d0c79c..1e7e8136 100644 --- a/examples/pre-training/models/moe/moe_layer_auto.py +++ b/examples/pre-training/models/moe/moe_layer_auto.py @@ -56,24 +56,6 @@ -try: - import moe_ops_auto -except ImportError: - moe_ops_auto = None - logger.warning( - "`moe_ops_auto` not found, run " - "`python3 src/ernie_core/ops/moe/setup_auto.py install` to install" - ) - -try: - import moe_combine_auto -except ImportError: - moe_combine_auto = None - logger.warning( - "`moe_combine_auto` not found, run " - "`python3 src/ernie_core/ops/moe/setup_auto.py install` to install" - ) - @@ -105,7 +87,7 @@ def forward(ctx, x, combine_weights, scatter_index): ctx.combine_weights = combine_weights ctx.scatter_index = scatter_index assert moe_combine is not None - ret = moe_combine.moe_combine(x, combine_weights, scatter_index) + ret = paddle.incubate.nn.functional.moe_combine(x, combine_weights, scatter_index) return ret @staticmethod @@ -518,7 +500,7 @@ def gate_and_distpach(self, input, token_type_ids): self.moe_statics.expert_usage[0] += dispatch_mask.detach() dispatched_input.stop_gradient = False combine_weights_unnorm.stop_gradient = False - scatter_index.stop_gradient = True + scatter_index.stop_gradient = False dispatch_mask.stop_gradient = True scatter_index = scatter_index.transpose([1, 0]) @@ -586,7 +568,7 @@ def gate_and_distpach(self, input, token_type_ids): ) dispatched_input = dispatched_input dispatch_mask.stop_gradient = True - scatter_index.stop_gradient = True + scatter_index.stop_gradient = False return ( dispatched_input, combine_weights, @@ -759,7 +741,7 @@ def combining_fused_auto(x, combine_weights, scatter_index, hard_gate=False): if hard_gate: x_gatherd = F.embedding(scatter_index, x) return x_gatherd.squeeze(-2) - ret = moe_combine_auto.moe_combine_auto(x, combine_weights, scatter_index) + ret = paddle.incubate.nn.functional.moe_combine(x, combine_weights, scatter_index) ret.stop_gradient = False return ret @@ -985,9 +967,7 @@ def gate_and_distpach(self, input, token_type_ids): scatter_index, dispatch_mask, _, - ) = moe_ops_auto.moe_gate_dispatch_auto( - input, prob, k, local_capacity, True - ) + ) = paddle.incubate.nn.functional.moe_gate_dispatch(input, prob, None, k, local_capacity, True) dispatched_input.stop_gradient = False combine_weights_unnorm.stop_gradient = False dispatch_mask.stop_gradient = True @@ -1022,7 +1002,7 @@ def gate_and_distpach(self, input, token_type_ids): capacity=capacity, ) dispatch_mask.stop_gradient = True - scatter_index.stop_gradient = True + scatter_index.stop_gradient = False return ( dispatched_input, combine_weights,