modelscope
diff --git a/‎docs/source/Instruction/命令行参数.md‎
Lines changed: 1 addition & 1 deletion b/‎docs/source/Instruction/命令行参数.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/source_en/Instruction/Command-line-parameters.md‎
Lines changed: 1 addition & 1 deletion b/‎docs/source_en/Instruction/Command-line-parameters.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/train/long_text/ring_attention/sequence_parallel.sh‎
Lines changed: 31 additions & 0 deletions b/‎examples/train/long_text/ring_attention/sequence_parallel.sh‎
Lines changed: 31 additions & 0 deletions
diff --git a/‎examples/train/long_text/ring_attention/sequence_parallel_256k.sh‎
Lines changed: 30 additions & 0 deletions b/‎examples/train/long_text/ring_attention/sequence_parallel_256k.sh‎
Lines changed: 30 additions & 0 deletions
diff --git a/‎examples/train/long_text/ring_attention/sequence_parallel_dpo.sh‎
Lines changed: 29 additions & 0 deletions b/‎examples/train/long_text/ring_attention/sequence_parallel_dpo.sh‎
Lines changed: 29 additions & 0 deletions
diff --git a/‎examples/train/long_text/ring_attention/sequence_parallel_grpo.sh‎
Lines changed: 45 additions & 0 deletions b/‎examples/train/long_text/ring_attention/sequence_parallel_grpo.sh‎
Lines changed: 45 additions & 0 deletions
diff --git a/‎examples/train/long_text/sequence_parallel.sh‎ renamed to ‎examples/train/long_text/ulysses/sequence_parallel.sh‎ b/‎examples/train/long_text/sequence_parallel.sh‎ renamed to ‎examples/train/long_text/ulysses/sequence_parallel.sh‎
diff --git a/‎examples/train/long_text/sequence_parallel_512k.sh‎ renamed to ‎examples/train/long_text/ulysses/sequence_parallel_512k.sh‎ b/‎examples/train/long_text/sequence_parallel_512k.sh‎ renamed to ‎examples/train/long_text/ulysses/sequence_parallel_512k.sh‎
diff --git a/‎examples/train/long_text/sequence_parallel_dpo.sh‎ renamed to ‎examples/train/long_text/ulysses/sequence_parallel_dpo.sh‎ b/‎examples/train/long_text/sequence_parallel_dpo.sh‎ renamed to ‎examples/train/long_text/ulysses/sequence_parallel_dpo.sh‎
diff --git a/‎examples/train/long_text/sequence_parallel_grpo.sh‎ renamed to ‎examples/train/long_text/ulysses/sequence_parallel_grpo.sh‎ b/‎examples/train/long_text/sequence_parallel_grpo.sh‎ renamed to ‎examples/train/long_text/ulysses/sequence_parallel_grpo.sh‎
@@ -94,7 +94,7 @@
   - 'all': 计算所有tokens的损失。
   - 'ignore_empty_think': 在`'default'`的基础上，忽略空的`'<think>\n\n</think>\n\n'`损失计算，具体请参考[此issue](https://github.com/modelscope/ms-swift/issues/4030)。
   - 'react', 'hermes', 'qwen': 在`'default'`的基础上，将`tool_call`部分的loss权重调整为2。
-- sequence_parallel_size: 序列并行大小，默认是1。当前支持CPT/SFT/DPO/GRPO。训练脚本参考[这里](https://github.com/modelscope/ms-swift/tree/main/examples/train/long_text/sequence_parallel.sh)。
+- sequence_parallel_size: 序列并行大小，默认是1。当前支持CPT/SFT/DPO/GRPO。训练脚本参考[这里](https://github.com/modelscope/ms-swift/tree/main/examples/train/long_text/ulysses/sequence_parallel.sh)。
 - response_prefix: response的前缀字符，例如QwQ-32B将response_prefix设置为`'<think>\n'`。默认为None，根据模型自动设置。
   - 注意：若对deepseek-r1/qwq模型使用不包含`<think>...</think>`的数据集进行训练，请加在推理训练后模型时额外传入`--response_prefix ''`。
 - template_backend: 选择template后端，可选为'swift'、'jinja'，默认为'swift'。如果使用jinja，则使用transformers的`apply_chat_template`。
 
@@ -95,7 +95,7 @@ Hints:
   - 'all': Calculate the loss for all tokens.
   - 'ignore_empty_think': On top of 'default', ignore the loss calculation for empty `'<think>\n\n</think>\n\n'`. See [this issue](https://github.com/modelscope/ms-swift/issues/4030) for more details.
   - `'react'`, `'hermes'`, `'qwen'`: On top of `'default'`, set the loss weight of the `tool_call` part to 2.
-- sequence_parallel_size: Sequence parallelism size, default is 1. Currently supported in CPT/SFT/DPO/GRPO. The training script refers to [here](https://github.com/modelscope/ms-swift/tree/main/examples/train/long_text/sequence_parallel.sh).
+- sequence_parallel_size: Sequence parallelism size, default is 1. Currently supported in CPT/SFT/DPO/GRPO. The training script refers to [here](https://github.com/modelscope/ms-swift/tree/main/examples/train/long_text/ulysses/sequence_parallel.sh).
 - response_prefix: The prefix character for the response, for example, setting the response_prefix to `'<think>\n'` for QwQ-32B. The default is None, and it is automatically set according to the model.
   - Note: If you are training the deepseek-r1/qwq model with a dataset that does not include `<think>...</think>`, please pass `--response_prefix ''` additionally when inferring after training.
 - template_backend: Selection of the template backend. Options are 'swift' and 'jinja', with 'swift' as the default. If using jinja, it applies transformer's `apply_chat_template`.
 
@@ -0,0 +1,31 @@
+# Env: 4 * A100
+# Max Length: 65536
+# GPU Memory: 4 * 38GiB, Training Speed 30s/it
+NPROC_PER_NODE=4 \
+CUDA_VISIBLE_DEVICES=0,1,2,3 \
+SEQUENCE_PARALLEL_IMPL=ring_attention \
+RING_HEAD_STRIDE=2 \
+swift sft \
+    --model Qwen/Qwen2.5-3B-Instruct \
+    --train_type full \
+    --dataset 'AI-ModelScope/LongAlpaca-12k' \
+    --torch_dtype bfloat16 \
+    --per_device_train_batch_size 1 \
+    --per_device_eval_batch_size 1 \
+    --learning_rate 1e-5 \
+    --gradient_accumulation_steps 8 \
+    --packing true \
+    --rope_scaling yarn \
+    --max_length 65536 \
+    --eval_steps 50 \
+    --save_steps 50 \
+    --logging_steps 5 \
+    --warmup_ratio 0.05 \
+    --dataloader_num_workers 8 \
+    --dataset_num_proc 8 \
+    --save_total_limit 2 \
+    --save_only_model true \
+    --output_dir output/Qwen2.5-3B-Instruct \
+    --deepspeed zero3 \
+    --attn_impl flash_attn \
+    --sequence_parallel_size 4
@@ -0,0 +1,30 @@
+# Env: 4 * A100
+# Max Length: 256000
+# GPU Memory: 4 * 42GiB, Training Speed 43s/it
+NPROC_PER_NODE=4 \
+CELOSS_PARALLEL_SIZE=2048 \
+SEQUENCE_PARALLEL_IMPL=ring_attention \
+swift sft \
+    --model Qwen/Qwen2.5-7B-Instruct \
+    --train_type lora \
+    --dataset 'AI-ModelScope/LongAlpaca-12k' \
+    --torch_dtype bfloat16 \
+    --per_device_train_batch_size 1 \
+    --per_device_eval_batch_size 1 \
+    --learning_rate 1e-5 \
+    --gradient_accumulation_steps 2 \
+    --packing true \
+    --rope_scaling yarn \
+    --max_length 256000 \
+    --eval_steps 200 \
+    --save_steps 200 \
+    --logging_steps 5 \
+    --warmup_ratio 0.05 \
+    --dataloader_num_workers 8 \
+    --dataset_num_proc 8 \
+    --save_total_limit 2 \
+    --use_liger_kernel true \
+    --save_only_model true \
+    --deepspeed zero3_offload \
+    --attn_impl flash_attn \
+    --sequence_parallel_size 4
@@ -0,0 +1,29 @@
+# Env: 4 * A100
+# GPU Memory: 4 * 52GiB, Training Speed 4s/it
+NPROC_PER_NODE=4 \
+CUDA_VISIBLE_DEVICES=0,1,2,3 \
+swift rlhf \
+    --rlhf_type dpo \
+    --model Qwen/Qwen2.5-7B-Instruct \
+    --train_type full \
+    --dataset hjh0119/shareAI-Llama3-DPO-zh-en-emoji \
+    --torch_dtype bfloat16 \
+    --num_train_epochs 1 \
+    --per_device_train_batch_size 1 \
+    --per_device_eval_batch_size 1 \
+    --learning_rate 1e-5 \
+    --gradient_accumulation_steps 4 \
+    --eval_steps 100 \
+    --save_steps 100 \
+    --save_total_limit 2 \
+    --logging_steps 5 \
+    --max_length 8192 \
+    --output_dir output \
+    --warmup_ratio 0.05 \
+    --save_only_model true \
+    --dataloader_num_workers 4 \
+    --dataset_num_proc 4 \
+    --deepspeed zero3 \
+    --attn_impl flash_attn \
+    --padding_free true \
+    --sequence_parallel_size 2
@@ -0,0 +1,45 @@
+NPROC_PER_NODE=4 \
+PYTORCH_CUDA_ALLOC_CONF='' \
+SEQUENCE_PARALLEL_IMPL=ring_attention \
+swift rlhf \
+    --rlhf_type grpo \
+    --model Qwen/Qwen2.5-7B \
+    --train_type full \
+    --use_vllm true \
+    --vllm_mode colocate \
+    --vllm_gpu_memory_utilization 0.5 \
+    --vllm_max_model_len 2048 \
+    --vllm_tensor_parallel_size 4 \
+    --dataset AI-MO/NuminaMath-TIR#5000 \
+    --torch_dtype bfloat16 \
+    --num_train_epochs 1 \
+    --max_length 2048 \
+    --per_device_train_batch_size 4 \
+    --per_device_eval_batch_size 4 \
+    --gradient_accumulation_steps 8 \
+    --eval_steps 1000 \
+    --save_steps 1000 \
+    --learning_rate 1e-6 \
+    --save_total_limit 2 \
+    --logging_steps 5 \
+    --output_dir output \
+    --warmup_ratio 0.05 \
+    --dataloader_num_workers 4 \
+    --max_completion_length 1024 \
+    --reward_funcs accuracy format \
+    --num_generations 4 \
+    --system examples/train/grpo/prompt.txt \
+    --deepspeed zero3 \
+    --temperature 1.0 \
+    --top_p 1.0 \
+    --top_k 80 \
+    --attn_impl flash_attn \
+    --log_completions true \
+    --async_generate false \
+    --offload_optimizer true \
+    --offload_model true \
+    --padding_free true \
+    --sequence_parallel_size 4 \
+    --gc_collect_after_offload true \
+    --dataloader_drop_last true \
+    --sleep_level 1