[fix] remove useless param in doc; fix gpt2 qkv test;

duanjunwen · duanjunwen · commit d74071ae6d08 · 2024-12-25T13:38:14.000+08:00
diff --git a/docs/source/en/features/zerobubble_pipeline_parallelism.md b/docs/source/en/features/zerobubble_pipeline_parallelism.md
@@ -36,11 +36,10 @@ Build our model and Optimizer. We created a Llama with 8 Decoder-Layer. Then, in
 ```python
 # Global Param
 NUM_BATCH = 8
-NUM_TOK_PER_BATCH, NUM_EXPERTS = 4, 4
+NUM_TOK_PER_BATCH = 4
 NUM_LAYERS = 8
 HIDDEN_SIZE_PER_HEAD = 4
 NUM_HEADS = 4
-TOP_K = 1
 # Init Llama from huggingface
 configuration = LlamaConfig(
     hidden_size=HIDDEN_SIZE_PER_HEAD * NUM_HEADS,
diff --git a/docs/source/zh-Hans/features/zerobubble_pipeline_parallelism.md b/docs/source/zh-Hans/features/zerobubble_pipeline_parallelism.md
@@ -37,11 +37,10 @@ colossalai.launch(rank=rank, world_size=world_size, host="localhost", port=port,
 ```python
 # Global Param
 NUM_BATCH = 8
-NUM_TOK_PER_BATCH, NUM_EXPERTS = 4, 4
+NUM_TOK_PER_BATCH = 4
 NUM_LAYERS = 8
 HIDDEN_SIZE_PER_HEAD = 4
 NUM_HEADS = 4
-TOP_K = 1
 # Init Llama from huggingface
 configuration = LlamaConfig(
     hidden_size=HIDDEN_SIZE_PER_HEAD * NUM_HEADS,
diff --git a/tests/test_shardformer/test_layer/test_gpt2_qkv_fused_linear_1d.py b/tests/test_shardformer/test_layer/test_gpt2_qkv_fused_linear_1d.py
@@ -188,7 +188,7 @@ def check_linear_conv_1d_with_weight_grad_store(lazy_init: bool, seq_parallel_mo
     assert_close(linear.weight.grad, linear_base.weight.grad)
 
 
-@parameterize("lazy_init", [False])
+@parameterize("lazy_init", [False, True])
 @parameterize("seq_parallel_mode", ["split_gather", None])
 def check_gpt2_qkv_fused_linear_1d(lazy_init: bool, seq_parallel_mode: bool):
     check_linear_conv_1d_col(lazy_init, seq_parallel_mode)