|
554 | 554 | BASE_LLAMA3_70B_CONFIG, |
555 | 555 | num_gpus=8, |
556 | 556 | peft="lora", |
557 | | - # pipeline_model_parallel_size=4, |
558 | | - # virtual_pipeline_model_parallel_size=20, |
| 557 | + tensor_model_parallel_size=1, |
| 558 | + pipeline_model_parallel_size=1, |
| 559 | + context_parallel_size=1, |
559 | 560 | micro_batch_size=1, |
560 | | - global_batch_size=64, |
| 561 | + global_batch_size=32, |
561 | 562 | cuda_graph_impl="transformer_engine", |
562 | 563 | cuda_graph_scope="mlp", |
563 | 564 | ) |
564 | 565 |
|
565 | 566 | LLAMA3_70B_LORA_CONFIG_GB300_BF16_V1 = _LLAMA3_70B_LORA_CONFIG_GB300 |
566 | 567 | LLAMA3_70B_LORA_CONFIG_GB300_FP8_CS_V1 = _LLAMA3_70B_LORA_CONFIG_GB300 |
567 | 568 | LLAMA3_70B_LORA_CONFIG_GB300_FP8_MX_V1 = replace( |
568 | | - LLAMA3_70B_LORA_CONFIG_GB300_FP8_CS_V1, |
569 | | - pipeline_model_parallel_size=2, # PP=1 is OOM |
| 569 | + _LLAMA3_70B_LORA_CONFIG_GB300, |
| 570 | + pipeline_model_parallel_size=2, |
570 | 571 | ) |
571 | 572 |
|
572 | 573 |
|
573 | 574 | _LLAMA3_70B_LORA_CONFIG_GB200 = replace( |
574 | 575 | BASE_LLAMA3_70B_CONFIG, |
575 | 576 | num_gpus=8, |
576 | 577 | peft="lora", |
577 | | - pipeline_model_parallel_size=4, |
578 | | - virtual_pipeline_model_parallel_size=20, |
| 578 | + tensor_model_parallel_size=1, |
| 579 | + pipeline_model_parallel_size=1, |
| 580 | + context_parallel_size=1, |
579 | 581 | micro_batch_size=1, |
580 | 582 | global_batch_size=64, |
581 | 583 | cuda_graph_impl="transformer_engine", |
582 | 584 | cuda_graph_scope="mlp", |
583 | 585 | ) |
584 | 586 |
|
585 | 587 | LLAMA3_70B_LORA_CONFIG_GB200_BF16_V1 = _LLAMA3_70B_LORA_CONFIG_GB200 |
586 | | -LLAMA3_70B_LORA_CONFIG_GB200_FP8_CS_V1 = _LLAMA3_70B_LORA_CONFIG_GB200 |
| 588 | +LLAMA3_70B_LORA_CONFIG_GB200_FP8_CS_V1 = replace( |
| 589 | + BASE_LLAMA3_70B_CONFIG, |
| 590 | + num_gpus=8, |
| 591 | + peft="lora", |
| 592 | + tensor_model_parallel_size=1, |
| 593 | + pipeline_model_parallel_size=2, |
| 594 | + context_parallel_size=1, |
| 595 | + micro_batch_size=1, |
| 596 | + global_batch_size=32, |
| 597 | + cuda_graph_impl="transformer_engine", |
| 598 | + cuda_graph_scope="mlp", |
| 599 | +) |
587 | 600 | LLAMA3_70B_LORA_CONFIG_GB200_FP8_MX_V1 = LLAMA3_70B_LORA_CONFIG_GB200_FP8_CS_V1 |
588 | 601 |
|
589 | 602 |
|
590 | 603 | _LLAMA3_70B_LORA_CONFIG_H100 = replace( |
591 | 604 | BASE_LLAMA3_70B_CONFIG, |
592 | 605 | num_gpus=8, |
593 | 606 | peft="lora", |
594 | | - tensor_model_parallel_size=2, |
| 607 | + tensor_model_parallel_size=1, |
595 | 608 | pipeline_model_parallel_size=4, |
| 609 | + context_parallel_size=1, |
596 | 610 | virtual_pipeline_model_parallel_size=20, |
597 | 611 | micro_batch_size=1, |
598 | 612 | global_batch_size=32, |
599 | 613 | ) |
600 | 614 |
|
601 | 615 | LLAMA3_70B_LORA_CONFIG_H100_BF16_V1 = replace( |
602 | 616 | _LLAMA3_70B_LORA_CONFIG_H100, |
603 | | - recompute_num_layers=2, |
| 617 | + recompute_num_layers=1, |
| 618 | +) |
| 619 | +LLAMA3_70B_LORA_CONFIG_H100_FP8_CS_V1 = replace( |
| 620 | + _LLAMA3_70B_LORA_CONFIG_H100, |
| 621 | + tensor_model_parallel_size=2, |
604 | 622 | ) |
605 | | -LLAMA3_70B_LORA_CONFIG_H100_FP8_CS_V1 = _LLAMA3_70B_LORA_CONFIG_H100 |
606 | 623 |
|
607 | 624 |
|
608 | 625 | __all__ = [ |
|
0 commit comments