pytorch
diff --git a/‎torchtitan/experiments/__init__.py
Lines changed: 1 addition & 0 deletions b/‎torchtitan/experiments/__init__.py
Lines changed: 1 addition & 0 deletions
diff --git a/‎torchtitan/experiments/qwen3/README.md
Lines changed: 27 additions & 0 deletions b/‎torchtitan/experiments/qwen3/README.md
Lines changed: 27 additions & 0 deletions
diff --git a/‎torchtitan/experiments/qwen3/__init__.py
Lines changed: 121 additions & 0 deletions b/‎torchtitan/experiments/qwen3/__init__.py
Lines changed: 121 additions & 0 deletions
diff --git a/‎torchtitan/experiments/qwen3/infra/parallelize.py
Lines changed: 219 additions & 0 deletions b/‎torchtitan/experiments/qwen3/infra/parallelize.py
Lines changed: 219 additions & 0 deletions
@@ -5,4 +5,5 @@
 # LICENSE file in the root directory of this source tree.
 
 import torchtitan.experiments.llama4  # noqa: F401
+import torchtitan.experiments.qwen3
 import torchtitan.experiments.simple_fsdp  # noqa: F401
@@ -0,0 +1,27 @@
+**The Qwen3 model is still under development.**
+
+
+#### Available features
+QWEN3 0.6B Dense model is available for:
+
+- FSDP/HSDP, TP, DDP, AC, compile support
+
+Other model sizes are added to the args, but toml file configs need to be added and tested. Further testing is needed to check the coistency of the parallelism implementations.
+
+#### Download Qwen3 tokenizer
+
+```python scripts/download_tokenizer.py --repo_id Qwen/Qwen3-0.6B```
+
+
+#### Parity with HF
+
+Model parity test has been done and results suggest parity with HF implementation. Further investigation is needed to check the sanity of the Rope function.
+
+#### To be added
+- Modeling
+    - Variants of Dense models up to 32B
+    - MoE alternatives
+    - Weight tying
+- Testing
+    - The model should be tested against established performance benchmarks
+    - CI integration
@@ -0,0 +1,121 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+#
+# Copyright (c) Meta Platforms, Inc. All Rights Reserved.
+
+from torchtitan.components.loss import build_cross_entropy_loss
+from torchtitan.components.lr_scheduler import build_lr_schedulers
+from torchtitan.components.optimizer import build_optimizers
+from torchtitan.components.tokenizer import build_hf_tokenizer
+from torchtitan.components.validate import build_validator
+from torchtitan.datasets.hf_datasets import build_hf_dataloader
+from torchtitan.protocols.train_spec import register_train_spec, TrainSpec
+
+from .infra.parallelize import parallelize_qwen3
+from .model.args import Qwen3ModelArgs
+from .model.model import Transformer
+
+__all__ = [
+    "parallelize_qwen3",
+    "Qwen3ModelArgs",
+    "Transformer",
+    "qwen3_configs",
+]
+
+
+# Adding different variants of the model
+
+qwen3_configs = {
+    "0.6B": Qwen3ModelArgs(
+        vocab_size=151936,
+        max_seq_len=4096,
+        head_dim=128,
+        dim=1024,
+        n_layers=28,
+        n_heads=16,
+        n_kv_heads=8,
+        qk_norm=True,
+        hidden_dim=3072,
+        rope_theta=1000000,
+    ),
+    "1.7B": Qwen3ModelArgs(
+        vocab_size=151936,
+        max_seq_len=4096,
+        head_dim=128,
+        dim=2048,
+        n_layers=28,
+        n_heads=16,
+        n_kv_heads=8,
+        qk_norm=True,
+        hidden_dim=6144,
+        rope_theta=1000000,
+    ),
+    "4B": Qwen3ModelArgs(
+        vocab_size=151936,
+        max_seq_len=4096,
+        head_dim=128,
+        dim=2560,
+        n_layers=36,
+        n_heads=32,
+        n_kv_heads=8,
+        qk_norm=True,
+        hidden_dim=9728,
+        rope_theta=1000000,
+    ),
+    "8B": Qwen3ModelArgs(
+        vocab_size=151936,
+        max_seq_len=4096,
+        head_dim=128,
+        dim=4096,
+        n_layers=36,
+        n_heads=32,
+        n_kv_heads=8,
+        qk_norm=True,
+        hidden_dim=12288,
+        rope_theta=1000000,
+    ),
+    "14B": Qwen3ModelArgs(
+        vocab_size=151936,
+        max_seq_len=4096,
+        head_dim=128,
+        dim=5120,
+        n_layers=40,
+        n_heads=40,
+        n_kv_heads=8,
+        qk_norm=True,
+        hidden_dim=17408,
+        rope_theta=1000000,
+    ),
+    "32B": Qwen3ModelArgs(
+        vocab_size=151936,
+        max_seq_len=4096,
+        head_dim=128,
+        dim=5120,
+        n_layers=64,
+        n_heads=64,
+        n_kv_heads=8,
+        qk_norm=True,
+        hidden_dim=25600,
+        rope_theta=1000000,
+    ),
+}
+
+
+register_train_spec(
+    TrainSpec(
+        name="qwen3",
+        model_cls=Transformer,
+        model_args=qwen3_configs,  # Change from dict to Mapping
+        parallelize_fn=parallelize_qwen3,
+        pipelining_fn=None,
+        build_optimizers_fn=build_optimizers,
+        build_lr_schedulers_fn=build_lr_schedulers,
+        build_dataloader_fn=build_hf_dataloader,
+        build_tokenizer_fn=build_hf_tokenizer,
+        build_loss_fn=build_cross_entropy_loss,
+        build_validator_fn=build_validator,
+    )
+)
@@ -0,0 +1,219 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# This file applies the PT-D parallelisms (except pipeline parallelism) and various
+# training techniques (e.g. activation checkpointing and compile) to the Llama model.
+
+import torch
+import torch.nn as nn
+
+from torch.distributed.device_mesh import DeviceMesh
+from torch.distributed.tensor import Replicate, Shard
+from torch.distributed.tensor.parallel import (
+    ColwiseParallel,
+    parallelize_module,
+    PrepareModuleInput,
+    RowwiseParallel,
+    SequenceParallel,
+)
+
+from torchtitan.config import JobConfig, TORCH_DTYPE_MAP
+from torchtitan.distributed import ParallelDims
+from torchtitan.distributed.expert_parallel import NoParallel
+from torchtitan.models.llama3.infra.parallelize import (
+    apply_ac,
+    apply_compile,
+    apply_ddp,
+    apply_fsdp,
+)
+from torchtitan.tools.logging import logger
+
+
+def parallelize_qwen3(
+    model: nn.Module,
+    parallel_dims: ParallelDims,
+    job_config: JobConfig,
+):
+
+    world_mesh = parallel_dims.world_mesh
+    assert (
+        job_config.training.seq_len % parallel_dims.seq_len_divisor == 0
+    ), f"""
+        Sequence length {job_config.training.seq_len} must be divisible by the product of TP degree
+        ({parallel_dims.tp}) and 2 * CP degree ({parallel_dims.cp}).
+        """
+    if parallel_dims.tp_enabled:
+        if (
+            job_config.parallelism.enable_async_tensor_parallel
+            and not job_config.training.compile
+        ):
+            raise RuntimeError("Async TP requires --training.compile")
+
+        enable_float8_linear = "float8" in job_config.model.converters
+        float8_is_rowwise = job_config.float8.recipe_name in (
+            "rowwise",
+            "rowwise_with_gw_hp",
+        )
+
+        # For now, float8 all-gather with TP is only supported for tensorwise
+        # float8 scaling recipes. For rowwise recipes, we use regular TP and
+        # all-gather happens in high precision.
+        enable_float8_tensorwise_tp = enable_float8_linear and not float8_is_rowwise
+
+        apply_tp(
+            model,
+            world_mesh["tp"],
+            loss_parallel=not job_config.parallelism.disable_loss_parallel,
+            enable_float8_tensorwise_tp=enable_float8_tensorwise_tp,
+            enable_async_tp=job_config.parallelism.enable_async_tensor_parallel,
+        )
+
+    if job_config.activation_checkpoint.mode != "none":
+        apply_ac(model, job_config.activation_checkpoint)
+
+    # turn on per-TransformerBlock compile after AC wrapping and before FSDP
+    if job_config.training.compile:
+        apply_compile(model)
+
+    if parallel_dims.fsdp_enabled:
+        # apply FSDP or HSDP, potentially with Context Parallel
+        if parallel_dims.dp_replicate_enabled:
+            dp_mesh_dim_names = ("dp_replicate", "dp_shard_cp")
+        else:
+            dp_mesh_dim_names = ("dp_shard_cp",)
+
+        apply_fsdp(
+            model,
+            world_mesh[tuple(dp_mesh_dim_names)],
+            param_dtype=TORCH_DTYPE_MAP[job_config.training.mixed_precision_param],
+            reduce_dtype=TORCH_DTYPE_MAP[job_config.training.mixed_precision_reduce],
+            pp_enabled=parallel_dims.pp_enabled,
+            cpu_offload=job_config.training.enable_cpu_offload,
+            reshard_after_forward_policy=job_config.parallelism.fsdp_reshard_after_forward,
+        )
+
+        if parallel_dims.dp_replicate_enabled:
+            logger.info("Applied HSDP to the model")
+        else:
+            logger.info("Applied FSDP to the model")
+
+        if parallel_dims.dp_replicate_enabled:
+            logger.info("Applied HSDP to the model")
+        else:
+            logger.info("Applied FSDP to the model")
+
+        if parallel_dims.cp_enabled:
+            logger.info("Applied Context Parallel to the model")
+
+        if job_config.training.enable_cpu_offload:
+            logger.info("Applied CPU Offloading to the model")
+    elif parallel_dims.dp_replicate_enabled:
+        if world_mesh.ndim > 1:
+            raise RuntimeError("DDP has not supported > 1D parallelism")
+        apply_ddp(
+            model,
+            world_mesh,
+            enable_compile=job_config.training.compile,
+            enable_compiled_autograd=job_config.parallelism.enable_compiled_autograd,
+        )
+
+    return model
+
+
+def apply_tp(
+    model: nn.Module,
+    tp_mesh: DeviceMesh,
+    loss_parallel: bool,
+    enable_float8_tensorwise_tp: bool,
+    enable_async_tp: bool,
+):
+    """Apply tensor parallelism."""
+    # 1. Parallelize the embedding and shard its outputs (which are the first
+    # transformer block's inputs)
+    # 2. Parallelize the root norm layer over the sequence dim
+    # 3. Parallelize the final linear output layer
+    parallelize_module(
+        model,
+        tp_mesh,
+        {
+            "tok_embeddings": RowwiseParallel(
+                input_layouts=Replicate(),
+                output_layouts=Shard(1),
+            ),
+            "norm": SequenceParallel(),
+            "output": ColwiseParallel(
+                input_layouts=Shard(1),
+                output_layouts=Shard(-1) if loss_parallel else Replicate(),
+                use_local_output=not loss_parallel,
+            ),
+        },
+    )
+
+    # Parallel styles used for transformer block linear weights and their
+    # inputs may be different for float8 linears with tensorwise scaling.
+    if enable_float8_tensorwise_tp:
+        # TODO(vkuzo): add the items below to __init__.py of torchao.float8 and import from there
+        from torchao.float8.float8_tensor_parallel import (
+            Float8ColwiseParallel,
+            Float8RowwiseParallel,
+            PrepareFloat8ModuleInput,
+        )
+
+        rowwise_parallel, colwise_parallel, prepare_module_input = (
+            Float8RowwiseParallel,
+            Float8ColwiseParallel,
+            PrepareFloat8ModuleInput,
+        )
+    else:
+        rowwise_parallel, colwise_parallel, prepare_module_input = (
+            RowwiseParallel,
+            ColwiseParallel,
+            PrepareModuleInput,
+        )
+
+    # Apply tensor + sequence parallelism to every transformer block
+    # NOTE: At the cost of model code change, we can accelerate Sequence Parallel
+    #       by folding (and unfolding) the batch dimension and the sequence dimension.
+    #       Examples can be found at https://github.com/pytorch/torchtitan/pull/437
+    for transformer_block in model.layers.values():
+        layer_plan = {
+            "attention_norm": SequenceParallel(),
+            "attention": prepare_module_input(
+                input_layouts=(Shard(1), Replicate()),
+                desired_input_layouts=(Replicate(), Replicate()),
+            ),
+            "attention.wq": colwise_parallel(use_local_output=False),
+            "attention.wk": colwise_parallel(use_local_output=False),
+            "attention.wv": colwise_parallel(use_local_output=False),
+            "attention.q_norm": NoParallel(use_local_output=False),
+            "attention.k_norm": NoParallel(use_local_output=False),
+            "attention.wo": rowwise_parallel(output_layouts=Shard(1)),
+            "ffn_norm": SequenceParallel(),
+            "feed_forward": prepare_module_input(
+                input_layouts=(Shard(1),),
+                desired_input_layouts=(Replicate(),),
+            ),
+            "feed_forward.w1": colwise_parallel(),
+            "feed_forward.w2": rowwise_parallel(output_layouts=Shard(1)),
+            "feed_forward.w3": colwise_parallel(),
+        }
+
+        parallelize_module(
+            module=transformer_block,
+            device_mesh=tp_mesh,
+            parallelize_plan=layer_plan,
+        )
+
+    if enable_async_tp:
+        from torch.distributed._symmetric_memory import enable_symm_mem_for_group
+
+        torch._inductor.config._micro_pipeline_tp = True
+        enable_symm_mem_for_group(tp_mesh.get_group().group_name)
+
+    logger.info(
+        f"Applied {'Float8 tensorwise ' if enable_float8_tensorwise_tp else ''}{'Async ' if enable_async_tp else ''}"
+        "Tensor Parallelism to the model"
+    )