quic
diff --git a/‎Dockerfile‎
Lines changed: 6 additions & 6 deletions b/‎Dockerfile‎
Lines changed: 6 additions & 6 deletions
diff --git a/‎QEfficient/__init__.py‎
Lines changed: 3 additions & 1 deletion b/‎QEfficient/__init__.py‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎QEfficient/base/__init__.py‎
Lines changed: 1 addition & 0 deletions b/‎QEfficient/base/__init__.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎QEfficient/base/modeling_qeff.py‎
Lines changed: 18 additions & 21 deletions b/‎QEfficient/base/modeling_qeff.py‎
Lines changed: 18 additions & 21 deletions
diff --git a/‎QEfficient/base/pytorch_transforms.py‎
Lines changed: 35 additions & 2 deletions b/‎QEfficient/base/pytorch_transforms.py‎
Lines changed: 35 additions & 2 deletions
diff --git a/‎QEfficient/cloud/finetune.py‎
Lines changed: 50 additions & 14 deletions b/‎QEfficient/cloud/finetune.py‎
Lines changed: 50 additions & 14 deletions
@@ -7,8 +7,8 @@ FROM docker-registry.qualcomm.com/library/ubuntu:20.04
 RUN apt-get update && apt-get install -y \
     git \
     tmux \
-    python3.10 \
-    python3.10-venv \
+    python3.12 \
+    python3.12-venv \
     python3-pip
 
 # pip recognizes this variable
@@ -24,7 +24,7 @@ RUN mkdir -p /app/qefficient-library
 COPY . /app/qefficient-library
 
 # Create Virtual Env for the docker image
-RUN python3.10 -m venv /app/llm_env
+RUN python3.12 -m venv /app/llm_env
 RUN . /app/llm_env/bin/activate
 WORKDIR /app/qefficient-library
 
@@ -33,7 +33,7 @@ WORKDIR /app/qefficient-library
 RUN pip install torch==2.0.0+cpu --extra-index-url https://download.pytorch.org/whl/cpu --no-deps
 RUN pip install datasets==2.17.0 fsspec==2023.10.0 multidict==6.0.5 sentencepiece --no-deps
 
-RUN python3.10 -m pip install .
+RUN python3.12 -m pip install .
 WORKDIR /app/qefficient-library
 
 # Set the environment variable for the model card name and token ID
@@ -45,7 +45,7 @@ ENV TOKEN_ID = ""
 # Print a success message
 CMD ["echo", "qefficient-transformers repository cloned and setup installed inside Docker image."]
 CMD ["echo", "Starting the Model Download and Export to Onnx Stage for QEff."]
-CMD python3.10 -m QEfficient.cloud.export --model-name "$MODEL_NAME"
+CMD python3.12 -m QEfficient.cloud.export --model-name "$MODEL_NAME"
 
 # Example usage:
 # docker build -t qefficient-library .
@@ -55,4 +55,4 @@ CMD python3.10 -m QEfficient.cloud.export --model-name "$MODEL_NAME"
 # 2. For smaller models, 32GiB RAM is sufficient, but larger LLMs we require good CPU/RAM (Context 7B model would require atleast 64GiB).
 # 3. The exact minimum system configuration are tough to decide, since its all function of model parameters.
 
-# docker run -e MODEL_NAME=gpt2 -e TOKEN_ID=<your-token-id> qefficient-library
+# docker run -e MODEL_NAME=gpt2 -e TOKEN_ID=<your-token-id> qefficient-library
@@ -24,6 +24,7 @@
     QEFFAutoModelForCausalLM,
     QEFFAutoModelForCTC,
     QEFFAutoModelForImageTextToText,
+    QEFFAutoModelForSequenceClassification,
     QEFFAutoModelForSpeechSeq2Seq,
     QEFFCommonLoader,
 )
@@ -53,6 +54,7 @@
     "QEFFAutoModelForCTC",
     "QEffAutoPeftModelForCausalLM",
     "QEFFAutoModelForImageTextToText",
+    "QEFFAutoModelForSequenceClassification",
     "QEFFAutoModelForSpeechSeq2Seq",
     "QEFFCommonLoader",
     "QEffFluxPipeline",
@@ -61,7 +63,7 @@
 
 
 # Conditionally import QAIC-related modules if the SDK is installed
-__version__ = "0.0.1.dev0"
+__version__ = "1.22.0.dev0"
 
 
 def check_qaic_sdk():
 
@@ -11,5 +11,6 @@
     QEFFAutoModelForCausalLM,
     QEFFAutoModelForCTC,
     QEFFAutoModelForImageTextToText,
+    QEFFAutoModelForSequenceClassification,
     QEFFAutoModelForSpeechSeq2Seq,
 )
@@ -180,7 +180,7 @@ def compile(self, *args, **kwargs) -> Path:
                     :enable_qnn (bool): Enables QNN Compilation. ``Defaults to False. if not passed.``
                     :qnn_config (str): Path of QNN Config parameters file. ``Defaults to None. if not passed``
 
-                for QAIC compilation path, any flag that is supported by ``qaic-exec`` can be passed. Params are converted to flags as below:
+                for QAIC compilation path, any flag that is supported by ``qaic-compile`` can be passed. Params are converted to flags as below:
 
                     - aic_num_cores=16 -> -aic-num-cores=16
                     - convert_to_fp16=True -> -convert-to-fp16
@@ -369,7 +369,7 @@ def _compile(
         **compiler_options,
     ) -> str:
         """
-        Interface for qaic-exec compiler
+        Interface for qaic-compile compiler
 
         Args:
             :onnx_path (str): Onnx file to compile
@@ -382,7 +382,7 @@ def _compile(
             :enable_qnn (bool): Enables QNN Compilation. ``Defaults to False.``
             :qnn_config (str): Path of QNN Config parameters file. Any extra parameters for QNN compilation can be passed via this file. ``Defaults to None.``
             :compiler_options: Pass any compiler option as input.
-                Any flag that is supported by `qaic-exec` can be passed. Params are converted to flags as below:
+                Any flag that is supported by `qaic-compile` can be passed. Params are converted to flags as below:
 
                 - aic_num_cores=16 -> -aic-num-cores=16
                 - convert_to_fp16=True -> -convert-to-fp16
@@ -438,23 +438,10 @@ def _compile(
             + [f"-m={onnx_path}"]
         )
 
-        for key, value in compiler_options.items():
-            option = "-" + key.replace("_", "-")
-            if isinstance(value, bool):
-                if value:
-                    command.append(option)
-                continue
-            command.append(f"{option}={value}")
-
-        if use_onnx_subfunctions:
-            logger.info("Using ONNX subfunctions for compilation.")
-            command.append("-sub-functions")
-
         # MDP partition config: prioritize dump over load
         mdp_dump_json_path = compiler_options.pop("mdp_dump_partition_config", None)
         mdp_ts_json_path = compiler_options.pop("mdp_load_partition_config", None)
         mdp_ts_json = None
-        user_provided_load_config = False
 
         if mdp_dump_json_path:
             if mdp_ts_json_path:
@@ -465,12 +452,26 @@ def _compile(
         elif mdp_ts_json_path:
             command.append(f"-mdp-load-partition-config={mdp_ts_json_path}")
             mdp_ts_json = load_json(str(mdp_ts_json_path))
-            user_provided_load_config = True
         elif mdp_ts_num_devices > 1:
             # Generate mdp config only if neither dump nor load is provided and num_devices > 1
             mdp_ts_json = generate_mdp_partition_config(
                 mdp_ts_num_devices, compiler_options.get("aic_num_cores", constants.DEFAULT_AIC_NUM_CORES)
             )
+            mdp_ts_json_path = compile_dir / f"mdp_ts_{mdp_ts_num_devices}.json"
+            create_json(str(mdp_ts_json_path), mdp_ts_json)
+            command.append(f"-mdp-load-partition-config={mdp_ts_json_path}")
+
+        for key, value in compiler_options.items():
+            option = "-" + key.replace("_", "-")
+            if isinstance(value, bool):
+                if value:
+                    command.append(option)
+                continue
+            command.append(f"{option}={value}")
+
+        if use_onnx_subfunctions:
+            logger.info("Using ONNX subfunctions for compilation.")
+            command.append("-sub-functions")
 
         compile_hash_params = {
             "command": command,
@@ -495,10 +496,6 @@ def _compile(
             shutil.rmtree(qpc_path)
 
         # Write the generated MDP partition config file (not if user provided it)
-        if mdp_ts_json is not None and not user_provided_load_config:
-            mdp_ts_json_path = compile_dir / f"mdp_ts_{mdp_ts_num_devices}.json"
-            create_json(str(mdp_ts_json_path), mdp_ts_json)
-            command.append(f"-mdp-load-partition-config={mdp_ts_json_path}")
 
         # Write specializations.json file
         if specializations is not None:
 
@@ -32,6 +32,33 @@ def apply(cls, model: nn.Module) -> Tuple[nn.Module, bool]:
         raise NotImplementedError("Use subclasses for Pytorch transform")
 
 
+class ProxyModuleMappingTransform(PytorchTransform):
+    """
+    Replaces the PyTorch modules based on the _module_mapping class variable.
+    """
+
+    _module_mapping: Dict[Type[nn.Module], Type[nn.Module]]
+
+    @classmethod
+    def apply(cls, model: nn.Module) -> Tuple[nn.Module, bool]:
+        transformed = False
+        for name, module in model.named_modules():
+            for base_type, repl_type in cls._module_mapping.items():
+                if isinstance(module, base_type):
+                    if base_type is nn.Linear:
+                        short_name = name.split(".")[-1] if name else ""
+                        if short_name != "lm_head":
+                            continue
+                    # Perform in-place class replacement (preserve parameters/state)
+                    try:
+                        module.__class__ = repl_type
+                        transformed = True
+                    except Exception as e:
+                        logger.warning(f"Failed to replace module {name} ({base_type}) -> {repl_type}: {e}")
+
+        return model, transformed
+
+
 class ModuleMappingTransform(PytorchTransform):
     """
     Replaces the PyTorch modules based on the _module_mapping class variable.
@@ -152,10 +179,16 @@ def apply(cls, model: nn.Module) -> Tuple[nn.Module, bool]:
             # ---- build the textual prefix once per layer ----------
             if is_gpt_oss:
                 prefix = f"model.layers.{layer_idx}.mlp.experts."
-                experts = model_tmp.model.layers[layer_idx].mlp.experts
+                # experts = model_tmp.model.layers[layer_idx].mlp.experts
+                ff = model_tmp.model.layers[layer_idx].mlp
             else:
                 prefix = f"model.layers.{layer_idx}.feed_forward.experts."
-                experts = model_tmp.model.layers[layer_idx].feed_forward.experts
+                # experts = model_tmp.model.layers[layer_idx].feed_forward.experts
+                ff = model_tmp.model.layers[layer_idx].feed_forward
+
+            if not hasattr(ff, "experts"):
+                continue
+            experts = ff.experts
 
             fused_key = prefix + "gate_up_proj"
             gate_key = prefix + "gate_proj"
 
@@ -28,7 +28,7 @@
 )
 from QEfficient.finetune.utils.dataset_utils import get_dataloader, get_longest_seq_length
 from QEfficient.finetune.utils.device_map import get_device_map
-from QEfficient.finetune.utils.helper import Task_Mode, get_world_size
+from QEfficient.finetune.utils.helper import Task_Mode, get_local_rank, get_local_world_size, get_rank, get_world_size
 from QEfficient.finetune.utils.logging_utils import logger
 from QEfficient.finetune.utils.parser import get_finetune_parser
 from QEfficient.finetune.utils.train_utils import print_model_size, print_trainable_parameters, train
@@ -52,10 +52,8 @@ def setup_distributed_training(train_config: TrainConfig) -> None:
     """
     Initialize the distributed training environment if Distributed Data Parallel (DDP) is enabled.
 
-    This function configures the PyTorch distributed backend based on the device type
-    and initializes the process group. It also validates device availability and
-    pipeline parallelism settings.
-
+    Supports single-node and multi-node training launched via torchrun
+    (uses WORLD_SIZE, RANK, LOCAL_RANK, LOCAL_WORLD_SIZE environment variables).
     Parameters
     ----------
     train_config : TrainConfig
@@ -67,32 +65,57 @@ def setup_distributed_training(train_config: TrainConfig) -> None:
         If the number of required devices exceeds the total available devices.
         If pipeline parallelism (`num_pp_stages`) is enabled but set to 1.
         If DDP is enabled with a CPU device or with a specific device index (DDP requires device type only).
-
     Notes
     -----
     - If `train_config.enable_ddp` is False, this function performs no action.
     - Sets the appropriate device for each process in a distributed setup.
     """
 
     torch_device = torch.device(train_config.device)
-    num_available_devices = getattr(torch, torch_device.type).device_count()
-    assert get_world_size() * train_config.num_pp_stages <= num_available_devices, (
-        "Number of devices required should be less than or equal to total available devices."
-    )
+
+    # Validate pipeline parallelism settings
     if train_config.enable_pp:
         assert train_config.num_pp_stages > 1, (
             f"For pipeline parallelism, num_pp_stages should be greater than 1. Got {train_config.num_pp_stages}"
         )
 
+    # If DDP is disabled, nothing to initialize here
     if not train_config.enable_ddp:
+        # Non-DDP path: allow explicit device index, just set it if present
+        if torch_device.type != "cpu" and torch_device.index is not None:
+            getattr(torch, torch_device.type).set_device(torch_device.index)
         return
 
+    # ---- DDP path (single- or multi-node) ----
     assert torch_device.type != "cpu", "Host doesn't support single-node DDP"
-    assert torch_device.index is None, f"DDP requires only device type, got: {torch_device}"
+    assert torch_device.index is None, f"DDP requires only device type (qaic/cuda), got: {torch_device}"
+
+    # Torchrun-provided env vars
+    world_size = get_world_size()
+    rank = get_rank()
+    local_rank = get_local_rank()
+    local_world_size = get_local_world_size()
+
+    # Per-node device validation
+    num_available_devices = getattr(torch, torch_device.type).device_count()
+    assert local_world_size * train_config.num_pp_stages <= num_available_devices, (
+        "Number of devices required per node (LOCAL_WORLD_SIZE * num_pp_stages) should be <= locally available devices."
+    )
+
     dist_backend_map = {"cpu": "gloo", "qaic": "qccl", "cuda": "gloo"}
-    dist.init_process_group(backend=dist_backend_map[torch_device.type])
+    dist.init_process_group(dist_backend_map[torch_device.type], rank=rank, world_size=world_size)
+
+    # Set the base device index for this process on this node
+    # For PP: each process controls num_pp_stages devices starting from base_device_index
+    base_device_index = local_rank * train_config.num_pp_stages
     # from here onward "qaic/cuda" will automatically map to "qaic:i/cuda:i", where i = process rank
-    getattr(torch, torch_device.type).set_device(dist.get_rank() * train_config.num_pp_stages)
+    getattr(torch, torch_device.type).set_device(base_device_index)
+
+    # persist rank info in the config
+    train_config.rank = rank
+    train_config.local_rank = local_rank
+    train_config.world_size = world_size
+    train_config.local_world_size = local_world_size
 
 
 def setup_seeds(seed: int) -> None:
@@ -362,14 +385,26 @@ def main(**kwargs) -> None:
         f"passed context length is {train_config.context_length} and overall model's context length is "
         f"{model.config.max_position_embeddings}"
     )
+
+    # Figure out the concrete device for this process
+    torch_device = torch.device(train_config.device)
+    if train_config.enable_ddp and torch_device.type != "cpu":
+        # setup_distributed_training has already set the current device based on LOCAL_RANK
+        current_idx = getattr(torch, torch_device.type).current_device()
+        device = torch.device(torch_device.type, current_idx)
+    else:
+        device = torch_device
+
     if not train_config.enable_pp:
-        model.to(train_config.device)
+        model.to(device)
+
     optimizer = optim.AdamW(
         model.parameters(),
         lr=train_config.lr,
         weight_decay=train_config.weight_decay,
     )
     scheduler = StepLR(optimizer, step_size=1, gamma=train_config.gamma)
+
     if train_config.enable_ddp:
         ignore_names = set()
         for name, param in model.named_parameters():
@@ -378,6 +413,7 @@ def main(**kwargs) -> None:
         # Adding params in ignore list will enforce DDP to ignore them during synchronization,
         # which will further reduce the tensor exchange across devices.
         torch.nn.parallel.DistributedDataParallel._set_params_and_buffers_to_ignore_for_model(model, ignore_names)
+
         model = nn.parallel.DistributedDataParallel(model)
 
     results = train(
Original file line number	Diff line number	Diff line change
`@@ -11,5 +11,6 @@`
`11`	`11`	`QEFFAutoModelForCausalLM,`
`12`	`12`	`QEFFAutoModelForCTC,`
`13`	`13`	`QEFFAutoModelForImageTextToText,`
	`14`	`+ QEFFAutoModelForSequenceClassification,`
`14`	`15`	`QEFFAutoModelForSpeechSeq2Seq,`
`15`	`16`	`)`