Skip to content

Commit 5b0fb7e

Browse files
committed
Merge branch 'main' into qwenvl2_5_multi_spec
2 parents bba6252 + 652351b commit 5b0fb7e

File tree

120 files changed

+7801
-2005
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

120 files changed

+7801
-2005
lines changed

Dockerfile

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,8 @@ FROM docker-registry.qualcomm.com/library/ubuntu:20.04
77
RUN apt-get update && apt-get install -y \
88
git \
99
tmux \
10-
python3.10 \
11-
python3.10-venv \
10+
python3.12 \
11+
python3.12-venv \
1212
python3-pip
1313

1414
# pip recognizes this variable
@@ -24,7 +24,7 @@ RUN mkdir -p /app/qefficient-library
2424
COPY . /app/qefficient-library
2525

2626
# Create Virtual Env for the docker image
27-
RUN python3.10 -m venv /app/llm_env
27+
RUN python3.12 -m venv /app/llm_env
2828
RUN . /app/llm_env/bin/activate
2929
WORKDIR /app/qefficient-library
3030

@@ -33,7 +33,7 @@ WORKDIR /app/qefficient-library
3333
RUN pip install torch==2.0.0+cpu --extra-index-url https://download.pytorch.org/whl/cpu --no-deps
3434
RUN pip install datasets==2.17.0 fsspec==2023.10.0 multidict==6.0.5 sentencepiece --no-deps
3535

36-
RUN python3.10 -m pip install .
36+
RUN python3.12 -m pip install .
3737
WORKDIR /app/qefficient-library
3838

3939
# Set the environment variable for the model card name and token ID
@@ -45,7 +45,7 @@ ENV TOKEN_ID = ""
4545
# Print a success message
4646
CMD ["echo", "qefficient-transformers repository cloned and setup installed inside Docker image."]
4747
CMD ["echo", "Starting the Model Download and Export to Onnx Stage for QEff."]
48-
CMD python3.10 -m QEfficient.cloud.export --model-name "$MODEL_NAME"
48+
CMD python3.12 -m QEfficient.cloud.export --model-name "$MODEL_NAME"
4949

5050
# Example usage:
5151
# docker build -t qefficient-library .
@@ -55,4 +55,4 @@ CMD python3.10 -m QEfficient.cloud.export --model-name "$MODEL_NAME"
5555
# 2. For smaller models, 32GiB RAM is sufficient, but larger LLMs we require good CPU/RAM (Context 7B model would require atleast 64GiB).
5656
# 3. The exact minimum system configuration are tough to decide, since its all function of model parameters.
5757

58-
# docker run -e MODEL_NAME=gpt2 -e TOKEN_ID=<your-token-id> qefficient-library
58+
# docker run -e MODEL_NAME=gpt2 -e TOKEN_ID=<your-token-id> qefficient-library

QEfficient/__init__.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
QEFFAutoModelForCausalLM,
2525
QEFFAutoModelForCTC,
2626
QEFFAutoModelForImageTextToText,
27+
QEFFAutoModelForSequenceClassification,
2728
QEFFAutoModelForSpeechSeq2Seq,
2829
QEFFCommonLoader,
2930
)
@@ -53,6 +54,7 @@
5354
"QEFFAutoModelForCTC",
5455
"QEffAutoPeftModelForCausalLM",
5556
"QEFFAutoModelForImageTextToText",
57+
"QEFFAutoModelForSequenceClassification",
5658
"QEFFAutoModelForSpeechSeq2Seq",
5759
"QEFFCommonLoader",
5860
"QEffFluxPipeline",
@@ -61,7 +63,7 @@
6163

6264

6365
# Conditionally import QAIC-related modules if the SDK is installed
64-
__version__ = "0.0.1.dev0"
66+
__version__ = "1.22.0.dev0"
6567

6668

6769
def check_qaic_sdk():

QEfficient/base/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,5 +11,6 @@
1111
QEFFAutoModelForCausalLM,
1212
QEFFAutoModelForCTC,
1313
QEFFAutoModelForImageTextToText,
14+
QEFFAutoModelForSequenceClassification,
1415
QEFFAutoModelForSpeechSeq2Seq,
1516
)

QEfficient/base/modeling_qeff.py

Lines changed: 18 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -180,7 +180,7 @@ def compile(self, *args, **kwargs) -> Path:
180180
:enable_qnn (bool): Enables QNN Compilation. ``Defaults to False. if not passed.``
181181
:qnn_config (str): Path of QNN Config parameters file. ``Defaults to None. if not passed``
182182
183-
for QAIC compilation path, any flag that is supported by ``qaic-exec`` can be passed. Params are converted to flags as below:
183+
for QAIC compilation path, any flag that is supported by ``qaic-compile`` can be passed. Params are converted to flags as below:
184184
185185
- aic_num_cores=16 -> -aic-num-cores=16
186186
- convert_to_fp16=True -> -convert-to-fp16
@@ -369,7 +369,7 @@ def _compile(
369369
**compiler_options,
370370
) -> str:
371371
"""
372-
Interface for qaic-exec compiler
372+
Interface for qaic-compile compiler
373373
374374
Args:
375375
:onnx_path (str): Onnx file to compile
@@ -382,7 +382,7 @@ def _compile(
382382
:enable_qnn (bool): Enables QNN Compilation. ``Defaults to False.``
383383
:qnn_config (str): Path of QNN Config parameters file. Any extra parameters for QNN compilation can be passed via this file. ``Defaults to None.``
384384
:compiler_options: Pass any compiler option as input.
385-
Any flag that is supported by `qaic-exec` can be passed. Params are converted to flags as below:
385+
Any flag that is supported by `qaic-compile` can be passed. Params are converted to flags as below:
386386
387387
- aic_num_cores=16 -> -aic-num-cores=16
388388
- convert_to_fp16=True -> -convert-to-fp16
@@ -438,23 +438,10 @@ def _compile(
438438
+ [f"-m={onnx_path}"]
439439
)
440440

441-
for key, value in compiler_options.items():
442-
option = "-" + key.replace("_", "-")
443-
if isinstance(value, bool):
444-
if value:
445-
command.append(option)
446-
continue
447-
command.append(f"{option}={value}")
448-
449-
if use_onnx_subfunctions:
450-
logger.info("Using ONNX subfunctions for compilation.")
451-
command.append("-sub-functions")
452-
453441
# MDP partition config: prioritize dump over load
454442
mdp_dump_json_path = compiler_options.pop("mdp_dump_partition_config", None)
455443
mdp_ts_json_path = compiler_options.pop("mdp_load_partition_config", None)
456444
mdp_ts_json = None
457-
user_provided_load_config = False
458445

459446
if mdp_dump_json_path:
460447
if mdp_ts_json_path:
@@ -465,12 +452,26 @@ def _compile(
465452
elif mdp_ts_json_path:
466453
command.append(f"-mdp-load-partition-config={mdp_ts_json_path}")
467454
mdp_ts_json = load_json(str(mdp_ts_json_path))
468-
user_provided_load_config = True
469455
elif mdp_ts_num_devices > 1:
470456
# Generate mdp config only if neither dump nor load is provided and num_devices > 1
471457
mdp_ts_json = generate_mdp_partition_config(
472458
mdp_ts_num_devices, compiler_options.get("aic_num_cores", constants.DEFAULT_AIC_NUM_CORES)
473459
)
460+
mdp_ts_json_path = compile_dir / f"mdp_ts_{mdp_ts_num_devices}.json"
461+
create_json(str(mdp_ts_json_path), mdp_ts_json)
462+
command.append(f"-mdp-load-partition-config={mdp_ts_json_path}")
463+
464+
for key, value in compiler_options.items():
465+
option = "-" + key.replace("_", "-")
466+
if isinstance(value, bool):
467+
if value:
468+
command.append(option)
469+
continue
470+
command.append(f"{option}={value}")
471+
472+
if use_onnx_subfunctions:
473+
logger.info("Using ONNX subfunctions for compilation.")
474+
command.append("-sub-functions")
474475

475476
compile_hash_params = {
476477
"command": command,
@@ -495,10 +496,6 @@ def _compile(
495496
shutil.rmtree(qpc_path)
496497

497498
# Write the generated MDP partition config file (not if user provided it)
498-
if mdp_ts_json is not None and not user_provided_load_config:
499-
mdp_ts_json_path = compile_dir / f"mdp_ts_{mdp_ts_num_devices}.json"
500-
create_json(str(mdp_ts_json_path), mdp_ts_json)
501-
command.append(f"-mdp-load-partition-config={mdp_ts_json_path}")
502499

503500
# Write specializations.json file
504501
if specializations is not None:

QEfficient/base/pytorch_transforms.py

Lines changed: 35 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,33 @@ def apply(cls, model: nn.Module) -> Tuple[nn.Module, bool]:
3232
raise NotImplementedError("Use subclasses for Pytorch transform")
3333

3434

35+
class ProxyModuleMappingTransform(PytorchTransform):
36+
"""
37+
Replaces the PyTorch modules based on the _module_mapping class variable.
38+
"""
39+
40+
_module_mapping: Dict[Type[nn.Module], Type[nn.Module]]
41+
42+
@classmethod
43+
def apply(cls, model: nn.Module) -> Tuple[nn.Module, bool]:
44+
transformed = False
45+
for name, module in model.named_modules():
46+
for base_type, repl_type in cls._module_mapping.items():
47+
if isinstance(module, base_type):
48+
if base_type is nn.Linear:
49+
short_name = name.split(".")[-1] if name else ""
50+
if short_name != "lm_head":
51+
continue
52+
# Perform in-place class replacement (preserve parameters/state)
53+
try:
54+
module.__class__ = repl_type
55+
transformed = True
56+
except Exception as e:
57+
logger.warning(f"Failed to replace module {name} ({base_type}) -> {repl_type}: {e}")
58+
59+
return model, transformed
60+
61+
3562
class ModuleMappingTransform(PytorchTransform):
3663
"""
3764
Replaces the PyTorch modules based on the _module_mapping class variable.
@@ -152,10 +179,16 @@ def apply(cls, model: nn.Module) -> Tuple[nn.Module, bool]:
152179
# ---- build the textual prefix once per layer ----------
153180
if is_gpt_oss:
154181
prefix = f"model.layers.{layer_idx}.mlp.experts."
155-
experts = model_tmp.model.layers[layer_idx].mlp.experts
182+
# experts = model_tmp.model.layers[layer_idx].mlp.experts
183+
ff = model_tmp.model.layers[layer_idx].mlp
156184
else:
157185
prefix = f"model.layers.{layer_idx}.feed_forward.experts."
158-
experts = model_tmp.model.layers[layer_idx].feed_forward.experts
186+
# experts = model_tmp.model.layers[layer_idx].feed_forward.experts
187+
ff = model_tmp.model.layers[layer_idx].feed_forward
188+
189+
if not hasattr(ff, "experts"):
190+
continue
191+
experts = ff.experts
159192

160193
fused_key = prefix + "gate_up_proj"
161194
gate_key = prefix + "gate_proj"

QEfficient/cloud/finetune.py

Lines changed: 50 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@
2828
)
2929
from QEfficient.finetune.utils.dataset_utils import get_dataloader, get_longest_seq_length
3030
from QEfficient.finetune.utils.device_map import get_device_map
31-
from QEfficient.finetune.utils.helper import Task_Mode, get_world_size
31+
from QEfficient.finetune.utils.helper import Task_Mode, get_local_rank, get_local_world_size, get_rank, get_world_size
3232
from QEfficient.finetune.utils.logging_utils import logger
3333
from QEfficient.finetune.utils.parser import get_finetune_parser
3434
from QEfficient.finetune.utils.train_utils import print_model_size, print_trainable_parameters, train
@@ -52,10 +52,8 @@ def setup_distributed_training(train_config: TrainConfig) -> None:
5252
"""
5353
Initialize the distributed training environment if Distributed Data Parallel (DDP) is enabled.
5454
55-
This function configures the PyTorch distributed backend based on the device type
56-
and initializes the process group. It also validates device availability and
57-
pipeline parallelism settings.
58-
55+
Supports single-node and multi-node training launched via torchrun
56+
(uses WORLD_SIZE, RANK, LOCAL_RANK, LOCAL_WORLD_SIZE environment variables).
5957
Parameters
6058
----------
6159
train_config : TrainConfig
@@ -67,32 +65,57 @@ def setup_distributed_training(train_config: TrainConfig) -> None:
6765
If the number of required devices exceeds the total available devices.
6866
If pipeline parallelism (`num_pp_stages`) is enabled but set to 1.
6967
If DDP is enabled with a CPU device or with a specific device index (DDP requires device type only).
70-
7168
Notes
7269
-----
7370
- If `train_config.enable_ddp` is False, this function performs no action.
7471
- Sets the appropriate device for each process in a distributed setup.
7572
"""
7673

7774
torch_device = torch.device(train_config.device)
78-
num_available_devices = getattr(torch, torch_device.type).device_count()
79-
assert get_world_size() * train_config.num_pp_stages <= num_available_devices, (
80-
"Number of devices required should be less than or equal to total available devices."
81-
)
75+
76+
# Validate pipeline parallelism settings
8277
if train_config.enable_pp:
8378
assert train_config.num_pp_stages > 1, (
8479
f"For pipeline parallelism, num_pp_stages should be greater than 1. Got {train_config.num_pp_stages}"
8580
)
8681

82+
# If DDP is disabled, nothing to initialize here
8783
if not train_config.enable_ddp:
84+
# Non-DDP path: allow explicit device index, just set it if present
85+
if torch_device.type != "cpu" and torch_device.index is not None:
86+
getattr(torch, torch_device.type).set_device(torch_device.index)
8887
return
8988

89+
# ---- DDP path (single- or multi-node) ----
9090
assert torch_device.type != "cpu", "Host doesn't support single-node DDP"
91-
assert torch_device.index is None, f"DDP requires only device type, got: {torch_device}"
91+
assert torch_device.index is None, f"DDP requires only device type (qaic/cuda), got: {torch_device}"
92+
93+
# Torchrun-provided env vars
94+
world_size = get_world_size()
95+
rank = get_rank()
96+
local_rank = get_local_rank()
97+
local_world_size = get_local_world_size()
98+
99+
# Per-node device validation
100+
num_available_devices = getattr(torch, torch_device.type).device_count()
101+
assert local_world_size * train_config.num_pp_stages <= num_available_devices, (
102+
"Number of devices required per node (LOCAL_WORLD_SIZE * num_pp_stages) should be <= locally available devices."
103+
)
104+
92105
dist_backend_map = {"cpu": "gloo", "qaic": "qccl", "cuda": "gloo"}
93-
dist.init_process_group(backend=dist_backend_map[torch_device.type])
106+
dist.init_process_group(dist_backend_map[torch_device.type], rank=rank, world_size=world_size)
107+
108+
# Set the base device index for this process on this node
109+
# For PP: each process controls num_pp_stages devices starting from base_device_index
110+
base_device_index = local_rank * train_config.num_pp_stages
94111
# from here onward "qaic/cuda" will automatically map to "qaic:i/cuda:i", where i = process rank
95-
getattr(torch, torch_device.type).set_device(dist.get_rank() * train_config.num_pp_stages)
112+
getattr(torch, torch_device.type).set_device(base_device_index)
113+
114+
# persist rank info in the config
115+
train_config.rank = rank
116+
train_config.local_rank = local_rank
117+
train_config.world_size = world_size
118+
train_config.local_world_size = local_world_size
96119

97120

98121
def setup_seeds(seed: int) -> None:
@@ -362,14 +385,26 @@ def main(**kwargs) -> None:
362385
f"passed context length is {train_config.context_length} and overall model's context length is "
363386
f"{model.config.max_position_embeddings}"
364387
)
388+
389+
# Figure out the concrete device for this process
390+
torch_device = torch.device(train_config.device)
391+
if train_config.enable_ddp and torch_device.type != "cpu":
392+
# setup_distributed_training has already set the current device based on LOCAL_RANK
393+
current_idx = getattr(torch, torch_device.type).current_device()
394+
device = torch.device(torch_device.type, current_idx)
395+
else:
396+
device = torch_device
397+
365398
if not train_config.enable_pp:
366-
model.to(train_config.device)
399+
model.to(device)
400+
367401
optimizer = optim.AdamW(
368402
model.parameters(),
369403
lr=train_config.lr,
370404
weight_decay=train_config.weight_decay,
371405
)
372406
scheduler = StepLR(optimizer, step_size=1, gamma=train_config.gamma)
407+
373408
if train_config.enable_ddp:
374409
ignore_names = set()
375410
for name, param in model.named_parameters():
@@ -378,6 +413,7 @@ def main(**kwargs) -> None:
378413
# Adding params in ignore list will enforce DDP to ignore them during synchronization,
379414
# which will further reduce the tensor exchange across devices.
380415
torch.nn.parallel.DistributedDataParallel._set_params_and_buffers_to_ignore_for_model(model, ignore_names)
416+
381417
model = nn.parallel.DistributedDataParallel(model)
382418

383419
results = train(

0 commit comments

Comments
 (0)