Skip to content
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,7 @@ def main():
model = NeuronModelForCausalLM.from_pretrained(
model_id,
training_args.trn_config,
torch_dtype=torch.bfloat16,
dtype=torch.bfloat16,
attn_implementation="flash_attention_2", # Enable flash attention
)

Expand Down
2 changes: 1 addition & 1 deletion benchmark/text-generation/performance/llama3.3-70b.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ def main():
assert neuron_config.sequence_length == seq_length, (
f"Model {model_name} is not configured for sequence length {seq_length}."
)
assert neuron_config.torch_dtype == torch.bfloat16, f"Model {model_name} is not configured for bf16."
assert neuron_config.dtype == torch.bfloat16, f"Model {model_name} is not configured for bf16."
model = NeuronModelForCausalLM.from_pretrained(model_id)
except Exception:
model = NeuronModelForCausalLM.from_pretrained(
Expand Down
24 changes: 12 additions & 12 deletions docs/source/contribute/contribute_for_training.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ class YourModelEmbeddings(nn.Module):
self.embed_tokens = ParallelEmbedding(
config.vocab_size,
config.hidden_size,
dtype=config.torch_dtype,
dtype=config.dtype,
sequence_parallel_enabled=trn_config.sequence_parallel_enabled,
)
```
Expand All @@ -105,7 +105,7 @@ class YourModelMLP(nn.Module, CustomModule):
bias=False,
gather_output=False,
sequence_parallel_enabled=trn_config.sequence_parallel_enabled,
dtype=config.torch_dtype,
dtype=config.dtype,
)

self.down_proj = RowParallelLinear(
Expand All @@ -114,7 +114,7 @@ class YourModelMLP(nn.Module, CustomModule):
bias=False,
input_is_parallel=True,
sequence_parallel_enabled=trn_config.sequence_parallel_enabled,
dtype=config.torch_dtype,
dtype=config.dtype,
)

# Define transformation specs
Expand Down Expand Up @@ -151,23 +151,23 @@ class YourModelAttention(nn.Module, CustomModule):
bias=False,
gather_output=False,
sequence_parallel_enabled=trn_config.sequence_parallel_enabled,
dtype=config.torch_dtype,
dtype=config.dtype,
)
self.k_proj = ColumnParallelLinear(
config.hidden_size,
self.num_key_value_heads * self.head_dim,
bias=False,
gather_output=False,
sequence_parallel_enabled=trn_config.sequence_parallel_enabled,
dtype=config.torch_dtype,
dtype=config.dtype,
)
self.v_proj = ColumnParallelLinear(
config.hidden_size,
self.num_key_value_heads * self.head_dim,
bias=False,
gather_output=False,
sequence_parallel_enabled=trn_config.sequence_parallel_enabled,
dtype=config.torch_dtype,
dtype=config.dtype,
)

self.o_proj = RowParallelLinear(
Expand All @@ -176,7 +176,7 @@ class YourModelAttention(nn.Module, CustomModule):
bias=False,
input_is_parallel=True,
sequence_parallel_enabled=trn_config.sequence_parallel_enabled,
dtype=config.torch_dtype,
dtype=config.dtype,
)

# No transformation specs needed - regular parallel layers
Expand All @@ -201,7 +201,7 @@ class YourModelAttention(nn.Module, CustomModule):
bias=False,
gather_output=False,
sequence_parallel_enabled=trn_config.sequence_parallel_enabled,
dtype=config.torch_dtype,
dtype=config.dtype,
)

# Define transformation specs for fused QKV
Expand Down Expand Up @@ -246,7 +246,7 @@ class YourModelAttention(nn.Module, CustomModule):
sequence_parallel_enabled=trn_config.sequence_parallel_enabled,
kv_size_multiplier=self.kv_size_multiplier,
fuse_qkv=trn_config.fuse_qkv,
dtype=config.torch_dtype,
dtype=config.dtype,
)

# Define transformation specs for GQA QKV
Expand Down Expand Up @@ -336,7 +336,7 @@ class YourModelForCausalLM(NeuronModelMixin, YourPreTrainedModel):
config.vocab_size,
bias=False,
gather_output=False,
dtype=config.torch_dtype,
dtype=config.dtype,
)

self.post_init()
Expand Down Expand Up @@ -473,7 +473,7 @@ Update `tests/training/test_modeling_auto.py`:
@is_trainium_test
def test_auto_model_with_supported_architecture(from_pretrained):
trn_config = TrainingNeuronConfig()
kwargs = {"torch_dtype": torch.bfloat16}
kwargs = {"dtype": torch.bfloat16}
for model_name_or_path in [
"michaelbenayoun/llama-2-tiny-4kv-heads-4layers-random",
"michaelbenayoun/granite-tiny-4kv-heads-4layers-random",
Expand All @@ -487,7 +487,7 @@ def test_auto_model_with_supported_architecture(from_pretrained):
@is_trainium_test
def test_auto_model_for_causal_lm_with_supported_architecture(from_pretrained):
trn_config = TrainingNeuronConfig()
kwargs = {"torch_dtype": torch.bfloat16}
kwargs = {"dtype": torch.bfloat16}
for model_name_or_path in [
"michaelbenayoun/llama-2-tiny-4kv-heads-4layers-random",
"michaelbenayoun/granite-tiny-4kv-heads-4layers-random",
Expand Down
2 changes: 1 addition & 1 deletion docs/source/model_doc/diffusers/flux.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ if __name__ == "__main__":

pipe = NeuronFluxPipeline.from_pretrained(
"black-forest-labs/FLUX.1-dev",
torch_dtype=torch.bfloat16,
dtype=torch.bfloat16,
export=True,
tensor_parallel_size=8,
**compiler_args,
Expand Down
2 changes: 1 addition & 1 deletion docs/source/model_doc/diffusers/pixart_alpha.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ from optimum.neuron import NeuronPixArtAlphaPipeline
compiler_args = {"auto_cast": "none"}
input_shapes = {"batch_size": 1, "height": 512, "width": 512, "sequence_length": 120}

neuron_model = NeuronPixArtAlphaPipeline.from_pretrained("PixArt-alpha/PixArt-XL-2-512x512", torch_dtype=torch.bfloat16, export=True, disable_neuron_cache=True, **compiler_args, **input_shapes)
neuron_model = NeuronPixArtAlphaPipeline.from_pretrained("PixArt-alpha/PixArt-XL-2-512x512", dtype=torch.bfloat16, export=True, disable_neuron_cache=True, **compiler_args, **input_shapes)

# Save locally
neuron_model.save_pretrained("pixart_alpha_neuron_512/")
Expand Down
2 changes: 1 addition & 1 deletion docs/source/model_doc/diffusers/pixart_sigma.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ from optimum.neuron import NeuronPixArtSigmaPipeline
compiler_args = {"auto_cast": "none"}
input_shapes = {"batch_size": 1, "height": 512, "width": 512, "sequence_length": 120}

neuron_model = NeuronPixArtSigmaPipeline.from_pretrained("Jingya/pixart_sigma_pipe_xl_2_512_ms", torch_dtype=torch.bfloat16, export=True, disable_neuron_cache=True, **compiler_args, **input_shapes)
neuron_model = NeuronPixArtSigmaPipeline.from_pretrained("Jingya/pixart_sigma_pipe_xl_2_512_ms", dtype=torch.bfloat16, export=True, disable_neuron_cache=True, **compiler_args, **input_shapes)

# Save locally
neuron_model.save_pretrained("pixart_sigma_neuron_512/")
Expand Down
2 changes: 1 addition & 1 deletion docs/source/quickstart.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ def main():
model = NeuronModelForCausalLM.from_pretrained(
model_id,
training_args.trn_config,
torch_dtype=torch.bfloat16,
dtype=torch.bfloat16,
attn_implementation="flash_attention_2", # Enable flash attention
)

Expand Down
2 changes: 1 addition & 1 deletion docs/source/training_tutorials/finetune_llama.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,7 @@ dtype = torch.bfloat16 if training_args.bf16 else torch.float32
model = NeuronModelForCausalLM.from_pretrained(
model_id,
trn_config,
torch_dtype=dtype,
dtype=dtype,
# Use FlashAttention2 for better performance and to be able to use larger sequence lengths.
attn_implementation="flash_attention_2",
)
Expand Down
2 changes: 1 addition & 1 deletion docs/source/training_tutorials/finetune_qwen3.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -137,7 +137,7 @@ dtype = torch.bfloat16 if training_args.bf16 else torch.float32
model = NeuronModelForCausalLM.from_pretrained(
model_id,
trn_config,
torch_dtype=dtype,
dtype=dtype,
# Use FlashAttention2 for better performance and to be able to use larger sequence lengths.
attn_implementation="flash_attention_2",
)
Expand Down
3 changes: 2 additions & 1 deletion examples/training/qwen3/finetune_qwen3.sh
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

To restore back

Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,8 @@ TP_DEGREE=8
BS=1
GRADIENT_ACCUMULATION_STEPS=8
LOGGING_STEPS=2
MODEL_NAME="Qwen/Qwen3-8B" # Change this to the desired model name
# MODEL_NAME="Qwen/Qwen3-8B" # Change this to the desired model name
MODEL_NAME="Qwen/Qwen3-0.6B" # Change this to the desired model name
OUTPUT_DIR="$(echo $MODEL_NAME | cut -d'/' -f2)-finetuned"
DISTRIBUTED_ARGS="--nproc_per_node $PROCESSES_PER_NODE"
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
Expand Down
2 changes: 1 addition & 1 deletion optimum/commands/neuron/cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,7 +147,7 @@ def _list_entries(self):
str(entry["batch_size"]),
str(entry["sequence_length"]),
str(entry.get("tp_degree", entry.get("tensor_parallel_size"))),
str(entry["torch_dtype"]),
str(entry["dtype"]),
str(entry["target"]),
)
)
Expand Down
4 changes: 2 additions & 2 deletions optimum/commands/neuron/serve.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ def run(self):
sequence_length = self.args.sequence_length
tensor_parallel_size = self.args.tensor_parallel_size
config = AutoConfig.from_pretrained(model_name_or_path)
torch_dtype = DTYPE_MAPPER.pt(config.torch_dtype)
torch_dtype = DTYPE_MAPPER.pt(config.dtype)
try:
# Look for a NeuronConfig in the model directory
neuron_config = NeuronConfig.from_pretrained(model_name_or_path)
Expand Down Expand Up @@ -202,7 +202,7 @@ def run(self):
batch_size = selected_entry["batch_size"]
sequence_length = selected_entry["sequence_length"]
tensor_parallel_size = selected_entry["tp_degree"]
torch_dtype = DTYPE_MAPPER.pt(selected_entry["torch_dtype"])
torch_dtype = DTYPE_MAPPER.pt(selected_entry["dtype"])
warning_msg = f"{model_id} is not a neuron model, but a cached configuration is available using"
warning_msg += f" instance type {instance_type},"
warning_msg += f" batch size = {batch_size},"
Expand Down
4 changes: 2 additions & 2 deletions optimum/exporters/neuron/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -581,7 +581,7 @@ def load_models_and_neuron_configs(
"trust_remote_code": trust_remote_code,
"framework": "pt",
"library_name": library_name,
"torch_dtype": torch_dtype,
"dtype": torch_dtype,
}
if model is None:
model = TasksManager.get_model_from_task(**model_kwargs)
Expand Down Expand Up @@ -878,7 +878,7 @@ def main():
model_name_or_path=args.model,
output=args.output,
compiler_kwargs=compiler_kwargs,
torch_dtype=args.torch_dtype,
torch_dtype=args.dtype,
tensor_parallel_size=args.tensor_parallel_size,
task=task,
dynamic_batch_size=args.dynamic_batch_size,
Expand Down
2 changes: 1 addition & 1 deletion optimum/neuron/cache/entries/cache_entry.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
"bos_token_id",
"pad_token_id",
"torchscript",
"torch_dtype", # this has been renamed as `float_dtype` for the check
"dtype", # this has been renamed as `float_dtype` for the check
"_commit_hash",
"sample_size",
"projection_dim",
Expand Down
2 changes: 1 addition & 1 deletion optimum/neuron/cache/hub_cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -427,7 +427,7 @@ def select_hub_cached_entries(
continue
if torch_dtype is not None:
target_value = DTYPE_MAPPER.pt(torch_dtype) if isinstance(torch_dtype, str) else torch_dtype
entry_value = DTYPE_MAPPER.pt(entry.get("torch_dtype"))
entry_value = DTYPE_MAPPER.pt(entry.get("dtype"))
if target_value != entry_value:
continue
selected.append(entry)
Expand Down
11 changes: 5 additions & 6 deletions optimum/neuron/generation/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -325,18 +325,17 @@ class NeuronGenerationMixin(GenerationMixin):
The class exposes [`~generation.GenerationMixin.generate`], which can be used for:
- *greedy decoding* by calling [`~generation.GenerationMixin.greedy_search`] if `num_beams=1` and
`do_sample=False`
- *contrastive search* by calling [`~generation.GenerationMixin.contrastive_search`] if `penalty_alpha>0` and
`top_k>1`
- *multinomial sampling* by calling [`~generation.GenerationMixin.sample`] if `num_beams=1` and
`do_sample=True`
- *beam-search decoding* by calling [`~generation.GenerationMixin.beam_search`] if `num_beams>1` and
`do_sample=False`
- *beam-search multinomial sampling* by calling [`~generation.GenerationMixin.beam_sample`] if `num_beams>1`
and `do_sample=True`
- *diverse beam-search decoding* by calling [`~generation.GenerationMixin.group_beam_search`], if `num_beams>1`
and `num_beam_groups>1`
- *constrained beam-search decoding* by calling [`~generation.GenerationMixin.constrained_beam_search`], if
`constraints!=None` or `force_words_ids!=None`

Note: The following strategies have been removed in transformers 4.56.0+:
- constrained beam-search decoding (constraints and force_words_ids)
- group beam-search decoding (num_beam_groups > 1)
- contrastive search (penalty_alpha > 0)

You do not need to call any of the above methods directly. Pass custom parameter values to 'generate' instead. To
learn more about decoding strategies refer to the [text generation strategies guide](../generation_strategies).
Expand Down
2 changes: 1 addition & 1 deletion optimum/neuron/modeling_diffusion.py
Original file line number Diff line number Diff line change
Expand Up @@ -1193,7 +1193,7 @@ def forward(

outputs = self.model(*inputs)
if self.config.model_type == "t5" and isinstance(outputs, dict): # Flux text encoder 2
return [outputs["last_hidden_state"].to(self.config.torch_dtype)]
return [outputs["last_hidden_state"].to(self.config.dtype)]

if return_dict and not isinstance(outputs, dict):
outputs = ModelOutput(dict(zip(self.neuron_config.outputs, outputs)))
Expand Down
6 changes: 3 additions & 3 deletions optimum/neuron/models/inference/backend/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,9 +83,9 @@ def __init__(
self.batch_size = batch_size
self.sequence_length = sequence_length
self.tp_degree = tp_degree
self.torch_dtype = torch_dtype
if isinstance(self.torch_dtype, str):
self.torch_dtype = DTYPE_MAPPER.pt(self.torch_dtype)
self.dtype = torch_dtype
if isinstance(self.dtype, str):
self.dtype = DTYPE_MAPPER.pt(self.dtype)
self.n_active_tokens = self.sequence_length if n_active_tokens is None else n_active_tokens
self.output_logits = output_logits

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ def __init__(
self.head_dim = self.hidden_size // self.num_attention_heads
self.max_position_embeddings = config.max_position_embeddings
self.rope_theta = config.rope_theta
self.torch_dtype = neuron_config.torch_dtype
self.dtype = neuron_config.dtype
self.rms_norm_eps = config.rms_norm_eps
self._qk_scale = qk_scale

Expand All @@ -111,7 +111,7 @@ def __init__(
num_attention_heads=self.num_attention_heads,
num_key_value_heads=self.num_key_value_heads,
tp_degree=neuron_config.tp_degree,
dtype=self.torch_dtype,
dtype=self.dtype,
bias=qkv_proj_bias,
gather_output=False,
fused_qkv=neuron_config.fused_qkv,
Expand All @@ -125,12 +125,12 @@ def __init__(
num_attention_heads=self.num_attention_heads,
num_key_value_heads=self.num_key_value_heads,
tp_degree=neuron_config.tp_degree,
dtype=self.torch_dtype,
dtype=self.dtype,
bias=o_proj_bias,
input_is_parallel=True,
layer_name=self.o_proj_layer_name,
tensor_model_parallel_group=self.tensor_model_parallel_group,
rpl_reduce_dtype=neuron_config.torch_dtype,
rpl_reduce_dtype=neuron_config.dtype,
)
self.num_heads = utils.divide(self.qkv_proj.get_num_attention_heads(), neuron_config.tp_degree)
self.num_key_value_heads = utils.divide(self.qkv_proj.get_num_key_value_heads(), neuron_config.tp_degree)
Expand Down Expand Up @@ -202,13 +202,13 @@ def perform_prefill(self, Q, K, V, q_len, bsz, attention_mask) -> Tensor:
Q = (
Q.permute(0, 1, 3, 2) # after permute: batch, num_heads, d_head, seqlen
.reshape((bsz * self.num_heads, self.head_dim, q_len))
.to(self.torch_dtype)
.to(self.dtype)
)
Q = Q * self.qk_scale
K_active = (
K_active.permute(0, 1, 3, 2).reshape((bsz * self.num_heads, self.head_dim, q_len)).to(self.torch_dtype)
K_active.permute(0, 1, 3, 2).reshape((bsz * self.num_heads, self.head_dim, q_len)).to(self.dtype)
)
V_active = V_active.reshape((bsz * self.num_heads, q_len, self.head_dim)).to(self.torch_dtype)
V_active = V_active.reshape((bsz * self.num_heads, q_len, self.head_dim)).to(self.dtype)
# shape: (B*H)DS
attn_output = torch.zeros(bsz * self.num_heads, self.head_dim, q_len, dtype=Q.dtype, device=Q.device)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,8 +38,8 @@ def __init__(
self.max_tokens = max_tokens
self.active_tokens = active_tokens

if not self.neuron_config.torch_dtype:
self.neuron_config.torch_dtype = torch.float32
if not self.neuron_config.dtype:
self.neuron_config.dtype = torch.float32

if config.pad_token_id is None:
config.pad_token_id = 0
Expand Down Expand Up @@ -88,9 +88,9 @@ def load_module(self):
float_model = self.model_cls(self.config, self.neuron_config)
float_model.eval()

if self.neuron_config.torch_dtype != torch.float32:
if self.neuron_config.dtype != torch.float32:
float_model._apply(
lambda t: t.to(self.neuron_config.torch_dtype)
lambda t: t.to(self.neuron_config.dtype)
if t.is_floating_point() and t.dtype not in [torch.float8_e4m3fn, torch.float8_e5m2]
else t
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,8 +37,8 @@ def __init__(
self.model = model
self.tag = tag

if not self.neuron_config.torch_dtype:
self.neuron_config.torch_dtype = torch.float32
if not self.neuron_config.dtype:
self.neuron_config.dtype = torch.float32

if config.pad_token_id is None:
config.pad_token_id = 0
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ def __init__(self, config: PretrainedConfig, neuron_config: NxDNeuronConfig, **k
self._init_kv_shape(config, neuron_config)

num_layer = config.num_hidden_layers
dtype = neuron_config.torch_dtype
dtype = neuron_config.dtype
self.past_key_values = nn.ParameterList(
[nn.Parameter(torch.zeros(self.kv_shape, dtype=dtype), requires_grad=False) for _ in range(num_layer * 2)]
)
Expand Down
Loading
Loading